diff options
Diffstat (limited to 'src/mds')
114 files changed, 84915 insertions, 0 deletions
diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc new file mode 100644 index 00000000..02cc2d2b --- /dev/null +++ b/src/mds/Anchor.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/Anchor.h" + +#include "common/Formatter.h" + +void Anchor::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(ino, bl); + encode(dirino, bl); + encode(d_name, bl); + encode(d_type, bl); + ENCODE_FINISH(bl); +} + +void Anchor::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(ino, bl); + decode(dirino, bl); + decode(d_name, bl); + decode(d_type, bl); + DECODE_FINISH(bl); +} + +void Anchor::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("dirino", dirino); + f->dump_string("d_name", d_name); + f->dump_unsigned("d_type", d_type); +} + +void Anchor::generate_test_instances(list<Anchor*>& ls) +{ + ls.push_back(new Anchor); + ls.push_back(new Anchor); + ls.back()->ino = 1; + ls.back()->dirino = 2; + ls.back()->d_name = "hello"; + ls.back()->d_type = DT_DIR; +} + +ostream& operator<<(ostream& out, const Anchor &a) +{ + return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")"; +} diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h new file mode 100644 index 00000000..49b592b9 --- /dev/null +++ b/src/mds/Anchor.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ANCHOR_H +#define CEPH_ANCHOR_H + +#include <string> + +#include "include/types.h" +#include "mdstypes.h" +#include "include/buffer.h" + +/* + * Anchor represents primary linkage of an inode. When adding inode to an + * anchor table, MDS ensures that the table also contains inode's ancestor + * inodes. MDS can get inode's path by looking up anchor table recursively. + */ +class Anchor { +public: + inodeno_t ino; // anchored ino + inodeno_t dirino; + std::string d_name; + __u8 d_type = 0; + + int omap_idx = -1; // stored in which omap object + + Anchor() {} + Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) : + ino(i), dirino(di), d_name(str), d_type(tp) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<Anchor*>& ls); +}; +WRITE_CLASS_ENCODER(Anchor) + +inline bool operator==(const Anchor &l, const Anchor &r) { + return l.ino == r.ino && l.dirino == r.dirino && + l.d_name == r.d_name && l.d_type == r.d_type; +} + +ostream& operator<<(ostream& out, const Anchor &a); + +class RecoveredAnchor : public Anchor { +public: + RecoveredAnchor() {} + + mds_rank_t auth = MDS_RANK_NONE; // auth hint +}; + +class OpenedAnchor : public Anchor { +public: + OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : + Anchor(i, di, str, tp), + nref(nr) + {} + + mutable int nref = 0; // how many children +}; + +#endif diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc new file mode 100644 index 00000000..b66550bd --- /dev/null +++ b/src/mds/Beacon.cc @@ -0,0 +1,484 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/dout.h" +#include "common/HeartbeatMap.h" + +#include "include/stringify.h" +#include "include/util.h" + +#include "mon/MonClient.h" +#include "mds/MDLog.h" +#include "mds/MDSRank.h" +#include "mds/MDSMap.h" +#include "mds/Locker.h" + +#include "Beacon.h" + +#include <chrono> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds.beacon." << name << ' ' + +using namespace std::chrono_literals; + +Beacon::Beacon(CephContext *cct, MonClient *monc, std::string_view name) + : + Dispatcher(cct), + beacon_interval(g_conf()->mds_beacon_interval), + monc(monc), + name(name) +{ +} + +Beacon::~Beacon() +{ + shutdown(); +} + +void Beacon::shutdown() +{ + std::unique_lock<std::mutex> lock(mutex); + if (!finished) { + finished = true; + lock.unlock(); + if (sender.joinable()) + sender.join(); + } +} + +void Beacon::init(const MDSMap &mdsmap) +{ + std::unique_lock lock(mutex); + + _notify_mdsmap(mdsmap); + + sender = std::thread([this]() { + std::unique_lock<std::mutex> lock(mutex); + std::condition_variable c; // no one wakes us + while (!finished) { + auto now = clock::now(); + auto since = std::chrono::duration<double>(now-last_send).count(); + auto interval = beacon_interval; + if (since >= interval*.90) { + if (!_send()) { + interval = 0.5; /* 500ms */ + } + } else { + interval -= since; + } + dout(20) << "sender thread waiting interval " << interval << "s" << dendl; + c.wait_for(lock, interval*1s); + } + }); +} + +bool Beacon::ms_can_fast_dispatch2(const Message::const_ref& m) const +{ + return m->get_type() == MSG_MDS_BEACON; +} + +void Beacon::ms_fast_dispatch2(const Message::ref& m) +{ + bool handled = ms_dispatch2(m); + ceph_assert(handled); +} + +bool Beacon::ms_dispatch2(const Message::ref& m) +{ + if (m->get_type() == MSG_MDS_BEACON) { + if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + handle_mds_beacon(MMDSBeacon::msgref_cast(m)); + } + return true; + } + + return false; +} + + +/** + * Update lagginess state based on response from remote MDSMonitor + * + * This function puts the passed message before returning + */ +void Beacon::handle_mds_beacon(const MMDSBeacon::const_ref &m) +{ + std::unique_lock lock(mutex); + + version_t seq = m->get_seq(); + + // update lab + auto it = seq_stamp.find(seq); + if (it != seq_stamp.end()) { + auto now = clock::now(); + + last_acked_stamp = it->second; + auto rtt = std::chrono::duration<double>(now - last_acked_stamp).count(); + + dout(5) << "received beacon reply " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << " rtt " << rtt << dendl; + + if (laggy && rtt < g_conf()->mds_beacon_grace) { + dout(0) << " MDS is no longer laggy" << dendl; + laggy = false; + last_laggy = now; + } + + // clean up seq_stamp map + seq_stamp.erase(seq_stamp.begin(), ++it); + + // Wake a waiter up if present + cvar.notify_all(); + } else { + dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m->get_state()) + << " seq " << m->get_seq() << " dne" << dendl; + } +} + + +void Beacon::send() +{ + std::unique_lock lock(mutex); + _send(); +} + + +void Beacon::send_and_wait(const double duration) +{ + std::unique_lock lock(mutex); + _send(); + auto awaiting_seq = last_seq; + dout(20) << __func__ << ": awaiting " << awaiting_seq + << " for up to " << duration << "s" << dendl; + + auto start = clock::now(); + while (!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) { + auto now = clock::now(); + auto s = duration*.95-std::chrono::duration<double>(now-start).count(); + if (s < 0) break; + cvar.wait_for(lock, s*1s); + } +} + + +/** + * Call periodically, or when you have updated the desired state + */ +bool Beacon::_send() +{ + auto now = clock::now(); + auto since = std::chrono::duration<double>(now-last_acked_stamp).count(); + + if (!cct->get_heartbeat_map()->is_healthy()) { + /* If anything isn't progressing, let avoid sending a beacon so that + * the MDS will consider us laggy */ + dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl; + return false; + } + + ++last_seq; + dout(5) << "Sending beacon " << ceph_mds_state_name(want_state) << " seq " << last_seq << dendl; + + seq_stamp[last_seq] = now; + + ceph_assert(want_state != MDSMap::STATE_NULL); + + auto beacon = MMDSBeacon::create( + monc->get_fsid(), mds_gid_t(monc->get_global_id()), + name, + epoch, + want_state, + last_seq, + CEPH_FEATURES_SUPPORTED_DEFAULT); + + beacon->set_health(health); + beacon->set_compat(compat); + // piggyback the sys info on beacon msg + if (want_state == MDSMap::STATE_BOOT) { + map<string, string> sys_info; + collect_sys_info(&sys_info, cct); + sys_info["addr"] = stringify(monc->get_myaddrs()); + beacon->set_sys_info(sys_info); + } + monc->send_mon_message(beacon.detach()); + last_send = now; + return true; +} + +/** + * Call this when there is a new MDSMap available + */ +void Beacon::notify_mdsmap(const MDSMap &mdsmap) +{ + std::unique_lock lock(mutex); + + _notify_mdsmap(mdsmap); +} + +void Beacon::_notify_mdsmap(const MDSMap &mdsmap) +{ + ceph_assert(mdsmap.get_epoch() >= epoch); + + if (mdsmap.get_epoch() != epoch) { + epoch = mdsmap.get_epoch(); + compat = MDSMap::get_compat_set_default(); + compat.merge(mdsmap.compat); + } +} + + +bool Beacon::is_laggy() +{ + std::unique_lock lock(mutex); + + auto now = clock::now(); + auto since = std::chrono::duration<double>(now-last_acked_stamp).count(); + if (since > g_conf()->mds_beacon_grace) { + if (!laggy) { + dout(1) << "MDS connection to Monitors appears to be laggy; " << since + << "s since last acked beacon" << dendl; + } + laggy = true; + return true; + } + return false; +} + +void Beacon::set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate) +{ + std::unique_lock lock(mutex); + + // Update mdsmap epoch atomically with updating want_state, so that when + // we send a beacon with the new want state it has the latest epoch, and + // once we have updated to the latest epoch, we are not sending out + // a stale want_state (i.e. one from before making it through MDSMap + // handling) + _notify_mdsmap(mdsmap); + + if (want_state != newstate) { + dout(5) << __func__ << ": " + << ceph_mds_state_name(want_state) << " -> " + << ceph_mds_state_name(newstate) << dendl; + want_state = newstate; + } +} + + +/** + * We are 'shown' an MDS briefly in order to update + * some health metrics that we will send in the next + * beacon. + */ +void Beacon::notify_health(MDSRank const *mds) +{ + std::unique_lock lock(mutex); + if (!mds) { + // No MDS rank held + return; + } + + // I'm going to touch this MDS, so it must be locked + ceph_assert(mds->mds_lock.is_locked_by_me()); + + health.metrics.clear(); + + // Detect presence of entries in DamageTable + if (!mds->damage_table.empty()) { + MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string( + "Metadata damage detected")); + health.metrics.push_back(m); + } + + // Detect MDS_HEALTH_TRIM condition + // Indicates MDS is not trimming promptly + { + if (mds->mdlog->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments * g_conf().get_val<double>("mds_log_warn_factor"))) { + std::ostringstream oss; + oss << "Behind on trimming (" << mds->mdlog->get_num_segments() + << "/" << g_conf()->mds_log_max_segments << ")"; + + MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str()); + m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments()); + m.metadata["max_segments"] = stringify(g_conf()->mds_log_max_segments); + health.metrics.push_back(m); + } + } + + // Detect clients failing to respond to modifications to capabilities in + // CLIENT_CAPS messages. + { + std::list<client_t> late_clients; + mds->locker->get_late_revoking_clients(&late_clients, + mds->mdsmap->get_session_timeout()); + std::list<MDSHealthMetric> late_cap_metrics; + + for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) { + + // client_t is equivalent to session.info.inst.name.num + // Construct an entity_name_t to lookup into SessionMap + entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v); + Session const *s = mds->sessionmap.get_session(ename); + if (s == NULL) { + // Shouldn't happen, but not worth crashing if it does as this is + // just health-reporting code. + derr << "Client ID without session: " << i->v << dendl; + continue; + } + + std::ostringstream oss; + oss << "Client " << s->get_human_name() << " failing to respond to capability release"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = stringify(i->v); + late_cap_metrics.push_back(m); + } + + if (late_cap_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) { + health.metrics.splice(health.metrics.end(), late_cap_metrics); + } else { + std::ostringstream oss; + oss << "Many clients (" << late_cap_metrics.size() + << ") failing to respond to capability release"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str()); + m.metadata["client_count"] = stringify(late_cap_metrics.size()); + health.metrics.push_back(m); + late_cap_metrics.clear(); + } + } + + // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE + // messages. May be due to buggy client or resource-hogging application. + // + // Detect clients failing to advance their old_client_tid + { + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + + const auto min_caps_working_set = g_conf().get_val<uint64_t>("mds_min_caps_working_set"); + const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold"); + const auto max_completed_requests = g_conf()->mds_max_completed_requests; + const auto max_completed_flushes = g_conf()->mds_max_completed_flushes; + std::list<MDSHealthMetric> late_recall_metrics; + std::list<MDSHealthMetric> large_completed_requests_metrics; + for (auto& session : sessions) { + const uint64_t num_caps = session->get_num_caps(); + const uint64_t recall_caps = session->get_recall_caps(); + if (recall_caps > recall_warning_threshold && num_caps > min_caps_working_set) { + dout(2) << "Session " << *session << + " is not releasing caps fast enough. Recalled caps at " << recall_caps + << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl; + std::ostringstream oss; + oss << "Client " << session->get_human_name() << " failing to respond to cache pressure"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = stringify(session->get_client()); + late_recall_metrics.push_back(m); + } + if ((session->get_num_trim_requests_warnings() > 0 && + session->get_num_completed_requests() >= max_completed_requests) || + (session->get_num_trim_flushes_warnings() > 0 && + session->get_num_completed_flushes() >= max_completed_flushes)) { + std::ostringstream oss; + oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. "; + MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = stringify(session->get_client()); + large_completed_requests_metrics.push_back(m); + } + } + + if (late_recall_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) { + health.metrics.splice(health.metrics.end(), late_recall_metrics); + } else { + std::ostringstream oss; + oss << "Many clients (" << late_recall_metrics.size() + << ") failing to respond to cache pressure"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str()); + m.metadata["client_count"] = stringify(late_recall_metrics.size()); + health.metrics.push_back(m); + late_recall_metrics.clear(); + } + + if (large_completed_requests_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) { + health.metrics.splice(health.metrics.end(), large_completed_requests_metrics); + } else { + std::ostringstream oss; + oss << "Many clients (" << large_completed_requests_metrics.size() + << ") failing to advance their oldest client/flush tid"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str()); + m.metadata["client_count"] = stringify(large_completed_requests_metrics.size()); + health.metrics.push_back(m); + large_completed_requests_metrics.clear(); + } + } + + // Detect MDS_HEALTH_SLOW_REQUEST condition + { + int slow = mds->get_mds_slow_req_count(); + if (slow) { + dout(20) << slow << " slow request found" << dendl; + std::ostringstream oss; + oss << slow << " slow requests are blocked > " << g_conf()->mds_op_complaint_time << " secs"; + + MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str()); + health.metrics.push_back(m); + } + } + + { + auto complaint_time = g_conf()->osd_op_complaint_time; + auto now = clock::now(); + auto cutoff = now - ceph::make_timespan(complaint_time); + + std::string count; + ceph::coarse_mono_time oldest; + if (MDSIOContextBase::check_ios_in_flight(cutoff, count, oldest)) { + dout(20) << count << " slow metadata IOs found" << dendl; + + auto oldest_secs = std::chrono::duration<double>(now - oldest).count(); + std::ostringstream oss; + oss << count << " slow metadata IOs are blocked > " << complaint_time + << " secs, oldest blocked for " << (int64_t)oldest_secs << " secs"; + + MDSHealthMetric m(MDS_HEALTH_SLOW_METADATA_IO, HEALTH_WARN, oss.str()); + health.metrics.push_back(m); + } + } + + // Report a health warning if we are readonly + if (mds->mdcache->is_readonly()) { + MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN, + "MDS in read-only mode"); + health.metrics.push_back(m); + } + + // Report if we have significantly exceeded our cache size limit + if (mds->mdcache->cache_overfull()) { + std::ostringstream oss; + oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size()) + << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); " + << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, " + << mds->mdcache->get_num_strays() << " stray files"; + + MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str()); + health.metrics.push_back(m); + } +} + +MDSMap::DaemonState Beacon::get_want_state() const +{ + std::unique_lock lock(mutex); + return want_state; +} + diff --git a/src/mds/Beacon.h b/src/mds/Beacon.h new file mode 100644 index 00000000..2e84aa6c --- /dev/null +++ b/src/mds/Beacon.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef BEACON_STATE_H +#define BEACON_STATE_H + +#include <mutex> +#include <string_view> +#include <thread> + +#include "include/types.h" +#include "include/Context.h" +#include "msg/Dispatcher.h" + +#include "messages/MMDSBeacon.h" + +class MonClient; +class MDSRank; + + +/** + * One of these per MDS. Handle beacon logic in this separate class so + * that a busy MDS holding its own lock does not hold up sending beacon + * messages to the mon and cause false lagginess. + * + * So that we can continue to operate while the MDS is holding its own lock, + * we keep copies of the data needed to generate beacon messages. The MDS is + * responsible for calling Beacon::notify_* when things change. + */ +class Beacon : public Dispatcher +{ +public: + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + Beacon(CephContext *cct, MonClient *monc, std::string_view name); + ~Beacon() override; + + void init(const MDSMap &mdsmap); + void shutdown(); + + bool ms_can_fast_dispatch_any() const override { return true; } + bool ms_can_fast_dispatch2(const Message::const_ref& m) const override; + void ms_fast_dispatch2(const Message::ref& m) override; + bool ms_dispatch2(const Message::ref &m) override; + void ms_handle_connect(Connection *c) override {} + bool ms_handle_reset(Connection *c) override {return false;} + void ms_handle_remote_reset(Connection *c) override {} + bool ms_handle_refused(Connection *c) override {return false;} + + void notify_mdsmap(const MDSMap &mdsmap); + void notify_health(const MDSRank *mds); + + void handle_mds_beacon(const MMDSBeacon::const_ref &m); + void send(); + + void set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate); + MDSMap::DaemonState get_want_state() const; + + /** + * Send a beacon, and block until the ack is received from the mon + * or `duration` seconds pass, whichever happens sooner. Useful + * for emitting a last message on shutdown. + */ + void send_and_wait(const double duration); + + bool is_laggy(); + double last_cleared_laggy() const { + std::unique_lock lock(mutex); + return std::chrono::duration<double>(clock::now()-last_laggy).count(); + } + +private: + void _notify_mdsmap(const MDSMap &mdsmap); + bool _send(); + + mutable std::mutex mutex; + std::thread sender; + std::condition_variable cvar; + time last_send = clock::zero(); + double beacon_interval = 5.0; + bool finished = false; + MonClient* monc; + + // Items we duplicate from the MDS to have access under our own lock + std::string name; + version_t epoch = 0; + CompatSet compat; + MDSMap::DaemonState want_state = MDSMap::STATE_BOOT; + + // Internal beacon state + version_t last_seq = 0; // last seq sent to monitor + std::map<version_t,time> seq_stamp; // seq # -> time sent + time last_acked_stamp = clock::zero(); // last time we sent a beacon that got acked + bool laggy = false; + time last_laggy = clock::zero(); + + // Health status to be copied into each beacon message + MDSHealth health; +}; + +#endif // BEACON_STATE_H diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc new file mode 100644 index 00000000..b2a7db1e --- /dev/null +++ b/src/mds/CDentry.cc @@ -0,0 +1,630 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "CDentry.h" +#include "CInode.h" +#include "CDir.h" + +#include "MDSRank.h" +#include "MDCache.h" +#include "Locker.h" +#include "LogSegment.h" + +#include "messages/MLock.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") " + + +ostream& CDentry::print_db_line_prefix(ostream& out) +{ + return out << ceph_clock_now() << " mds." << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "; +} + +LockType CDentry::lock_type(CEPH_LOCK_DN); +LockType CDentry::versionlock_type(CEPH_LOCK_DVERSION); + + +// CDentry + +ostream& operator<<(ostream& out, const CDentry& dn) +{ + filepath path; + dn.make_path(path); + + out << "[dentry " << path; + + if (true || dn.first != 0 || dn.last != CEPH_NOSNAP) { + out << " [" << dn.first << ","; + if (dn.last == CEPH_NOSNAP) + out << "head"; + else + out << dn.last; + out << ']'; + } + + if (dn.is_auth()) { + out << " auth"; + if (dn.is_replicated()) + out << dn.get_replicas(); + } else { + out << " rep@" << dn.authority(); + out << "." << dn.get_replica_nonce(); + } + + if (dn.get_linkage()->is_null()) out << " NULL"; + if (dn.get_linkage()->is_remote()) { + out << " REMOTE("; + out << dn.get_linkage()->get_remote_d_type_string(); + out << ")"; + } + + if (!dn.lock.is_sync_and_unlocked()) + out << " " << dn.lock; + if (!dn.versionlock.is_sync_and_unlocked()) + out << " " << dn.versionlock; + + if (dn.get_projected_version() != dn.get_version()) + out << " pv=" << dn.get_projected_version(); + out << " v=" << dn.get_version(); + + if (dn.get_num_auth_pins()) { + out << " ap=" << dn.get_num_auth_pins(); +#ifdef MDS_AUTHPIN_SET + dn.print_authpin_set(out); +#endif + } + + { + const CInode *inode = dn.get_linkage()->get_inode(); + out << " ino="; + if (inode) { + out << inode->ino(); + } else { + out << "(nil)"; + } + } + + out << " state=" << dn.get_state(); + if (dn.is_new()) out << "|new"; + if (dn.state_test(CDentry::STATE_BOTTOMLRU)) out << "|bottomlru"; + + if (dn.get_num_ref()) { + out << " |"; + dn.print_pin_set(out); + } + + out << " " << &dn; + out << "]"; + return out; +} + + +bool operator<(const CDentry& l, const CDentry& r) +{ + if ((l.get_dir()->ino() < r.get_dir()->ino()) || + (l.get_dir()->ino() == r.get_dir()->ino() && + (l.get_name() < r.get_name() || + (l.get_name() == r.get_name() && l.last < r.last)))) + return true; + return false; +} + + +void CDentry::print(ostream& out) +{ + out << *this; +} + + +/* +inodeno_t CDentry::get_ino() +{ + if (get_inode()) + return get_inode()->ino(); + return inodeno_t(); +} +*/ + +mds_authority_t CDentry::authority() const +{ + return dir->authority(); +} + + +void CDentry::add_waiter(uint64_t tag, MDSContext *c) +{ + // wait on the directory? + if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { + dir->add_waiter(tag, c); + return; + } + MDSCacheObject::add_waiter(tag, c); +} + + +version_t CDentry::pre_dirty(version_t min) +{ + projected_version = dir->pre_dirty(min); + dout(10) << __func__ << " " << *this << dendl; + return projected_version; +} + + +void CDentry::_mark_dirty(LogSegment *ls) +{ + // state+pin + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + dir->inc_num_dirty(); + dir->dirty_dentries.push_back(&item_dir_dirty); + ceph_assert(ls); + } + if (ls) + ls->dirty_dentries.push_back(&item_dirty); +} + +void CDentry::mark_dirty(version_t pv, LogSegment *ls) +{ + dout(10) << __func__ << " " << *this << dendl; + + // i now live in this new dir version + ceph_assert(pv <= projected_version); + version = pv; + _mark_dirty(ls); + + // mark dir too + dir->mark_dirty(pv, ls); +} + + +void CDentry::mark_clean() +{ + dout(10) << __func__ << " " << *this << dendl; + ceph_assert(is_dirty()); + + // not always true for recalc_auth_bits during resolve finish + //assert(dir->get_version() == 0 || version <= dir->get_version()); // hmm? + + state_clear(STATE_DIRTY|STATE_NEW); + dir->dec_num_dirty(); + + item_dir_dirty.remove_myself(); + item_dirty.remove_myself(); + + put(PIN_DIRTY); +} + +void CDentry::mark_new() +{ + dout(10) << __func__ << " " << *this << dendl; + state_set(STATE_NEW); +} + +void CDentry::make_path_string(string& s, bool projected) const +{ + if (dir) { + dir->inode->make_path_string(s, projected); + } else { + s = "???"; + } + s += "/"; + s.append(name.data(), name.length()); +} + +void CDentry::make_path(filepath& fp, bool projected) const +{ + ceph_assert(dir); + dir->inode->make_path(fp, projected); + fp.push_dentry(get_name()); +} + +/* + * we only add ourselves to remote_parents when the linkage is + * active (no longer projected). if the passed dnl is projected, + * don't link in, and do that work later in pop_projected_linkage(). + */ +void CDentry::link_remote(CDentry::linkage_t *dnl, CInode *in) +{ + ceph_assert(dnl->is_remote()); + ceph_assert(in->ino() == dnl->get_remote_ino()); + dnl->inode = in; + + if (dnl == &linkage) + in->add_remote_parent(this); +} + +void CDentry::unlink_remote(CDentry::linkage_t *dnl) +{ + ceph_assert(dnl->is_remote()); + ceph_assert(dnl->inode); + + if (dnl == &linkage) + dnl->inode->remove_remote_parent(this); + + dnl->inode = 0; +} + +void CDentry::push_projected_linkage() +{ + _project_linkage(); + + if (is_auth()) { + CInode *diri = dir->inode; + if (diri->is_stray()) + diri->mdcache->notify_stray_removed(); + } +} + + +void CDentry::push_projected_linkage(CInode *inode) +{ + // dirty rstat tracking is in the projected plane + bool dirty_rstat = inode->is_dirty_rstat(); + if (dirty_rstat) + inode->clear_dirty_rstat(); + + _project_linkage()->inode = inode; + inode->push_projected_parent(this); + + if (dirty_rstat) + inode->mark_dirty_rstat(); + + if (is_auth()) { + CInode *diri = dir->inode; + if (diri->is_stray()) + diri->mdcache->notify_stray_created(); + } +} + +CDentry::linkage_t *CDentry::pop_projected_linkage() +{ + ceph_assert(projected.size()); + + linkage_t& n = projected.front(); + + /* + * the idea here is that the link_remote_inode(), link_primary_inode(), + * etc. calls should make linkage identical to &n (and we assert as + * much). + */ + + if (n.remote_ino) { + dir->link_remote_inode(this, n.remote_ino, n.remote_d_type); + if (n.inode) { + linkage.inode = n.inode; + linkage.inode->add_remote_parent(this); + } + } else if (n.inode) { + dir->link_primary_inode(this, n.inode); + n.inode->pop_projected_parent(); + } + + ceph_assert(n.inode == linkage.inode); + ceph_assert(n.remote_ino == linkage.remote_ino); + ceph_assert(n.remote_d_type == linkage.remote_d_type); + + projected.pop_front(); + + return &linkage; +} + + + +// ---------------------------- +// auth pins + +int CDentry::get_num_dir_auth_pins() const +{ + ceph_assert(!is_projected()); + if (get_linkage()->is_primary()) + return auth_pins + get_linkage()->get_inode()->get_num_auth_pins(); + return auth_pins; +} + +bool CDentry::can_auth_pin(int *err_ret) const +{ + ceph_assert(dir); + return dir->can_auth_pin(err_ret); +} + +void CDentry::auth_pin(void *by) +{ + if (auth_pins == 0) + get(PIN_AUTHPIN); + auth_pins++; + +#ifdef MDS_AUTHPIN_SET + auth_pin_set.insert(by); +#endif + + dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl; + + dir->adjust_nested_auth_pins(1, by); +} + +void CDentry::auth_unpin(void *by) +{ + auth_pins--; + +#ifdef MDS_AUTHPIN_SET + { + auto it = auth_pin_set.find(by); + ceph_assert(it != auth_pin_set.end()); + auth_pin_set.erase(it); + } +#endif + + if (auth_pins == 0) + put(PIN_AUTHPIN); + + dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl; + ceph_assert(auth_pins >= 0); + + dir->adjust_nested_auth_pins(-1, by); +} + +void CDentry::adjust_nested_auth_pins(int diradj, void *by) +{ + dir->adjust_nested_auth_pins(diradj, by); +} + +bool CDentry::is_frozen() const +{ + return dir->is_frozen(); +} + +bool CDentry::is_freezing() const +{ + return dir->is_freezing(); +} + +void CDentry::decode_replica(bufferlist::const_iterator& p, bool is_new) +{ + __u32 nonce; + decode(nonce, p); + replica_nonce = nonce; + + decode(first, p); + + inodeno_t rino; + unsigned char rdtype; + decode(rino, p); + decode(rdtype, p); + lock.decode_state(p, is_new); + + bool need_recover; + decode(need_recover, p); + + if (is_new) { + if (rino) + dir->link_remote_inode(this, rino, rdtype); + if (need_recover) + lock.mark_need_recover(); + } +} + +// ---------------------------- +// locking + +void CDentry::set_object_info(MDSCacheObjectInfo &info) +{ + info.dirfrag = dir->dirfrag(); + info.dname = name; + info.snapid = last; +} + +void CDentry::encode_lock_state(int type, bufferlist& bl) +{ + encode(first, bl); + + // null, ino, or remote_ino? + char c; + if (linkage.is_primary()) { + c = 1; + encode(c, bl); + encode(linkage.get_inode()->inode.ino, bl); + } + else if (linkage.is_remote()) { + c = 2; + encode(c, bl); + encode(linkage.get_remote_ino(), bl); + } + else if (linkage.is_null()) { + // encode nothing. + } + else ceph_abort(); +} + +void CDentry::decode_lock_state(int type, const bufferlist& bl) +{ + auto p = bl.cbegin(); + + snapid_t newfirst; + decode(newfirst, p); + + if (!is_auth() && newfirst != first) { + dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl; + ceph_assert(newfirst > first); + first = newfirst; + } + + if (p.end()) { + // null + ceph_assert(linkage.is_null()); + return; + } + + char c; + inodeno_t ino; + decode(c, p); + + switch (c) { + case 1: + case 2: + decode(ino, p); + // newly linked? + if (linkage.is_null() && !is_auth()) { + // force trim from cache! + dout(10) << __func__ << " replica dentry null -> non-null, must trim" << dendl; + //assert(get_num_ref() == 0); + } else { + // verify? + + } + break; + default: + ceph_abort(); + } +} + + +ClientLease *CDentry::add_client_lease(client_t c, Session *session) +{ + ClientLease *l; + if (client_lease_map.count(c)) + l = client_lease_map[c]; + else { + dout(20) << __func__ << " client." << c << " on " << lock << dendl; + if (client_lease_map.empty()) { + get(PIN_CLIENTLEASE); + lock.get_client_lease(); + } + l = client_lease_map[c] = new ClientLease(c, this); + l->seq = ++session->lease_seq; + + } + + return l; +} + +void CDentry::remove_client_lease(ClientLease *l, Locker *locker) +{ + ceph_assert(l->parent == this); + + bool gather = false; + + dout(20) << __func__ << " client." << l->client << " on " << lock << dendl; + + client_lease_map.erase(l->client); + l->item_lease.remove_myself(); + l->item_session_lease.remove_myself(); + delete l; + + if (client_lease_map.empty()) { + gather = !lock.is_stable(); + lock.put_client_lease(); + put(PIN_CLIENTLEASE); + } + + if (gather) + locker->eval_gather(&lock); +} + +void CDentry::remove_client_leases(Locker *locker) +{ + while (!client_lease_map.empty()) + remove_client_lease(client_lease_map.begin()->second, locker); +} + +void CDentry::_put() +{ + if (get_num_ref() <= ((int)is_dirty() + 1)) { + CDentry::linkage_t *dnl = get_projected_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref()) + in->mdcache->maybe_eval_stray(in, true); + } + } +} + +void CDentry::dump(Formatter *f) const +{ + ceph_assert(f != NULL); + + filepath path; + make_path(path); + + f->dump_string("path", path.get_path()); + f->dump_unsigned("path_ino", path.get_ino().val); + f->dump_unsigned("snap_first", first); + f->dump_unsigned("snap_last", last); + + f->dump_bool("is_primary", get_linkage()->is_primary()); + f->dump_bool("is_remote", get_linkage()->is_remote()); + f->dump_bool("is_null", get_linkage()->is_null()); + f->dump_bool("is_new", is_new()); + if (get_linkage()->get_inode()) { + f->dump_unsigned("inode", get_linkage()->get_inode()->ino()); + } else { + f->dump_unsigned("inode", 0); + } + + if (linkage.is_remote()) { + f->dump_string("remote_type", linkage.get_remote_d_type_string()); + } else { + f->dump_string("remote_type", ""); + } + + f->dump_unsigned("version", get_version()); + f->dump_unsigned("projected_version", get_projected_version()); + + f->dump_int("auth_pins", auth_pins); + + MDSCacheObject::dump(f); + + f->open_object_section("lock"); + lock.dump(f); + f->close_section(); + + f->open_object_section("versionlock"); + versionlock.dump(f); + f->close_section(); + + f->open_array_section("states"); + MDSCacheObject::dump_states(f); + if (state_test(STATE_NEW)) + f->dump_string("state", "new"); + if (state_test(STATE_FRAGMENTING)) + f->dump_string("state", "fragmenting"); + if (state_test(STATE_PURGING)) + f->dump_string("state", "purging"); + if (state_test(STATE_BADREMOTEINO)) + f->dump_string("state", "badremoteino"); + if (state_test(STATE_STRAY)) + f->dump_string("state", "stray"); + f->close_section(); +} + +std::string CDentry::linkage_t::get_remote_d_type_string() const +{ + switch (DTTOIF(remote_d_type)) { + case S_IFSOCK: return "sock"; + case S_IFLNK: return "lnk"; + case S_IFREG: return "reg"; + case S_IFBLK: return "blk"; + case S_IFDIR: return "dir"; + case S_IFCHR: return "chr"; + case S_IFIFO: return "fifo"; + default: ceph_abort(); return ""; + } +} + +MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co); diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h new file mode 100644 index 00000000..56aa58c5 --- /dev/null +++ b/src/mds/CDentry.h @@ -0,0 +1,381 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_CDENTRY_H +#define CEPH_CDENTRY_H + +#include <string> +#include <string_view> +#include <set> + +#include "include/counter.h" +#include "include/types.h" +#include "include/buffer_fwd.h" +#include "include/lru.h" +#include "include/elist.h" +#include "include/filepath.h" + +#include "MDSCacheObject.h" +#include "MDSContext.h" +#include "SimpleLock.h" +#include "LocalLock.h" +#include "ScrubHeader.h" + +class CInode; +class CDir; +class Locker; +class CDentry; +class LogSegment; + +class Session; + + + +// define an ordering +bool operator<(const CDentry& l, const CDentry& r); + +// dentry +class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry> { +public: + MEMPOOL_CLASS_HELPERS(); + friend class CDir; + + struct linkage_t { + CInode *inode = nullptr; + inodeno_t remote_ino = 0; + unsigned char remote_d_type = 0; + + linkage_t() {} + + // dentry type is primary || remote || null + // inode ptr is required for primary, optional for remote, undefined for null + bool is_primary() const { return remote_ino == 0 && inode != 0; } + bool is_remote() const { return remote_ino > 0; } + bool is_null() const { return remote_ino == 0 && inode == 0; } + + CInode *get_inode() { return inode; } + const CInode *get_inode() const { return inode; } + inodeno_t get_remote_ino() const { return remote_ino; } + unsigned char get_remote_d_type() const { return remote_d_type; } + std::string get_remote_d_type_string() const; + + void set_remote(inodeno_t ino, unsigned char d_type) { + remote_ino = ino; + remote_d_type = d_type; + inode = 0; + } + void link_remote(CInode *in); + }; + + + // -- state -- + static const int STATE_NEW = (1<<0); + static const int STATE_FRAGMENTING = (1<<1); + static const int STATE_PURGING = (1<<2); + static const int STATE_BADREMOTEINO = (1<<3); + static const int STATE_EVALUATINGSTRAY = (1<<4); + static const int STATE_PURGINGPINNED = (1<<5); + static const int STATE_BOTTOMLRU = (1<<6); + // stray dentry needs notification of releasing reference + static const int STATE_STRAY = STATE_NOTIFYREF; + static const int MASK_STATE_IMPORT_KEPT = STATE_BOTTOMLRU; + + // -- pins -- + static const int PIN_INODEPIN = 1; // linked inode is pinned + static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting + static const int PIN_PURGING = 3; + static const int PIN_SCRUBPARENT = 4; + + static const unsigned EXPORT_NONCE = 1; + + + CDentry(std::string_view n, __u32 h, + snapid_t f, snapid_t l) : + hash(h), + first(f), last(l), + item_dirty(this), + lock(this, &lock_type), + versionlock(this, &versionlock_type), + name(n) + {} + CDentry(std::string_view n, __u32 h, inodeno_t ino, unsigned char dt, + snapid_t f, snapid_t l) : + hash(h), + first(f), last(l), + item_dirty(this), + lock(this, &lock_type), + versionlock(this, &versionlock_type), + name(n) + { + linkage.remote_ino = ino; + linkage.remote_d_type = dt; + } + + std::string_view pin_name(int p) const override { + switch (p) { + case PIN_INODEPIN: return "inodepin"; + case PIN_FRAGMENTING: return "fragmenting"; + case PIN_PURGING: return "purging"; + case PIN_SCRUBPARENT: return "scrubparent"; + default: return generic_pin_name(p); + } + } + + // -- wait -- + //static const int WAIT_LOCK_OFFSET = 8; + + void add_waiter(uint64_t tag, MDSContext *c) override; + + bool is_lt(const MDSCacheObject *r) const override { + return *this < *static_cast<const CDentry*>(r); + } + + dentry_key_t key() { + return dentry_key_t(last, name.c_str(), hash); + } + + const CDir *get_dir() const { return dir; } + CDir *get_dir() { return dir; } + std::string_view get_name() const { return std::string_view(name); } + + __u32 get_hash() const { return hash; } + + // linkage + const linkage_t *get_linkage() const { return &linkage; } + linkage_t *get_linkage() { return &linkage; } + + linkage_t *_project_linkage() { + projected.push_back(linkage_t()); + return &projected.back(); + } + void push_projected_linkage(); + void push_projected_linkage(inodeno_t ino, char d_type) { + linkage_t *p = _project_linkage(); + p->remote_ino = ino; + p->remote_d_type = d_type; + } + void push_projected_linkage(CInode *inode); + linkage_t *pop_projected_linkage(); + + bool is_projected() const { return !projected.empty(); } + + linkage_t *get_projected_linkage() { + if (!projected.empty()) + return &projected.back(); + return &linkage; + } + + const linkage_t *get_projected_linkage() const { + if (!projected.empty()) + return &projected.back(); + return &linkage; + } + + CInode *get_projected_inode() { + return get_projected_linkage()->inode; + } + + bool use_projected(client_t client, const MutationRef& mut) const { + return lock.can_read_projected(client) || + lock.get_xlock_by() == mut; + } + linkage_t *get_linkage(client_t client, const MutationRef& mut) { + return use_projected(client, mut) ? get_projected_linkage() : get_linkage(); + } + + // ref counts: pin ourselves in the LRU when we're pinned. + void first_get() override { + lru_pin(); + } + void last_put() override { + lru_unpin(); + } + void _put() override; + + // auth pins + bool can_auth_pin(int *err_ret=nullptr) const override; + void auth_pin(void *by) override; + void auth_unpin(void *by) override; + void adjust_nested_auth_pins(int diradj, void *by); + bool is_frozen() const override; + bool is_freezing() const override; + int get_num_dir_auth_pins() const; + + // remote links + void link_remote(linkage_t *dnl, CInode *in); + void unlink_remote(linkage_t *dnl); + + // copy cons + CDentry(const CDentry& m); + const CDentry& operator= (const CDentry& right); + + // misc + void make_path_string(std::string& s, bool projected=false) const; + void make_path(filepath& fp, bool projected=false) const; + + // -- version -- + version_t get_version() const { return version; } + void set_version(version_t v) { projected_version = version = v; } + version_t get_projected_version() const { return projected_version; } + void set_projected_version(version_t v) { projected_version = v; } + + mds_authority_t authority() const override; + + version_t pre_dirty(version_t min=0); + void _mark_dirty(LogSegment *ls); + void mark_dirty(version_t projected_dirv, LogSegment *ls); + void mark_clean(); + + void mark_new(); + bool is_new() const { return state_test(STATE_NEW); } + void clear_new() { state_clear(STATE_NEW); } + + // -- replication + void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) { + __u32 nonce = add_replica(mds); + encode(nonce, bl); + encode(first, bl); + encode(linkage.remote_ino, bl); + encode(linkage.remote_d_type, bl); + lock.encode_state_for_replica(bl); + encode(need_recover, bl); + } + void decode_replica(bufferlist::const_iterator& p, bool is_new); + + // -- exporting + // note: this assumes the dentry already exists. + // i.e., the name is already extracted... so we just need the other state. + void encode_export(bufferlist& bl) { + encode(first, bl); + encode(state, bl); + encode(version, bl); + encode(projected_version, bl); + encode(lock, bl); + encode(get_replicas(), bl); + get(PIN_TEMPEXPORTING); + } + void finish_export() { + // twiddle + clear_replica_map(); + replica_nonce = EXPORT_NONCE; + state_clear(CDentry::STATE_AUTH); + if (is_dirty()) + mark_clean(); + put(PIN_TEMPEXPORTING); + } + void abort_export() { + put(PIN_TEMPEXPORTING); + } + void decode_import(bufferlist::const_iterator& blp, LogSegment *ls) { + decode(first, blp); + __u32 nstate; + decode(nstate, blp); + decode(version, blp); + decode(projected_version, blp); + decode(lock, blp); + decode(get_replicas(), blp); + + // twiddle + state &= MASK_STATE_IMPORT_KEPT; + state_set(CDentry::STATE_AUTH); + if (nstate & STATE_DIRTY) + _mark_dirty(ls); + if (is_replicated()) + get(PIN_REPLICATED); + replica_nonce = 0; + } + + // -- locking -- + SimpleLock* get_lock(int type) override { + ceph_assert(type == CEPH_LOCK_DN); + return &lock; + } + void set_object_info(MDSCacheObjectInfo &info) override; + void encode_lock_state(int type, bufferlist& bl) override; + void decode_lock_state(int type, const bufferlist& bl) override; + + // --------------------------------------------- + // replicas (on clients) + + bool is_any_leases() const { + return !client_lease_map.empty(); + } + const ClientLease *get_client_lease(client_t c) const { + if (client_lease_map.count(c)) + return client_lease_map.find(c)->second; + return 0; + } + ClientLease *get_client_lease(client_t c) { + if (client_lease_map.count(c)) + return client_lease_map.find(c)->second; + return 0; + } + bool have_client_lease(client_t c) const { + const ClientLease *l = get_client_lease(c); + if (l) + return true; + else + return false; + } + + ClientLease *add_client_lease(client_t c, Session *session); + void remove_client_lease(ClientLease *r, Locker *locker); // returns remaining mask (if any), and kicks locker eval_gathers + void remove_client_leases(Locker *locker); + + ostream& print_db_line_prefix(ostream& out) override; + void print(ostream& out) override; + void dump(Formatter *f) const; + + + __u32 hash; + snapid_t first, last; + + elist<CDentry*>::item item_dirty, item_dir_dirty; + elist<CDentry*>::item item_stray; + + // lock + static LockType lock_type; + static LockType versionlock_type; + + SimpleLock lock; // FIXME referenced containers not in mempool + LocalLock versionlock; // FIXME referenced containers not in mempool + + mempool::mds_co::map<client_t,ClientLease*> client_lease_map; + + +protected: + friend class Migrator; + friend class Locker; + friend class MDCache; + friend class StrayManager; + friend class CInode; + friend class C_MDC_XlockRequest; + + CDir *dir = nullptr; // containing dirfrag + linkage_t linkage; + mempool::mds_co::list<linkage_t> projected; + + version_t version = 0; // dir version when last touched. + version_t projected_version = 0; // what it will be when i unlock/commit. + +private: + mempool::mds_co::string name; +}; + +ostream& operator<<(ostream& out, const CDentry& dn); + + +#endif diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc new file mode 100755 index 00000000..e6576542 --- /dev/null +++ b/src/mds/CDir.cc @@ -0,0 +1,3520 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string_view> + +#include "include/types.h" + +#include "CDir.h" +#include "CDentry.h" +#include "CInode.h" +#include "Mutation.h" + +#include "MDSMap.h" +#include "MDSRank.h" +#include "MDCache.h" +#include "Locker.h" +#include "MDLog.h" +#include "LogSegment.h" + +#include "common/bloom_filter.hpp" +#include "include/Context.h" +#include "common/Clock.h" + +#include "osdc/Objecter.h" + +#include "common/config.h" +#include "include/ceph_assert.h" +#include "include/compat.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " + +int CDir::num_frozen_trees = 0; +int CDir::num_freezing_trees = 0; + +class CDirContext : public MDSContext +{ +protected: + CDir *dir; + MDSRank* get_mds() override {return dir->cache->mds;} + +public: + explicit CDirContext(CDir *d) : dir(d) { + ceph_assert(dir != NULL); + } +}; + + +class CDirIOContext : public MDSIOContextBase +{ +protected: + CDir *dir; + MDSRank* get_mds() override {return dir->cache->mds;} + +public: + explicit CDirIOContext(CDir *d) : dir(d) { + ceph_assert(dir != NULL); + } +}; + + +// PINS +//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + + +ostream& operator<<(ostream& out, const CDir& dir) +{ + out << "[dir " << dir.dirfrag() << " " << dir.get_path() << "/" + << " [" << dir.first << ",head]"; + if (dir.is_auth()) { + out << " auth"; + if (dir.is_replicated()) + out << dir.get_replicas(); + + if (dir.is_projected()) + out << " pv=" << dir.get_projected_version(); + out << " v=" << dir.get_version(); + out << " cv=" << dir.get_committing_version(); + out << "/" << dir.get_committed_version(); + } else { + mds_authority_t a = dir.authority(); + out << " rep@" << a.first; + if (a.second != CDIR_AUTH_UNKNOWN) + out << "," << a.second; + out << "." << dir.get_replica_nonce(); + } + + if (dir.is_rep()) out << " REP"; + + if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { + if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) + out << " dir_auth=" << dir.get_dir_auth().first; + else + out << " dir_auth=" << dir.get_dir_auth(); + } + + if (dir.get_auth_pins() || dir.get_dir_auth_pins()) { + out << " ap=" << dir.get_auth_pins() + << "+" << dir.get_dir_auth_pins(); +#ifdef MDS_AUTHPIN_SET + dir.print_authpin_set(out); +#endif + } + + out << " state=" << dir.get_state(); + if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; + if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; + if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; + if (dir.state_test(CDir::STATE_AUXSUBTREE)) out << "|auxsubtree"; + if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; + if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; + if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; + if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; + if (dir.state_test(CDir::STATE_BADFRAG)) out << "|badfrag"; + if (dir.state_test(CDir::STATE_FRAGMENTING)) out << "|fragmenting"; + if (dir.state_test(CDir::STATE_CREATING)) out << "|creating"; + if (dir.state_test(CDir::STATE_COMMITTING)) out << "|committing"; + if (dir.state_test(CDir::STATE_FETCHING)) out << "|fetching"; + if (dir.state_test(CDir::STATE_EXPORTING)) out << "|exporting"; + if (dir.state_test(CDir::STATE_IMPORTING)) out << "|importing"; + if (dir.state_test(CDir::STATE_STICKY)) out << "|sticky"; + if (dir.state_test(CDir::STATE_DNPINNEDFRAG)) out << "|dnpinnedfrag"; + if (dir.state_test(CDir::STATE_ASSIMRSTAT)) out << "|assimrstat"; + + // fragstat + out << " " << dir.fnode.fragstat; + if (!(dir.fnode.fragstat == dir.fnode.accounted_fragstat)) + out << "/" << dir.fnode.accounted_fragstat; + if (g_conf()->mds_debug_scatterstat && dir.is_projected()) { + const fnode_t *pf = dir.get_projected_fnode(); + out << "->" << pf->fragstat; + if (!(pf->fragstat == pf->accounted_fragstat)) + out << "/" << pf->accounted_fragstat; + } + + // rstat + out << " " << dir.fnode.rstat; + if (!(dir.fnode.rstat == dir.fnode.accounted_rstat)) + out << "/" << dir.fnode.accounted_rstat; + if (g_conf()->mds_debug_scatterstat && dir.is_projected()) { + const fnode_t *pf = dir.get_projected_fnode(); + out << "->" << pf->rstat; + if (!(pf->rstat == pf->accounted_rstat)) + out << "/" << pf->accounted_rstat; + } + + out << " hs=" << dir.get_num_head_items() << "+" << dir.get_num_head_null(); + out << ",ss=" << dir.get_num_snap_items() << "+" << dir.get_num_snap_null(); + if (dir.get_num_dirty()) + out << " dirty=" << dir.get_num_dirty(); + + if (dir.get_num_ref()) { + out << " |"; + dir.print_pin_set(out); + } + + out << " " << &dir; + return out << "]"; +} + + +void CDir::print(ostream& out) +{ + out << *this; +} + + + + +ostream& CDir::print_db_line_prefix(ostream& out) +{ + return out << ceph_clock_now() << " mds." << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; +} + + + +// ------------------------------------------------------------------- +// CDir + +CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : + cache(mdcache), inode(in), frag(fg), + first(2), + dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)), + projected_version(0), + dirty_dentries(member_offset(CDentry, item_dir_dirty)), + item_dirty(this), item_new(this), + num_head_items(0), num_head_null(0), + num_snap_items(0), num_snap_null(0), + num_dirty(0), committing_version(0), committed_version(0), + dir_auth_pins(0), + dir_rep(REP_NONE), + pop_me(mdcache->decayrate), + pop_nested(mdcache->decayrate), + pop_auth_subtree(mdcache->decayrate), + pop_auth_subtree_nested(mdcache->decayrate), + pop_spread(mdcache->decayrate), + pop_lru_subdirs(member_offset(CInode, item_pop_lru)), + num_dentries_nested(0), num_dentries_auth_subtree(0), + num_dentries_auth_subtree_nested(0), + dir_auth(CDIR_AUTH_DEFAULT) +{ + // auth + ceph_assert(in->is_dir()); + if (auth) state_set(STATE_AUTH); +} + +/** + * Check the recursive statistics on size for consistency. + * If mds_debug_scatterstat is enabled, assert for correctness, + * otherwise just print out the mismatch and continue. + */ +bool CDir::check_rstats(bool scrub) +{ + if (!g_conf()->mds_debug_scatterstat && !scrub) + return true; + + dout(25) << "check_rstats on " << this << dendl; + if (!is_complete() || !is_auth() || is_frozen()) { + dout(3) << "check_rstats " << (scrub ? "(scrub) " : "") + << "bailing out -- incomplete or non-auth or frozen dir on " + << *this << dendl; + return !scrub; + } + + frag_info_t frag_info; + nest_info_t nest_info; + for (auto i = items.begin(); i != items.end(); ++i) { + if (i->second->last != CEPH_NOSNAP) + continue; + CDentry::linkage_t *dnl = i->second->get_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + nest_info.add(in->inode.accounted_rstat); + if (in->is_dir()) + frag_info.nsubdirs++; + else + frag_info.nfiles++; + } else if (dnl->is_remote()) + frag_info.nfiles++; + } + + bool good = true; + // fragstat + if(!frag_info.same_sums(fnode.fragstat)) { + dout(1) << "mismatch between head items and fnode.fragstat! printing dentries" << dendl; + dout(1) << "get_num_head_items() = " << get_num_head_items() + << "; fnode.fragstat.nfiles=" << fnode.fragstat.nfiles + << " fnode.fragstat.nsubdirs=" << fnode.fragstat.nsubdirs << dendl; + good = false; + } else { + dout(20) << "get_num_head_items() = " << get_num_head_items() + << "; fnode.fragstat.nfiles=" << fnode.fragstat.nfiles + << " fnode.fragstat.nsubdirs=" << fnode.fragstat.nsubdirs << dendl; + } + + // rstat + if (!nest_info.same_sums(fnode.rstat)) { + dout(1) << "mismatch between child accounted_rstats and my rstats!" << dendl; + dout(1) << "total of child dentrys: " << nest_info << dendl; + dout(1) << "my rstats: " << fnode.rstat << dendl; + good = false; + } else { + dout(20) << "total of child dentrys: " << nest_info << dendl; + dout(20) << "my rstats: " << fnode.rstat << dendl; + } + + if (!good) { + if (!scrub) { + for (auto i = items.begin(); i != items.end(); ++i) { + CDentry *dn = i->second; + if (dn->get_linkage()->is_primary()) { + CInode *in = dn->get_linkage()->inode; + dout(1) << *dn << " rstat " << in->inode.accounted_rstat << dendl; + } else { + dout(1) << *dn << dendl; + } + } + + ceph_assert(frag_info.nfiles == fnode.fragstat.nfiles); + ceph_assert(frag_info.nsubdirs == fnode.fragstat.nsubdirs); + ceph_assert(nest_info.rbytes == fnode.rstat.rbytes); + ceph_assert(nest_info.rfiles == fnode.rstat.rfiles); + ceph_assert(nest_info.rsubdirs == fnode.rstat.rsubdirs); + } + } + dout(10) << "check_rstats complete on " << this << dendl; + return good; +} + +void CDir::adjust_num_inodes_with_caps(int d) +{ + // FIXME: smarter way to decide if adding 'this' to open file table + if (num_inodes_with_caps == 0 && d > 0) + cache->open_file_table.add_dirfrag(this); + else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d) + cache->open_file_table.remove_dirfrag(this); + + num_inodes_with_caps += d; + ceph_assert(num_inodes_with_caps >= 0); +} + +CDentry *CDir::lookup(std::string_view name, snapid_t snap) +{ + dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl; + auto iter = items.lower_bound(dentry_key_t(snap, name, inode->hash_dentry_name(name))); + if (iter == items.end()) + return 0; + if (iter->second->get_name() == name && + iter->second->first <= snap && + iter->second->last >= snap) { + dout(20) << " hit -> " << iter->first << dendl; + return iter->second; + } + dout(20) << " miss -> " << iter->first << dendl; + return 0; +} + +CDentry *CDir::lookup_exact_snap(std::string_view name, snapid_t last) { + dout(20) << __func__ << " (" << last << ", '" << name << "')" << dendl; + auto p = items.find(dentry_key_t(last, name, inode->hash_dentry_name(name))); + if (p == items.end()) + return NULL; + return p->second; +} + +/*** + * linking fun + */ + +CDentry* CDir::add_null_dentry(std::string_view dname, + snapid_t first, snapid_t last) +{ + // foreign + ceph_assert(lookup_exact_snap(dname, last) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last); + if (is_auth()) + dn->state_set(CDentry::STATE_AUTH); + + cache->bottom_lru.lru_insert_mid(dn); + dn->state_set(CDentry::STATE_BOTTOMLRU); + + dn->dir = this; + dn->version = get_projected_version(); + + // add to dir + ceph_assert(items.count(dn->key()) == 0); + //assert(null_items.count(dn->get_name()) == 0); + + items[dn->key()] = dn; + if (last == CEPH_NOSNAP) + num_head_null++; + else + num_snap_null++; + + if (state_test(CDir::STATE_DNPINNEDFRAG)) { + dn->get(CDentry::PIN_FRAGMENTING); + dn->state_set(CDentry::STATE_FRAGMENTING); + } + + dout(12) << __func__ << " " << *dn << dendl; + + // pin? + if (get_num_any() == 1) + get(PIN_CHILD); + + ceph_assert(get_num_any() == items.size()); + return dn; +} + + +CDentry* CDir::add_primary_dentry(std::string_view dname, CInode *in, + snapid_t first, snapid_t last) +{ + // primary + ceph_assert(lookup_exact_snap(dname, last) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last); + if (is_auth()) + dn->state_set(CDentry::STATE_AUTH); + if (is_auth() || !inode->is_stray()) { + cache->lru.lru_insert_mid(dn); + } else { + cache->bottom_lru.lru_insert_mid(dn); + dn->state_set(CDentry::STATE_BOTTOMLRU); + } + + dn->dir = this; + dn->version = get_projected_version(); + + // add to dir + ceph_assert(items.count(dn->key()) == 0); + //assert(null_items.count(dn->get_name()) == 0); + + items[dn->key()] = dn; + + dn->get_linkage()->inode = in; + + link_inode_work(dn, in); + + if (dn->last == CEPH_NOSNAP) + num_head_items++; + else + num_snap_items++; + + if (state_test(CDir::STATE_DNPINNEDFRAG)) { + dn->get(CDentry::PIN_FRAGMENTING); + dn->state_set(CDentry::STATE_FRAGMENTING); + } + + dout(12) << __func__ << " " << *dn << dendl; + + // pin? + if (get_num_any() == 1) + get(PIN_CHILD); + ceph_assert(get_num_any() == items.size()); + return dn; +} + +CDentry* CDir::add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type, + snapid_t first, snapid_t last) +{ + // foreign + ceph_assert(lookup_exact_snap(dname, last) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), ino, d_type, first, last); + if (is_auth()) + dn->state_set(CDentry::STATE_AUTH); + cache->lru.lru_insert_mid(dn); + + dn->dir = this; + dn->version = get_projected_version(); + + // add to dir + ceph_assert(items.count(dn->key()) == 0); + //assert(null_items.count(dn->get_name()) == 0); + + items[dn->key()] = dn; + if (last == CEPH_NOSNAP) + num_head_items++; + else + num_snap_items++; + + if (state_test(CDir::STATE_DNPINNEDFRAG)) { + dn->get(CDentry::PIN_FRAGMENTING); + dn->state_set(CDentry::STATE_FRAGMENTING); + } + + dout(12) << __func__ << " " << *dn << dendl; + + // pin? + if (get_num_any() == 1) + get(PIN_CHILD); + + ceph_assert(get_num_any() == items.size()); + return dn; +} + + + +void CDir::remove_dentry(CDentry *dn) +{ + dout(12) << __func__ << " " << *dn << dendl; + + // there should be no client leases at this point! + ceph_assert(dn->client_lease_map.empty()); + + if (state_test(CDir::STATE_DNPINNEDFRAG)) { + dn->put(CDentry::PIN_FRAGMENTING); + dn->state_clear(CDentry::STATE_FRAGMENTING); + } + + if (dn->get_linkage()->is_null()) { + if (dn->last == CEPH_NOSNAP) + num_head_null--; + else + num_snap_null--; + } else { + if (dn->last == CEPH_NOSNAP) + num_head_items--; + else + num_snap_items--; + } + + if (!dn->get_linkage()->is_null()) + // detach inode and dentry + unlink_inode_work(dn); + + // remove from list + ceph_assert(items.count(dn->key()) == 1); + items.erase(dn->key()); + + // clean? + if (dn->is_dirty()) + dn->mark_clean(); + + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) + cache->bottom_lru.lru_remove(dn); + else + cache->lru.lru_remove(dn); + delete dn; + + // unpin? + if (get_num_any() == 0) + put(PIN_CHILD); + ceph_assert(get_num_any() == items.size()); +} + +void CDir::link_remote_inode(CDentry *dn, CInode *in) +{ + link_remote_inode(dn, in->ino(), IFTODT(in->get_projected_inode()->mode)); +} + +void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type) +{ + dout(12) << __func__ << " " << *dn << " remote " << ino << dendl; + ceph_assert(dn->get_linkage()->is_null()); + + dn->get_linkage()->set_remote(ino, d_type); + + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { + cache->bottom_lru.lru_remove(dn); + cache->lru.lru_insert_mid(dn); + dn->state_clear(CDentry::STATE_BOTTOMLRU); + } + + if (dn->last == CEPH_NOSNAP) { + num_head_items++; + num_head_null--; + } else { + num_snap_items++; + num_snap_null--; + } + ceph_assert(get_num_any() == items.size()); +} + +void CDir::link_primary_inode(CDentry *dn, CInode *in) +{ + dout(12) << __func__ << " " << *dn << " " << *in << dendl; + ceph_assert(dn->get_linkage()->is_null()); + + dn->get_linkage()->inode = in; + + link_inode_work(dn, in); + + if (dn->state_test(CDentry::STATE_BOTTOMLRU) && + (is_auth() || !inode->is_stray())) { + cache->bottom_lru.lru_remove(dn); + cache->lru.lru_insert_mid(dn); + dn->state_clear(CDentry::STATE_BOTTOMLRU); + } + + if (dn->last == CEPH_NOSNAP) { + num_head_items++; + num_head_null--; + } else { + num_snap_items++; + num_snap_null--; + } + + ceph_assert(get_num_any() == items.size()); +} + +void CDir::link_inode_work( CDentry *dn, CInode *in) +{ + ceph_assert(dn->get_linkage()->get_inode() == in); + in->set_primary_parent(dn); + + // set inode version + //in->inode.version = dn->get_version(); + + // pin dentry? + if (in->get_num_ref()) + dn->get(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_link(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); + + // adjust auth pin count + if (in->auth_pins) + dn->adjust_nested_auth_pins(in->auth_pins, NULL); + + // verify open snaprealm parent + if (in->snaprealm) + in->snaprealm->adjust_parent(); + else if (in->is_any_caps()) + in->move_to_realm(inode->find_snaprealm()); +} + +void CDir::unlink_inode(CDentry *dn, bool adjust_lru) +{ + if (dn->get_linkage()->is_primary()) { + dout(12) << __func__ << " " << *dn << " " << *dn->get_linkage()->get_inode() << dendl; + } else { + dout(12) << __func__ << " " << *dn << dendl; + } + + unlink_inode_work(dn); + + if (adjust_lru && !dn->state_test(CDentry::STATE_BOTTOMLRU)) { + cache->lru.lru_remove(dn); + cache->bottom_lru.lru_insert_mid(dn); + dn->state_set(CDentry::STATE_BOTTOMLRU); + } + + if (dn->last == CEPH_NOSNAP) { + num_head_items--; + num_head_null++; + } else { + num_snap_items--; + num_snap_null++; + } + ceph_assert(get_num_any() == items.size()); +} + + +void CDir::try_remove_unlinked_dn(CDentry *dn) +{ + ceph_assert(dn->dir == this); + ceph_assert(dn->get_linkage()->is_null()); + + // no pins (besides dirty)? + if (dn->get_num_ref() != dn->is_dirty()) + return; + + // was the dn new? + if (dn->is_new()) { + dout(10) << __func__ << " " << *dn << " in " << *this << dendl; + if (dn->is_dirty()) + dn->mark_clean(); + remove_dentry(dn); + + // NOTE: we may not have any more dirty dentries, but the fnode + // still changed, so the directory must remain dirty. + } +} + + +void CDir::unlink_inode_work(CDentry *dn) +{ + CInode *in = dn->get_linkage()->get_inode(); + + if (dn->get_linkage()->is_remote()) { + // remote + if (in) + dn->unlink_remote(dn->get_linkage()); + + dn->get_linkage()->set_remote(0, 0); + } else if (dn->get_linkage()->is_primary()) { + // primary + // unpin dentry? + if (in->get_num_ref()) + dn->put(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_unlink(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(-1); + + // unlink auth_pin count + if (in->auth_pins) + dn->adjust_nested_auth_pins(-in->auth_pins, nullptr); + + // detach inode + in->remove_primary_parent(dn); + if (in->is_dir()) + in->item_pop_lru.remove_myself(); + dn->get_linkage()->inode = 0; + } else { + ceph_assert(!dn->get_linkage()->is_null()); + } +} + +void CDir::add_to_bloom(CDentry *dn) +{ + ceph_assert(dn->last == CEPH_NOSNAP); + if (!bloom) { + /* not create bloom filter for incomplete dir that was added by log replay */ + if (!is_complete()) + return; + + /* don't maintain bloom filters in standby replay (saves cycles, and also + * avoids need to implement clearing it in EExport for #16924) */ + if (cache->mds->is_standby_replay()) { + return; + } + + unsigned size = get_num_head_items() + get_num_snap_items(); + if (size < 100) size = 100; + bloom.reset(new bloom_filter(size, 1.0 / size, 0)); + } + /* This size and false positive probability is completely random.*/ + bloom->insert(dn->get_name().data(), dn->get_name().size()); +} + +bool CDir::is_in_bloom(std::string_view name) +{ + if (!bloom) + return false; + return bloom->contains(name.data(), name.size()); +} + +void CDir::remove_null_dentries() { + dout(12) << __func__ << " " << *this << dendl; + + auto p = items.begin(); + while (p != items.end()) { + CDentry *dn = p->second; + ++p; + if (dn->get_linkage()->is_null() && !dn->is_projected()) + remove_dentry(dn); + } + + ceph_assert(num_snap_null == 0); + ceph_assert(num_head_null == 0); + ceph_assert(get_num_any() == items.size()); +} + +/** remove dirty null dentries for deleted directory. the dirfrag will be + * deleted soon, so it's safe to not commit dirty dentries. + * + * This is called when a directory is being deleted, a prerequisite + * of which is that its children have been unlinked: we expect to only see + * null, unprojected dentries here. + */ +void CDir::try_remove_dentries_for_stray() +{ + dout(10) << __func__ << dendl; + ceph_assert(get_parent_dir()->inode->is_stray()); + + // clear dirty only when the directory was not snapshotted + bool clear_dirty = !inode->snaprealm; + + auto p = items.begin(); + while (p != items.end()) { + CDentry *dn = p->second; + ++p; + if (dn->last == CEPH_NOSNAP) { + ceph_assert(!dn->is_projected()); + ceph_assert(dn->get_linkage()->is_null()); + if (clear_dirty && dn->is_dirty()) + dn->mark_clean(); + // It's OK to remove lease prematurely because we will never link + // the dentry to inode again. + if (dn->is_any_leases()) + dn->remove_client_leases(cache->mds->locker); + if (dn->get_num_ref() == 0) + remove_dentry(dn); + } else { + ceph_assert(!dn->is_projected()); + CDentry::linkage_t *dnl= dn->get_linkage(); + CInode *in = NULL; + if (dnl->is_primary()) { + in = dnl->get_inode(); + if (clear_dirty && in->is_dirty()) + in->mark_clean(); + } + if (clear_dirty && dn->is_dirty()) + dn->mark_clean(); + if (dn->get_num_ref() == 0) { + remove_dentry(dn); + if (in) + cache->remove_inode(in); + } + } + } + + if (clear_dirty && is_dirty()) + mark_clean(); +} + +bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps) +{ + ceph_assert(dn->last != CEPH_NOSNAP); + set<snapid_t>::const_iterator p = snaps.lower_bound(dn->first); + CDentry::linkage_t *dnl= dn->get_linkage(); + CInode *in = 0; + if (dnl->is_primary()) + in = dnl->get_inode(); + if ((p == snaps.end() || *p > dn->last) && + (dn->get_num_ref() == dn->is_dirty()) && + (!in || in->get_num_ref() == in->is_dirty())) { + dout(10) << " purging snapped " << *dn << dendl; + if (in && in->is_dirty()) + in->mark_clean(); + remove_dentry(dn); + if (in) { + dout(10) << " purging snapped " << *in << dendl; + cache->remove_inode(in); + } + return true; + } + return false; +} + + +void CDir::purge_stale_snap_data(const set<snapid_t>& snaps) +{ + dout(10) << __func__ << " " << snaps << dendl; + + auto p = items.begin(); + while (p != items.end()) { + CDentry *dn = p->second; + ++p; + + if (dn->last == CEPH_NOSNAP) + continue; + + try_trim_snap_dentry(dn, snaps); + } +} + + +/** + * steal_dentry -- semi-violently move a dentry from one CDir to another + * (*) violently, in that nitems, most pins, etc. are not correctly maintained + * on the old CDir corpse; must call finish_old_fragment() when finished. + */ +void CDir::steal_dentry(CDentry *dn) +{ + dout(15) << __func__ << " " << *dn << dendl; + + items[dn->key()] = dn; + + dn->dir->items.erase(dn->key()); + if (dn->dir->items.empty()) + dn->dir->put(PIN_CHILD); + + if (get_num_any() == 0) + get(PIN_CHILD); + if (dn->get_linkage()->is_null()) { + if (dn->last == CEPH_NOSNAP) + num_head_null++; + else + num_snap_null++; + } else if (dn->last == CEPH_NOSNAP) { + num_head_items++; + + if (dn->get_linkage()->is_primary()) { + CInode *in = dn->get_linkage()->get_inode(); + auto pi = in->get_projected_inode(); + if (in->is_dir()) { + fnode.fragstat.nsubdirs++; + if (in->item_pop_lru.is_on_list()) + pop_lru_subdirs.push_back(&in->item_pop_lru); + } else { + fnode.fragstat.nfiles++; + } + fnode.rstat.rbytes += pi->accounted_rstat.rbytes; + fnode.rstat.rfiles += pi->accounted_rstat.rfiles; + fnode.rstat.rsubdirs += pi->accounted_rstat.rsubdirs; + fnode.rstat.rsnaps += pi->accounted_rstat.rsnaps; + if (pi->accounted_rstat.rctime > fnode.rstat.rctime) + fnode.rstat.rctime = pi->accounted_rstat.rctime; + + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); + + // move dirty inode rstat to new dirfrag + if (in->is_dirty_rstat()) + dirty_rstat_inodes.push_back(&in->dirty_rstat_item); + } else if (dn->get_linkage()->is_remote()) { + if (dn->get_linkage()->get_remote_d_type() == DT_DIR) + fnode.fragstat.nsubdirs++; + else + fnode.fragstat.nfiles++; + } + } else { + num_snap_items++; + if (dn->get_linkage()->is_primary()) { + CInode *in = dn->get_linkage()->get_inode(); + if (in->is_dirty_rstat()) + dirty_rstat_inodes.push_back(&in->dirty_rstat_item); + } + } + + { + int dap = dn->get_num_dir_auth_pins(); + if (dap) { + adjust_nested_auth_pins(dap, NULL); + dn->dir->adjust_nested_auth_pins(-dap, NULL); + } + } + + if (dn->is_dirty()) { + dirty_dentries.push_back(&dn->item_dir_dirty); + num_dirty++; + } + + dn->dir = this; +} + +void CDir::prepare_old_fragment(map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay) +{ + // auth_pin old fragment for duration so that any auth_pinning + // during the dentry migration doesn't trigger side effects + if (!replay && is_auth()) + auth_pin(this); + + if (!waiting_on_dentry.empty()) { + for (const auto &p : waiting_on_dentry) { + auto &e = dentry_waiters[p.first]; + for (const auto &waiter : p.second) { + e.push_back(waiter); + } + } + waiting_on_dentry.clear(); + put(PIN_DNWAITER); + } +} + +void CDir::prepare_new_fragment(bool replay) +{ + if (!replay && is_auth()) { + _freeze_dir(); + mark_complete(); + } + inode->add_dirfrag(this); +} + +void CDir::finish_old_fragment(MDSContext::vec& waiters, bool replay) +{ + // take waiters _before_ unfreeze... + if (!replay) { + take_waiting(WAIT_ANY_MASK, waiters); + if (is_auth()) { + auth_unpin(this); // pinned in prepare_old_fragment + ceph_assert(is_frozen_dir()); + unfreeze_dir(); + } + } + + ceph_assert(dir_auth_pins == 0); + ceph_assert(auth_pins == 0); + + num_head_items = num_head_null = 0; + num_snap_items = num_snap_null = 0; + adjust_num_inodes_with_caps(-num_inodes_with_caps); + + // this mirrors init_fragment_pins() + if (is_auth()) + clear_replica_map(); + if (is_dirty()) + mark_clean(); + if (state_test(STATE_IMPORTBOUND)) + put(PIN_IMPORTBOUND); + if (state_test(STATE_EXPORTBOUND)) + put(PIN_EXPORTBOUND); + if (is_subtree_root()) + put(PIN_SUBTREE); + + if (auth_pins > 0) + put(PIN_AUTHPIN); + + ceph_assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0)); +} + +void CDir::init_fragment_pins() +{ + if (is_replicated()) + get(PIN_REPLICATED); + if (state_test(STATE_DIRTY)) + get(PIN_DIRTY); + if (state_test(STATE_EXPORTBOUND)) + get(PIN_EXPORTBOUND); + if (state_test(STATE_IMPORTBOUND)) + get(PIN_IMPORTBOUND); + if (is_subtree_root()) + get(PIN_SUBTREE); +} + +void CDir::split(int bits, list<CDir*>& subs, MDSContext::vec& waiters, bool replay) +{ + dout(10) << "split by " << bits << " bits on " << *this << dendl; + + ceph_assert(replay || is_complete() || !is_auth()); + + frag_vec_t frags; + frag.split(bits, frags); + + vector<CDir*> subfrags(1 << bits); + + double fac = 1.0 / (double)(1 << bits); // for scaling load vecs + + version_t rstat_version = inode->get_projected_inode()->rstat.version; + version_t dirstat_version = inode->get_projected_inode()->dirstat.version; + + nest_info_t rstatdiff; + frag_info_t fragstatdiff; + if (fnode.accounted_rstat.version == rstat_version) + rstatdiff.add_delta(fnode.accounted_rstat, fnode.rstat); + if (fnode.accounted_fragstat.version == dirstat_version) + fragstatdiff.add_delta(fnode.accounted_fragstat, fnode.fragstat); + dout(10) << " rstatdiff " << rstatdiff << " fragstatdiff " << fragstatdiff << dendl; + + map<string_snap_t, MDSContext::vec > dentry_waiters; + prepare_old_fragment(dentry_waiters, replay); + + // create subfrag dirs + int n = 0; + for (const auto& fg : frags) { + CDir *f = new CDir(inode, fg, cache, is_auth()); + f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE)); + f->get_replicas() = get_replicas(); + f->set_version(get_version()); + f->pop_me = pop_me; + f->pop_me.scale(fac); + + // FIXME; this is an approximation + f->pop_nested = pop_nested; + f->pop_nested.scale(fac); + f->pop_auth_subtree = pop_auth_subtree; + f->pop_auth_subtree.scale(fac); + f->pop_auth_subtree_nested = pop_auth_subtree_nested; + f->pop_auth_subtree_nested.scale(fac); + + dout(10) << " subfrag " << fg << " " << *f << dendl; + subfrags[n++] = f; + subs.push_back(f); + + f->set_dir_auth(get_dir_auth()); + f->freeze_tree_state = freeze_tree_state; + f->prepare_new_fragment(replay); + f->init_fragment_pins(); + } + + // repartition dentries + while (!items.empty()) { + auto p = items.begin(); + + CDentry *dn = p->second; + frag_t subfrag = inode->pick_dirfrag(dn->get_name()); + int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift(); + dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl; + CDir *f = subfrags[n]; + f->steal_dentry(dn); + } + + for (const auto &p : dentry_waiters) { + frag_t subfrag = inode->pick_dirfrag(p.first.name); + int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift(); + CDir *f = subfrags[n]; + + if (f->waiting_on_dentry.empty()) + f->get(PIN_DNWAITER); + auto &e = f->waiting_on_dentry[p.first]; + for (const auto &waiter : p.second) { + e.push_back(waiter); + } + } + + // FIXME: handle dirty old rstat + + // fix up new frag fragstats + for (int i=0; i<n; i++) { + CDir *f = subfrags[i]; + f->fnode.rstat.version = rstat_version; + f->fnode.accounted_rstat = f->fnode.rstat; + f->fnode.fragstat.version = dirstat_version; + f->fnode.accounted_fragstat = f->fnode.fragstat; + dout(10) << " rstat " << f->fnode.rstat << " fragstat " << f->fnode.fragstat + << " on " << *f << dendl; + } + + // give any outstanding frag stat differential to first frag + dout(10) << " giving rstatdiff " << rstatdiff << " fragstatdiff" << fragstatdiff + << " to " << *subfrags[0] << dendl; + subfrags[0]->fnode.accounted_rstat.add(rstatdiff); + subfrags[0]->fnode.accounted_fragstat.add(fragstatdiff); + + finish_old_fragment(waiters, replay); +} + +void CDir::merge(list<CDir*>& subs, MDSContext::vec& waiters, bool replay) +{ + dout(10) << "merge " << subs << dendl; + + set_dir_auth(subs.front()->get_dir_auth()); + freeze_tree_state = subs.front()->freeze_tree_state; + + for (auto dir : subs) { + ceph_assert(get_dir_auth() == dir->get_dir_auth()); + ceph_assert(freeze_tree_state == dir->freeze_tree_state); + } + + prepare_new_fragment(replay); + + nest_info_t rstatdiff; + frag_info_t fragstatdiff; + bool touched_mtime, touched_chattr; + version_t rstat_version = inode->get_projected_inode()->rstat.version; + version_t dirstat_version = inode->get_projected_inode()->dirstat.version; + + map<string_snap_t, MDSContext::vec > dentry_waiters; + + for (auto dir : subs) { + dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl; + ceph_assert(!dir->is_auth() || dir->is_complete() || replay); + + if (dir->fnode.accounted_rstat.version == rstat_version) + rstatdiff.add_delta(dir->fnode.accounted_rstat, dir->fnode.rstat); + if (dir->fnode.accounted_fragstat.version == dirstat_version) + fragstatdiff.add_delta(dir->fnode.accounted_fragstat, dir->fnode.fragstat, + &touched_mtime, &touched_chattr); + + dir->prepare_old_fragment(dentry_waiters, replay); + + // steal dentries + while (!dir->items.empty()) + steal_dentry(dir->items.begin()->second); + + // merge replica map + for (const auto &p : dir->get_replicas()) { + unsigned cur = get_replicas()[p.first]; + if (p.second > cur) + get_replicas()[p.first] = p.second; + } + + // merge version + if (dir->get_version() > get_version()) + set_version(dir->get_version()); + + // merge state + state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); + + dir->finish_old_fragment(waiters, replay); + inode->close_dirfrag(dir->get_frag()); + } + + if (!dentry_waiters.empty()) { + get(PIN_DNWAITER); + for (const auto &p : dentry_waiters) { + auto &e = waiting_on_dentry[p.first]; + for (const auto &waiter : p.second) { + e.push_back(waiter); + } + } + } + + if (is_auth() && !replay) + mark_complete(); + + // FIXME: merge dirty old rstat + fnode.rstat.version = rstat_version; + fnode.accounted_rstat = fnode.rstat; + fnode.accounted_rstat.add(rstatdiff); + + fnode.fragstat.version = dirstat_version; + fnode.accounted_fragstat = fnode.fragstat; + fnode.accounted_fragstat.add(fragstatdiff); + + init_fragment_pins(); +} + + + + +void CDir::resync_accounted_fragstat() +{ + fnode_t *pf = get_projected_fnode(); + auto pi = inode->get_projected_inode(); + + if (pf->accounted_fragstat.version != pi->dirstat.version) { + pf->fragstat.version = pi->dirstat.version; + dout(10) << __func__ << " " << pf->accounted_fragstat << " -> " << pf->fragstat << dendl; + pf->accounted_fragstat = pf->fragstat; + } +} + +/* + * resync rstat and accounted_rstat with inode + */ +void CDir::resync_accounted_rstat() +{ + fnode_t *pf = get_projected_fnode(); + auto pi = inode->get_projected_inode(); + + if (pf->accounted_rstat.version != pi->rstat.version) { + pf->rstat.version = pi->rstat.version; + dout(10) << __func__ << " " << pf->accounted_rstat << " -> " << pf->rstat << dendl; + pf->accounted_rstat = pf->rstat; + dirty_old_rstat.clear(); + } +} + +void CDir::assimilate_dirty_rstat_inodes() +{ + dout(10) << __func__ << dendl; + for (elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current(); + !p.end(); ++p) { + CInode *in = *p; + ceph_assert(in->is_auth()); + if (in->is_frozen()) + continue; + + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + + inode->mdcache->project_rstat_inode_to_frag(in, this, 0, 0, NULL); + } + state_set(STATE_ASSIMRSTAT); + dout(10) << __func__ << " done" << dendl; +} + +void CDir::assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob) +{ + if (!state_test(STATE_ASSIMRSTAT)) + return; + state_clear(STATE_ASSIMRSTAT); + dout(10) << __func__ << dendl; + elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current(); + while (!p.end()) { + CInode *in = *p; + ++p; + + if (in->is_frozen()) + continue; + + CDentry *dn = in->get_projected_parent_dn(); + + mut->auth_pin(in); + mut->add_projected_inode(in); + + in->clear_dirty_rstat(); + blob->add_primary_dentry(dn, in, true); + } + + if (!dirty_rstat_inodes.empty()) + inode->mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock); +} + + + + +/**************************************** + * WAITING + */ + +void CDir::add_dentry_waiter(std::string_view dname, snapid_t snapid, MDSContext *c) +{ + if (waiting_on_dentry.empty()) + get(PIN_DNWAITER); + waiting_on_dentry[string_snap_t(dname, snapid)].push_back(c); + dout(10) << __func__ << " dentry " << dname + << " snap " << snapid + << " " << c << " on " << *this << dendl; +} + +void CDir::take_dentry_waiting(std::string_view dname, snapid_t first, snapid_t last, + MDSContext::vec& ls) +{ + if (waiting_on_dentry.empty()) + return; + + string_snap_t lb(dname, first); + string_snap_t ub(dname, last); + auto it = waiting_on_dentry.lower_bound(lb); + while (it != waiting_on_dentry.end() && + !(ub < it->first)) { + dout(10) << __func__ << " " << dname + << " [" << first << "," << last << "] found waiter on snap " + << it->first.snapid + << " on " << *this << dendl; + for (const auto &waiter : it->second) { + ls.push_back(waiter); + } + waiting_on_dentry.erase(it++); + } + + if (waiting_on_dentry.empty()) + put(PIN_DNWAITER); +} + +void CDir::take_sub_waiting(MDSContext::vec& ls) +{ + dout(10) << __func__ << dendl; + if (!waiting_on_dentry.empty()) { + for (const auto &p : waiting_on_dentry) { + for (const auto &waiter : p.second) { + ls.push_back(waiter); + } + } + waiting_on_dentry.clear(); + put(PIN_DNWAITER); + } +} + + + +void CDir::add_waiter(uint64_t tag, MDSContext *c) +{ + // hierarchical? + + // at subtree root? + if (tag & WAIT_ATSUBTREEROOT) { + if (!is_subtree_root()) { + // try parent + dout(10) << "add_waiter " << std::hex << tag << std::dec << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl; + inode->parent->dir->add_waiter(tag, c); + return; + } + } + + ceph_assert(!(tag & WAIT_CREATED) || state_test(STATE_CREATING)); + + MDSCacheObject::add_waiter(tag, c); +} + + + +/* NOTE: this checks dentry waiters too */ +void CDir::take_waiting(uint64_t mask, MDSContext::vec& ls) +{ + if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) { + // take all dentry waiters + for (const auto &p : waiting_on_dentry) { + dout(10) << "take_waiting dentry " << p.first.name + << " snap " << p.first.snapid << " on " << *this << dendl; + for (const auto &waiter : p.second) { + ls.push_back(waiter); + } + } + waiting_on_dentry.clear(); + put(PIN_DNWAITER); + } + + // waiting + MDSCacheObject::take_waiting(mask, ls); +} + + +void CDir::finish_waiting(uint64_t mask, int result) +{ + dout(11) << __func__ << " mask " << hex << mask << dec << " result " << result << " on " << *this << dendl; + + MDSContext::vec finished; + take_waiting(mask, finished); + if (result < 0) + finish_contexts(g_ceph_context, finished, result); + else + cache->mds->queue_waiters(finished); +} + + + +// dirty/clean + +fnode_t *CDir::project_fnode() +{ + ceph_assert(get_version() != 0); + auto &p = projected_fnode.emplace_back(*get_projected_fnode()); + + if (scrub_infop && scrub_infop->last_scrub_dirty) { + p.localized_scrub_stamp = scrub_infop->last_local.time; + p.localized_scrub_version = scrub_infop->last_local.version; + p.recursive_scrub_stamp = scrub_infop->last_recursive.time; + p.recursive_scrub_version = scrub_infop->last_recursive.version; + scrub_infop->last_scrub_dirty = false; + scrub_maybe_delete_info(); + } + + dout(10) << __func__ << " " << &p << dendl; + return &p; +} + +void CDir::pop_and_dirty_projected_fnode(LogSegment *ls) +{ + ceph_assert(!projected_fnode.empty()); + auto &front = projected_fnode.front(); + dout(15) << __func__ << " " << &front << " v" << front.version << dendl; + fnode = front; + _mark_dirty(ls); + projected_fnode.pop_front(); +} + + +version_t CDir::pre_dirty(version_t min) +{ + if (min > projected_version) + projected_version = min; + ++projected_version; + dout(10) << __func__ << " " << projected_version << dendl; + return projected_version; +} + +void CDir::mark_dirty(version_t pv, LogSegment *ls) +{ + ceph_assert(get_version() < pv); + ceph_assert(pv <= projected_version); + fnode.version = pv; + _mark_dirty(ls); +} + +void CDir::_mark_dirty(LogSegment *ls) +{ + if (!state_test(STATE_DIRTY)) { + dout(10) << __func__ << " (was clean) " << *this << " version " << get_version() << dendl; + _set_dirty_flag(); + ceph_assert(ls); + } else { + dout(10) << __func__ << " (already dirty) " << *this << " version " << get_version() << dendl; + } + if (ls) { + ls->dirty_dirfrags.push_back(&item_dirty); + + // if i've never committed, i need to be before _any_ mention of me is trimmed from the journal. + if (committed_version == 0 && !item_new.is_on_list()) + ls->new_dirfrags.push_back(&item_new); + } +} + +void CDir::mark_new(LogSegment *ls) +{ + ls->new_dirfrags.push_back(&item_new); + state_clear(STATE_CREATING); + + MDSContext::vec waiters; + take_waiting(CDir::WAIT_CREATED, waiters); + cache->mds->queue_waiters(waiters); +} + +void CDir::mark_clean() +{ + dout(10) << __func__ << " " << *this << " version " << get_version() << dendl; + if (state_test(STATE_DIRTY)) { + item_dirty.remove_myself(); + item_new.remove_myself(); + + state_clear(STATE_DIRTY); + put(PIN_DIRTY); + } +} + +// caller should hold auth pin of this +void CDir::log_mark_dirty() +{ + if (is_dirty() || projected_version > get_version()) + return; // noop if it is already dirty or will be dirty + + version_t pv = pre_dirty(); + mark_dirty(pv, cache->mds->mdlog->get_current_segment()); +} + +void CDir::mark_complete() { + state_set(STATE_COMPLETE); + bloom.reset(); +} + +void CDir::first_get() +{ + inode->get(CInode::PIN_DIRFRAG); +} + +void CDir::last_put() +{ + inode->put(CInode::PIN_DIRFRAG); +} + + + +/****************************************************************************** + * FETCH and COMMIT + */ + +// ----------------------- +// FETCH +void CDir::fetch(MDSContext *c, bool ignore_authpinnability) +{ + string want; + return fetch(c, want, ignore_authpinnability); +} + +void CDir::fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability) +{ + dout(10) << "fetch on " << *this << dendl; + + ceph_assert(is_auth()); + ceph_assert(!is_complete()); + + if (!can_auth_pin() && !ignore_authpinnability) { + if (c) { + dout(7) << "fetch waiting for authpinnable" << dendl; + add_waiter(WAIT_UNFREEZE, c); + } else + dout(7) << "fetch not authpinnable and no context" << dendl; + return; + } + + // unlinked directory inode shouldn't have any entry + if (!inode->is_base() && get_parent_dir()->inode->is_stray() && + !inode->snaprealm) { + dout(7) << "fetch dirfrag for unlinked directory, mark complete" << dendl; + if (get_version() == 0) { + ceph_assert(inode->is_auth()); + set_version(1); + + if (state_test(STATE_REJOINUNDEF)) { + ceph_assert(cache->mds->is_rejoin()); + state_clear(STATE_REJOINUNDEF); + cache->opened_undef_dirfrag(this); + } + } + mark_complete(); + + if (c) + cache->mds->queue_waiter(c); + return; + } + + if (c) add_waiter(WAIT_COMPLETE, c); + if (!want_dn.empty()) wanted_items.insert(mempool::mds_co::string(want_dn)); + + // already fetching? + if (state_test(CDir::STATE_FETCHING)) { + dout(7) << "already fetching; waiting" << dendl; + return; + } + + auth_pin(this); + state_set(CDir::STATE_FETCHING); + + if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_fetch); + + std::set<dentry_key_t> empty; + _omap_fetch(NULL, empty); +} + +void CDir::fetch(MDSContext *c, const std::set<dentry_key_t>& keys) +{ + dout(10) << "fetch " << keys.size() << " keys on " << *this << dendl; + + ceph_assert(is_auth()); + ceph_assert(!is_complete()); + + if (!can_auth_pin()) { + dout(7) << "fetch keys waiting for authpinnable" << dendl; + add_waiter(WAIT_UNFREEZE, c); + return; + } + if (state_test(CDir::STATE_FETCHING)) { + dout(7) << "fetch keys waiting for full fetch" << dendl; + add_waiter(WAIT_COMPLETE, c); + return; + } + + auth_pin(this); + if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_fetch); + + _omap_fetch(c, keys); +} + +class C_IO_Dir_OMAP_FetchedMore : public CDirIOContext { + MDSContext *fin; +public: + bufferlist hdrbl; + bool more = false; + map<string, bufferlist> omap; ///< carry-over from before + map<string, bufferlist> omap_more; ///< new batch + int ret; + C_IO_Dir_OMAP_FetchedMore(CDir *d, MDSContext *f) : + CDirIOContext(d), fin(f), ret(0) { } + void finish(int r) { + // merge results + if (omap.empty()) { + omap.swap(omap_more); + } else { + omap.insert(omap_more.begin(), omap_more.end()); + } + if (more) { + dir->_omap_fetch_more(hdrbl, omap, fin); + } else { + dir->_omap_fetched(hdrbl, omap, !fin, r); + if (fin) + fin->complete(r); + } + } + void print(ostream& out) const override { + out << "dirfrag_fetch_more(" << dir->dirfrag() << ")"; + } +}; + +class C_IO_Dir_OMAP_Fetched : public CDirIOContext { + MDSContext *fin; +public: + bufferlist hdrbl; + bool more = false; + map<string, bufferlist> omap; + bufferlist btbl; + int ret1, ret2, ret3; + + C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) : + CDirIOContext(d), fin(f), ret1(0), ret2(0), ret3(0) { } + void finish(int r) override { + // check the correctness of backtrace + if (r >= 0 && ret3 != -ECANCELED) + dir->inode->verify_diri_backtrace(btbl, ret3); + if (r >= 0) r = ret1; + if (r >= 0) r = ret2; + if (more) { + dir->_omap_fetch_more(hdrbl, omap, fin); + } else { + dir->_omap_fetched(hdrbl, omap, !fin, r); + if (fin) + fin->complete(r); + } + } + void print(ostream& out) const override { + out << "dirfrag_fetch(" << dir->dirfrag() << ")"; + } +}; + +void CDir::_omap_fetch(MDSContext *c, const std::set<dentry_key_t>& keys) +{ + C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c); + object_t oid = get_ondisk_object(); + object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool()); + ObjectOperation rd; + rd.omap_get_header(&fin->hdrbl, &fin->ret1); + if (keys.empty()) { + ceph_assert(!c); + rd.omap_get_vals("", "", g_conf()->mds_dir_keys_per_op, + &fin->omap, &fin->more, &fin->ret2); + } else { + ceph_assert(c); + std::set<std::string> str_keys; + for (auto p : keys) { + string str; + p.encode(str); + str_keys.insert(str); + } + rd.omap_get_vals_by_keys(str_keys, &fin->omap, &fin->ret2); + } + // check the correctness of backtrace + if (g_conf()->mds_verify_backtrace > 0 && frag == frag_t()) { + rd.getxattr("parent", &fin->btbl, &fin->ret3); + rd.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } else { + fin->ret3 = -ECANCELED; + } + + cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0, + new C_OnFinisher(fin, cache->mds->finisher)); +} + +void CDir::_omap_fetch_more( + bufferlist& hdrbl, + map<string, bufferlist>& omap, + MDSContext *c) +{ + // we have more omap keys to fetch! + object_t oid = get_ondisk_object(); + object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool()); + C_IO_Dir_OMAP_FetchedMore *fin = new C_IO_Dir_OMAP_FetchedMore(this, c); + fin->hdrbl.claim(hdrbl); + fin->omap.swap(omap); + ObjectOperation rd; + rd.omap_get_vals(fin->omap.rbegin()->first, + "", /* filter prefix */ + g_conf()->mds_dir_keys_per_op, + &fin->omap_more, + &fin->more, + &fin->ret); + cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0, + new C_OnFinisher(fin, cache->mds->finisher)); +} + +CDentry *CDir::_load_dentry( + std::string_view key, + std::string_view dname, + const snapid_t last, + bufferlist &bl, + const int pos, + const std::set<snapid_t> *snaps, + bool *force_dirty) +{ + auto q = bl.cbegin(); + + snapid_t first; + decode(first, q); + + // marker + char type; + decode(type, q); + + dout(20) << "_fetched pos " << pos << " marker '" << type << "' dname '" << dname + << " [" << first << "," << last << "]" + << dendl; + + bool stale = false; + if (snaps && last != CEPH_NOSNAP) { + set<snapid_t>::const_iterator p = snaps->lower_bound(first); + if (p == snaps->end() || *p > last) { + dout(10) << " skipping stale dentry on [" << first << "," << last << "]" << dendl; + stale = true; + } + } + + /* + * look for existing dentry for _last_ snap, because unlink + + * create may leave a "hole" (epochs during which the dentry + * doesn't exist) but for which no explicit negative dentry is in + * the cache. + */ + CDentry *dn; + if (stale) + dn = lookup_exact_snap(dname, last); + else + dn = lookup(dname, last); + + if (type == 'L') { + // hard link + inodeno_t ino; + unsigned char d_type; + decode(ino, q); + decode(d_type, q); + + if (stale) { + if (!dn) { + stale_items.insert(mempool::mds_co::string(key)); + *force_dirty = true; + } + return dn; + } + + if (dn) { + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl; + if (committed_version == 0 && + dnl->is_remote() && + dn->is_dirty() && + ino == dnl->get_remote_ino() && + d_type == dnl->get_remote_d_type()) { + // see comment below + dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; + dn->mark_clean(); + } + } else { + // (remote) link + dn = add_remote_dentry(dname, ino, d_type, first, last); + + // link to inode? + CInode *in = cache->get_inode(ino); // we may or may not have it. + if (in) { + dn->link_remote(dn->get_linkage(), in); + dout(12) << "_fetched got remote link " << ino << " which we have " << *in << dendl; + } else { + dout(12) << "_fetched got remote link " << ino << " (don't have it)" << dendl; + } + } + } + else if (type == 'I') { + // inode + + // Load inode data before looking up or constructing CInode + InodeStore inode_data; + inode_data.decode_bare(q); + + if (stale) { + if (!dn) { + stale_items.insert(mempool::mds_co::string(key)); + *force_dirty = true; + } + return dn; + } + + bool undef_inode = false; + if (dn) { + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl; + + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (in->state_test(CInode::STATE_REJOINUNDEF)) { + undef_inode = true; + } else if (committed_version == 0 && + dn->is_dirty() && + inode_data.inode.ino == in->ino() && + inode_data.inode.version == in->get_version()) { + /* clean underwater item? + * Underwater item is something that is dirty in our cache from + * journal replay, but was previously flushed to disk before the + * mds failed. + * + * We only do this is committed_version == 0. that implies either + * - this is a fetch after from a clean/empty CDir is created + * (and has no effect, since the dn won't exist); or + * - this is a fetch after _recovery_, which is what we're worried + * about. Items that are marked dirty from the journal should be + * marked clean if they appear on disk. + */ + dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; + dn->mark_clean(); + dout(10) << "_fetched had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl; + in->mark_clean(); + } + } + } + + if (!dn || undef_inode) { + // add inode + CInode *in = cache->get_inode(inode_data.inode.ino, last); + if (!in || undef_inode) { + if (undef_inode && in) + in->first = first; + else + in = new CInode(cache, true, first, last); + + in->inode = inode_data.inode; + // symlink? + if (in->is_symlink()) + in->symlink = inode_data.symlink; + + in->dirfragtree.swap(inode_data.dirfragtree); + in->xattrs.swap(inode_data.xattrs); + in->old_inodes.swap(inode_data.old_inodes); + if (!in->old_inodes.empty()) { + snapid_t min_first = in->old_inodes.rbegin()->first + 1; + if (min_first > in->first) + in->first = min_first; + } + + in->oldest_snap = inode_data.oldest_snap; + in->decode_snap_blob(inode_data.snap_blob); + if (snaps && !in->snaprealm) + in->purge_stale_snap_data(*snaps); + + if (!undef_inode) { + cache->add_inode(in); // add + dn = add_primary_dentry(dname, in, first, last); // link + } + dout(12) << "_fetched got " << *dn << " " << *in << dendl; + + if (in->inode.is_dirty_rstat()) + in->mark_dirty_rstat(); + + //in->hack_accessed = false; + //in->hack_load_stamp = ceph_clock_now(); + //num_new_inodes_loaded++; + } else if (g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata")) { + dout(20) << "hack: adding duplicate dentry for " << *in << dendl; + dn = add_primary_dentry(dname, in, first, last); + } else { + dout(0) << "_fetched badness: got (but i already had) " << *in + << " mode " << in->inode.mode + << " mtime " << in->inode.mtime << dendl; + string dirpath, inopath; + this->inode->make_path_string(dirpath); + in->make_path_string(inopath); + cache->mds->clog->error() << "loaded dup inode " << inode_data.inode.ino + << " [" << first << "," << last << "] v" << inode_data.inode.version + << " at " << dirpath << "/" << dname + << ", but inode " << in->vino() << " v" << in->inode.version + << " already exists at " << inopath; + return dn; + } + } + } else { + std::ostringstream oss; + oss << "Invalid tag char '" << type << "' pos " << pos; + throw buffer::malformed_input(oss.str()); + } + + return dn; +} + +void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap, + bool complete, int r) +{ + LogChannelRef clog = cache->mds->clog; + dout(10) << "_fetched header " << hdrbl.length() << " bytes " + << omap.size() << " keys for " << *this << dendl; + + ceph_assert(r == 0 || r == -ENOENT || r == -ENODATA); + ceph_assert(is_auth()); + ceph_assert(!is_frozen()); + + if (hdrbl.length() == 0) { + dout(0) << "_fetched missing object for " << *this << dendl; + + clog->error() << "dir " << dirfrag() << " object missing on disk; some " + "files may be lost (" << get_path() << ")"; + + go_bad(complete); + return; + } + + fnode_t got_fnode; + { + auto p = hdrbl.cbegin(); + try { + decode(got_fnode, p); + } catch (const buffer::error &err) { + derr << "Corrupt fnode in dirfrag " << dirfrag() + << ": " << err << dendl; + clog->warn() << "Corrupt fnode header in " << dirfrag() << ": " + << err << " (" << get_path() << ")"; + go_bad(complete); + return; + } + if (!p.end()) { + clog->warn() << "header buffer of dir " << dirfrag() << " has " + << hdrbl.length() - p.get_off() << " extra bytes (" + << get_path() << ")"; + go_bad(complete); + return; + } + } + + dout(10) << "_fetched version " << got_fnode.version << dendl; + + // take the loaded fnode? + // only if we are a fresh CDir* with no prior state. + if (get_version() == 0) { + ceph_assert(!is_projected()); + ceph_assert(!state_test(STATE_COMMITTING)); + fnode = got_fnode; + projected_version = committing_version = committed_version = got_fnode.version; + + if (state_test(STATE_REJOINUNDEF)) { + ceph_assert(cache->mds->is_rejoin()); + state_clear(STATE_REJOINUNDEF); + cache->opened_undef_dirfrag(this); + } + } + + list<CInode*> undef_inodes; + + // purge stale snaps? + // only if we have past_parents open! + bool force_dirty = false; + const set<snapid_t> *snaps = NULL; + SnapRealm *realm = inode->find_snaprealm(); + if (!realm->have_past_parents_open()) { + dout(10) << " no snap purge, one or more past parents NOT open" << dendl; + } else if (fnode.snap_purged_thru < realm->get_last_destroyed()) { + snaps = &realm->get_snaps(); + dout(10) << " snap_purged_thru " << fnode.snap_purged_thru + << " < " << realm->get_last_destroyed() + << ", snap purge based on " << *snaps << dendl; + if (get_num_snap_items() == 0) { + fnode.snap_purged_thru = realm->get_last_destroyed(); + force_dirty = true; + } + } + + unsigned pos = omap.size() - 1; + for (map<string, bufferlist>::reverse_iterator p = omap.rbegin(); + p != omap.rend(); + ++p, --pos) { + string dname; + snapid_t last; + dentry_key_t::decode_helper(p->first, dname, last); + + CDentry *dn = NULL; + try { + dn = _load_dentry( + p->first, dname, last, p->second, pos, snaps, + &force_dirty); + } catch (const buffer::error &err) { + cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in " + "dir frag " << dirfrag() << ": " + << err << "(" << get_path() << ")"; + + // Remember that this dentry is damaged. Subsequent operations + // that try to act directly on it will get their EIOs, but this + // dirfrag as a whole will continue to look okay (minus the + // mysteriously-missing dentry) + go_bad_dentry(last, dname); + + // Anyone who was WAIT_DENTRY for this guy will get kicked + // to RetryRequest, and hit the DamageTable-interrogating path. + // Stats will now be bogus because we will think we're complete, + // but have 1 or more missing dentries. + continue; + } + + if (!dn) + continue; + + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) + undef_inodes.push_back(dnl->get_inode()); + + if (wanted_items.count(mempool::mds_co::string(dname)) > 0 || !complete) { + dout(10) << " touching wanted dn " << *dn << dendl; + inode->mdcache->touch_dentry(dn); + } + } + + //cache->mds->logger->inc("newin", num_new_inodes_loaded); + + // mark complete, !fetching + if (complete) { + wanted_items.clear(); + mark_complete(); + state_clear(STATE_FETCHING); + + if (scrub_infop && scrub_infop->need_scrub_local) { + scrub_infop->need_scrub_local = false; + scrub_local(); + } + } + + // open & force frags + while (!undef_inodes.empty()) { + CInode *in = undef_inodes.front(); + undef_inodes.pop_front(); + in->state_clear(CInode::STATE_REJOINUNDEF); + cache->opened_undef_inode(in); + } + + // dirty myself to remove stale snap dentries + if (force_dirty && !inode->mdcache->is_readonly()) + log_mark_dirty(); + + auth_unpin(this); + + if (complete) { + // kick waiters + finish_waiting(WAIT_COMPLETE, 0); + } +} + +void CDir::go_bad_dentry(snapid_t last, std::string_view dname) +{ + dout(10) << __func__ << " " << dname << dendl; + std::string path(get_path()); + path += "/"; + path += dname; + const bool fatal = cache->mds->damage_table.notify_dentry( + inode->ino(), frag, last, dname, path); + if (fatal) { + cache->mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } +} + +void CDir::go_bad(bool complete) +{ + dout(10) << __func__ << " " << frag << dendl; + const bool fatal = cache->mds->damage_table.notify_dirfrag( + inode->ino(), frag, get_path()); + if (fatal) { + cache->mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } + + if (complete) { + if (get_version() == 0) + set_version(1); + + state_set(STATE_BADFRAG); + mark_complete(); + } + + state_clear(STATE_FETCHING); + auth_unpin(this); + finish_waiting(WAIT_COMPLETE, -EIO); +} + +// ----------------------- +// COMMIT + +/** + * commit + * + * @param want - min version i want committed + * @param c - callback for completion + */ +void CDir::commit(version_t want, MDSContext *c, bool ignore_authpinnability, int op_prio) +{ + dout(10) << "commit want " << want << " on " << *this << dendl; + if (want == 0) want = get_version(); + + // preconditions + ceph_assert(want <= get_version() || get_version() == 0); // can't commit the future + ceph_assert(want > committed_version); // the caller is stupid + ceph_assert(is_auth()); + ceph_assert(ignore_authpinnability || can_auth_pin()); + + // note: queue up a noop if necessary, so that we always + // get an auth_pin. + if (!c) + c = new C_MDSInternalNoop; + + // auth_pin on first waiter + if (waiting_for_commit.empty()) + auth_pin(this); + waiting_for_commit[want].push_back(c); + + // ok. + _commit(want, op_prio); +} + +class C_IO_Dir_Committed : public CDirIOContext { + version_t version; +public: + C_IO_Dir_Committed(CDir *d, version_t v) : CDirIOContext(d), version(v) { } + void finish(int r) override { + dir->_committed(r, version); + } + void print(ostream& out) const override { + out << "dirfrag_commit(" << dir->dirfrag() << ")"; + } +}; + +/** + * Flush out the modified dentries in this dir. Keep the bufferlist + * below max_write_size; + */ +void CDir::_omap_commit(int op_prio) +{ + dout(10) << __func__ << dendl; + + unsigned max_write_size = cache->max_dir_commit_size; + unsigned write_size = 0; + + if (op_prio < 0) + op_prio = CEPH_MSG_PRIO_DEFAULT; + + // snap purge? + const set<snapid_t> *snaps = NULL; + SnapRealm *realm = inode->find_snaprealm(); + if (!realm->have_past_parents_open()) { + dout(10) << " no snap purge, one or more past parents NOT open" << dendl; + } else if (fnode.snap_purged_thru < realm->get_last_destroyed()) { + snaps = &realm->get_snaps(); + dout(10) << " snap_purged_thru " << fnode.snap_purged_thru + << " < " << realm->get_last_destroyed() + << ", snap purge based on " << *snaps << dendl; + // fnode.snap_purged_thru = realm->get_last_destroyed(); + } + + set<string> to_remove; + map<string, bufferlist> to_set; + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_Dir_Committed(this, + get_version()), + cache->mds->finisher)); + + SnapContext snapc; + object_t oid = get_ondisk_object(); + object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool()); + + if (!stale_items.empty()) { + for (const auto &p : stale_items) { + to_remove.insert(std::string(p)); + write_size += p.length(); + } + stale_items.clear(); + } + + auto write_one = [&](CDentry *dn) { + string key; + dn->key().encode(key); + + if (dn->last != CEPH_NOSNAP && + snaps && try_trim_snap_dentry(dn, *snaps)) { + dout(10) << " rm " << key << dendl; + write_size += key.length(); + to_remove.insert(key); + return; + } + + if (dn->get_linkage()->is_null()) { + dout(10) << " rm " << dn->get_name() << " " << *dn << dendl; + write_size += key.length(); + to_remove.insert(key); + } else { + dout(10) << " set " << dn->get_name() << " " << *dn << dendl; + bufferlist dnbl; + _encode_dentry(dn, dnbl, snaps); + write_size += key.length() + dnbl.length(); + to_set[key].swap(dnbl); + } + + if (write_size >= max_write_size) { + ObjectOperation op; + op.priority = op_prio; + + // don't create new dirfrag blindly + if (!is_new()) + op.stat(NULL, (ceph::real_time*) NULL, NULL); + + if (!to_set.empty()) + op.omap_set(to_set); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + + cache->mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + + write_size = 0; + to_set.clear(); + to_remove.clear(); + } + }; + + if (state_test(CDir::STATE_FRAGMENTING) && is_new()) { + assert(committed_version == 0); + for (auto p = items.begin(); p != items.end(); ) { + CDentry *dn = p->second; + ++p; + if (dn->get_linkage()->is_null()) + continue; + write_one(dn); + } + } else { + for (auto p = dirty_dentries.begin(); !p.end(); ) { + CDentry *dn = *p; + ++p; + write_one(dn); + } + } + + ObjectOperation op; + op.priority = op_prio; + + // don't create new dirfrag blindly + if (!is_new()) + op.stat(NULL, (ceph::real_time*)NULL, NULL); + + /* + * save the header at the last moment.. If we were to send it off before other + * updates, but die before sending them all, we'd think that the on-disk state + * was fully committed even though it wasn't! However, since the messages are + * strictly ordered between the MDS and the OSD, and since messages to a given + * PG are strictly ordered, if we simply send the message containing the header + * off last, we cannot get our header into an incorrect state. + */ + bufferlist header; + encode(fnode, header); + op.omap_set_header(header); + + if (!to_set.empty()) + op.omap_set(to_set); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + + cache->mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + + gather.activate(); +} + +void CDir::_encode_dentry(CDentry *dn, bufferlist& bl, + const set<snapid_t> *snaps) +{ + // clear dentry NEW flag, if any. we can no longer silently drop it. + dn->clear_new(); + + encode(dn->first, bl); + + // primary or remote? + if (dn->linkage.is_remote()) { + inodeno_t ino = dn->linkage.get_remote_ino(); + unsigned char d_type = dn->linkage.get_remote_d_type(); + dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' remote ino " << ino << dendl; + + // marker, name, ino + bl.append('L'); // remote link + encode(ino, bl); + encode(d_type, bl); + } else if (dn->linkage.is_primary()) { + // primary link + CInode *in = dn->linkage.get_inode(); + ceph_assert(in); + + dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' inode " << *in << dendl; + + // marker, name, inode, [symlink string] + bl.append('I'); // inode + + if (in->is_multiversion()) { + if (!in->snaprealm) { + if (snaps) + in->purge_stale_snap_data(*snaps); + } else if (in->snaprealm->have_past_parents_open()) { + in->purge_stale_snap_data(in->snaprealm->get_snaps()); + } + } + + bufferlist snap_blob; + in->encode_snap_blob(snap_blob); + in->encode_bare(bl, cache->mds->mdsmap->get_up_features(), &snap_blob); + } else { + ceph_assert(!dn->linkage.is_null()); + } +} + +void CDir::_commit(version_t want, int op_prio) +{ + dout(10) << "_commit want " << want << " on " << *this << dendl; + + // we can't commit things in the future. + // (even the projected future.) + ceph_assert(want <= get_version() || get_version() == 0); + + // check pre+postconditions. + ceph_assert(is_auth()); + + // already committed? + if (committed_version >= want) { + dout(10) << "already committed " << committed_version << " >= " << want << dendl; + return; + } + // already committing >= want? + if (committing_version >= want) { + dout(10) << "already committing " << committing_version << " >= " << want << dendl; + ceph_assert(state_test(STATE_COMMITTING)); + return; + } + + // alrady committed an older version? + if (committing_version > committed_version) { + dout(10) << "already committing older " << committing_version << ", waiting for that to finish" << dendl; + return; + } + + // commit. + committing_version = get_version(); + + // mark committing (if not already) + if (!state_test(STATE_COMMITTING)) { + dout(10) << "marking committing" << dendl; + state_set(STATE_COMMITTING); + } + + if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_commit); + + _omap_commit(op_prio); +} + + +/** + * _committed + * + * @param v version i just committed + */ +void CDir::_committed(int r, version_t v) +{ + if (r < 0) { + // the directory could be partly purged during MDS failover + if (r == -ENOENT && committed_version == 0 && + !inode->is_base() && get_parent_dir()->inode->is_stray()) { + r = 0; + if (inode->snaprealm) + inode->state_set(CInode::STATE_MISSINGOBJS); + } + if (r < 0) { + dout(1) << "commit error " << r << " v " << v << dendl; + cache->mds->clog->error() << "failed to commit dir " << dirfrag() << " object," + << " errno " << r; + cache->mds->handle_write_error(r); + return; + } + } + + dout(10) << "_committed v " << v << " on " << *this << dendl; + ceph_assert(is_auth()); + + bool stray = inode->is_stray(); + + // take note. + ceph_assert(v > committed_version); + ceph_assert(v <= committing_version); + committed_version = v; + + // _all_ commits done? + if (committing_version == committed_version) + state_clear(CDir::STATE_COMMITTING); + + // _any_ commit, even if we've been redirtied, means we're no longer new. + item_new.remove_myself(); + + // dir clean? + if (committed_version == get_version()) + mark_clean(); + + // dentries clean? + for (auto p = dirty_dentries.begin(); !p.end(); ) { + CDentry *dn = *p; + ++p; + + // inode? + if (dn->linkage.is_primary()) { + CInode *in = dn->linkage.get_inode(); + ceph_assert(in); + ceph_assert(in->is_auth()); + + if (committed_version >= in->get_version()) { + if (in->is_dirty()) { + dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl; + in->mark_clean(); + } + } else { + dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl; + ceph_assert(in->is_dirty() || in->last < CEPH_NOSNAP); // special case for cow snap items (not predirtied) + } + } + + // dentry + if (committed_version >= dn->get_version()) { + dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl; + dn->mark_clean(); + + // drop clean null stray dentries immediately + if (stray && + dn->get_num_ref() == 0 && + !dn->is_projected() && + dn->get_linkage()->is_null()) + remove_dentry(dn); + } else { + dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl; + ceph_assert(dn->is_dirty()); + } + } + + // finishers? + bool were_waiters = !waiting_for_commit.empty(); + + auto it = waiting_for_commit.begin(); + while (it != waiting_for_commit.end()) { + auto _it = it; + ++_it; + if (it->first > committed_version) { + dout(10) << " there are waiters for " << it->first << ", committing again" << dendl; + _commit(it->first, -1); + break; + } + MDSContext::vec t; + for (const auto &waiter : it->second) + t.push_back(waiter); + cache->mds->queue_waiters(t); + waiting_for_commit.erase(it); + it = _it; + } + + // try drop dentries in this dirfrag if it's about to be purged + if (!inode->is_base() && get_parent_dir()->inode->is_stray() && + inode->snaprealm) + cache->maybe_eval_stray(inode, true); + + // unpin if we kicked the last waiter. + if (were_waiters && + waiting_for_commit.empty()) + auth_unpin(this); +} + + + + +// IMPORT/EXPORT + +void CDir::encode_export(bufferlist& bl) +{ + ceph_assert(!is_projected()); + encode(first, bl); + encode(fnode, bl); + encode(dirty_old_rstat, bl); + encode(committed_version, bl); + + encode(state, bl); + encode(dir_rep, bl); + + encode(pop_me, bl); + encode(pop_auth_subtree, bl); + + encode(dir_rep_by, bl); + encode(get_replicas(), bl); + + get(PIN_TEMPEXPORTING); +} + +void CDir::finish_export() +{ + state &= MASK_STATE_EXPORT_KEPT; + pop_nested.sub(pop_auth_subtree); + pop_auth_subtree_nested.sub(pop_auth_subtree); + pop_me.zero(); + pop_auth_subtree.zero(); + put(PIN_TEMPEXPORTING); + dirty_old_rstat.clear(); +} + +void CDir::decode_import(bufferlist::const_iterator& blp, LogSegment *ls) +{ + decode(first, blp); + decode(fnode, blp); + decode(dirty_old_rstat, blp); + projected_version = fnode.version; + decode(committed_version, blp); + committing_version = committed_version; + + unsigned s; + decode(s, blp); + state &= MASK_STATE_IMPORT_KEPT; + state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED)); + + if (is_dirty()) { + get(PIN_DIRTY); + _mark_dirty(ls); + } + + decode(dir_rep, blp); + + decode(pop_me, blp); + decode(pop_auth_subtree, blp); + pop_nested.add(pop_auth_subtree); + pop_auth_subtree_nested.add(pop_auth_subtree); + + decode(dir_rep_by, blp); + decode(get_replicas(), blp); + if (is_replicated()) get(PIN_REPLICATED); + + replica_nonce = 0; // no longer defined + + // did we import some dirty scatterlock data? + if (dirty_old_rstat.size() || + !(fnode.rstat == fnode.accounted_rstat)) { + cache->mds->locker->mark_updated_scatterlock(&inode->nestlock); + ls->dirty_dirfrag_nest.push_back(&inode->item_dirty_dirfrag_nest); + } + if (!(fnode.fragstat == fnode.accounted_fragstat)) { + cache->mds->locker->mark_updated_scatterlock(&inode->filelock); + ls->dirty_dirfrag_dir.push_back(&inode->item_dirty_dirfrag_dir); + } + if (is_dirty_dft()) { + if (inode->dirfragtreelock.get_state() != LOCK_MIX && + inode->dirfragtreelock.is_stable()) { + // clear stale dirtydft + state_clear(STATE_DIRTYDFT); + } else { + cache->mds->locker->mark_updated_scatterlock(&inode->dirfragtreelock); + ls->dirty_dirfrag_dirfragtree.push_back(&inode->item_dirty_dirfrag_dirfragtree); + } + } +} + +void CDir::abort_import() +{ + ceph_assert(is_auth()); + state_clear(CDir::STATE_AUTH); + remove_bloom(); + clear_replica_map(); + set_replica_nonce(CDir::EXPORT_NONCE); + if (is_dirty()) + mark_clean(); + + pop_nested.sub(pop_auth_subtree); + pop_auth_subtree_nested.sub(pop_auth_subtree); + pop_me.zero(); + pop_auth_subtree.zero(); +} + +void CDir::encode_dirstat(bufferlist& bl, const session_info_t& info, const DirStat& ds) { + if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { + ENCODE_START(1, 1, bl); + encode(ds.frag, bl); + encode(ds.auth, bl); + encode(ds.dist, bl); + ENCODE_FINISH(bl); + } + else { + encode(ds.frag, bl); + encode(ds.auth, bl); + encode(ds.dist, bl); + } +} + +/******************************** + * AUTHORITY + */ + +/* + * if dir_auth.first == parent, auth is same as inode. + * unless .second != unknown, in which case that sticks. + */ +mds_authority_t CDir::authority() const +{ + if (is_subtree_root()) + return dir_auth; + else + return inode->authority(); +} + +/** is_subtree_root() + * true if this is an auth delegation point. + * that is, dir_auth != default (parent,unknown) + * + * some key observations: + * if i am auth: + * - any region bound will be an export, or frozen. + * + * note that this DOES heed dir_auth.pending + */ +/* +bool CDir::is_subtree_root() +{ + if (dir_auth == CDIR_AUTH_DEFAULT) { + //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT + //<< " on " << ino() << dendl; + return false; + } else { + //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT + //<< " on " << ino() << dendl; + return true; + } +} +*/ + +/** contains(x) + * true if we are x, or an ancestor of x + */ +bool CDir::contains(CDir *x) +{ + while (1) { + if (x == this) + return true; + x = x->get_inode()->get_projected_parent_dir(); + if (x == 0) + return false; + } +} + + + +/** set_dir_auth + */ +void CDir::set_dir_auth(const mds_authority_t &a) +{ + dout(10) << "setting dir_auth=" << a + << " from " << dir_auth + << " on " << *this << dendl; + + bool was_subtree = is_subtree_root(); + bool was_ambiguous = dir_auth.second >= 0; + + // set it. + dir_auth = a; + + // new subtree root? + if (!was_subtree && is_subtree_root()) { + dout(10) << " new subtree root, adjusting auth_pins" << dendl; + + if (freeze_tree_state) { + // only by CDir::_freeze_tree() + ceph_assert(is_freezing_tree_root()); + } + + inode->num_subtree_roots++; + + // unpin parent of frozen dir/tree? + if (inode->is_auth()) { + ceph_assert(!is_frozen_tree_root()); + if (is_frozen_dir()) + inode->auth_unpin(this); + } + } + if (was_subtree && !is_subtree_root()) { + dout(10) << " old subtree root, adjusting auth_pins" << dendl; + + inode->num_subtree_roots--; + + // pin parent of frozen dir/tree? + if (inode->is_auth()) { + ceph_assert(!is_frozen_tree_root()); + if (is_frozen_dir()) + inode->auth_pin(this); + } + } + + // newly single auth? + if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { + MDSContext::vec ls; + take_waiting(WAIT_SINGLEAUTH, ls); + cache->mds->queue_waiters(ls); + } +} + +/***************************************** + * AUTH PINS and FREEZING + * + * the basic plan is that auth_pins only exist in auth regions, and they + * prevent a freeze (and subsequent auth change). + * + * however, we also need to prevent a parent from freezing if a child is frozen. + * for that reason, the parent inode of a frozen directory is auth_pinned. + * + * the oddity is when the frozen directory is a subtree root. if that's the case, + * the parent inode isn't frozen. which means that when subtree authority is adjusted + * at the bounds, inodes for any frozen bound directories need to get auth_pins at that + * time. + * + */ + +void CDir::auth_pin(void *by) +{ + if (auth_pins == 0) + get(PIN_AUTHPIN); + auth_pins++; + +#ifdef MDS_AUTHPIN_SET + auth_pin_set.insert(by); +#endif + + dout(10) << "auth_pin by " << by << " on " << *this << " count now " << auth_pins << dendl; + + if (freeze_tree_state) + freeze_tree_state->auth_pins += 1; +} + +void CDir::auth_unpin(void *by) +{ + auth_pins--; + +#ifdef MDS_AUTHPIN_SET + { + auto it = auth_pin_set.find(by); + ceph_assert(it != auth_pin_set.end()); + auth_pin_set.erase(it); + } +#endif + if (auth_pins == 0) + put(PIN_AUTHPIN); + + dout(10) << "auth_unpin by " << by << " on " << *this << " count now " << auth_pins << dendl; + ceph_assert(auth_pins >= 0); + + if (freeze_tree_state) + freeze_tree_state->auth_pins -= 1; + + maybe_finish_freeze(); // pending freeze? +} + +void CDir::adjust_nested_auth_pins(int dirinc, void *by) +{ + ceph_assert(dirinc); + dir_auth_pins += dirinc; + + dout(15) << __func__ << " " << dirinc << " on " << *this + << " by " << by << " count now " + << auth_pins << "/" << dir_auth_pins << dendl; + ceph_assert(dir_auth_pins >= 0); + + if (freeze_tree_state) + freeze_tree_state->auth_pins += dirinc; + + if (dirinc < 0) + maybe_finish_freeze(); // pending freeze? +} + +#ifdef MDS_VERIFY_FRAGSTAT +void CDir::verify_fragstat() +{ + ceph_assert(is_complete()); + if (inode->is_stray()) + return; + + frag_info_t c; + memset(&c, 0, sizeof(c)); + + for (auto it = items.begin(); + it != items.end(); + ++it) { + CDentry *dn = it->second; + if (dn->is_null()) + continue; + + dout(10) << " " << *dn << dendl; + if (dn->is_primary()) + dout(10) << " " << *dn->inode << dendl; + + if (dn->is_primary()) { + if (dn->inode->is_dir()) + c.nsubdirs++; + else + c.nfiles++; + } + if (dn->is_remote()) { + if (dn->get_remote_d_type() == DT_DIR) + c.nsubdirs++; + else + c.nfiles++; + } + } + + if (c.nsubdirs != fnode.fragstat.nsubdirs || + c.nfiles != fnode.fragstat.nfiles) { + dout(0) << "verify_fragstat failed " << fnode.fragstat << " on " << *this << dendl; + dout(0) << " i count " << c << dendl; + ceph_abort(); + } else { + dout(0) << "verify_fragstat ok " << fnode.fragstat << " on " << *this << dendl; + } +} +#endif + +/***************************************************************************** + * FREEZING + */ + +// FREEZE TREE + +void CDir::_walk_tree(std::function<bool(CDir*)> callback) +{ + + deque<CDir*> dfq; + dfq.push_back(this); + + vector<CDir*> dfv; + while (!dfq.empty()) { + CDir *dir = dfq.front(); + dfq.pop_front(); + + for (auto& p : *dir) { + CDentry *dn = p.second; + if (!dn->get_linkage()->is_primary()) + continue; + CInode *in = dn->get_linkage()->get_inode(); + if (!in->is_dir()) + continue; + + in->get_nested_dirfrags(dfv); + for (auto& dir : dfv) { + auto ret = callback(dir); + if (ret) + dfq.push_back(dir); + } + dfv.clear(); + } + } +} + +bool CDir::freeze_tree() +{ + ceph_assert(!is_frozen()); + ceph_assert(!is_freezing()); + ceph_assert(!freeze_tree_state); + + auth_pin(this); + + // Travese the subtree to mark dirfrags as 'freezing' (set freeze_tree_state) + // and to accumulate auth pins and record total count in freeze_tree_state. + // when auth unpin an 'freezing' object, the counter in freeze_tree_state also + // gets decreased. Subtree become 'frozen' when the counter reaches zero. + freeze_tree_state = std::make_shared<freeze_tree_state_t>(this); + freeze_tree_state->auth_pins += get_auth_pins() + get_dir_auth_pins(); + + _walk_tree([this](CDir *dir) { + if (dir->freeze_tree_state) + return false; + dir->freeze_tree_state = freeze_tree_state; + freeze_tree_state->auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins(); + return true; + } + ); + + if (is_freezeable(true)) { + _freeze_tree(); + auth_unpin(this); + return true; + } else { + state_set(STATE_FREEZINGTREE); + ++num_freezing_trees; + dout(10) << "freeze_tree waiting " << *this << dendl; + return false; + } +} + +void CDir::_freeze_tree() +{ + dout(10) << __func__ << " " << *this << dendl; + ceph_assert(is_freezeable(true)); + + if (freeze_tree_state) { + ceph_assert(is_auth()); + } else { + ceph_assert(!is_auth()); + freeze_tree_state = std::make_shared<freeze_tree_state_t>(this); + } + freeze_tree_state->frozen = true; + + if (is_auth()) { + mds_authority_t auth; + bool was_subtree = is_subtree_root(); + if (was_subtree) { + auth = get_dir_auth(); + } else { + // temporarily prevent parent subtree from becoming frozen. + inode->auth_pin(this); + // create new subtree + auth = authority(); + } + + _walk_tree([this, &auth] (CDir *dir) { + if (dir->freeze_tree_state != freeze_tree_state) { + inode->mdcache->adjust_subtree_auth(dir, auth); + return false; + } + return true; + } + ); + + ceph_assert(auth.first >= 0); + ceph_assert(auth.second == CDIR_AUTH_UNKNOWN); + auth.second = auth.first; + inode->mdcache->adjust_subtree_auth(this, auth); + if (!was_subtree) + inode->auth_unpin(this); + } else { + // importing subtree ? + _walk_tree([this] (CDir *dir) { + ceph_assert(!dir->freeze_tree_state); + dir->freeze_tree_state = freeze_tree_state; + return true; + } + ); + } + + // twiddle state + if (state_test(STATE_FREEZINGTREE)) { + state_clear(STATE_FREEZINGTREE); + --num_freezing_trees; + } + + state_set(STATE_FROZENTREE); + ++num_frozen_trees; + get(PIN_FROZEN); +} + +void CDir::unfreeze_tree() +{ + dout(10) << __func__ << " " << *this << dendl; + + MDSContext::vec unfreeze_waiters; + take_waiting(WAIT_UNFREEZE, unfreeze_waiters); + + if (freeze_tree_state) { + _walk_tree([this, &unfreeze_waiters](CDir *dir) { + if (dir->freeze_tree_state != freeze_tree_state) + return false; + dir->freeze_tree_state.reset(); + dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters); + return true; + } + ); + } + + if (state_test(STATE_FROZENTREE)) { + // frozen. unfreeze. + state_clear(STATE_FROZENTREE); + --num_frozen_trees; + + put(PIN_FROZEN); + + if (is_auth()) { + // must be subtree + ceph_assert(is_subtree_root()); + // for debug purpose, caller should ensure 'dir_auth.second == dir_auth.first' + mds_authority_t auth = get_dir_auth(); + ceph_assert(auth.first >= 0); + ceph_assert(auth.second == auth.first); + auth.second = CDIR_AUTH_UNKNOWN; + inode->mdcache->adjust_subtree_auth(this, auth); + } + freeze_tree_state.reset(); + } else { + ceph_assert(state_test(STATE_FREEZINGTREE)); + + // freezing. stop it. + state_clear(STATE_FREEZINGTREE); + --num_freezing_trees; + freeze_tree_state.reset(); + + finish_waiting(WAIT_FROZEN, -1); + auth_unpin(this); + } + + cache->mds->queue_waiters(unfreeze_waiters); +} + +void CDir::adjust_freeze_after_rename(CDir *dir) +{ + if (!freeze_tree_state || dir->freeze_tree_state != freeze_tree_state) + return; + CDir *newdir = dir->get_inode()->get_parent_dir(); + if (newdir == this || newdir->freeze_tree_state == freeze_tree_state) + return; + + ceph_assert(!freeze_tree_state->frozen); + ceph_assert(get_dir_auth_pins() > 0); + + MDSContext::vec unfreeze_waiters; + + auto unfreeze = [this, &unfreeze_waiters](CDir *dir) { + if (dir->freeze_tree_state != freeze_tree_state) + return false; + int dec = dir->get_auth_pins() + dir->get_dir_auth_pins(); + // shouldn't become zero because srcdn of rename was auth pinned + ceph_assert(freeze_tree_state->auth_pins > dec); + freeze_tree_state->auth_pins -= dec; + dir->freeze_tree_state.reset(); + dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters); + return true; + }; + + unfreeze(dir); + dir->_walk_tree(unfreeze); + + cache->mds->queue_waiters(unfreeze_waiters); +} + +bool CDir::can_auth_pin(int *err_ret) const +{ + int err; + if (!is_auth()) { + err = ERR_NOT_AUTH; + } else if (is_freezing_dir() || is_frozen_dir()) { + err = ERR_FRAGMENTING_DIR; + } else { + auto p = is_freezing_or_frozen_tree(); + if (p.first || p.second) { + err = ERR_EXPORTING_TREE; + } else { + err = 0; + } + } + if (err && err_ret) + *err_ret = err; + return !err; +} + +class C_Dir_AuthUnpin : public CDirContext { + public: + explicit C_Dir_AuthUnpin(CDir *d) : CDirContext(d) {} + void finish(int r) override { + dir->auth_unpin(dir->get_inode()); + } +}; + +void CDir::maybe_finish_freeze() +{ + if (dir_auth_pins != 0) + return; + + // we can freeze the _dir_ even with nested pins... + if (state_test(STATE_FREEZINGDIR)) { + if (auth_pins == 1) { + _freeze_dir(); + auth_unpin(this); + finish_waiting(WAIT_FROZEN); + } + } + + if (freeze_tree_state) { + if (freeze_tree_state->frozen || + freeze_tree_state->auth_pins != 1) + return; + + if (freeze_tree_state->dir != this) { + freeze_tree_state->dir->maybe_finish_freeze(); + return; + } + + ceph_assert(state_test(STATE_FREEZINGTREE)); + + if (!is_subtree_root() && inode->is_frozen()) { + dout(10) << __func__ << " !subtree root and frozen inode, waiting for unfreeze on " << inode << dendl; + // retake an auth_pin... + auth_pin(inode); + // and release it when the parent inode unfreezes + inode->add_waiter(WAIT_UNFREEZE, new C_Dir_AuthUnpin(this)); + return; + } + + _freeze_tree(); + auth_unpin(this); + finish_waiting(WAIT_FROZEN); + } +} + + + +// FREEZE DIR + +bool CDir::freeze_dir() +{ + ceph_assert(!is_frozen()); + ceph_assert(!is_freezing()); + + auth_pin(this); + if (is_freezeable_dir(true)) { + _freeze_dir(); + auth_unpin(this); + return true; + } else { + state_set(STATE_FREEZINGDIR); + dout(10) << "freeze_dir + wait " << *this << dendl; + return false; + } +} + +void CDir::_freeze_dir() +{ + dout(10) << __func__ << " " << *this << dendl; + //assert(is_freezeable_dir(true)); + // not always true during split because the original fragment may have frozen a while + // ago and we're just now getting around to breaking it up. + + state_clear(STATE_FREEZINGDIR); + state_set(STATE_FROZENDIR); + get(PIN_FROZEN); + + if (is_auth() && !is_subtree_root()) + inode->auth_pin(this); // auth_pin for duration of freeze +} + + +void CDir::unfreeze_dir() +{ + dout(10) << __func__ << " " << *this << dendl; + + if (state_test(STATE_FROZENDIR)) { + state_clear(STATE_FROZENDIR); + put(PIN_FROZEN); + + // unpin (may => FREEZEABLE) FIXME: is this order good? + if (is_auth() && !is_subtree_root()) + inode->auth_unpin(this); + + finish_waiting(WAIT_UNFREEZE); + } else { + finish_waiting(WAIT_FROZEN, -1); + + // still freezing. stop. + ceph_assert(state_test(STATE_FREEZINGDIR)); + state_clear(STATE_FREEZINGDIR); + auth_unpin(this); + + finish_waiting(WAIT_UNFREEZE); + } +} + +/** + * Slightly less complete than operator<<, because this is intended + * for identifying a directory and its state rather than for dumping + * debug output. + */ +void CDir::dump(Formatter *f, int flags) const +{ + ceph_assert(f != NULL); + if (flags & DUMP_PATH) { + f->dump_stream("path") << get_path(); + } + if (flags & DUMP_DIRFRAG) { + f->dump_stream("dirfrag") << dirfrag(); + } + if (flags & DUMP_SNAPID_FIRST) { + f->dump_int("snapid_first", first); + } + if (flags & DUMP_VERSIONS) { + f->dump_stream("projected_version") << get_projected_version(); + f->dump_stream("version") << get_version(); + f->dump_stream("committing_version") << get_committing_version(); + f->dump_stream("committed_version") << get_committed_version(); + } + if (flags & DUMP_REP) { + f->dump_bool("is_rep", is_rep()); + } + if (flags & DUMP_DIR_AUTH) { + if (get_dir_auth() != CDIR_AUTH_DEFAULT) { + if (get_dir_auth().second == CDIR_AUTH_UNKNOWN) { + f->dump_stream("dir_auth") << get_dir_auth().first; + } else { + f->dump_stream("dir_auth") << get_dir_auth(); + } + } else { + f->dump_string("dir_auth", ""); + } + } + if (flags & DUMP_STATES) { + f->open_array_section("states"); + MDSCacheObject::dump_states(f); + if (state_test(CDir::STATE_COMPLETE)) f->dump_string("state", "complete"); + if (state_test(CDir::STATE_FREEZINGTREE)) f->dump_string("state", "freezingtree"); + if (state_test(CDir::STATE_FROZENTREE)) f->dump_string("state", "frozentree"); + if (state_test(CDir::STATE_FROZENDIR)) f->dump_string("state", "frozendir"); + if (state_test(CDir::STATE_FREEZINGDIR)) f->dump_string("state", "freezingdir"); + if (state_test(CDir::STATE_EXPORTBOUND)) f->dump_string("state", "exportbound"); + if (state_test(CDir::STATE_IMPORTBOUND)) f->dump_string("state", "importbound"); + if (state_test(CDir::STATE_BADFRAG)) f->dump_string("state", "badfrag"); + f->close_section(); + } + if (flags & DUMP_MDS_CACHE_OBJECT) { + MDSCacheObject::dump(f); + } + if (flags & DUMP_ITEMS) { + f->open_array_section("dentries"); + for (auto &p : items) { + CDentry *dn = p.second; + f->open_object_section("dentry"); + dn->dump(f); + f->close_section(); + } + f->close_section(); + } +} + +void CDir::dump_load(Formatter *f) +{ + f->dump_stream("path") << get_path(); + f->dump_stream("dirfrag") << dirfrag(); + + f->open_object_section("pop_me"); + pop_me.dump(f); + f->close_section(); + + f->open_object_section("pop_nested"); + pop_nested.dump(f); + f->close_section(); + + f->open_object_section("pop_auth_subtree"); + pop_auth_subtree.dump(f); + f->close_section(); + + f->open_object_section("pop_auth_subtree_nested"); + pop_auth_subtree_nested.dump(f); + f->close_section(); +} + +/****** Scrub Stuff *******/ + +void CDir::scrub_info_create() const +{ + ceph_assert(!scrub_infop); + + // break out of const-land to set up implicit initial state + CDir *me = const_cast<CDir*>(this); + fnode_t *fn = me->get_projected_fnode(); + + std::unique_ptr<scrub_info_t> si(new scrub_info_t()); + + si->last_recursive.version = si->recursive_start.version = + fn->recursive_scrub_version; + si->last_recursive.time = si->recursive_start.time = + fn->recursive_scrub_stamp; + + si->last_local.version = fn->localized_scrub_version; + si->last_local.time = fn->localized_scrub_stamp; + + me->scrub_infop.swap(si); +} + +void CDir::scrub_initialize(const ScrubHeaderRefConst& header) +{ + dout(20) << __func__ << dendl; + ceph_assert(is_complete()); + ceph_assert(header != nullptr); + + // FIXME: weird implicit construction, is someone else meant + // to be calling scrub_info_create first? + scrub_info(); + ceph_assert(scrub_infop && !scrub_infop->directory_scrubbing); + + scrub_infop->recursive_start.version = get_projected_version(); + scrub_infop->recursive_start.time = ceph_clock_now(); + + scrub_infop->directories_to_scrub.clear(); + scrub_infop->directories_scrubbing.clear(); + scrub_infop->directories_scrubbed.clear(); + scrub_infop->others_to_scrub.clear(); + scrub_infop->others_scrubbing.clear(); + scrub_infop->others_scrubbed.clear(); + + for (auto i = items.begin(); + i != items.end(); + ++i) { + // TODO: handle snapshot scrubbing + if (i->first.snapid != CEPH_NOSNAP) + continue; + + CDentry::linkage_t *dnl = i->second->get_projected_linkage(); + if (dnl->is_primary()) { + if (dnl->get_inode()->is_dir()) + scrub_infop->directories_to_scrub.insert(i->first); + else + scrub_infop->others_to_scrub.insert(i->first); + } else if (dnl->is_remote()) { + // TODO: check remote linkage + } + } + scrub_infop->directory_scrubbing = true; + scrub_infop->header = header; +} + +void CDir::scrub_finished() +{ + dout(20) << __func__ << dendl; + ceph_assert(scrub_infop && scrub_infop->directory_scrubbing); + + ceph_assert(scrub_infop->directories_to_scrub.empty()); + ceph_assert(scrub_infop->directories_scrubbing.empty()); + scrub_infop->directories_scrubbed.clear(); + ceph_assert(scrub_infop->others_to_scrub.empty()); + ceph_assert(scrub_infop->others_scrubbing.empty()); + scrub_infop->others_scrubbed.clear(); + scrub_infop->directory_scrubbing = false; + + scrub_infop->last_recursive = scrub_infop->recursive_start; + scrub_infop->last_scrub_dirty = true; +} + +int CDir::_next_dentry_on_set(dentry_key_set &dns, bool missing_okay, + MDSContext *cb, CDentry **dnout) +{ + dentry_key_t dnkey; + CDentry *dn; + + while (!dns.empty()) { + set<dentry_key_t>::iterator front = dns.begin(); + dnkey = *front; + dn = lookup(dnkey.name); + if (!dn) { + if (!is_complete() && + (!has_bloom() || is_in_bloom(dnkey.name))) { + // need to re-read this dirfrag + fetch(cb); + return EAGAIN; + } + // okay, we lost it + if (missing_okay) { + dout(15) << " we no longer have directory dentry " + << dnkey.name << ", assuming it got renamed" << dendl; + dns.erase(dnkey); + continue; + } else { + dout(5) << " we lost dentry " << dnkey.name + << ", bailing out because that's impossible!" << dendl; + ceph_abort(); + } + } + // okay, we got a dentry + dns.erase(dnkey); + + if (dn->get_projected_version() < scrub_infop->last_recursive.version && + !(scrub_infop->header->get_force())) { + dout(15) << " skip dentry " << dnkey.name + << ", no change since last scrub" << dendl; + continue; + } + + if (!dn->get_linkage()->is_primary()) { + dout(15) << " skip dentry " << dnkey.name + << ", no longer primary" << dendl; + continue; + } + + *dnout = dn; + return 0; + } + *dnout = NULL; + return ENOENT; +} + +int CDir::scrub_dentry_next(MDSContext *cb, CDentry **dnout) +{ + dout(20) << __func__ << dendl; + ceph_assert(scrub_infop && scrub_infop->directory_scrubbing); + + dout(20) << "trying to scrub directories underneath us" << dendl; + int rval = _next_dentry_on_set(scrub_infop->directories_to_scrub, true, + cb, dnout); + if (rval == 0) { + dout(20) << __func__ << " inserted to directories scrubbing: " + << *dnout << dendl; + scrub_infop->directories_scrubbing.insert((*dnout)->key()); + } else if (rval == EAGAIN) { + // we don't need to do anything else + } else { // we emptied out the directory scrub set + ceph_assert(rval == ENOENT); + dout(20) << "no directories left, moving on to other kinds of dentries" + << dendl; + + rval = _next_dentry_on_set(scrub_infop->others_to_scrub, false, cb, dnout); + if (rval == 0) { + dout(20) << __func__ << " inserted to others scrubbing: " + << *dnout << dendl; + scrub_infop->others_scrubbing.insert((*dnout)->key()); + } + } + dout(20) << " returning " << rval << " with dn=" << *dnout << dendl; + return rval; +} + +void CDir::scrub_dentries_scrubbing(list<CDentry*> *out_dentries) +{ + dout(20) << __func__ << dendl; + ceph_assert(scrub_infop && scrub_infop->directory_scrubbing); + + for (set<dentry_key_t>::iterator i = + scrub_infop->directories_scrubbing.begin(); + i != scrub_infop->directories_scrubbing.end(); + ++i) { + CDentry *d = lookup(i->name, i->snapid); + ceph_assert(d); + out_dentries->push_back(d); + } + for (set<dentry_key_t>::iterator i = scrub_infop->others_scrubbing.begin(); + i != scrub_infop->others_scrubbing.end(); + ++i) { + CDentry *d = lookup(i->name, i->snapid); + ceph_assert(d); + out_dentries->push_back(d); + } +} + +void CDir::scrub_dentry_finished(CDentry *dn) +{ + dout(20) << __func__ << " on dn " << *dn << dendl; + ceph_assert(scrub_infop && scrub_infop->directory_scrubbing); + dentry_key_t dn_key = dn->key(); + if (scrub_infop->directories_scrubbing.erase(dn_key)) { + scrub_infop->directories_scrubbed.insert(dn_key); + } else { + ceph_assert(scrub_infop->others_scrubbing.count(dn_key)); + scrub_infop->others_scrubbing.erase(dn_key); + scrub_infop->others_scrubbed.insert(dn_key); + } +} + +void CDir::scrub_maybe_delete_info() +{ + if (scrub_infop && + !scrub_infop->directory_scrubbing && + !scrub_infop->need_scrub_local && + !scrub_infop->last_scrub_dirty && + !scrub_infop->pending_scrub_error && + scrub_infop->dirty_scrub_stamps.empty()) { + scrub_infop.reset(); + } +} + +bool CDir::scrub_local() +{ + ceph_assert(is_complete()); + bool rval = check_rstats(true); + + scrub_info(); + if (rval) { + scrub_infop->last_local.time = ceph_clock_now(); + scrub_infop->last_local.version = get_projected_version(); + scrub_infop->pending_scrub_error = false; + scrub_infop->last_scrub_dirty = true; + } else { + scrub_infop->pending_scrub_error = true; + if (scrub_infop->header->get_repair()) + cache->repair_dirfrag_stats(this); + } + return rval; +} + +std::string CDir::get_path() const +{ + std::string path; + get_inode()->make_path_string(path, true); + return path; +} + +bool CDir::should_split_fast() const +{ + // Max size a fragment can be before trigger fast splitting + int fast_limit = g_conf()->mds_bal_split_size * g_conf()->mds_bal_fragment_fast_factor; + + // Fast path: the sum of accounted size and null dentries does not + // exceed threshold: we definitely are not over it. + if (get_frag_size() + get_num_head_null() <= fast_limit) { + return false; + } + + // Fast path: the accounted size of the frag exceeds threshold: we + // definitely are over it + if (get_frag_size() > fast_limit) { + return true; + } + + int64_t effective_size = 0; + + for (const auto &p : items) { + const CDentry *dn = p.second; + if (!dn->get_projected_linkage()->is_null()) { + effective_size++; + } + } + + return effective_size > fast_limit; +} + +MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co); diff --git a/src/mds/CDir.h b/src/mds/CDir.h new file mode 100644 index 00000000..23c94c8b --- /dev/null +++ b/src/mds/CDir.h @@ -0,0 +1,782 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_CDIR_H +#define CEPH_CDIR_H + +#include <iosfwd> +#include <list> +#include <map> +#include <set> +#include <string> +#include <string_view> + +#include "common/bloom_filter.hpp" +#include "common/config.h" +#include "include/buffer_fwd.h" +#include "include/counter.h" +#include "include/types.h" + +#include "CInode.h" +#include "MDSCacheObject.h" +#include "MDSContext.h" +#include "cephfs_features.h" +#include "SessionMap.h" +#include "messages/MClientReply.h" + +class CDentry; +class MDCache; + +struct ObjectOperation; + +ostream& operator<<(ostream& out, const class CDir& dir); +class CDir : public MDSCacheObject, public Counter<CDir> { + using time = ceph::coarse_mono_time; + using clock = ceph::coarse_mono_clock; + + friend ostream& operator<<(ostream& out, const class CDir& dir); + +public: + MEMPOOL_CLASS_HELPERS(); + // -- pins -- + static const int PIN_DNWAITER = 1; + static const int PIN_INOWAITER = 2; + static const int PIN_CHILD = 3; + static const int PIN_FROZEN = 4; + static const int PIN_SUBTREE = 5; + static const int PIN_IMPORTING = 7; + static const int PIN_IMPORTBOUND = 9; + static const int PIN_EXPORTBOUND = 10; + static const int PIN_STICKY = 11; + static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() + std::string_view pin_name(int p) const override { + switch (p) { + case PIN_DNWAITER: return "dnwaiter"; + case PIN_INOWAITER: return "inowaiter"; + case PIN_CHILD: return "child"; + case PIN_FROZEN: return "frozen"; + case PIN_SUBTREE: return "subtree"; + case PIN_IMPORTING: return "importing"; + case PIN_IMPORTBOUND: return "importbound"; + case PIN_EXPORTBOUND: return "exportbound"; + case PIN_STICKY: return "sticky"; + case PIN_SUBTREETEMP: return "subtreetemp"; + default: return generic_pin_name(p); + } + } + + // -- state -- + static const unsigned STATE_COMPLETE = (1<< 0); // the complete contents are in cache + static const unsigned STATE_FROZENTREE = (1<< 1); // root of tree (bounded by exports) + static const unsigned STATE_FREEZINGTREE = (1<< 2); // in process of freezing + static const unsigned STATE_FROZENDIR = (1<< 3); + static const unsigned STATE_FREEZINGDIR = (1<< 4); + static const unsigned STATE_COMMITTING = (1<< 5); // mid-commit + static const unsigned STATE_FETCHING = (1<< 6); // currenting fetching + static const unsigned STATE_CREATING = (1<< 7); + static const unsigned STATE_IMPORTBOUND = (1<< 8); + static const unsigned STATE_EXPORTBOUND = (1<< 9); + static const unsigned STATE_EXPORTING = (1<<10); + static const unsigned STATE_IMPORTING = (1<<11); + static const unsigned STATE_FRAGMENTING = (1<<12); + static const unsigned STATE_STICKY = (1<<13); // sticky pin due to inode stickydirs + static const unsigned STATE_DNPINNEDFRAG = (1<<14); // dir is refragmenting + static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats + static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree + static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag + static const unsigned STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table + static const unsigned STATE_AUXSUBTREE = (1<<19); // no subtree merge + + // common states + static const unsigned STATE_CLEAN = 0; + + // these state bits are preserved by an import/export + // ...except if the directory is hashed, in which case none of them are! + static const unsigned MASK_STATE_EXPORTED = + (STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG); + static const unsigned MASK_STATE_IMPORT_KEPT = + ( + STATE_IMPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_STICKY | + STATE_TRACKEDBYOFT); + static const unsigned MASK_STATE_EXPORT_KEPT = + (STATE_EXPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_FROZENDIR | + STATE_STICKY | + STATE_TRACKEDBYOFT); + static const unsigned MASK_STATE_FRAGMENT_KEPT = + (STATE_DIRTY | + STATE_EXPORTBOUND | + STATE_IMPORTBOUND | + STATE_AUXSUBTREE | + STATE_REJOINUNDEF); + + // -- rep spec -- + static const int REP_NONE = 0; + static const int REP_ALL = 1; + static const int REP_LIST = 2; + + + static const unsigned EXPORT_NONCE = 1; + + + // -- wait masks -- + static const uint64_t WAIT_DENTRY = (1<<0); // wait for item to be in cache + static const uint64_t WAIT_COMPLETE = (1<<1); // wait for complete dir contents + static const uint64_t WAIT_FROZEN = (1<<2); // auth pins removed + static const uint64_t WAIT_CREATED = (1<<3); // new dirfrag is logged + + static const int WAIT_DNLOCK_OFFSET = 4; + + static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1); + static const uint64_t WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); + + // -- dump flags -- + static const int DUMP_PATH = (1 << 0); + static const int DUMP_DIRFRAG = (1 << 1); + static const int DUMP_SNAPID_FIRST = (1 << 2); + static const int DUMP_VERSIONS = (1 << 3); + static const int DUMP_REP = (1 << 4); + static const int DUMP_DIR_AUTH = (1 << 5); + static const int DUMP_STATES = (1 << 6); + static const int DUMP_MDS_CACHE_OBJECT = (1 << 7); + static const int DUMP_ITEMS = (1 << 8); + static const int DUMP_ALL = (-1); + static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_ITEMS); + + public: + // context + MDCache *cache; + + CInode *inode; // my inode + frag_t frag; // my frag + + bool is_lt(const MDSCacheObject *r) const override { + return dirfrag() < (static_cast<const CDir*>(r))->dirfrag(); + } + + fnode_t fnode; + snapid_t first; + mempool::mds_co::compact_map<snapid_t,old_rstat_t> dirty_old_rstat; // [value.first,key] + + // my inodes with dirty rstat data + elist<CInode*> dirty_rstat_inodes; + + void resync_accounted_fragstat(); + void resync_accounted_rstat(); + void assimilate_dirty_rstat_inodes(); + void assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob); + + void mark_exporting() { + state_set(CDir::STATE_EXPORTING); + inode->num_exporting_dirs++; + } + void clear_exporting() { + state_clear(CDir::STATE_EXPORTING); + inode->num_exporting_dirs--; + } + +protected: + version_t projected_version; + mempool::mds_co::list<fnode_t> projected_fnode; + +public: + elist<CDentry*> dirty_dentries; + elist<CDir*>::item item_dirty, item_new; + +public: + version_t get_version() const { return fnode.version; } + void set_version(version_t v) { + ceph_assert(projected_fnode.empty()); + projected_version = fnode.version = v; + } + version_t get_projected_version() const { return projected_version; } + + const fnode_t *get_projected_fnode() const { + if (projected_fnode.empty()) + return &fnode; + else + return &projected_fnode.back(); + } + + fnode_t *get_projected_fnode() { + if (projected_fnode.empty()) + return &fnode; + else + return &projected_fnode.back(); + } + fnode_t *project_fnode(); + + void pop_and_dirty_projected_fnode(LogSegment *ls); + bool is_projected() const { return !projected_fnode.empty(); } + version_t pre_dirty(version_t min=0); + void _mark_dirty(LogSegment *ls); + void _set_dirty_flag() { + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + } + } + void mark_dirty(version_t pv, LogSegment *ls); + void mark_clean(); + + bool is_new() { return item_new.is_on_list(); } + void mark_new(LogSegment *ls); + + bool is_bad() { return state_test(STATE_BADFRAG); } +private: + void log_mark_dirty(); + +public: + typedef mempool::mds_co::map<dentry_key_t, CDentry*> dentry_key_map; + typedef mempool::mds_co::set<dentry_key_t> dentry_key_set; + + class scrub_info_t { + public: + /// inodes we contain with dirty scrub stamps + dentry_key_map dirty_scrub_stamps; // TODO: make use of this! + struct scrub_stamps { + version_t version; + utime_t time; + scrub_stamps() : version(0) {} + void operator=(const scrub_stamps &o) { + version = o.version; + time = o.time; + } + }; + + scrub_stamps recursive_start; // when we last started a recursive scrub + scrub_stamps last_recursive; // when we last finished a recursive scrub + scrub_stamps last_local; // when we last did a local scrub + + bool directory_scrubbing; /// safety check + bool need_scrub_local; + bool last_scrub_dirty; /// is scrub info dirty or is it flushed to fnode? + bool pending_scrub_error; + + /// these are lists of children in each stage of scrubbing + dentry_key_set directories_to_scrub; + dentry_key_set directories_scrubbing; + dentry_key_set directories_scrubbed; + dentry_key_set others_to_scrub; + dentry_key_set others_scrubbing; + dentry_key_set others_scrubbed; + + ScrubHeaderRefConst header; + + scrub_info_t() : + directory_scrubbing(false), + need_scrub_local(false), + last_scrub_dirty(false), + pending_scrub_error(false) {} + }; + /** + * Call to start this CDir on a new scrub. + * @pre It is not currently scrubbing + * @pre The CDir is marked complete. + * @post It has set up its internal scrubbing state. + */ + void scrub_initialize(const ScrubHeaderRefConst& header); + /** + * Get the next dentry to scrub. Gives you a CDentry* and its meaning. This + * function will give you all directory-representing dentries before any + * others. + * 0: success, you should scrub this CDentry right now + * EAGAIN: is currently fetching the next CDentry into memory for you. + * It will activate your callback when done; try again when it does! + * ENOENT: there are no remaining dentries to scrub + * <0: There was an unexpected error + * + * @param cb An MDSContext which will be activated only if + * we return EAGAIN via rcode, or else ignored + * @param dnout CDentry * which you should next scrub, or NULL + * @returns a value as described above + */ + int scrub_dentry_next(MDSContext *cb, CDentry **dnout); + /** + * Get the currently scrubbing dentries. When returned, the passed-in + * list will be filled with all CDentry * which have been returned + * from scrub_dentry_next() but not sent back via scrub_dentry_finished(). + */ + void scrub_dentries_scrubbing(std::list<CDentry*> *out_dentries); + /** + * Report to the CDir that a CDentry has been scrubbed. Call this + * for every CDentry returned from scrub_dentry_next(). + * @param dn The CDentry which has been scrubbed. + */ + void scrub_dentry_finished(CDentry *dn); + /** + * Call this once all CDentries have been scrubbed, according to + * scrub_dentry_next's listing. It finalizes the scrub statistics. + */ + void scrub_finished(); + /** + * Tell the CDir to do a local scrub of itself. + * @pre The CDir is_complete(). + * @returns true if the rstats and directory contents match, false otherwise. + */ + bool scrub_local(); +private: + /** + * Create a scrub_info_t struct for the scrub_infop pointer. + */ + void scrub_info_create() const; + /** + * Delete the scrub_infop if it's not got any useful data. + */ + void scrub_maybe_delete_info(); + /** + * Check the given set (presumably one of those in scrub_info_t) for the + * next key to scrub and look it up (or fail!). + */ + int _next_dentry_on_set(dentry_key_set &dns, bool missing_okay, + MDSContext *cb, CDentry **dnout); + + +protected: + std::unique_ptr<scrub_info_t> scrub_infop; // FIXME not in mempool + + // contents of this directory + dentry_key_map items; // non-null AND null + unsigned num_head_items; + unsigned num_head_null; + unsigned num_snap_items; + unsigned num_snap_null; + + int num_dirty; + + int num_inodes_with_caps = 0; + + // state + version_t committing_version; + version_t committed_version; + + mempool::mds_co::compact_set<mempool::mds_co::string> stale_items; + + // lock nesting, freeze + static int num_frozen_trees; + static int num_freezing_trees; + + int dir_auth_pins; + + // cache control (defined for authority; hints for replicas) + __s32 dir_rep; + mempool::mds_co::compact_set<__s32> dir_rep_by; // if dir_rep == REP_LIST + + // popularity + dirfrag_load_vec_t pop_me; + dirfrag_load_vec_t pop_nested; + dirfrag_load_vec_t pop_auth_subtree; + dirfrag_load_vec_t pop_auth_subtree_nested; + + time last_popularity_sample = clock::zero(); + + load_spread_t pop_spread; + + elist<CInode*> pop_lru_subdirs; + + // and to provide density + int num_dentries_nested; + int num_dentries_auth_subtree; + int num_dentries_auth_subtree_nested; + + + // friends + friend class Migrator; + friend class CInode; + friend class MDCache; + friend class MDiscover; + friend class MDBalancer; + + friend class CDirDiscover; + friend class CDirExport; + friend class C_IO_Dir_TMAP_Fetched; + friend class C_IO_Dir_OMAP_Fetched; + friend class C_IO_Dir_OMAP_FetchedMore; + friend class C_IO_Dir_Committed; + + std::unique_ptr<bloom_filter> bloom; // XXX not part of mempool::mds_co + /* If you set up the bloom filter, you must keep it accurate! + * It's deleted when you mark_complete() and is deliberately not serialized.*/ + + public: + CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); + + const scrub_info_t *scrub_info() const { + if (!scrub_infop) { + scrub_info_create(); + } + return scrub_infop.get(); + } + + + // -- accessors -- + inodeno_t ino() const { return inode->ino(); } // deprecate me? + frag_t get_frag() const { return frag; } + dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } + + CInode *get_inode() { return inode; } + const CInode *get_inode() const { return inode; } + CDir *get_parent_dir() { return inode->get_parent_dir(); } + + dentry_key_map::iterator begin() { return items.begin(); } + dentry_key_map::iterator end() { return items.end(); } + dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); } + + unsigned get_num_head_items() const { return num_head_items; } + unsigned get_num_head_null() const { return num_head_null; } + unsigned get_num_snap_items() const { return num_snap_items; } + unsigned get_num_snap_null() const { return num_snap_null; } + unsigned get_num_any() const { return num_head_items + num_head_null + num_snap_items + num_snap_null; } + + bool check_rstats(bool scrub=false); + + void inc_num_dirty() { num_dirty++; } + void dec_num_dirty() { + ceph_assert(num_dirty > 0); + num_dirty--; + } + int get_num_dirty() const { + return num_dirty; + } + + void adjust_num_inodes_with_caps(int d); + + int64_t get_frag_size() const { + return get_projected_fnode()->fragstat.size(); + } + + // -- dentries and inodes -- + public: + CDentry* lookup_exact_snap(std::string_view dname, snapid_t last); + CDentry* lookup(std::string_view n, snapid_t snap=CEPH_NOSNAP); + + CDentry* add_null_dentry(std::string_view dname, + snapid_t first=2, snapid_t last=CEPH_NOSNAP); + CDentry* add_primary_dentry(std::string_view dname, CInode *in, + snapid_t first=2, snapid_t last=CEPH_NOSNAP); + CDentry* add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type, + snapid_t first=2, snapid_t last=CEPH_NOSNAP); + void remove_dentry( CDentry *dn ); // delete dentry + void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type); + void link_remote_inode( CDentry *dn, CInode *in ); + void link_primary_inode( CDentry *dn, CInode *in ); + void unlink_inode(CDentry *dn, bool adjust_lru=true); + void try_remove_unlinked_dn(CDentry *dn); + + void add_to_bloom(CDentry *dn); + bool is_in_bloom(std::string_view name); + bool has_bloom() { return (bloom ? true : false); } + void remove_bloom() { + bloom.reset(); + } +private: + void link_inode_work( CDentry *dn, CInode *in ); + void unlink_inode_work( CDentry *dn ); + void remove_null_dentries(); + void purge_stale_snap_data(const std::set<snapid_t>& snaps); +public: + void try_remove_dentries_for_stray(); + bool try_trim_snap_dentry(CDentry *dn, const std::set<snapid_t>& snaps); + + +public: + void split(int bits, std::list<CDir*>& subs, MDSContext::vec& waiters, bool replay); + void merge(std::list<CDir*>& subs, MDSContext::vec& waiters, bool replay); + + bool should_split() const { + return (int)get_frag_size() > g_conf()->mds_bal_split_size; + } + bool should_split_fast() const; + bool should_merge() const { + return (int)get_frag_size() < g_conf()->mds_bal_merge_size; + } + +private: + void prepare_new_fragment(bool replay); + void prepare_old_fragment(map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay); + void steal_dentry(CDentry *dn); // from another dir. used by merge/split. + void finish_old_fragment(MDSContext::vec& waiters, bool replay); + void init_fragment_pins(); + + + // -- authority -- + /* + * normal: <parent,unknown> !subtree_root + * delegation: <mds,unknown> subtree_root + * ambiguous: <mds1,mds2> subtree_root + * <parent,mds2> subtree_root + */ + mds_authority_t dir_auth; + + std::string get_path() const; + + public: + mds_authority_t authority() const override; + mds_authority_t get_dir_auth() const { return dir_auth; } + void set_dir_auth(const mds_authority_t &a); + void set_dir_auth(mds_rank_t a) { set_dir_auth(mds_authority_t(a, CDIR_AUTH_UNKNOWN)); } + bool is_ambiguous_dir_auth() const { + return dir_auth.second != CDIR_AUTH_UNKNOWN; + } + bool is_full_dir_auth() const { + return is_auth() && !is_ambiguous_dir_auth(); + } + bool is_full_dir_nonauth() const { + return !is_auth() && !is_ambiguous_dir_auth(); + } + + bool is_subtree_root() const { + return dir_auth != CDIR_AUTH_DEFAULT; + } + + bool contains(CDir *x); // true if we are x or an ancestor of x + + + // for giving to clients + void get_dist_spec(std::set<mds_rank_t>& ls, mds_rank_t auth) { + if (is_auth()) { + list_replicas(ls); + if (!ls.empty()) + ls.insert(auth); + } + } + + static void encode_dirstat(bufferlist& bl, const session_info_t& info, const DirStat& ds); + + void _encode_base(bufferlist& bl) { + encode(first, bl); + encode(fnode, bl); + encode(dir_rep, bl); + encode(dir_rep_by, bl); + } + void _decode_base(bufferlist::const_iterator& p) { + decode(first, p); + decode(fnode, p); + decode(dir_rep, p); + decode(dir_rep_by, p); + } + void encode_replica(mds_rank_t who, bufferlist& bl) { + __u32 nonce = add_replica(who); + encode(nonce, bl); + _encode_base(bl); + } + void decode_replica(bufferlist::const_iterator& p) { + __u32 nonce; + decode(nonce, p); + replica_nonce = nonce; + _decode_base(p); + } + + + + // -- state -- + bool is_complete() { return state & STATE_COMPLETE; } + bool is_exporting() { return state & STATE_EXPORTING; } + bool is_importing() { return state & STATE_IMPORTING; } + bool is_dirty_dft() { return state & STATE_DIRTYDFT; } + + int get_dir_rep() const { return dir_rep; } + bool is_rep() const { + if (dir_rep == REP_NONE) return false; + return true; + } + + // -- fetch -- + object_t get_ondisk_object() { + return file_object_t(ino(), frag); + } + void fetch(MDSContext *c, bool ignore_authpinnability=false); + void fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability=false); + void fetch(MDSContext *c, const std::set<dentry_key_t>& keys); +protected: + mempool::mds_co::compact_set<mempool::mds_co::string> wanted_items; + + void _omap_fetch(MDSContext *fin, const std::set<dentry_key_t>& keys); + void _omap_fetch_more( + bufferlist& hdrbl, std::map<std::string, bufferlist>& omap, + MDSContext *fin); + CDentry *_load_dentry( + std::string_view key, + std::string_view dname, + snapid_t last, + bufferlist &bl, + int pos, + const std::set<snapid_t> *snaps, + bool *force_dirty); + + /** + * Go bad due to a damaged dentry (register with damagetable and go BADFRAG) + */ + void go_bad_dentry(snapid_t last, std::string_view dname); + + /** + * Go bad due to a damaged header (register with damagetable and go BADFRAG) + */ + void go_bad(bool complete); + + void _omap_fetched(bufferlist& hdrbl, std::map<std::string, bufferlist>& omap, + bool complete, int r); + + // -- commit -- + mempool::mds_co::compact_map<version_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_for_commit; + void _commit(version_t want, int op_prio); + void _omap_commit(int op_prio); + void _encode_dentry(CDentry *dn, bufferlist& bl, const std::set<snapid_t> *snaps); + void _committed(int r, version_t v); +public: +#if 0 // unused? + void wait_for_commit(Context *c, version_t v=0); +#endif + void commit_to(version_t want); + void commit(version_t want, MDSContext *c, + bool ignore_authpinnability=false, int op_prio=-1); + + // -- dirtyness -- + version_t get_committing_version() const { return committing_version; } + version_t get_committed_version() const { return committed_version; } + void set_committed_version(version_t v) { committed_version = v; } + + void mark_complete(); + + + // -- reference counting -- + void first_get() override; + void last_put() override; + + // -- waiters -- +protected: + mempool::mds_co::compact_map< string_snap_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_on_dentry; // FIXME string_snap_t not in mempool + +public: + bool is_waiting_for_dentry(std::string_view dname, snapid_t snap) { + return waiting_on_dentry.count(string_snap_t(dname, snap)); + } + void add_dentry_waiter(std::string_view dentry, snapid_t snap, MDSContext *c); + void take_dentry_waiting(std::string_view dentry, snapid_t first, snapid_t last, MDSContext::vec& ls); + void take_sub_waiting(MDSContext::vec& ls); // dentry or ino + + void add_waiter(uint64_t mask, MDSContext *c) override; + void take_waiting(uint64_t mask, MDSContext::vec& ls) override; // may include dentry waiters + void finish_waiting(uint64_t mask, int result = 0); // ditto + + + // -- import/export -- + void encode_export(bufferlist& bl); + void finish_export(); + void abort_export() { + put(PIN_TEMPEXPORTING); + } + void decode_import(bufferlist::const_iterator& blp, LogSegment *ls); + void abort_import(); + + // -- auth pins -- + bool can_auth_pin(int *err_ret=nullptr) const override; + int get_auth_pins() const { return auth_pins; } + int get_dir_auth_pins() const { return dir_auth_pins; } + void auth_pin(void *who) override; + void auth_unpin(void *who) override; + + void adjust_nested_auth_pins(int dirinc, void *by); + void verify_fragstat(); + + // -- freezing -- + struct freeze_tree_state_t { + CDir *dir; // freezing/frozen tree root + int auth_pins = 0; + bool frozen = false; + freeze_tree_state_t(CDir *d) : dir(d) {} + }; + // all dirfrags within freezing/frozen tree reference the 'state' + std::shared_ptr<freeze_tree_state_t> freeze_tree_state; + + void _walk_tree(std::function<bool(CDir*)> cb); + + bool freeze_tree(); + void _freeze_tree(); + void unfreeze_tree(); + void adjust_freeze_after_rename(CDir *dir); + + bool freeze_dir(); + void _freeze_dir(); + void unfreeze_dir(); + + void maybe_finish_freeze(); + + pair<bool,bool> is_freezing_or_frozen_tree() const { + if (freeze_tree_state) { + if (freeze_tree_state->frozen) + return make_pair(false, true); + return make_pair(true, false); + } + return make_pair(false, false); + } + + bool is_freezing() const override { return is_freezing_dir() || is_freezing_tree(); } + bool is_freezing_tree() const { + if (!num_freezing_trees) + return false; + return is_freezing_or_frozen_tree().first; + } + bool is_freezing_tree_root() const { return state & STATE_FREEZINGTREE; } + bool is_freezing_dir() const { return state & STATE_FREEZINGDIR; } + + bool is_frozen() const override { return is_frozen_dir() || is_frozen_tree(); } + bool is_frozen_tree() const { + if (!num_frozen_trees) + return false; + return is_freezing_or_frozen_tree().second; + } + bool is_frozen_tree_root() const { return state & STATE_FROZENTREE; } + bool is_frozen_dir() const { return state & STATE_FROZENDIR; } + + bool is_freezeable(bool freezing=false) const { + // no nested auth pins. + if (auth_pins - (freezing ? 1 : 0) > 0 || + (freeze_tree_state && freeze_tree_state->auth_pins != auth_pins)) + return false; + + // inode must not be frozen. + if (!is_subtree_root() && inode->is_frozen()) + return false; + + return true; + } + + bool is_freezeable_dir(bool freezing=false) const { + if ((auth_pins - freezing) > 0 || dir_auth_pins > 0) + return false; + + // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). + if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) + return false; + + return true; + } + + ostream& print_db_line_prefix(ostream& out) override; + void print(ostream& out) override; + void dump(Formatter *f, int flags = DUMP_DEFAULT) const; + void dump_load(Formatter *f); +}; + +#endif diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc new file mode 100644 index 00000000..e5491171 --- /dev/null +++ b/src/mds/CInode.cc @@ -0,0 +1,4959 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "common/errno.h" + +#include <string> +#include <stdio.h> + +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" + +#include "MDSRank.h" +#include "MDCache.h" +#include "MDLog.h" +#include "Locker.h" +#include "Mutation.h" + +#include "events/EUpdate.h" + +#include "osdc/Objecter.h" + +#include "snap.h" + +#include "LogSegment.h" + +#include "common/Clock.h" + +#include "common/config.h" +#include "global/global_context.h" +#include "include/ceph_assert.h" + +#include "mds/MDSContinuation.h" +#include "mds/InoTable.h" +#include "cephfs_features.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " + + +class CInodeIOContext : public MDSIOContextBase +{ +protected: + CInode *in; + MDSRank *get_mds() override {return in->mdcache->mds;} +public: + explicit CInodeIOContext(CInode *in_) : in(in_) { + ceph_assert(in != NULL); + } +}; + +sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1; + +LockType CInode::versionlock_type(CEPH_LOCK_IVERSION); +LockType CInode::authlock_type(CEPH_LOCK_IAUTH); +LockType CInode::linklock_type(CEPH_LOCK_ILINK); +LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT); +LockType CInode::filelock_type(CEPH_LOCK_IFILE); +LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR); +LockType CInode::snaplock_type(CEPH_LOCK_ISNAP); +LockType CInode::nestlock_type(CEPH_LOCK_INEST); +LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK); +LockType CInode::policylock_type(CEPH_LOCK_IPOLICY); + +//int cinode_pins[CINODE_NUM_PINS]; // counts +ostream& CInode::print_db_line_prefix(ostream& out) +{ + return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; +} + +/* + * write caps and lock ids + */ +struct cinode_lock_info_t cinode_lock_info[] = { + { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR }, + { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL }, + { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL }, + { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL }, +}; +int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]); + + + +ostream& operator<<(ostream& out, const CInode& in) +{ + string path; + in.make_path_string(path, true); + + out << "[inode " << in.inode.ino; + out << " [" + << (in.is_multiversion() ? "...":"") + << in.first << "," << in.last << "]"; + out << " " << path << (in.is_dir() ? "/":""); + + if (in.is_auth()) { + out << " auth"; + if (in.is_replicated()) + out << in.get_replicas(); + } else { + mds_authority_t a = in.authority(); + out << " rep@" << a.first; + if (a.second != CDIR_AUTH_UNKNOWN) + out << "," << a.second; + out << "." << in.get_replica_nonce(); + } + + if (in.is_symlink()) + out << " symlink='" << in.symlink << "'"; + if (in.is_dir() && !in.dirfragtree.empty()) + out << " " << in.dirfragtree; + + out << " v" << in.get_version(); + if (in.get_projected_version() > in.get_version()) + out << " pv" << in.get_projected_version(); + + if (in.get_num_auth_pins()) { + out << " ap=" << in.get_num_auth_pins(); +#ifdef MDS_AUTHPIN_SET + in.print_authpin_set(out); +#endif + } + + if (in.snaprealm) + out << " snaprealm=" << in.snaprealm; + + if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; + if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; + if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; + if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; + if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs"; + if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; + if (in.is_frozen_inode()) out << " FROZEN"; + if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; + + const CInode::mempool_inode *pi = in.get_projected_inode(); + if (pi->is_truncating()) + out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; + + if (in.inode.is_dir()) { + out << " " << in.inode.dirstat; + if (g_conf()->mds_debug_scatterstat && in.is_projected()) { + const CInode::mempool_inode *pi = in.get_projected_inode(); + out << "->" << pi->dirstat; + } + } else { + out << " s=" << in.inode.size; + if (in.inode.nlink != 1) + out << " nl=" << in.inode.nlink; + } + + // rstat + out << " " << in.inode.rstat; + if (!(in.inode.rstat == in.inode.accounted_rstat)) + out << "/" << in.inode.accounted_rstat; + if (g_conf()->mds_debug_scatterstat && in.is_projected()) { + const CInode::mempool_inode *pi = in.get_projected_inode(); + out << "->" << pi->rstat; + if (!(pi->rstat == pi->accounted_rstat)) + out << "/" << pi->accounted_rstat; + } + + if (!in.client_need_snapflush.empty()) + out << " need_snapflush=" << in.client_need_snapflush; + + + // locks + if (!in.authlock.is_sync_and_unlocked()) + out << " " << in.authlock; + if (!in.linklock.is_sync_and_unlocked()) + out << " " << in.linklock; + if (in.inode.is_dir()) { + if (!in.dirfragtreelock.is_sync_and_unlocked()) + out << " " << in.dirfragtreelock; + if (!in.snaplock.is_sync_and_unlocked()) + out << " " << in.snaplock; + if (!in.nestlock.is_sync_and_unlocked()) + out << " " << in.nestlock; + if (!in.policylock.is_sync_and_unlocked()) + out << " " << in.policylock; + } else { + if (!in.flocklock.is_sync_and_unlocked()) + out << " " << in.flocklock; + } + if (!in.filelock.is_sync_and_unlocked()) + out << " " << in.filelock; + if (!in.xattrlock.is_sync_and_unlocked()) + out << " " << in.xattrlock; + if (!in.versionlock.is_sync_and_unlocked()) + out << " " << in.versionlock; + + // hack: spit out crap on which clients have caps + if (in.inode.client_ranges.size()) + out << " cr=" << in.inode.client_ranges; + + if (!in.get_client_caps().empty()) { + out << " caps={"; + bool first = true; + for (const auto &p : in.get_client_caps()) { + if (!first) out << ","; + out << p.first << "=" + << ccap_string(p.second.pending()); + if (p.second.issued() != p.second.pending()) + out << "/" << ccap_string(p.second.issued()); + out << "/" << ccap_string(p.second.wanted()) + << "@" << p.second.get_last_seq(); + first = false; + } + out << "}"; + if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) { + out << ",l=" << in.get_loner(); + if (in.get_loner() != in.get_wanted_loner()) + out << "(" << in.get_wanted_loner() << ")"; + } + } + if (!in.get_mds_caps_wanted().empty()) { + out << " mcw={"; + bool first = true; + for (const auto &p : in.get_mds_caps_wanted()) { + if (!first) + out << ','; + out << p.first << '=' << ccap_string(p.second); + first = false; + } + out << '}'; + } + + if (in.get_num_ref()) { + out << " |"; + in.print_pin_set(out); + } + + if (in.inode.export_pin != MDS_RANK_NONE) { + out << " export_pin=" << in.inode.export_pin; + } + + out << " " << ∈ + out << "]"; + return out; +} + +ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si) +{ + out << "{scrub_start_version: " << si.scrub_start_version + << ", scrub_start_stamp: " << si.scrub_start_stamp + << ", last_scrub_version: " << si.last_scrub_version + << ", last_scrub_stamp: " << si.last_scrub_stamp; + return out; +} + +CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) + : + mdcache(c), + first(f), last(l), + item_dirty(this), + item_caps(this), + item_open_file(this), + item_dirty_parent(this), + item_dirty_dirfrag_dir(this), + item_dirty_dirfrag_nest(this), + item_dirty_dirfrag_dirfragtree(this), + pop(c->decayrate), + versionlock(this, &versionlock_type), + authlock(this, &authlock_type), + linklock(this, &linklock_type), + dirfragtreelock(this, &dirfragtreelock_type), + filelock(this, &filelock_type), + xattrlock(this, &xattrlock_type), + snaplock(this, &snaplock_type), + nestlock(this, &nestlock_type), + flocklock(this, &flocklock_type), + policylock(this, &policylock_type) +{ + if (auth) state_set(STATE_AUTH); +} + +void CInode::print(ostream& out) +{ + out << *this; +} + +void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) +{ + dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; + + if (client_need_snapflush.empty()) { + get(CInode::PIN_NEEDSNAPFLUSH); + + // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially + // long periods waiting for clients to flush their snaps. + auth_pin(this); // pin head inode... + } + + auto &clients = client_need_snapflush[snapid]; + if (clients.empty()) + snapin->auth_pin(this); // ...and pin snapped/old inode! + + clients.insert(client); +} + +void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) +{ + dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; + auto it = client_need_snapflush.find(snapid); + if (it == client_need_snapflush.end()) { + dout(10) << " snapid not found" << dendl; + return; + } + size_t n = it->second.erase(client); + if (n == 0) { + dout(10) << " client not found" << dendl; + return; + } + if (it->second.empty()) { + client_need_snapflush.erase(it); + snapin->auth_unpin(this); + + if (client_need_snapflush.empty()) { + put(CInode::PIN_NEEDSNAPFLUSH); + auth_unpin(this); + } + } +} + +pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in) +{ + dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl; + bool cowin_need_flush = false; + bool orig_need_flush = false; + auto it = client_need_snapflush.lower_bound(cowin->first); + while (it != client_need_snapflush.end() && it->first < in->first) { + ceph_assert(!it->second.empty()); + if (cowin->last >= it->first) { + cowin->auth_pin(this); + cowin_need_flush = true; + ++it; + } else { + it = client_need_snapflush.erase(it); + } + in->auth_unpin(this); + } + + if (it != client_need_snapflush.end() && it->first <= in->last) + orig_need_flush = true; + + return make_pair(cowin_need_flush, orig_need_flush); +} + +void CInode::mark_dirty_rstat() +{ + if (!state_test(STATE_DIRTYRSTAT)) { + dout(10) << __func__ << dendl; + state_set(STATE_DIRTYRSTAT); + get(PIN_DIRTYRSTAT); + CDentry *pdn = get_projected_parent_dn(); + if (pdn->is_auth()) { + CDir *pdir = pdn->dir; + pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item); + mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock); + } else { + // under cross-MDS rename. + // DIRTYRSTAT flag will get cleared when rename finishes + ceph_assert(state_test(STATE_AMBIGUOUSAUTH)); + } + } +} +void CInode::clear_dirty_rstat() +{ + if (state_test(STATE_DIRTYRSTAT)) { + dout(10) << __func__ << dendl; + state_clear(STATE_DIRTYRSTAT); + put(PIN_DIRTYRSTAT); + dirty_rstat_item.remove_myself(); + } +} + +CInode::projected_inode &CInode::project_inode(bool xattr, bool snap) +{ + auto &pi = projected_nodes.empty() ? + projected_nodes.emplace_back(inode) : + projected_nodes.emplace_back(projected_nodes.back().inode); + + if (scrub_infop && scrub_infop->last_scrub_dirty) { + pi.inode.last_scrub_stamp = scrub_infop->last_scrub_stamp; + pi.inode.last_scrub_version = scrub_infop->last_scrub_version; + scrub_infop->last_scrub_dirty = false; + scrub_maybe_delete_info(); + } + + if (xattr) { + pi.xattrs.reset(new mempool_xattr_map(*get_projected_xattrs())); + ++num_projected_xattrs; + } + + if (snap) { + project_snaprealm(); + } + + dout(15) << __func__ << " " << pi.inode.ino << dendl; + return pi; +} + +void CInode::pop_and_dirty_projected_inode(LogSegment *ls) +{ + ceph_assert(!projected_nodes.empty()); + auto &front = projected_nodes.front(); + dout(15) << __func__ << " " << front.inode.ino + << " v" << front.inode.version << dendl; + int64_t old_pool = inode.layout.pool_id; + + mark_dirty(front.inode.version, ls); + bool new_export_pin = inode.export_pin != front.inode.export_pin; + inode = front.inode; + if (new_export_pin) + maybe_export_pin(true); + + if (inode.is_backtrace_updated()) + mark_dirty_parent(ls, old_pool != inode.layout.pool_id); + + if (front.xattrs) { + --num_projected_xattrs; + xattrs = *front.xattrs; + } + + if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) { + pop_projected_snaprealm(projected_nodes.front().snapnode, false); + --num_projected_srnodes; + } + + projected_nodes.pop_front(); +} + +sr_t *CInode::prepare_new_srnode(snapid_t snapid) +{ + const sr_t *cur_srnode = get_projected_srnode(); + sr_t *new_srnode; + + if (cur_srnode) { + new_srnode = new sr_t(*cur_srnode); + if (!new_srnode->past_parents.empty()) { + // convert past_parents to past_parent_snaps + ceph_assert(snaprealm); + auto& snaps = snaprealm->get_snaps(); + for (auto p : snaps) { + if (p >= new_srnode->current_parent_since) + break; + if (!new_srnode->snaps.count(p)) + new_srnode->past_parent_snaps.insert(p); + } + new_srnode->seq = snaprealm->get_newest_seq(); + new_srnode->past_parents.clear(); + } + if (snaprealm) + snaprealm->past_parents_dirty = false; + } else { + if (snapid == 0) + snapid = mdcache->get_global_snaprealm()->get_newest_seq(); + new_srnode = new sr_t(); + new_srnode->seq = snapid; + new_srnode->created = snapid; + new_srnode->current_parent_since = get_oldest_snap(); + } + return new_srnode; +} + +void CInode::project_snaprealm(sr_t *new_srnode) +{ + dout(10) << __func__ << " " << new_srnode << dendl; + ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE); + projected_nodes.back().snapnode = new_srnode; + ++num_projected_srnodes; +} + +void CInode::mark_snaprealm_global(sr_t *new_srnode) +{ + ceph_assert(!is_dir()); + // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since' + new_srnode->last_destroyed = new_srnode->current_parent_since; + new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + new_srnode->mark_parent_global(); +} + +void CInode::clear_snaprealm_global(sr_t *new_srnode) +{ + // restore 'current_parent_since' + new_srnode->current_parent_since = new_srnode->last_destroyed; + new_srnode->last_destroyed = 0; + new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq(); + new_srnode->clear_parent_global(); +} + +bool CInode::is_projected_snaprealm_global() const +{ + const sr_t *srnode = get_projected_srnode(); + if (srnode && srnode->is_parent_global()) + return true; + return false; +} + +void CInode::project_snaprealm_past_parent(SnapRealm *newparent) +{ + sr_t *new_snap = project_snaprealm(); + record_snaprealm_past_parent(new_snap, newparent); +} + + +/* if newparent != parent, add parent to past_parents + if parent DNE, we need to find what the parent actually is and fill that in */ +void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent) +{ + ceph_assert(!new_snap->is_parent_global()); + SnapRealm *oldparent; + if (!snaprealm) { + oldparent = find_snaprealm(); + } else { + oldparent = snaprealm->parent; + } + + if (newparent != oldparent) { + snapid_t oldparentseq = oldparent->get_newest_seq(); + if (oldparentseq + 1 > new_snap->current_parent_since) { + // copy old parent's snaps + const set<snapid_t>& snaps = oldparent->get_snaps(); + auto p = snaps.lower_bound(new_snap->current_parent_since); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + if (oldparentseq > new_snap->seq) + new_snap->seq = oldparentseq; + } + new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + } +} + +void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent, + CDentry *dn, bool primary_dn) +{ + ceph_assert(new_snap->is_parent_global()); + + if (!oldparent) + oldparent = dn->get_dir()->inode->find_snaprealm(); + auto& snaps = oldparent->get_snaps(); + + if (!primary_dn) { + auto p = snaps.lower_bound(dn->first); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + } else { + // 'last_destroyed' is used as 'current_parent_since' + auto p = snaps.lower_bound(new_snap->last_destroyed); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + } +} + +void CInode::early_pop_projected_snaprealm() +{ + ceph_assert(!projected_nodes.empty()); + if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) { + pop_projected_snaprealm(projected_nodes.front().snapnode, true); + projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE; + --num_projected_srnodes; + } +} + +void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early) +{ + if (next_snaprealm) { + dout(10) << __func__ << (early ? " (early) " : " ") + << next_snaprealm << " seq " << next_snaprealm->seq << dendl; + bool invalidate_cached_snaps = false; + if (!snaprealm) { + open_snaprealm(); + } else if (!snaprealm->srnode.past_parents.empty()) { + invalidate_cached_snaps = true; + // re-open past parents + snaprealm->close_parents(); + + dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents + << " -> " << next_snaprealm->past_parents << dendl; + } + auto old_flags = snaprealm->srnode.flags; + snaprealm->srnode = *next_snaprealm; + delete next_snaprealm; + + if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { + snaprealm->close_parents(); + snaprealm->adjust_parent(); + } + + // we should be able to open these up (or have them already be open). + bool ok = snaprealm->_open_parents(NULL); + ceph_assert(ok); + + if (invalidate_cached_snaps) + snaprealm->invalidate_cached_snaps(); + + if (snaprealm->parent) + dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl; + } else { + dout(10) << __func__ << (early ? " (early) null" : " null") << dendl; + ceph_assert(snaprealm); + snaprealm->merge_to(NULL); + } +} + + +// ====== CInode ======= + +// dirfrags + +__u32 InodeStoreBase::hash_dentry_name(std::string_view dn) +{ + int which = inode.dir_layout.dl_dir_hash; + if (!which) + which = CEPH_STR_HASH_LINUX; + ceph_assert(ceph_str_hash_valid(which)); + return ceph_str_hash(which, dn.data(), dn.length()); +} + +frag_t InodeStoreBase::pick_dirfrag(std::string_view dn) +{ + if (dirfragtree.empty()) + return frag_t(); // avoid the string hash if we can. + + __u32 h = hash_dentry_name(dn); + return dirfragtree[h]; +} + +bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls) +{ + bool all = true; + { + frag_vec_t leaves; + dirfragtree.get_leaves_under(fg, leaves); + for (const auto &leaf : leaves) { + if (auto it = dirfrags.find(leaf); it != dirfrags.end()) { + ls.push_back(it->second); + } else { + all = false; + } + } + } + + if (all) + return all; + + fragtree_t tmpdft; + tmpdft.force_to_leaf(g_ceph_context, fg); + for (auto &p : dirfrags) { + tmpdft.force_to_leaf(g_ceph_context, p.first); + if (fg.contains(p.first) && !dirfragtree.is_leaf(p.first)) + ls.push_back(p.second); + } + + all = true; + { + frag_vec_t leaves; + tmpdft.get_leaves_under(fg, leaves); + for (const auto& leaf : leaves) { + if (!dirfrags.count(leaf)) { + all = false; + break; + } + } + } + + return all; +} + +void CInode::verify_dirfrags() +{ + bool bad = false; + for (const auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree + << ": " << *p.second << dendl; + bad = true; + } + } + ceph_assert(!bad); +} + +void CInode::force_dirfrags() +{ + bool bad = false; + for (auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree + << ": " << *p.second << dendl; + bad = true; + } + } + + if (bad) { + frag_vec_t leaves; + dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true); + } + } + + verify_dirfrags(); +} + +CDir *CInode::get_approx_dirfrag(frag_t fg) +{ + CDir *dir = get_dirfrag(fg); + if (dir) return dir; + + // find a child? + list<CDir*> ls; + get_dirfrags_under(fg, ls); + if (!ls.empty()) + return ls.front(); + + // try parents? + while (fg.bits() > 0) { + fg = fg.parent(); + dir = get_dirfrag(fg); + if (dir) return dir; + } + return NULL; +} + +CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) +{ + ceph_assert(is_dir()); + + // have it? + CDir *dir = get_dirfrag(fg); + if (!dir) { + // create it. + ceph_assert(is_auth() || mdcache->mds->is_any_replay()); + dir = new CDir(this, fg, mdcache, is_auth()); + add_dirfrag(dir); + } + return dir; +} + +CDir *CInode::add_dirfrag(CDir *dir) +{ + auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir)); + ceph_assert(em.second); + + if (stickydir_ref > 0) { + dir->state_set(CDir::STATE_STICKY); + dir->get(CDir::PIN_STICKY); + } + + maybe_export_pin(); + + return dir; +} + +void CInode::close_dirfrag(frag_t fg) +{ + dout(14) << __func__ << " " << fg << dendl; + ceph_assert(dirfrags.count(fg)); + + CDir *dir = dirfrags[fg]; + dir->remove_null_dentries(); + + // clear dirty flag + if (dir->is_dirty()) + dir->mark_clean(); + + if (stickydir_ref > 0) { + dir->state_clear(CDir::STATE_STICKY); + dir->put(CDir::PIN_STICKY); + } + + if (dir->is_subtree_root()) + num_subtree_roots--; + + // dump any remaining dentries, for debugging purposes + for (const auto &p : dir->items) + dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl; + + ceph_assert(dir->get_num_ref() == 0); + delete dir; + dirfrags.erase(fg); +} + +void CInode::close_dirfrags() +{ + while (!dirfrags.empty()) + close_dirfrag(dirfrags.begin()->first); +} + +bool CInode::has_subtree_root_dirfrag(int auth) +{ + if (num_subtree_roots > 0) { + if (auth == -1) + return true; + for (const auto &p : dirfrags) { + if (p.second->is_subtree_root() && + p.second->dir_auth.first == auth) + return true; + } + } + return false; +} + +bool CInode::has_subtree_or_exporting_dirfrag() +{ + if (num_subtree_roots > 0 || num_exporting_dirs > 0) + return true; + return false; +} + +void CInode::get_stickydirs() +{ + if (stickydir_ref == 0) { + get(PIN_STICKYDIRS); + for (const auto &p : dirfrags) { + p.second->state_set(CDir::STATE_STICKY); + p.second->get(CDir::PIN_STICKY); + } + } + stickydir_ref++; +} + +void CInode::put_stickydirs() +{ + ceph_assert(stickydir_ref > 0); + stickydir_ref--; + if (stickydir_ref == 0) { + put(PIN_STICKYDIRS); + for (const auto &p : dirfrags) { + p.second->state_clear(CDir::STATE_STICKY); + p.second->put(CDir::PIN_STICKY); + } + } +} + + + + + +// pins + +void CInode::first_get() +{ + // pin my dentry? + if (parent) + parent->get(CDentry::PIN_INODEPIN); +} + +void CInode::last_put() +{ + // unpin my dentry? + if (parent) + parent->put(CDentry::PIN_INODEPIN); +} + +void CInode::_put() +{ + if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent()) + mdcache->maybe_eval_stray(this, true); +} + +void CInode::add_remote_parent(CDentry *p) +{ + if (remote_parents.empty()) + get(PIN_REMOTEPARENT); + remote_parents.insert(p); +} +void CInode::remove_remote_parent(CDentry *p) +{ + remote_parents.erase(p); + if (remote_parents.empty()) + put(PIN_REMOTEPARENT); +} + + + + +CDir *CInode::get_parent_dir() +{ + if (parent) + return parent->dir; + return NULL; +} +CDir *CInode::get_projected_parent_dir() +{ + CDentry *p = get_projected_parent_dn(); + if (p) + return p->dir; + return NULL; +} +CInode *CInode::get_parent_inode() +{ + if (parent) + return parent->dir->inode; + return NULL; +} + +bool CInode::is_ancestor_of(const CInode *other) const +{ + while (other) { + if (other == this) + return true; + const CDentry *pdn = other->get_oldest_parent_dn(); + if (!pdn) { + ceph_assert(other->is_base()); + break; + } + other = pdn->get_dir()->get_inode(); + } + return false; +} + +bool CInode::is_projected_ancestor_of(const CInode *other) const +{ + while (other) { + if (other == this) + return true; + const CDentry *pdn = other->get_projected_parent_dn(); + if (!pdn) { + ceph_assert(other->is_base()); + break; + } + other = pdn->get_dir()->get_inode(); + } + return false; +} + +/* + * Because a non-directory inode may have multiple links, the use_parent + * argument allows selecting which parent to use for path construction. This + * argument is only meaningful for the final component (i.e. the first of the + * nested calls) because directories cannot have multiple hard links. If + * use_parent is NULL and projected is true, the primary parent's projected + * inode is used all the way up the path chain. Otherwise the primary parent + * stable inode is used. + */ +void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const +{ + if (!use_parent) { + use_parent = projected ? get_projected_parent_dn() : parent; + } + + if (use_parent) { + use_parent->make_path_string(s, projected); + } else if (is_root()) { + s = ""; + } else if (is_mdsdir()) { + char t[40]; + uint64_t eino(ino()); + eino -= MDS_INO_MDSDIR_OFFSET; + snprintf(t, sizeof(t), "~mds%" PRId64, eino); + s = t; + } else { + char n[40]; + uint64_t eino(ino()); + snprintf(n, sizeof(n), "#%" PRIx64, eino); + s += n; + } +} + +void CInode::make_path(filepath& fp, bool projected) const +{ + const CDentry *use_parent = projected ? get_projected_parent_dn() : parent; + if (use_parent) { + ceph_assert(!is_base()); + use_parent->make_path(fp, projected); + } else { + fp = filepath(ino()); + } +} + +void CInode::name_stray_dentry(string& dname) +{ + char s[20]; + snprintf(s, sizeof(s), "%llx", (unsigned long long)inode.ino.val); + dname = s; +} + +version_t CInode::pre_dirty() +{ + version_t pv; + CDentry* _cdentry = get_projected_parent_dn(); + if (_cdentry) { + pv = _cdentry->pre_dirty(get_projected_version()); + dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl; + } else { + ceph_assert(is_base()); + pv = get_projected_version() + 1; + } + // force update backtrace for old format inode (see mempool_inode::decode) + if (inode.backtrace_version == 0 && !projected_nodes.empty()) { + mempool_inode &pi = projected_nodes.back().inode; + if (pi.backtrace_version == 0) + pi.update_backtrace(pv); + } + return pv; +} + +void CInode::_mark_dirty(LogSegment *ls) +{ + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + ceph_assert(ls); + } + + // move myself to this segment's dirty list + if (ls) + ls->dirty_inodes.push_back(&item_dirty); +} + +void CInode::mark_dirty(version_t pv, LogSegment *ls) { + + dout(10) << __func__ << " " << *this << dendl; + + /* + NOTE: I may already be dirty, but this fn _still_ needs to be called so that + the directory is (perhaps newly) dirtied, and so that parent_dir_version is + updated below. + */ + + // only auth can get dirty. "dirty" async data in replicas is relative to + // filelock state, not the dirty flag. + ceph_assert(is_auth()); + + // touch my private version + ceph_assert(inode.version < pv); + inode.version = pv; + _mark_dirty(ls); + + // mark dentry too + if (parent) + parent->mark_dirty(pv, ls); +} + + +void CInode::mark_clean() +{ + dout(10) << __func__ << " " << *this << dendl; + if (state_test(STATE_DIRTY)) { + state_clear(STATE_DIRTY); + put(PIN_DIRTY); + + // remove myself from ls dirty list + item_dirty.remove_myself(); + } +} + + +// -------------- +// per-inode storage +// (currently for root inode only) + +struct C_IO_Inode_Stored : public CInodeIOContext { + version_t version; + Context *fin; + C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} + void finish(int r) override { + in->_stored(r, version, fin); + } + void print(ostream& out) const override { + out << "inode_store(" << in->ino() << ")"; + } +}; + +object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix) +{ + char n[60]; + snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg); + ceph_assert(strlen(n) + suffix.size() < sizeof n); + strncat(n, suffix.data(), suffix.size()); + return object_t(n); +} + +void CInode::store(MDSContext *fin) +{ + dout(10) << __func__ << " " << get_version() << dendl; + ceph_assert(is_base()); + + if (snaprealm) + purge_stale_snap_data(snaprealm->get_snaps()); + + // encode + bufferlist bl; + string magic = CEPH_FS_ONDISK_MAGIC; + using ceph::encode; + encode(magic, bl); + encode_store(bl, mdcache->mds->mdsmap->get_up_features()); + + // write it. + SnapContext snapc; + ObjectOperation m; + m.write_full(bl); + + object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode"); + object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); + + Context *newfin = + new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin), + mdcache->mds->finisher); + mdcache->mds->objecter->mutate(oid, oloc, m, snapc, + ceph::real_clock::now(), 0, + newfin); +} + +void CInode::_stored(int r, version_t v, Context *fin) +{ + if (r < 0) { + dout(1) << "store error " << r << " v " << v << " on " << *this << dendl; + mdcache->mds->clog->error() << "failed to store inode " << ino() + << " object: " << cpp_strerror(r); + mdcache->mds->handle_write_error(r); + fin->complete(r); + return; + } + + dout(10) << __func__ << " " << v << " on " << *this << dendl; + if (v == get_projected_version()) + mark_clean(); + + fin->complete(0); +} + +void CInode::flush(MDSContext *fin) +{ + dout(10) << __func__ << " " << *this << dendl; + ceph_assert(is_auth() && can_auth_pin()); + + MDSGatherBuilder gather(g_ceph_context); + + if (is_dirty_parent()) { + store_backtrace(gather.new_sub()); + } + if (is_dirty()) { + if (is_base()) { + store(gather.new_sub()); + } else { + parent->dir->commit(0, gather.new_sub()); + } + } + + if (gather.has_subs()) { + gather.set_finisher(fin); + gather.activate(); + } else { + fin->complete(0); + } +} + +struct C_IO_Inode_Fetched : public CInodeIOContext { + bufferlist bl, bl2; + Context *fin; + C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {} + void finish(int r) override { + // Ignore 'r', because we fetch from two places, so r is usually ENOENT + in->_fetched(bl, bl2, fin); + } + void print(ostream& out) const override { + out << "inode_fetch(" << in->ino() << ")"; + } +}; + +void CInode::fetch(MDSContext *fin) +{ + dout(10) << __func__ << dendl; + + C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin); + C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher)); + + object_t oid = CInode::get_object_name(ino(), frag_t(), ""); + object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); + + // Old on-disk format: inode stored in xattr of a dirfrag + ObjectOperation rd; + rd.getxattr("inode", &c->bl, NULL); + mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub()); + + // Current on-disk format: inode stored in a .inode object + object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode"); + mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub()); + + gather.activate(); +} + +void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) +{ + dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl; + bufferlist::const_iterator p; + if (bl2.length()) { + p = bl2.cbegin(); + } else if (bl.length()) { + p = bl.cbegin(); + } else { + derr << "No data while reading inode " << ino() << dendl; + fin->complete(-ENOENT); + return; + } + + using ceph::decode; + // Attempt decode + try { + string magic; + decode(magic, p); + dout(10) << " magic is '" << magic << "' (expecting '" + << CEPH_FS_ONDISK_MAGIC << "')" << dendl; + if (magic != CEPH_FS_ONDISK_MAGIC) { + dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC + << "'" << dendl; + fin->complete(-EINVAL); + } else { + decode_store(p); + dout(10) << "_fetched " << *this << dendl; + fin->complete(0); + } + } catch (buffer::error &err) { + derr << "Corrupt inode " << ino() << ": " << err << dendl; + fin->complete(-EINVAL); + return; + } +} + +void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) +{ + bt.ino = inode.ino; + bt.ancestors.clear(); + bt.pool = pool; + + CInode *in = this; + CDentry *pdn = get_parent_dn(); + while (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->inode.version)); + in = diri; + pdn = in->get_parent_dn(); + } + for (auto &p : inode.old_pools) { + // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) + if (p != pool) + bt.old_pools.insert(p); + } +} + +struct C_IO_Inode_StoredBacktrace : public CInodeIOContext { + version_t version; + Context *fin; + C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} + void finish(int r) override { + in->_stored_backtrace(r, version, fin); + } + void print(ostream& out) const override { + out << "backtrace_store(" << in->ino() << ")"; + } +}; + +void CInode::store_backtrace(MDSContext *fin, int op_prio) +{ + dout(10) << __func__ << " on " << *this << dendl; + ceph_assert(is_dirty_parent()); + + if (op_prio < 0) + op_prio = CEPH_MSG_PRIO_DEFAULT; + + auth_pin(this); + + const int64_t pool = get_backtrace_pool(); + inode_backtrace_t bt; + build_backtrace(pool, bt); + bufferlist parent_bl; + using ceph::encode; + encode(bt, parent_bl); + + ObjectOperation op; + op.priority = op_prio; + op.create(false); + op.setxattr("parent", parent_bl); + + bufferlist layout_bl; + encode(inode.layout, layout_bl, mdcache->mds->mdsmap->get_up_features()); + op.setxattr("layout", layout_bl); + + SnapContext snapc; + object_t oid = get_object_name(ino(), frag_t(), ""); + object_locator_t oloc(pool); + Context *fin2 = new C_OnFinisher( + new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin), + mdcache->mds->finisher); + + if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) { + dout(20) << __func__ << ": no dirtypool or no old pools" << dendl; + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, fin2); + return; + } + + C_GatherBuilder gather(g_ceph_context, fin2); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + + // In the case where DIRTYPOOL is set, we update all old pools backtraces + // such that anyone reading them will see the new pool ID in + // inode_backtrace_t::pool and go read everything else from there. + for (const auto &p : inode.old_pools) { + if (p == pool) + continue; + + dout(20) << __func__ << ": updating old pool " << p << dendl; + + ObjectOperation op; + op.priority = op_prio; + op.create(false); + op.setxattr("parent", parent_bl); + + object_locator_t oloc(p); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + } + gather.activate(); +} + +void CInode::_stored_backtrace(int r, version_t v, Context *fin) +{ + if (r == -ENOENT) { + const int64_t pool = get_backtrace_pool(); + bool exists = mdcache->mds->objecter->with_osdmap( + [pool](const OSDMap &osd_map) { + return osd_map.have_pg_pool(pool); + }); + + // This ENOENT is because the pool doesn't exist (the user deleted it + // out from under us), so the backtrace can never be written, so pretend + // to succeed so that the user can proceed to e.g. delete the file. + if (!exists) { + dout(4) << __func__ << " got ENOENT: a data pool was deleted " + "beneath us!" << dendl; + r = 0; + } + } + + if (r < 0) { + dout(1) << "store backtrace error " << r << " v " << v << dendl; + mdcache->mds->clog->error() << "failed to store backtrace on ino " + << ino() << " object" + << ", pool " << get_backtrace_pool() + << ", errno " << r; + mdcache->mds->handle_write_error(r); + if (fin) + fin->complete(r); + return; + } + + dout(10) << __func__ << " v " << v << dendl; + + auth_unpin(this); + if (v == inode.backtrace_version) + clear_dirty_parent(); + if (fin) + fin->complete(0); +} + +void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace) +{ + mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin); +} + +void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool) +{ + if (!state_test(STATE_DIRTYPARENT)) { + dout(10) << __func__ << dendl; + state_set(STATE_DIRTYPARENT); + get(PIN_DIRTYPARENT); + ceph_assert(ls); + } + if (dirty_pool) + state_set(STATE_DIRTYPOOL); + if (ls) + ls->dirty_parent_inodes.push_back(&item_dirty_parent); +} + +void CInode::clear_dirty_parent() +{ + if (state_test(STATE_DIRTYPARENT)) { + dout(10) << __func__ << dendl; + state_clear(STATE_DIRTYPARENT); + state_clear(STATE_DIRTYPOOL); + put(PIN_DIRTYPARENT); + item_dirty_parent.remove_myself(); + } +} + +void CInode::verify_diri_backtrace(bufferlist &bl, int err) +{ + if (is_base() || is_dirty_parent() || !is_auth()) + return; + + dout(10) << __func__ << dendl; + + if (err == 0) { + inode_backtrace_t backtrace; + using ceph::decode; + decode(backtrace, bl); + CDentry *pdn = get_parent_dn(); + if (backtrace.ancestors.empty() || + backtrace.ancestors[0].dname != pdn->get_name() || + backtrace.ancestors[0].dirino != pdn->get_dir()->ino()) + err = -EINVAL; + } + + if (err) { + MDSRank *mds = mdcache->mds; + mds->clog->error() << "bad backtrace on directory inode " << ino(); + ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1)); + + mark_dirty_parent(mds->mdlog->get_current_segment(), false); + mds->mdlog->flush(); + } +} + +// ------------------ +// parent dir + + +void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features, + const bufferlist *snap_blob) const +{ + using ceph::encode; + encode(inode, bl, features); + if (is_symlink()) + encode(symlink, bl); + encode(dirfragtree, bl); + encode(xattrs, bl); + if (snap_blob) + encode(*snap_blob, bl); + else + encode(bufferlist(), bl); + encode(old_inodes, bl, features); + encode(oldest_snap, bl); + encode(damage_flags, bl); +} + +void InodeStoreBase::encode(bufferlist &bl, uint64_t features, + const bufferlist *snap_blob) const +{ + ENCODE_START(6, 4, bl); + encode_bare(bl, features, snap_blob); + ENCODE_FINISH(bl); +} + +void CInode::encode_store(bufferlist& bl, uint64_t features) +{ + bufferlist snap_blob; + encode_snap_blob(snap_blob); + InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(), + &snap_blob); +} + +void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl, + bufferlist& snap_blob, __u8 struct_v) +{ + using ceph::decode; + decode(inode, bl); + if (is_symlink()) { + std::string tmp; + decode(tmp, bl); + symlink = std::string_view(tmp); + } + decode(dirfragtree, bl); + decode_noshare(xattrs, bl); + decode(snap_blob, bl); + + decode(old_inodes, bl); + if (struct_v == 2 && inode.is_dir()) { + bool default_layout_exists; + decode(default_layout_exists, bl); + if (default_layout_exists) { + decode(struct_v, bl); // this was a default_file_layout + decode(inode.layout, bl); // but we only care about the layout portion + } + } + + if (struct_v >= 5) { + // InodeStore is embedded in dentries without proper versioning, so + // we consume up to the end of the buffer + if (!bl.end()) { + decode(oldest_snap, bl); + } + + if (!bl.end()) { + decode(damage_flags, bl); + } + } +} + + +void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); + decode_bare(bl, snap_blob, struct_v); + DECODE_FINISH(bl); +} + +void CInode::decode_store(bufferlist::const_iterator& bl) +{ + bufferlist snap_blob; + InodeStoreBase::decode(bl, snap_blob); + decode_snap_blob(snap_blob); +} + +// ------------------ +// locking + +void CInode::set_object_info(MDSCacheObjectInfo &info) +{ + info.ino = ino(); + info.snapid = last; +} + +void CInode::encode_lock_state(int type, bufferlist& bl) +{ + using ceph::encode; + encode(first, bl); + if (!is_base()) + encode(parent->first, bl); + + switch (type) { + case CEPH_LOCK_IAUTH: + encode(inode.version, bl); + encode(inode.ctime, bl); + encode(inode.mode, bl); + encode(inode.uid, bl); + encode(inode.gid, bl); + break; + + case CEPH_LOCK_ILINK: + encode(inode.version, bl); + encode(inode.ctime, bl); + encode(inode.nlink, bl); + break; + + case CEPH_LOCK_IDFT: + if (is_auth()) { + encode(inode.version, bl); + } else { + // treat flushing as dirty when rejoining cache + bool dirty = dirfragtreelock.is_dirty_or_flushing(); + encode(dirty, bl); + } + { + // encode the raw tree + encode(dirfragtree, bl); + + // also specify which frags are mine + set<frag_t> myfrags; + list<CDir*> dfls; + get_dirfrags(dfls); + for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) + if ((*p)->is_auth()) { + frag_t fg = (*p)->get_frag(); + myfrags.insert(fg); + } + encode(myfrags, bl); + } + break; + + case CEPH_LOCK_IFILE: + if (is_auth()) { + encode(inode.version, bl); + encode(inode.ctime, bl); + encode(inode.mtime, bl); + encode(inode.atime, bl); + encode(inode.time_warp_seq, bl); + if (!is_dir()) { + encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features()); + encode(inode.size, bl); + encode(inode.truncate_seq, bl); + encode(inode.truncate_size, bl); + encode(inode.client_ranges, bl); + encode(inode.inline_data, bl); + } + } else { + // treat flushing as dirty when rejoining cache + bool dirty = filelock.is_dirty_or_flushing(); + encode(dirty, bl); + } + + { + dout(15) << __func__ << " inode.dirstat is " << inode.dirstat << dendl; + encode(inode.dirstat, bl); // only meaningful if i am auth. + bufferlist tmp; + __u32 n = 0; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + if (is_auth() || dir->is_auth()) { + fnode_t *pf = dir->get_projected_fnode(); + dout(15) << fg << " " << *dir << dendl; + dout(20) << fg << " fragstat " << pf->fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; + encode(fg, tmp); + encode(dir->first, tmp); + encode(pf->fragstat, tmp); + encode(pf->accounted_fragstat, tmp); + n++; + } + } + encode(n, bl); + bl.claim_append(tmp); + } + break; + + case CEPH_LOCK_INEST: + if (is_auth()) { + encode(inode.version, bl); + } else { + // treat flushing as dirty when rejoining cache + bool dirty = nestlock.is_dirty_or_flushing(); + encode(dirty, bl); + } + { + dout(15) << __func__ << " inode.rstat is " << inode.rstat << dendl; + encode(inode.rstat, bl); // only meaningful if i am auth. + bufferlist tmp; + __u32 n = 0; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + if (is_auth() || dir->is_auth()) { + fnode_t *pf = dir->get_projected_fnode(); + dout(10) << fg << " " << *dir << dendl; + dout(10) << fg << " " << pf->rstat << dendl; + dout(10) << fg << " " << pf->rstat << dendl; + dout(10) << fg << " " << dir->dirty_old_rstat << dendl; + encode(fg, tmp); + encode(dir->first, tmp); + encode(pf->rstat, tmp); + encode(pf->accounted_rstat, tmp); + encode(dir->dirty_old_rstat, tmp); + n++; + } + } + encode(n, bl); + bl.claim_append(tmp); + } + break; + + case CEPH_LOCK_IXATTR: + encode(inode.version, bl); + encode(inode.ctime, bl); + encode(xattrs, bl); + break; + + case CEPH_LOCK_ISNAP: + encode(inode.version, bl); + encode(inode.ctime, bl); + encode_snap(bl); + break; + + case CEPH_LOCK_IFLOCK: + encode(inode.version, bl); + _encode_file_locks(bl); + break; + + case CEPH_LOCK_IPOLICY: + if (inode.is_dir()) { + encode(inode.version, bl); + encode(inode.ctime, bl); + encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features()); + encode(inode.quota, bl); + encode(inode.export_pin, bl); + } + break; + + default: + ceph_abort(); + } +} + + +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ + +void CInode::decode_lock_state(int type, const bufferlist& bl) +{ + auto p = bl.cbegin(); + utime_t tm; + + snapid_t newfirst; + using ceph::decode; + decode(newfirst, p); + if (!is_auth() && newfirst != first) { + dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl; + first = newfirst; + } + if (!is_base()) { + decode(newfirst, p); + if (!parent->is_auth() && newfirst != parent->first) { + dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl; + parent->first = newfirst; + } + } + + switch (type) { + case CEPH_LOCK_IAUTH: + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode(inode.mode, p); + decode(inode.uid, p); + decode(inode.gid, p); + break; + + case CEPH_LOCK_ILINK: + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode(inode.nlink, p); + break; + + case CEPH_LOCK_IDFT: + if (is_auth()) { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting dftlock dirty flag" << dendl; + dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + decode(inode.version, p); + } + { + fragtree_t temp; + decode(temp, p); + set<frag_t> authfrags; + decode(authfrags, p); + if (is_auth()) { + // auth. believe replica's auth frags only. + for (set<frag_t>::iterator p = authfrags.begin(); p != authfrags.end(); ++p) + if (!dirfragtree.is_leaf(*p)) { + dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl; + dirfragtree.force_to_leaf(g_ceph_context, *p); + dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + // replica. take the new tree, BUT make sure any open + // dirfrags remain leaves (they may have split _after_ this + // dft was scattered, or we may still be be waiting on the + // notify from the auth) + dirfragtree.swap(temp); + for (const auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl; + dirfragtree.force_to_leaf(g_ceph_context, p.first); + } + if (p.second->is_auth()) + p.second->state_clear(CDir::STATE_DIRTYDFT); + } + } + if (g_conf()->mds_debug_frag) + verify_dirfrags(); + } + break; + + case CEPH_LOCK_IFILE: + if (!is_auth()) { + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode(inode.mtime, p); + decode(inode.atime, p); + decode(inode.time_warp_seq, p); + if (!is_dir()) { + decode(inode.layout, p); + decode(inode.size, p); + decode(inode.truncate_seq, p); + decode(inode.truncate_size, p); + decode(inode.client_ranges, p); + decode(inode.inline_data, p); + } + } else { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting filelock dirty flag" << dendl; + filelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } + { + frag_info_t dirstat; + decode(dirstat, p); + if (!is_auth()) { + dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; + inode.dirstat = dirstat; // take inode summation if replica + } + __u32 n; + decode(n, p); + dout(10) << " ...got " << n << " fragstats on " << *this << dendl; + while (n--) { + frag_t fg; + snapid_t fgfirst; + frag_info_t fragstat; + frag_info_t accounted_fragstat; + decode(fg, p); + decode(fgfirst, p); + decode(fragstat, p); + decode(accounted_fragstat, p); + dout(10) << fg << " [" << fgfirst << ",head] " << dendl; + dout(10) << fg << " fragstat " << fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl; + + CDir *dir = get_dirfrag(fg); + if (is_auth()) { + ceph_assert(dir); // i am auth; i had better have this dir open + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + dir->fnode.fragstat = fragstat; + dir->fnode.accounted_fragstat = accounted_fragstat; + dir->first = fgfirst; + if (!(fragstat == accounted_fragstat)) { + dout(10) << fg << " setting filelock updated flag" << dendl; + filelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + if (dir && dir->is_auth()) { + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + fnode_t *pf = dir->get_projected_fnode(); + finish_scatter_update(&filelock, dir, + inode.dirstat.version, pf->accounted_fragstat.version); + } + } + } + } + break; + + case CEPH_LOCK_INEST: + if (is_auth()) { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting nestlock dirty flag" << dendl; + nestlock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + decode(inode.version, p); + } + { + nest_info_t rstat; + decode(rstat, p); + if (!is_auth()) { + dout(10) << " taking inode rstat " << rstat << " for " << *this << dendl; + inode.rstat = rstat; // take inode summation if replica + } + __u32 n; + decode(n, p); + while (n--) { + frag_t fg; + snapid_t fgfirst; + nest_info_t rstat; + nest_info_t accounted_rstat; + decltype(CDir::dirty_old_rstat) dirty_old_rstat; + decode(fg, p); + decode(fgfirst, p); + decode(rstat, p); + decode(accounted_rstat, p); + decode(dirty_old_rstat, p); + dout(10) << fg << " [" << fgfirst << ",head]" << dendl; + dout(10) << fg << " rstat " << rstat << dendl; + dout(10) << fg << " accounted_rstat " << accounted_rstat << dendl; + dout(10) << fg << " dirty_old_rstat " << dirty_old_rstat << dendl; + + CDir *dir = get_dirfrag(fg); + if (is_auth()) { + ceph_assert(dir); // i am auth; i had better have this dir open + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + dir->fnode.rstat = rstat; + dir->fnode.accounted_rstat = accounted_rstat; + dir->dirty_old_rstat.swap(dirty_old_rstat); + if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { + dout(10) << fg << " setting nestlock updated flag" << dendl; + nestlock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + if (dir && dir->is_auth()) { + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + fnode_t *pf = dir->get_projected_fnode(); + finish_scatter_update(&nestlock, dir, + inode.rstat.version, pf->accounted_rstat.version); + } + } + } + } + break; + + case CEPH_LOCK_IXATTR: + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode(xattrs, p); + break; + + case CEPH_LOCK_ISNAP: + { + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode_snap(p); + } + break; + + case CEPH_LOCK_IFLOCK: + decode(inode.version, p); + _decode_file_locks(p); + break; + + case CEPH_LOCK_IPOLICY: + if (inode.is_dir()) { + decode(inode.version, p); + decode(tm, p); + if (inode.ctime < tm) inode.ctime = tm; + decode(inode.layout, p); + decode(inode.quota, p); + mds_rank_t old_pin = inode.export_pin; + decode(inode.export_pin, p); + maybe_export_pin(old_pin != inode.export_pin); + } + break; + + default: + ceph_abort(); + } +} + + +bool CInode::is_dirty_scattered() +{ + return + filelock.is_dirty_or_flushing() || + nestlock.is_dirty_or_flushing() || + dirfragtreelock.is_dirty_or_flushing(); +} + +void CInode::clear_scatter_dirty() +{ + filelock.remove_dirty(); + nestlock.remove_dirty(); + dirfragtreelock.remove_dirty(); +} + +void CInode::clear_dirty_scattered(int type) +{ + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_dir()); + switch (type) { + case CEPH_LOCK_IFILE: + item_dirty_dirfrag_dir.remove_myself(); + break; + + case CEPH_LOCK_INEST: + item_dirty_dirfrag_nest.remove_myself(); + break; + + case CEPH_LOCK_IDFT: + item_dirty_dirfrag_dirfragtree.remove_myself(); + break; + + default: + ceph_abort(); + } +} + + +/* + * when we initially scatter a lock, we need to check if any of the dirfrags + * have out of date accounted_rstat/fragstat. if so, mark the lock stale. + */ +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ +void CInode::start_scatter(ScatterLock *lock) +{ + dout(10) << __func__ << " " << *lock << " on " << *this << dendl; + ceph_assert(is_auth()); + mempool_inode *pi = get_projected_inode(); + + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + fnode_t *pf = dir->get_projected_fnode(); + dout(20) << fg << " " << *dir << dendl; + + if (!dir->is_auth()) + continue; + + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version); + break; + + case CEPH_LOCK_INEST: + finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version); + break; + + case CEPH_LOCK_IDFT: + dir->state_clear(CDir::STATE_DIRTYDFT); + break; + } + } +} + + +class C_Inode_FragUpdate : public MDSLogContextBase { +protected: + CInode *in; + CDir *dir; + MutationRef mut; + MDSRank *get_mds() override {return in->mdcache->mds;} + void finish(int r) override { + in->_finish_frag_update(dir, mut); + } + +public: + C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {} +}; + +void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir, + version_t inode_version, version_t dir_accounted_version) +{ + frag_t fg = dir->get_frag(); + ceph_assert(dir->is_auth()); + + if (dir->is_frozen()) { + dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl; + } else if (dir->get_version() == 0) { + dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl; + } else { + if (dir_accounted_version != inode_version) { + dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl; + + MDLog *mdlog = mdcache->mds->mdlog; + MutationRef mut(new MutationImpl()); + mut->ls = mdlog->get_current_segment(); + + mempool_inode *pi = get_projected_inode(); + fnode_t *pf = dir->project_fnode(); + + std::string_view ename = 0; + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + pf->fragstat.version = pi->dirstat.version; + pf->accounted_fragstat = pf->fragstat; + ename = "lock ifile accounted scatter stat update"; + break; + case CEPH_LOCK_INEST: + pf->rstat.version = pi->rstat.version; + pf->accounted_rstat = pf->rstat; + ename = "lock inest accounted scatter stat update"; + + if (!is_auth() && lock->get_state() == LOCK_MIX) { + dout(10) << __func__ << " try to assimilate dirty rstat on " + << *dir << dendl; + dir->assimilate_dirty_rstat_inodes(); + } + + break; + default: + ceph_abort(); + } + + pf->version = dir->pre_dirty(); + mut->add_projected_fnode(dir); + + EUpdate *le = new EUpdate(mdlog, ename); + mdlog->start_entry(le); + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + + ceph_assert(!dir->is_frozen()); + mut->auth_pin(dir); + + if (lock->get_type() == CEPH_LOCK_INEST && + !is_auth() && lock->get_state() == LOCK_MIX) { + dout(10) << __func__ << " finish assimilating dirty rstat on " + << *dir << dendl; + dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob); + + if (!(pf->rstat == pf->accounted_rstat)) { + if (!mut->is_wrlocked(&nestlock)) { + mdcache->mds->locker->wrlock_force(&nestlock, mut); + } + + mdcache->mds->locker->mark_updated_scatterlock(&nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest); + } + } + + mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut)); + } else { + dout(10) << __func__ << " " << fg << " accounted " << *lock + << " scatter stat unchanged at v" << dir_accounted_version << dendl; + } + } +} + +void CInode::_finish_frag_update(CDir *dir, MutationRef& mut) +{ + dout(10) << __func__ << " on " << *dir << dendl; + mut->apply(); + mdcache->mds->locker->drop_locks(mut.get()); + mut->cleanup(); +} + + +/* + * when we gather a lock, we need to assimilate dirfrag changes into the inode + * state. it's possible we can't update the dirfrag accounted_rstat/fragstat + * because the frag is auth and frozen, or that the replica couldn't for the same + * reason. hopefully it will get updated the next time the lock cycles. + * + * we have two dimensions of behavior: + * - we may be (auth and !frozen), and able to update, or not. + * - the frag may be stale, or not. + * + * if the frag is non-stale, we want to assimilate the diff into the + * inode, regardless of whether it's auth or updateable. + * + * if we update the frag, we want to set accounted_fragstat = frag, + * both if we took the diff or it was stale and we are making it + * un-stale. + */ +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ +void CInode::finish_scatter_gather_update(int type) +{ + LogChannelRef clog = mdcache->mds->clog; + + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_auth()); + + switch (type) { + case CEPH_LOCK_IFILE: + { + fragtree_t tmpdft = dirfragtree; + struct frag_info_t dirstat; + bool dirstat_valid = true; + + // adjust summation + ceph_assert(is_auth()); + mempool_inode *pi = get_projected_inode(); + + bool touched_mtime = false, touched_chattr = false; + dout(20) << " orig dirstat " << pi->dirstat << dendl; + pi->dirstat.version++; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + dout(20) << fg << " " << *dir << dendl; + + bool update; + if (dir->get_version() != 0) { + update = dir->is_auth() && !dir->is_frozen(); + } else { + update = false; + dirstat_valid = false; + } + + fnode_t *pf = dir->get_projected_fnode(); + if (update) + pf = dir->project_fnode(); + + if (pf->accounted_fragstat.version == pi->dirstat.version - 1) { + dout(20) << fg << " fragstat " << pf->fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; + pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); + } else { + dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl; + } + + if (pf->fragstat.nfiles < 0 || + pf->fragstat.nsubdirs < 0) { + clog->error() << "bad/negative dir size on " + << dir->dirfrag() << " " << pf->fragstat; + ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); + + if (pf->fragstat.nfiles < 0) + pf->fragstat.nfiles = 0; + if (pf->fragstat.nsubdirs < 0) + pf->fragstat.nsubdirs = 0; + } + + if (update) { + pf->accounted_fragstat = pf->fragstat; + pf->fragstat.version = pf->accounted_fragstat.version = pi->dirstat.version; + dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl; + } + + tmpdft.force_to_leaf(g_ceph_context, fg); + dirstat.add(pf->fragstat); + } + if (touched_mtime) + pi->mtime = pi->ctime = pi->dirstat.mtime; + if (touched_chattr) + pi->change_attr = pi->dirstat.change_attr; + dout(20) << " final dirstat " << pi->dirstat << dendl; + + if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) { + frag_vec_t leaves; + tmpdft.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + if (!dirfrags.count(leaf)) { + dirstat_valid = false; + break; + } + } + if (dirstat_valid) { + if (state_test(CInode::STATE_REPAIRSTATS)) { + dout(20) << " dirstat mismatch, fixing" << dendl; + } else { + clog->error() << "unmatched fragstat on " << ino() << ", inode has " + << pi->dirstat << ", dirfrags have " << dirstat; + ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter); + } + // trust the dirfrags for now + version_t v = pi->dirstat.version; + if (pi->dirstat.mtime > dirstat.mtime) + dirstat.mtime = pi->dirstat.mtime; + if (pi->dirstat.change_attr > dirstat.change_attr) + dirstat.change_attr = pi->dirstat.change_attr; + pi->dirstat = dirstat; + pi->dirstat.version = v; + } + } + + if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) + { + std::string path; + make_path_string(path); + clog->error() << "Inconsistent statistics detected: fragstat on inode " + << ino() << " (" << path << "), inode has " << pi->dirstat; + ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); + + if (pi->dirstat.nfiles < 0) + pi->dirstat.nfiles = 0; + if (pi->dirstat.nsubdirs < 0) + pi->dirstat.nsubdirs = 0; + } + } + break; + + case CEPH_LOCK_INEST: + { + // adjust summation + ceph_assert(is_auth()); + + fragtree_t tmpdft = dirfragtree; + nest_info_t rstat; + bool rstat_valid = true; + + rstat.rsubdirs = 1; + if (const sr_t *srnode = get_projected_srnode(); srnode) + rstat.rsnaps = srnode->snaps.size(); + + mempool_inode *pi = get_projected_inode(); + dout(20) << " orig rstat " << pi->rstat << dendl; + pi->rstat.version++; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + dout(20) << fg << " " << *dir << dendl; + + bool update; + if (dir->get_version() != 0) { + update = dir->is_auth() && !dir->is_frozen(); + } else { + update = false; + rstat_valid = false; + } + + fnode_t *pf = dir->get_projected_fnode(); + if (update) + pf = dir->project_fnode(); + + if (pf->accounted_rstat.version == pi->rstat.version-1) { + // only pull this frag's dirty rstat inodes into the frag if + // the frag is non-stale and updateable. if it's stale, + // that info will just get thrown out! + if (update) + dir->assimilate_dirty_rstat_inodes(); + + dout(20) << fg << " rstat " << pf->rstat << dendl; + dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl; + dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; + mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, + dir->first, CEPH_NOSNAP, this, true); + for (auto &p : dir->dirty_old_rstat) { + mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, + p.second.first, p.first, this, true); + } + if (update) // dir contents not valid if frozen or non-auth + dir->check_rstats(); + } else { + dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl; + } + if (update) { + pf->accounted_rstat = pf->rstat; + dir->dirty_old_rstat.clear(); + pf->rstat.version = pf->accounted_rstat.version = pi->rstat.version; + dir->check_rstats(); + dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl; + } + + tmpdft.force_to_leaf(g_ceph_context, fg); + rstat.add(pf->rstat); + } + dout(20) << " final rstat " << pi->rstat << dendl; + + if (rstat_valid && !rstat.same_sums(pi->rstat)) { + frag_vec_t leaves; + tmpdft.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + if (!dirfrags.count(leaf)) { + rstat_valid = false; + break; + } + } + if (rstat_valid) { + if (state_test(CInode::STATE_REPAIRSTATS)) { + dout(20) << " rstat mismatch, fixing" << dendl; + } else { + clog->error() << "inconsistent rstat on inode " << ino() + << ", inode has " << pi->rstat + << ", directory fragments have " << rstat; + ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter); + } + // trust the dirfrag for now + version_t v = pi->rstat.version; + if (pi->rstat.rctime > rstat.rctime) + rstat.rctime = pi->rstat.rctime; + pi->rstat = rstat; + pi->rstat.version = v; + } + } + + mdcache->broadcast_quota_to_client(this); + } + break; + + case CEPH_LOCK_IDFT: + break; + + default: + ceph_abort(); + } +} + +void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob) +{ + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_auth()); + + for (const auto &p : dirfrags) { + CDir *dir = p.second; + if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) + continue; + + if (type == CEPH_LOCK_IDFT) + continue; // nothing to do. + + dout(10) << " journaling updated frag accounted_ on " << *dir << dendl; + ceph_assert(dir->is_projected()); + fnode_t *pf = dir->get_projected_fnode(); + pf->version = dir->pre_dirty(); + mut->add_projected_fnode(dir); + metablob->add_dir(dir, true); + mut->auth_pin(dir); + + if (type == CEPH_LOCK_INEST) + dir->assimilate_dirty_rstat_inodes_finish(mut, metablob); + } +} + +// waiting + +bool CInode::is_frozen() const +{ + if (is_frozen_inode()) return true; + if (parent && parent->dir->is_frozen()) return true; + return false; +} + +bool CInode::is_frozen_dir() const +{ + if (parent && parent->dir->is_frozen_dir()) return true; + return false; +} + +bool CInode::is_freezing() const +{ + if (is_freezing_inode()) return true; + if (parent && parent->dir->is_freezing()) return true; + return false; +} + +void CInode::add_dir_waiter(frag_t fg, MDSContext *c) +{ + if (waiting_on_dir.empty()) + get(PIN_DIRWAITER); + waiting_on_dir[fg].push_back(c); + dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl; +} + +void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls) +{ + if (waiting_on_dir.empty()) + return; + + auto it = waiting_on_dir.find(fg); + if (it != waiting_on_dir.end()) { + dout(10) << __func__ << " frag " << fg << " on " << *this << dendl; + auto& waiting = it->second; + ls.insert(ls.end(), waiting.begin(), waiting.end()); + waiting_on_dir.erase(it); + + if (waiting_on_dir.empty()) + put(PIN_DIRWAITER); + } +} + +void CInode::add_waiter(uint64_t tag, MDSContext *c) +{ + dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c + << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) + << " !frozen " << !is_frozen_inode() + << " !freezing " << !is_freezing_inode() + << dendl; + // wait on the directory? + // make sure its not the inode that is explicitly ambiguous|freezing|frozen + if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || + ((tag & WAIT_UNFREEZE) && + !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { + dout(15) << "passing waiter up tree" << dendl; + parent->dir->add_waiter(tag, c); + return; + } + dout(15) << "taking waiter here" << dendl; + MDSCacheObject::add_waiter(tag, c); +} + +void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls) +{ + if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { + // take all dentry waiters + while (!waiting_on_dir.empty()) { + auto it = waiting_on_dir.begin(); + dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl; + auto& waiting = it->second; + ls.insert(ls.end(), waiting.begin(), waiting.end()); + waiting_on_dir.erase(it); + } + put(PIN_DIRWAITER); + } + + // waiting + MDSCacheObject::take_waiting(mask, ls); +} + +bool CInode::freeze_inode(int auth_pin_allowance) +{ + ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins + ceph_assert(auth_pins >= auth_pin_allowance); + if (auth_pins > auth_pin_allowance) { + dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; + auth_pin_freeze_allowance = auth_pin_allowance; + get(PIN_FREEZING); + state_set(STATE_FREEZING); + return false; + } + + dout(10) << "freeze_inode - frozen" << dendl; + ceph_assert(auth_pins == auth_pin_allowance); + if (!state_test(STATE_FROZEN)) { + get(PIN_FROZEN); + state_set(STATE_FROZEN); + } + return true; +} + +void CInode::unfreeze_inode(MDSContext::vec& finished) +{ + dout(10) << __func__ << dendl; + if (state_test(STATE_FREEZING)) { + state_clear(STATE_FREEZING); + put(PIN_FREEZING); + } else if (state_test(STATE_FROZEN)) { + state_clear(STATE_FROZEN); + put(PIN_FROZEN); + } else + ceph_abort(); + take_waiting(WAIT_UNFREEZE, finished); +} + +void CInode::unfreeze_inode() +{ + MDSContext::vec finished; + unfreeze_inode(finished); + mdcache->mds->queue_waiters(finished); +} + +void CInode::freeze_auth_pin() +{ + ceph_assert(state_test(CInode::STATE_FROZEN)); + state_set(CInode::STATE_FROZENAUTHPIN); +} + +void CInode::unfreeze_auth_pin() +{ + ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN)); + state_clear(CInode::STATE_FROZENAUTHPIN); + if (!state_test(STATE_FREEZING|STATE_FROZEN)) { + MDSContext::vec finished; + take_waiting(WAIT_UNFREEZE, finished); + mdcache->mds->queue_waiters(finished); + } +} + +void CInode::clear_ambiguous_auth(MDSContext::vec& finished) +{ + ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); + state_clear(CInode::STATE_AMBIGUOUSAUTH); + take_waiting(CInode::WAIT_SINGLEAUTH, finished); +} + +void CInode::clear_ambiguous_auth() +{ + MDSContext::vec finished; + clear_ambiguous_auth(finished); + mdcache->mds->queue_waiters(finished); +} + +// auth_pins +bool CInode::can_auth_pin(int *err_ret) const { + int err; + if (!is_auth()) { + err = ERR_NOT_AUTH; + } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) { + err = ERR_EXPORTING_INODE; + } else { + if (parent) + return parent->can_auth_pin(err_ret); + err = 0; + } + if (err && err_ret) + *err_ret = err; + return !err; +} + +void CInode::auth_pin(void *by) +{ + if (auth_pins == 0) + get(PIN_AUTHPIN); + auth_pins++; + +#ifdef MDS_AUTHPIN_SET + auth_pin_set.insert(by); +#endif + + dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl; + + if (parent) + parent->adjust_nested_auth_pins(1, this); +} + +void CInode::auth_unpin(void *by) +{ + auth_pins--; + +#ifdef MDS_AUTHPIN_SET + { + auto it = auth_pin_set.find(by); + ceph_assert(it != auth_pin_set.end()); + auth_pin_set.erase(it); + } +#endif + + if (auth_pins == 0) + put(PIN_AUTHPIN); + + dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl; + + ceph_assert(auth_pins >= 0); + + if (parent) + parent->adjust_nested_auth_pins(-1, by); + + if (is_freezing_inode() && + auth_pins == auth_pin_freeze_allowance) { + dout(10) << "auth_unpin freezing!" << dendl; + get(PIN_FROZEN); + put(PIN_FREEZING); + state_clear(STATE_FREEZING); + state_set(STATE_FROZEN); + finish_waiting(WAIT_FROZEN); + } +} + +// authority + +mds_authority_t CInode::authority() const +{ + if (inode_auth.first >= 0) + return inode_auth; + + if (parent) + return parent->dir->authority(); + + // new items that are not yet linked in (in the committed plane) belong + // to their first parent. + if (!projected_parent.empty()) + return projected_parent.front()->dir->authority(); + + return CDIR_AUTH_UNDEF; +} + + +// SNAP + +snapid_t CInode::get_oldest_snap() +{ + snapid_t t = first; + if (!old_inodes.empty()) + t = old_inodes.begin()->second.first; + return std::min(t, oldest_snap); +} + +CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head) +{ + ceph_assert(follows >= first); + + mempool_inode *pi = cow_head ? get_projected_inode() : get_previous_projected_inode(); + mempool_xattr_map *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs(); + + mempool_old_inode &old = old_inodes[follows]; + old.first = first; + old.inode = *pi; + old.xattrs = *px; + + if (first < oldest_snap) + oldest_snap = first; + + dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl; + + old.inode.trim_client_ranges(follows); + + if (g_conf()->mds_snap_rstat && + !(old.inode.rstat == old.inode.accounted_rstat)) + dirty_old_rstats.insert(follows); + + first = follows+1; + + dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" ) + << " to [" << old.first << "," << follows << "] on " + << *this << dendl; + + return old; +} + +void CInode::split_old_inode(snapid_t snap) +{ + auto it = old_inodes.lower_bound(snap); + ceph_assert(it != old_inodes.end() && it->second.first < snap); + + mempool_old_inode &old = old_inodes[snap - 1]; + old = it->second; + + it->second.first = snap; + dout(10) << __func__ << " " << "[" << old.first << "," << it->first + << "] to [" << snap << "," << it->first << "] on " << *this << dendl; +} + +void CInode::pre_cow_old_inode() +{ + snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); + if (first <= follows) + cow_old_inode(follows, true); +} + +bool CInode::has_snap_data(snapid_t snapid) +{ + bool found = snapid >= first && snapid <= last; + if (!found && is_multiversion()) { + auto p = old_inodes.lower_bound(snapid); + if (p != old_inodes.end()) { + if (p->second.first > snapid) { + if (p != old_inodes.begin()) + --p; + } + if (p->second.first <= snapid && snapid <= p->first) { + found = true; + } + } + } + return found; +} + +void CInode::purge_stale_snap_data(const set<snapid_t>& snaps) +{ + dout(10) << __func__ << " " << snaps << dendl; + + for (auto it = old_inodes.begin(); it != old_inodes.end(); ) { + const snapid_t &id = it->first; + const auto &s = snaps.lower_bound(it->second.first); + if (s == snaps.end() || *s > id) { + dout(10) << " purging old_inode [" << it->second.first << "," << id << "]" << dendl; + it = old_inodes.erase(it); + } else { + ++it; + } + } +} + +/* + * pick/create an old_inode + */ +CInode::mempool_old_inode * CInode::pick_old_inode(snapid_t snap) +{ + auto it = old_inodes.lower_bound(snap); // p is first key >= to snap + if (it != old_inodes.end() && it->second.first <= snap) { + dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl; + return &it->second; + } + dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl; + return NULL; +} + +void CInode::open_snaprealm(bool nosplit) +{ + if (!snaprealm) { + SnapRealm *parent = find_snaprealm(); + snaprealm = new SnapRealm(mdcache, this); + if (parent) { + dout(10) << __func__ << " " << snaprealm + << " parent is " << parent + << dendl; + dout(30) << " siblings are " << parent->open_children << dendl; + snaprealm->parent = parent; + if (!nosplit) + parent->split_at(snaprealm); + parent->open_children.insert(snaprealm); + } + } +} +void CInode::close_snaprealm(bool nojoin) +{ + if (snaprealm) { + dout(15) << __func__ << " " << *snaprealm << dendl; + snaprealm->close_parents(); + if (snaprealm->parent) { + snaprealm->parent->open_children.erase(snaprealm); + //if (!nojoin) + //snaprealm->parent->join(snaprealm); + } + delete snaprealm; + snaprealm = 0; + } +} + +SnapRealm *CInode::find_snaprealm() const +{ + const CInode *cur = this; + while (!cur->snaprealm) { + const CDentry *pdn = cur->get_oldest_parent_dn(); + if (!pdn) + break; + cur = pdn->get_dir()->get_inode(); + } + return cur->snaprealm; +} + +void CInode::encode_snap_blob(bufferlist &snapbl) +{ + if (snaprealm) { + using ceph::encode; + encode(snaprealm->srnode, snapbl); + dout(20) << __func__ << " " << *snaprealm << dendl; + } +} +void CInode::decode_snap_blob(const bufferlist& snapbl) +{ + using ceph::decode; + if (snapbl.length()) { + open_snaprealm(); + auto old_flags = snaprealm->srnode.flags; + auto p = snapbl.cbegin(); + decode(snaprealm->srnode, p); + if (is_base()) { + bool ok = snaprealm->_open_parents(NULL); + ceph_assert(ok); + } else { + if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { + snaprealm->close_parents(); + snaprealm->adjust_parent(); + } + } + dout(20) << __func__ << " " << *snaprealm << dendl; + } else if (snaprealm && + !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675 + ceph_assert(mdcache->mds->is_any_replay()); + snaprealm->merge_to(NULL); + } +} + +void CInode::encode_snap(bufferlist& bl) +{ + using ceph::encode; + bufferlist snapbl; + encode_snap_blob(snapbl); + encode(snapbl, bl); + encode(oldest_snap, bl); +} + +void CInode::decode_snap(bufferlist::const_iterator& p) +{ + using ceph::decode; + bufferlist snapbl; + decode(snapbl, p); + decode(oldest_snap, p); + decode_snap_blob(snapbl); +} + +// ============================================= + +client_t CInode::calc_ideal_loner() +{ + if (mdcache->is_readonly()) + return -1; + if (!get_mds_caps_wanted().empty()) + return -1; + + int n = 0; + client_t loner = -1; + for (const auto &p : client_caps) { + if (!p.second.is_stale() && + ((p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD)) || + (inode.is_dir() && !has_subtree_root_dirfrag()))) { + if (n) + return -1; + n++; + loner = p.first; + } + } + return loner; +} + +bool CInode::choose_ideal_loner() +{ + want_loner_cap = calc_ideal_loner(); + int changed = false; + if (loner_cap >= 0 && loner_cap != want_loner_cap) { + if (!try_drop_loner()) + return false; + changed = true; + } + + if (want_loner_cap >= 0) { + if (loner_cap < 0) { + set_loner_cap(want_loner_cap); + changed = true; + } else + ceph_assert(loner_cap == want_loner_cap); + } + return changed; +} + +bool CInode::try_set_loner() +{ + ceph_assert(want_loner_cap >= 0); + if (loner_cap >= 0 && loner_cap != want_loner_cap) + return false; + set_loner_cap(want_loner_cap); + return true; +} + +void CInode::set_loner_cap(client_t l) +{ + loner_cap = l; + authlock.set_excl_client(loner_cap); + filelock.set_excl_client(loner_cap); + linklock.set_excl_client(loner_cap); + xattrlock.set_excl_client(loner_cap); +} + +bool CInode::try_drop_loner() +{ + if (loner_cap < 0) + return true; + + int other_allowed = get_caps_allowed_by_type(CAP_ANY); + Capability *cap = get_client_cap(loner_cap); + if (!cap || + (cap->issued() & ~other_allowed) == 0) { + set_loner_cap(-1); + return true; + } + return false; +} + + +// choose new lock state during recovery, based on issued caps +void CInode::choose_lock_state(SimpleLock *lock, int allissued) +{ + int shift = lock->get_cap_shift(); + int issued = (allissued >> shift) & lock->get_cap_mask(); + if (is_auth()) { + if (lock->is_xlocked()) { + // do nothing here + } else if (lock->get_state() != LOCK_MIX) { + if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER)) + lock->set_state(LOCK_EXCL); + else if (issued & CEPH_CAP_GWR) { + if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED)) + lock->set_state(LOCK_EXCL); + else + lock->set_state(LOCK_MIX); + } else if (lock->is_dirty()) { + if (is_replicated()) + lock->set_state(LOCK_MIX); + else + lock->set_state(LOCK_LOCK); + } else + lock->set_state(LOCK_SYNC); + } + } else { + // our states have already been chosen during rejoin. + if (lock->is_xlocked()) + ceph_assert(lock->get_state() == LOCK_LOCK); + } +} + +void CInode::choose_lock_states(int dirty_caps) +{ + int issued = get_caps_issued() | dirty_caps; + if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR))) + choose_ideal_loner(); + choose_lock_state(&filelock, issued); + choose_lock_state(&nestlock, issued); + choose_lock_state(&dirfragtreelock, issued); + choose_lock_state(&authlock, issued); + choose_lock_state(&xattrlock, issued); + choose_lock_state(&linklock, issued); +} + +void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m) +{ + bool old_empty = mds_caps_wanted.empty(); + mds_caps_wanted.swap(m); + if (old_empty != (bool)mds_caps_wanted.empty()) { + if (old_empty) + adjust_num_caps_wanted(1); + else + adjust_num_caps_wanted(-1); + } +} + +void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) +{ + bool old_empty = mds_caps_wanted.empty(); + if (wanted) { + mds_caps_wanted[mds] = wanted; + if (old_empty) + adjust_num_caps_wanted(1); + } else if (!old_empty) { + mds_caps_wanted.erase(mds); + if (mds_caps_wanted.empty()) + adjust_num_caps_wanted(-1); + } +} + +void CInode::adjust_num_caps_wanted(int d) +{ + if (!num_caps_wanted && d > 0) + mdcache->open_file_table.add_inode(this); + else if (num_caps_wanted > 0 && num_caps_wanted == -d) + mdcache->open_file_table.remove_inode(this); + + num_caps_wanted +=d; + ceph_assert(num_caps_wanted >= 0); +} + +Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm) +{ + ceph_assert(last == CEPH_NOSNAP); + if (client_caps.empty()) { + get(PIN_CAPS); + if (conrealm) + containing_realm = conrealm; + else + containing_realm = find_snaprealm(); + containing_realm->inodes_with_caps.push_back(&item_caps); + dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; + + mdcache->num_inodes_with_caps++; + if (parent) + parent->dir->adjust_num_inodes_with_caps(1); + } + + uint64_t cap_id = ++mdcache->last_cap_id; + auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client), + std::forward_as_tuple(this, session, cap_id)); + ceph_assert(ret.second == true); + Capability *cap = &ret.first->second; + + cap->client_follows = first-1; + containing_realm->add_cap(client, cap); + + return cap; +} + +void CInode::remove_client_cap(client_t client) +{ + auto it = client_caps.find(client); + ceph_assert(it != client_caps.end()); + Capability *cap = &it->second; + + cap->item_session_caps.remove_myself(); + cap->item_revoking_caps.remove_myself(); + cap->item_client_revoking_caps.remove_myself(); + containing_realm->remove_cap(client, cap); + + if (client == loner_cap) + loner_cap = -1; + + if (cap->wanted()) + adjust_num_caps_wanted(-1); + + client_caps.erase(it); + if (client_caps.empty()) { + dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl; + put(PIN_CAPS); + item_caps.remove_myself(); + containing_realm = NULL; + mdcache->num_inodes_with_caps--; + if (parent) + parent->dir->adjust_num_inodes_with_caps(-1); + } + + //clean up advisory locks + bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false; + bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; + if (fcntl_removed || flock_removed) { + MDSContext::vec waiters; + take_waiting(CInode::WAIT_FLOCK, waiters); + mdcache->mds->queue_waiters(waiters); + } +} + +void CInode::move_to_realm(SnapRealm *realm) +{ + dout(10) << __func__ << " joining realm " << *realm + << ", leaving realm " << *containing_realm << dendl; + for (auto& p : client_caps) { + containing_realm->remove_cap(p.first, &p.second); + realm->add_cap(p.first, &p.second); + } + item_caps.remove_myself(); + realm->inodes_with_caps.push_back(&item_caps); + containing_realm = realm; +} + +Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session) +{ + Capability *cap = get_client_cap(client); + if (cap) { + // FIXME? + cap->merge(icr.capinfo.wanted, icr.capinfo.issued); + } else { + cap = add_client_cap(client, session); + cap->set_cap_id(icr.capinfo.cap_id); + cap->set_wanted(icr.capinfo.wanted); + cap->issue_norevoke(icr.capinfo.issued); + cap->reset_seq(); + } + cap->set_last_issue_stamp(ceph_clock_now()); + return cap; +} + +void CInode::clear_client_caps_after_export() +{ + while (!client_caps.empty()) + remove_client_cap(client_caps.begin()->first); + loner_cap = -1; + want_loner_cap = -1; + if (!get_mds_caps_wanted().empty()) { + mempool::mds_co::compact_map<int32_t,int32_t> empty; + set_mds_caps_wanted(empty); + } +} + +void CInode::export_client_caps(map<client_t,Capability::Export>& cl) +{ + for (const auto &p : client_caps) { + cl[p.first] = p.second.make_export(); + } +} + + // caps allowed +int CInode::get_caps_liked() const +{ + if (is_dir()) + return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER + else + return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; +} + +int CInode::get_caps_allowed_ever() const +{ + int allowed; + if (is_dir()) + allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; + else + allowed = CEPH_CAP_ANY; + return allowed & + (CEPH_CAP_PIN | + (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) | + (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) | + (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) | + (linklock.gcaps_allowed_ever() << linklock.get_cap_shift())); +} + +int CInode::get_caps_allowed_by_type(int type) const +{ + return + CEPH_CAP_PIN | + (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | + (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | + (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | + (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); +} + +int CInode::get_caps_careful() const +{ + return + (filelock.gcaps_careful() << filelock.get_cap_shift()) | + (authlock.gcaps_careful() << authlock.get_cap_shift()) | + (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | + (linklock.gcaps_careful() << linklock.get_cap_shift()); +} + +int CInode::get_xlocker_mask(client_t client) const +{ + return + (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | + (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | + (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | + (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); +} + +int CInode::get_caps_allowed_for_client(Session *session, Capability *cap, + mempool_inode *file_i) const +{ + client_t client = session->get_client(); + int allowed; + if (client == get_loner()) { + // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked + allowed = + get_caps_allowed_by_type(CAP_LONER) | + (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client)); + } else { + allowed = get_caps_allowed_by_type(CAP_ANY); + } + + if (!is_dir()) { + if (file_i->inline_data.version == CEPH_INLINE_NONE && + file_i->layout.pool_ns.empty()) { + // noop + } else if (cap) { + if ((file_i->inline_data.version != CEPH_INLINE_NONE && + cap->is_noinline()) || + (!file_i->layout.pool_ns.empty() && + cap->is_nopoolns())) + allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); + } else { + auto& conn = session->get_connection(); + if ((file_i->inline_data.version != CEPH_INLINE_NONE && + !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) || + (!file_i->layout.pool_ns.empty() && + !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))) + allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); + } + } + return allowed; +} + +// caps issued, wanted +int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker, + int shift, int mask) +{ + int c = 0; + int loner = 0, other = 0, xlocker = 0; + if (!is_auth()) { + loner_cap = -1; + } + + for (const auto &p : client_caps) { + int i = p.second.issued(); + c |= i; + if (p.first == loner_cap) + loner |= i; + else + other |= i; + xlocker |= get_xlocker_mask(p.first) & i; + } + if (ploner) *ploner = (loner >> shift) & mask; + if (pother) *pother = (other >> shift) & mask; + if (pxlocker) *pxlocker = (xlocker >> shift) & mask; + return (c >> shift) & mask; +} + +bool CInode::is_any_caps_wanted() const +{ + for (const auto &p : client_caps) { + if (p.second.wanted()) + return true; + } + return false; +} + +int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const +{ + int w = 0; + int loner = 0, other = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) { + int t = p.second.wanted(); + w |= t; + if (p.first == loner_cap) + loner |= t; + else + other |= t; + } + //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; + } + if (is_auth()) + for (const auto &p : mds_caps_wanted) { + w |= p.second; + other |= p.second; + //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; + } + if (ploner) *ploner = (loner >> shift) & mask; + if (pother) *pother = (other >> shift) & mask; + return (w >> shift) & mask; +} + +bool CInode::issued_caps_need_gather(SimpleLock *lock) +{ + int loner_issued, other_issued, xlocker_issued; + get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, + lock->get_cap_shift(), lock->get_cap_mask()); + if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) || + (other_issued & ~lock->gcaps_allowed(CAP_ANY)) || + (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER))) + return true; + return false; +} + + +// ============================================= + +int CInode::encode_inodestat(bufferlist& bl, Session *session, + SnapRealm *dir_realm, + snapid_t snapid, + unsigned max_bytes, + int getattr_caps) +{ + client_t client = session->get_client(); + ceph_assert(snapid); + + bool valid = true; + + // pick a version! + mempool_inode *oi = &inode; + mempool_inode *pi = get_projected_inode(); + + CInode::mempool_xattr_map *pxattrs = nullptr; + + if (snapid != CEPH_NOSNAP) { + + // for now at least, old_inodes is only defined/valid on the auth + if (!is_auth()) + valid = false; + + if (is_multiversion()) { + auto it = old_inodes.lower_bound(snapid); + if (it != old_inodes.end()) { + if (it->second.first > snapid) { + if (it != old_inodes.begin()) + --it; + } + if (it->second.first <= snapid && snapid <= it->first) { + dout(15) << __func__ << " snapid " << snapid + << " to old_inode [" << it->second.first << "," << it->first << "]" + << " " << it->second.inode.rstat + << dendl; + auto &p = it->second; + pi = oi = &p.inode; + pxattrs = &p.xattrs; + } else { + // snapshoted remote dentry can result this + dout(0) << __func__ << " old_inode for snapid " << snapid + << " not found" << dendl; + } + } + } else if (snapid < first || snapid > last) { + // snapshoted remote dentry can result this + dout(0) << __func__ << " [" << first << "," << last << "]" + << " not match snapid " << snapid << dendl; + } + } + + utime_t snap_btime; + SnapRealm *realm = find_snaprealm(); + if (snapid != CEPH_NOSNAP && realm) { + // add snapshot timestamp vxattr + map<snapid_t,const SnapInfo*> infomap; + realm->get_snap_info(infomap, + snapid, // min + snapid); // max + if (!infomap.empty()) { + ceph_assert(infomap.size() == 1); + const SnapInfo *si = infomap.begin()->second; + snap_btime = si->stamp; + } + } + + + bool no_caps = !valid || + session->is_stale() || + (dir_realm && realm != dir_realm) || + is_frozen() || + state_test(CInode::STATE_EXPORTINGCAPS); + if (no_caps) + dout(20) << __func__ << " no caps" + << (!valid?", !valid":"") + << (session->is_stale()?", session stale ":"") + << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"") + << (is_frozen()?", frozen inode":"") + << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") + << dendl; + + + // "fake" a version that is old (stable) version, +1 if projected. + version_t version = (oi->version * 2) + is_projected(); + + Capability *cap = get_client_cap(client); + bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client; + //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL)); + bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client; + bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client; + bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client; + + bool plocal = versionlock.get_last_wrlock_client() == client; + bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client; + + mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi; + + dout(20) << " pfile " << pfile << " pauth " << pauth + << " plink " << plink << " pxattr " << pxattr + << " plocal " << plocal + << " ctime " << any_i->ctime + << " valid=" << valid << dendl; + + // file + mempool_inode *file_i = pfile ? pi:oi; + file_layout_t layout; + if (is_dir()) { + layout = (ppolicy ? pi : oi)->layout; + } else { + layout = file_i->layout; + } + + // max_size is min of projected, actual + uint64_t max_size = + std::min(oi->client_ranges.count(client) ? + oi->client_ranges[client].range.last : 0, + pi->client_ranges.count(client) ? + pi->client_ranges[client].range.last : 0); + + // inline data + version_t inline_version = 0; + bufferlist inline_data; + if (file_i->inline_data.version == CEPH_INLINE_NONE) { + inline_version = CEPH_INLINE_NONE; + } else if ((!cap && !no_caps) || + (cap && cap->client_inline_version < file_i->inline_data.version) || + (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data + inline_version = file_i->inline_data.version; + if (file_i->inline_data.length() > 0) + inline_data = file_i->inline_data.get_data(); + } + + // nest (do same as file... :/) + if (cap) { + cap->last_rbytes = file_i->rstat.rbytes; + cap->last_rsize = file_i->rstat.rsize(); + } + + // auth + mempool_inode *auth_i = pauth ? pi:oi; + + // link + mempool_inode *link_i = plink ? pi:oi; + + // xattr + mempool_inode *xattr_i = pxattr ? pi:oi; + + using ceph::encode; + // xattr + version_t xattr_version; + if ((!cap && !no_caps) || + (cap && cap->client_xattr_version < xattr_i->xattr_version) || + (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs + if (!pxattrs) + pxattrs = pxattr ? get_projected_xattrs() : &xattrs; + xattr_version = xattr_i->xattr_version; + } else { + xattr_version = 0; + } + + // do we have room? + if (max_bytes) { + unsigned bytes = + 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) + + sizeof(struct ceph_file_layout) + + sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq + 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink + 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime + sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree + sizeof(__u32) + symlink.length() + // symlink + sizeof(struct ceph_dir_layout); // dir_layout + + if (xattr_version) { + bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries + if (pxattrs) { + for (const auto &p : *pxattrs) + bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length(); + } + } else { + bytes += sizeof(__u32); // xattr buffer len + } + bytes += + sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data + 1 + 1 + 8 + 8 + 4 + // quota + 4 + layout.pool_ns.size() + // pool ns + sizeof(struct ceph_timespec) + 8; // btime + change_attr + + if (bytes > max_bytes) + return -ENOSPC; + } + + + // encode caps + struct ceph_mds_reply_cap ecap; + if (snapid != CEPH_NOSNAP) { + /* + * snapped inodes (files or dirs) only get read-only caps. always + * issue everything possible, since it is read only. + * + * if a snapped inode has caps, limit issued caps based on the + * lock state. + * + * if it is a live inode, limit issued caps based on the lock + * state. + * + * do NOT adjust cap issued state, because the client always + * tracks caps per-snap and the mds does either per-interval or + * multiversion. + */ + ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE; + if (last == CEPH_NOSNAP || is_any_caps()) + ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i); + ecap.seq = 0; + ecap.mseq = 0; + ecap.realm = 0; + } else { + if (!no_caps && !cap) { + // add a new cap + cap = add_client_cap(client, session, realm); + if (is_auth()) + choose_ideal_loner(); + } + + int issue = 0; + if (!no_caps && cap) { + int likes = get_caps_liked(); + int allowed = get_caps_allowed_for_client(session, cap, file_i); + issue = (cap->wanted() | likes) & allowed; + cap->issue_norevoke(issue, true); + issue = cap->pending(); + dout(10) << "encode_inodestat issuing " << ccap_string(issue) + << " seq " << cap->get_last_seq() << dendl; + } else if (cap && cap->is_new() && !dir_realm) { + // alway issue new caps to client, otherwise the caps get lost + ceph_assert(cap->is_stale()); + ceph_assert(!cap->pending()); + issue = CEPH_CAP_PIN; + cap->issue_norevoke(issue, true); + dout(10) << "encode_inodestat issuing " << ccap_string(issue) + << " seq " << cap->get_last_seq() + << "(stale&new caps)" << dendl; + } + + if (issue) { + cap->set_last_issue(); + cap->set_last_issue_stamp(ceph_clock_now()); + ecap.caps = issue; + ecap.wanted = cap->wanted(); + ecap.cap_id = cap->get_cap_id(); + ecap.seq = cap->get_last_seq(); + ecap.mseq = cap->get_mseq(); + ecap.realm = realm->inode->ino(); + } else { + ecap.cap_id = 0; + ecap.caps = 0; + ecap.seq = 0; + ecap.mseq = 0; + ecap.realm = 0; + ecap.wanted = 0; + } + } + ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0; + dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps) + << " seq " << ecap.seq << " mseq " << ecap.mseq + << " xattrv " << xattr_version << dendl; + + if (inline_data.length() && cap) { + if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) { + dout(10) << "including inline version " << inline_version << dendl; + cap->client_inline_version = inline_version; + } else { + dout(10) << "dropping inline version " << inline_version << dendl; + inline_version = 0; + inline_data.clear(); + } + } + + // include those xattrs? + if (xattr_version && cap) { + if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) { + dout(10) << "including xattrs version " << xattr_version << dendl; + cap->client_xattr_version = xattr_version; + } else { + dout(10) << "dropping xattrs version " << xattr_version << dendl; + xattr_version = 0; + } + } + + // The end result of encode_xattrs() is equivalent to: + // { + // bufferlist xbl; + // if (xattr_version) { + // if (pxattrs) + // encode(*pxattrs, bl); + // else + // encode((__u32)0, bl); + // } + // encode(xbl, bl); + // } + // + // But encoding xattrs into the 'xbl' requires a memory allocation. + // The 'bl' should have enough pre-allocated memory in most cases. + // Encoding xattrs directly into it can avoid the extra allocation. + auto encode_xattrs = [xattr_version, pxattrs, &bl]() { + using ceph::encode; + if (xattr_version) { + ceph_le32 xbl_len; + auto filler = bl.append_hole(sizeof(xbl_len)); + const auto starting_bl_len = bl.length(); + if (pxattrs) + encode(*pxattrs, bl); + else + encode((__u32)0, bl); + xbl_len = bl.length() - starting_bl_len; + filler.copy_in(sizeof(xbl_len), (char *)&xbl_len); + } else { + encode((__u32)0, bl); + } + }; + + /* + * note: encoding matches MClientReply::InodeStat + */ + if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { + ENCODE_START(3, 1, bl); + encode(oi->ino, bl); + encode(snapid, bl); + encode(oi->rdev, bl); + encode(version, bl); + encode(xattr_version, bl); + encode(ecap, bl); + { + ceph_file_layout legacy_layout; + layout.to_legacy(&legacy_layout); + encode(legacy_layout, bl); + } + encode(any_i->ctime, bl); + encode(file_i->mtime, bl); + encode(file_i->atime, bl); + encode(file_i->time_warp_seq, bl); + encode(file_i->size, bl); + encode(max_size, bl); + encode(file_i->truncate_size, bl); + encode(file_i->truncate_seq, bl); + encode(auth_i->mode, bl); + encode((uint32_t)auth_i->uid, bl); + encode((uint32_t)auth_i->gid, bl); + encode(link_i->nlink, bl); + encode(file_i->dirstat.nfiles, bl); + encode(file_i->dirstat.nsubdirs, bl); + encode(file_i->rstat.rbytes, bl); + encode(file_i->rstat.rfiles, bl); + encode(file_i->rstat.rsubdirs, bl); + encode(file_i->rstat.rctime, bl); + dirfragtree.encode(bl); + encode(symlink, bl); + encode(file_i->dir_layout, bl); + encode_xattrs(); + encode(inline_version, bl); + encode(inline_data, bl); + mempool_inode *policy_i = ppolicy ? pi : oi; + encode(policy_i->quota, bl); + encode(layout.pool_ns, bl); + encode(any_i->btime, bl); + encode(any_i->change_attr, bl); + encode(file_i->export_pin, bl); + encode(snap_btime, bl); + ENCODE_FINISH(bl); + } + else { + ceph_assert(session->get_connection()); + + encode(oi->ino, bl); + encode(snapid, bl); + encode(oi->rdev, bl); + encode(version, bl); + encode(xattr_version, bl); + encode(ecap, bl); + { + ceph_file_layout legacy_layout; + layout.to_legacy(&legacy_layout); + encode(legacy_layout, bl); + } + encode(any_i->ctime, bl); + encode(file_i->mtime, bl); + encode(file_i->atime, bl); + encode(file_i->time_warp_seq, bl); + encode(file_i->size, bl); + encode(max_size, bl); + encode(file_i->truncate_size, bl); + encode(file_i->truncate_seq, bl); + encode(auth_i->mode, bl); + encode((uint32_t)auth_i->uid, bl); + encode((uint32_t)auth_i->gid, bl); + encode(link_i->nlink, bl); + encode(file_i->dirstat.nfiles, bl); + encode(file_i->dirstat.nsubdirs, bl); + encode(file_i->rstat.rbytes, bl); + encode(file_i->rstat.rfiles, bl); + encode(file_i->rstat.rsubdirs, bl); + encode(file_i->rstat.rctime, bl); + dirfragtree.encode(bl); + encode(symlink, bl); + auto& conn = session->get_connection(); + if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { + encode(file_i->dir_layout, bl); + } + encode_xattrs(); + if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { + encode(inline_version, bl); + encode(inline_data, bl); + } + if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) { + mempool_inode *policy_i = ppolicy ? pi : oi; + encode(policy_i->quota, bl); + } + if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { + encode(layout.pool_ns, bl); + } + if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) { + encode(any_i->btime, bl); + encode(any_i->change_attr, bl); + } + } + + return valid; +} + +void CInode::encode_cap_message(const MClientCaps::ref &m, Capability *cap) +{ + ceph_assert(cap); + + client_t client = cap->get_client(); + + bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL); + bool pauth = authlock.is_xlocked_by_client(client); + bool plink = linklock.is_xlocked_by_client(client); + bool pxattr = xattrlock.is_xlocked_by_client(client); + + mempool_inode *oi = &inode; + mempool_inode *pi = get_projected_inode(); + mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi; + + dout(20) << __func__ << " pfile " << pfile + << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr + << " ctime " << i->ctime << dendl; + + i = pfile ? pi:oi; + m->set_layout(i->layout); + m->size = i->size; + m->truncate_seq = i->truncate_seq; + m->truncate_size = i->truncate_size; + m->mtime = i->mtime; + m->atime = i->atime; + m->ctime = i->ctime; + m->change_attr = i->change_attr; + m->time_warp_seq = i->time_warp_seq; + m->nfiles = i->dirstat.nfiles; + m->nsubdirs = i->dirstat.nsubdirs; + + if (cap->client_inline_version < i->inline_data.version) { + m->inline_version = cap->client_inline_version = i->inline_data.version; + if (i->inline_data.length() > 0) + m->inline_data = i->inline_data.get_data(); + } else { + m->inline_version = 0; + } + + // max_size is min of projected, actual. + uint64_t oldms = oi->client_ranges.count(client) ? oi->client_ranges[client].range.last : 0; + uint64_t newms = pi->client_ranges.count(client) ? pi->client_ranges[client].range.last : 0; + m->max_size = std::min(oldms, newms); + + i = pauth ? pi:oi; + m->head.mode = i->mode; + m->head.uid = i->uid; + m->head.gid = i->gid; + + i = plink ? pi:oi; + m->head.nlink = i->nlink; + + using ceph::encode; + i = pxattr ? pi:oi; + auto ix = pxattr ? get_projected_xattrs() : &xattrs; + if ((cap->pending() & CEPH_CAP_XATTR_SHARED) && + i->xattr_version > cap->client_xattr_version) { + dout(10) << " including xattrs v " << i->xattr_version << dendl; + encode(*ix, m->xattrbl); + m->head.xattr_version = i->xattr_version; + cap->client_xattr_version = i->xattr_version; + } +} + + + +void CInode::_encode_base(bufferlist& bl, uint64_t features) +{ + using ceph::encode; + encode(first, bl); + encode(inode, bl, features); + encode(symlink, bl); + encode(dirfragtree, bl); + encode(xattrs, bl); + encode(old_inodes, bl, features); + encode(damage_flags, bl); + encode_snap(bl); +} +void CInode::_decode_base(bufferlist::const_iterator& p) +{ + using ceph::decode; + decode(first, p); + decode(inode, p); + { + std::string tmp; + decode(tmp, p); + symlink = std::string_view(tmp); + } + decode(dirfragtree, p); + decode_noshare(xattrs, p); + decode(old_inodes, p); + decode(damage_flags, p); + decode_snap(p); +} + +void CInode::_encode_locks_full(bufferlist& bl) +{ + using ceph::encode; + encode(authlock, bl); + encode(linklock, bl); + encode(dirfragtreelock, bl); + encode(filelock, bl); + encode(xattrlock, bl); + encode(snaplock, bl); + encode(nestlock, bl); + encode(flocklock, bl); + encode(policylock, bl); + + encode(loner_cap, bl); +} +void CInode::_decode_locks_full(bufferlist::const_iterator& p) +{ + using ceph::decode; + decode(authlock, p); + decode(linklock, p); + decode(dirfragtreelock, p); + decode(filelock, p); + decode(xattrlock, p); + decode(snaplock, p); + decode(nestlock, p); + decode(flocklock, p); + decode(policylock, p); + + decode(loner_cap, p); + set_loner_cap(loner_cap); + want_loner_cap = loner_cap; // for now, we'll eval() shortly. +} + +void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) +{ + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_replica(bl); + filelock.encode_state_for_replica(bl); + nestlock.encode_state_for_replica(bl); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); + using ceph::encode; + encode(need_recover, bl); +} + +void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) +{ + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_rejoin(bl, rep); + filelock.encode_state_for_rejoin(bl, rep); + nestlock.encode_state_for_rejoin(bl, rep); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); +} + +void CInode::_decode_locks_state(bufferlist::const_iterator& p, bool is_new) +{ + authlock.decode_state(p, is_new); + linklock.decode_state(p, is_new); + dirfragtreelock.decode_state(p, is_new); + filelock.decode_state(p, is_new); + nestlock.decode_state(p, is_new); + xattrlock.decode_state(p, is_new); + snaplock.decode_state(p, is_new); + flocklock.decode_state(p, is_new); + policylock.decode_state(p, is_new); + + using ceph::decode; + bool need_recover; + decode(need_recover, p); + if (need_recover && is_new) { + // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock + // and change the object when replaying unsafe requests. + authlock.mark_need_recover(); + linklock.mark_need_recover(); + dirfragtreelock.mark_need_recover(); + filelock.mark_need_recover(); + nestlock.mark_need_recover(); + xattrlock.mark_need_recover(); + snaplock.mark_need_recover(); + flocklock.mark_need_recover(); + policylock.mark_need_recover(); + } +} +void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, + list<SimpleLock*>& eval_locks, bool survivor) +{ + authlock.decode_state_rejoin(p, waiters, survivor); + linklock.decode_state_rejoin(p, waiters, survivor); + dirfragtreelock.decode_state_rejoin(p, waiters, survivor); + filelock.decode_state_rejoin(p, waiters, survivor); + nestlock.decode_state_rejoin(p, waiters, survivor); + xattrlock.decode_state_rejoin(p, waiters, survivor); + snaplock.decode_state_rejoin(p, waiters, survivor); + flocklock.decode_state_rejoin(p, waiters, survivor); + policylock.decode_state_rejoin(p, waiters, survivor); + + if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) + eval_locks.push_back(&dirfragtreelock); + if (!filelock.is_stable() && !filelock.is_wrlocked()) + eval_locks.push_back(&filelock); + if (!nestlock.is_stable() && !nestlock.is_wrlocked()) + eval_locks.push_back(&nestlock); +} + + +// IMPORT/EXPORT + +void CInode::encode_export(bufferlist& bl) +{ + ENCODE_START(5, 4, bl); + _encode_base(bl, mdcache->mds->mdsmap->get_up_features()); + + encode(state, bl); + + encode(pop, bl); + + encode(get_replicas(), bl); + + // include scatterlock info for any bounding CDirs + bufferlist bounding; + if (inode.is_dir()) + for (const auto &p : dirfrags) { + CDir *dir = p.second; + if (dir->state_test(CDir::STATE_EXPORTBOUND)) { + encode(p.first, bounding); + encode(dir->fnode.fragstat, bounding); + encode(dir->fnode.accounted_fragstat, bounding); + encode(dir->fnode.rstat, bounding); + encode(dir->fnode.accounted_rstat, bounding); + dout(10) << " encoded fragstat/rstat info for " << *dir << dendl; + } + } + encode(bounding, bl); + + _encode_locks_full(bl); + + _encode_file_locks(bl); + + ENCODE_FINISH(bl); + + get(PIN_TEMPEXPORTING); +} + +void CInode::finish_export() +{ + state &= MASK_STATE_EXPORT_KEPT; + + pop.zero(); + + // just in case! + //dirlock.clear_updated(); + + loner_cap = -1; + + put(PIN_TEMPEXPORTING); +} + +void CInode::decode_import(bufferlist::const_iterator& p, + LogSegment *ls) +{ + DECODE_START(5, p); + + _decode_base(p); + + unsigned s; + decode(s, p); + state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED)); + + if (is_dirty()) { + get(PIN_DIRTY); + _mark_dirty(ls); + } + if (is_dirty_parent()) { + get(PIN_DIRTYPARENT); + mark_dirty_parent(ls); + } + + decode(pop, p); + + decode(get_replicas(), p); + if (is_replicated()) + get(PIN_REPLICATED); + replica_nonce = 0; + + // decode fragstat info on bounding cdirs + bufferlist bounding; + decode(bounding, p); + auto q = bounding.cbegin(); + while (!q.end()) { + frag_t fg; + decode(fg, q); + CDir *dir = get_dirfrag(fg); + ceph_assert(dir); // we should have all bounds open + + // Only take the remote's fragstat/rstat if we are non-auth for + // this dirfrag AND the lock is NOT in a scattered (MIX) state. + // We know lock is stable, and MIX is the only state in which + // the inode auth (who sent us this data) may not have the best + // info. + + // HMM: Are there cases where dir->is_auth() is an insufficient + // check because the dirfrag is under migration? That implies + // it is frozen (and in a SYNC or LOCK state). FIXME. + + if (dir->is_auth() || + filelock.get_state() == LOCK_MIX) { + dout(10) << " skipped fragstat info for " << *dir << dendl; + frag_info_t f; + decode(f, q); + decode(f, q); + } else { + decode(dir->fnode.fragstat, q); + decode(dir->fnode.accounted_fragstat, q); + dout(10) << " took fragstat info for " << *dir << dendl; + } + if (dir->is_auth() || + nestlock.get_state() == LOCK_MIX) { + dout(10) << " skipped rstat info for " << *dir << dendl; + nest_info_t n; + decode(n, q); + decode(n, q); + } else { + decode(dir->fnode.rstat, q); + decode(dir->fnode.accounted_rstat, q); + dout(10) << " took rstat info for " << *dir << dendl; + } + } + + _decode_locks_full(p); + + _decode_file_locks(p); + + DECODE_FINISH(p); +} + + +void InodeStoreBase::dump(Formatter *f) const +{ + inode.dump(f); + f->dump_string("symlink", symlink); + f->open_array_section("old_inodes"); + for (const auto &p : old_inodes) { + f->open_object_section("old_inode"); + // The key is the last snapid, the first is in the mempool_old_inode + f->dump_int("last", p.first); + p.second.dump(f); + f->close_section(); // old_inode + } + f->close_section(); // old_inodes + + f->open_object_section("dirfragtree"); + dirfragtree.dump(f); + f->close_section(); // dirfragtree +} + + +void InodeStore::generate_test_instances(list<InodeStore*> &ls) +{ + InodeStore *populated = new InodeStore; + populated->inode.ino = 0xdeadbeef; + populated->symlink = "rhubarb"; + ls.push_back(populated); +} + +void InodeStoreBare::generate_test_instances(list<InodeStoreBare*> &ls) +{ + InodeStoreBare *populated = new InodeStoreBare; + populated->inode.ino = 0xdeadbeef; + populated->symlink = "rhubarb"; + ls.push_back(populated); +} + +void CInode::validate_disk_state(CInode::validated_data *results, + MDSContext *fin) +{ + class ValidationContinuation : public MDSContinuation { + public: + MDSContext *fin; + CInode *in; + CInode::validated_data *results; + bufferlist bl; + CInode *shadow_in; + + enum { + START = 0, + BACKTRACE, + INODE, + DIRFRAGS, + SNAPREALM, + }; + + ValidationContinuation(CInode *i, + CInode::validated_data *data_r, + MDSContext *fin_) : + MDSContinuation(i->mdcache->mds->server), + fin(fin_), + in(i), + results(data_r), + shadow_in(NULL) { + set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start)); + set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace)); + set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk)); + set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags)); + set_callback(SNAPREALM, static_cast<Continuation::stagePtr>(&ValidationContinuation::_snaprealm)); + } + + ~ValidationContinuation() override { + if (shadow_in) { + delete shadow_in; + in->mdcache->num_shadow_inodes--; + } + } + + /** + * Fetch backtrace and set tag if tag is non-empty + */ + void fetch_backtrace_and_tag(CInode *in, + std::string_view tag, bool is_internal, + Context *fin, int *bt_r, bufferlist *bt) + { + const int64_t pool = in->get_backtrace_pool(); + object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); + + ObjectOperation fetch; + fetch.getxattr("parent", bt, bt_r); + in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP, + NULL, 0, fin); + using ceph::encode; + if (!is_internal) { + ObjectOperation scrub_tag; + bufferlist tag_bl; + encode(tag, tag_bl); + scrub_tag.setxattr("scrub_tag", tag_bl); + SnapContext snapc; + in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc, + ceph::real_clock::now(), + 0, NULL); + } + } + + bool _start(int rval) { + if (in->is_dirty()) { + MDCache *mdcache = in->mdcache; + mempool_inode& inode = in->inode; + dout(20) << "validating a dirty CInode; results will be inconclusive" + << dendl; + } + if (in->is_symlink()) { + // there's nothing to do for symlinks! + return true; + } + + // prefetch snaprealm's past parents + if (in->snaprealm && !in->snaprealm->have_past_parents_open()) + in->snaprealm->open_parents(nullptr); + + C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE), + in->mdcache->mds->finisher); + + std::string_view tag = in->scrub_infop->header->get_tag(); + bool is_internal = in->scrub_infop->header->is_internal_tag(); + // Rather than using the usual CInode::fetch_backtrace, + // use a special variant that optionally writes a tag in the same + // operation. + fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl); + return false; + } + + bool _backtrace(int rval) { + // set up basic result reporting and make sure we got the data + results->performed_validation = true; // at least, some of it! + results->backtrace.checked = true; + + const int64_t pool = in->get_backtrace_pool(); + inode_backtrace_t& memory_backtrace = results->backtrace.memory_value; + in->build_backtrace(pool, memory_backtrace); + bool equivalent, divergent; + int memory_newer; + + MDCache *mdcache = in->mdcache; // For the benefit of dout + const mempool_inode& inode = in->inode; // For the benefit of dout + + // Ignore rval because it's the result of a FAILOK operation + // from fetch_backtrace_and_tag: the real result is in + // backtrace.ondisk_read_retval + dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl; + if (results->backtrace.ondisk_read_retval != 0) { + results->backtrace.error_str << "failed to read off disk; see retval"; + // we probably have a new unwritten file! + // so skip the backtrace scrub for this entry and say that all's well + if (in->is_dirty_parent()) + results->backtrace.passed = true; + goto next; + } + + // extract the backtrace, and compare it to a newly-constructed one + try { + auto p = bl.cbegin(); + using ceph::decode; + decode(results->backtrace.ondisk_value, p); + dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl; + } catch (buffer::error&) { + if (results->backtrace.ondisk_read_retval == 0 && rval != 0) { + // Cases where something has clearly gone wrong with the overall + // fetch op, though we didn't get a nonzero rc from the getxattr + // operation. e.g. object missing. + results->backtrace.ondisk_read_retval = rval; + } + results->backtrace.error_str << "failed to decode on-disk backtrace (" + << bl.length() << " bytes)!"; + // we probably have a new unwritten file! + // so skip the backtrace scrub for this entry and say that all's well + if (in->is_dirty_parent()) + results->backtrace.passed = true; + + goto next; + } + + memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value, + &equivalent, &divergent); + + if (divergent || memory_newer < 0) { + // we're divergent, or on-disk version is newer + results->backtrace.error_str << "On-disk backtrace is divergent or newer"; + // we probably have a new unwritten file! + // so skip the backtrace scrub for this entry and say that all's well + if (divergent && in->is_dirty_parent()) + results->backtrace.passed = true; + } else { + results->backtrace.passed = true; + } +next: + + if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) { + std::string path; + in->make_path_string(path); + in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino() + << "(" << path << "), rewriting it"; + in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(), + false); + // Flag that we repaired this BT so that it won't go into damagetable + results->backtrace.repaired = true; + } + + // If the inode's number was free in the InoTable, fix that + // (#15619) + { + InoTable *inotable = mdcache->mds->inotable; + + dout(10) << "scrub: inotable ino = " << inode.ino << dendl; + dout(10) << "scrub: inotable free says " + << inotable->is_marked_free(inode.ino) << dendl; + + if (inotable->is_marked_free(inode.ino)) { + LogChannelRef clog = in->mdcache->mds->clog; + clog->error() << "scrub: inode wrongly marked free: " << inode.ino; + + if (in->scrub_infop->header->get_repair()) { + bool repaired = inotable->repair(inode.ino); + if (repaired) { + clog->error() << "inode table repaired for inode: " << inode.ino; + + inotable->save(); + } else { + clog->error() << "Cannot repair inotable while other operations" + " are in progress"; + } + } + } + } + + + if (in->is_dir()) { + return validate_directory_data(); + } else { + // TODO: validate on-disk inode for normal files + return check_inode_snaprealm(); + } + } + + bool validate_directory_data() { + ceph_assert(in->is_dir()); + + if (in->is_base()) { + if (!shadow_in) { + shadow_in = new CInode(in->mdcache); + in->mdcache->create_unlinked_system_inode(shadow_in, in->inode.ino, in->inode.mode); + in->mdcache->num_shadow_inodes++; + } + shadow_in->fetch(get_internal_callback(INODE)); + return false; + } else { + // TODO: validate on-disk inode for non-base directories + results->inode.passed = true; + return check_dirfrag_rstats(); + } + } + + bool _inode_disk(int rval) { + results->inode.checked = true; + results->inode.ondisk_read_retval = rval; + results->inode.ondisk_value = shadow_in->inode; + results->inode.memory_value = in->inode; + + mempool_inode& si = shadow_in->inode; + mempool_inode& i = in->inode; + if (si.version > i.version) { + // uh, what? + results->inode.error_str << "On-disk inode is newer than in-memory one; "; + goto next; + } else { + bool divergent = false; + int r = i.compare(si, &divergent); + results->inode.passed = !divergent && r >= 0; + if (!results->inode.passed) { + results->inode.error_str << + "On-disk inode is divergent or newer than in-memory one; "; + goto next; + } + } +next: + return check_dirfrag_rstats(); + } + + bool check_dirfrag_rstats() { + MDSGatherBuilder gather(g_ceph_context); + frag_vec_t leaves; + in->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = in->get_or_open_dirfrag(in->mdcache, leaf); + dir->scrub_info(); + if (!dir->scrub_infop->header) + dir->scrub_infop->header = in->scrub_infop->header; + if (dir->is_complete()) { + dir->scrub_local(); + } else { + dir->scrub_infop->need_scrub_local = true; + dir->fetch(gather.new_sub(), false); + } + } + if (gather.has_subs()) { + gather.set_finisher(get_internal_callback(DIRFRAGS)); + gather.activate(); + return false; + } else { + return immediate(DIRFRAGS, 0); + } + } + + bool _dirfrags(int rval) { + int frags_errors = 0; + // basic reporting setup + results->raw_stats.checked = true; + results->raw_stats.ondisk_read_retval = rval; + + results->raw_stats.memory_value.dirstat = in->inode.dirstat; + results->raw_stats.memory_value.rstat = in->inode.rstat; + frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat; + nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat; + + if (rval != 0) { + results->raw_stats.error_str << "Failed to read dirfrags off disk"; + goto next; + } + + // check each dirfrag... + for (const auto &p : in->dirfrags) { + CDir *dir = p.second; + ceph_assert(dir->get_version() > 0); + nest_info.add(dir->fnode.accounted_rstat); + dir_info.add(dir->fnode.accounted_fragstat); + if (dir->scrub_infop->pending_scrub_error) { + dir->scrub_infop->pending_scrub_error = false; + if (dir->scrub_infop->header->get_repair()) { + results->raw_stats.repaired = true; + results->raw_stats.error_str + << "dirfrag(" << p.first << ") has bad stats (will be fixed); "; + } else { + results->raw_stats.error_str + << "dirfrag(" << p.first << ") has bad stats; "; + } + frags_errors++; + } + } + nest_info.rsubdirs++; // it gets one to account for self + if (const sr_t *srnode = in->get_projected_srnode(); srnode) + nest_info.rsnaps += srnode->snaps.size(); + + // ...and that their sum matches our inode settings + if (!dir_info.same_sums(in->inode.dirstat) || + !nest_info.same_sums(in->inode.rstat)) { + if (in->scrub_infop->header->get_repair()) { + results->raw_stats.error_str + << "freshly-calculated rstats don't match existing ones (will be fixed)"; + in->mdcache->repair_inode_stats(in); + results->raw_stats.repaired = true; + } else { + results->raw_stats.error_str + << "freshly-calculated rstats don't match existing ones"; + } + goto next; + } + if (frags_errors > 0) + goto next; + + results->raw_stats.passed = true; +next: + // snaprealm + return check_inode_snaprealm(); + } + + bool check_inode_snaprealm() { + if (!in->snaprealm) + return true; + + if (!in->snaprealm->have_past_parents_open()) { + in->snaprealm->open_parents(get_internal_callback(SNAPREALM)); + return false; + } else { + return immediate(SNAPREALM, 0); + } + } + + bool _snaprealm(int rval) { + + if (in->snaprealm->past_parents_dirty || + !in->get_projected_srnode()->past_parents.empty()) { + // temporarily store error in field of on-disk inode validation temporarily + results->inode.checked = true; + results->inode.passed = false; + if (in->scrub_infop->header->get_repair()) { + results->inode.error_str << "Inode has old format snaprealm (will upgrade)"; + results->inode.repaired = true; + in->mdcache->upgrade_inode_snaprealm(in); + } else { + results->inode.error_str << "Inode has old format snaprealm"; + } + } + return true; + } + + void _done() override { + if ((!results->raw_stats.checked || results->raw_stats.passed) && + (!results->backtrace.checked || results->backtrace.passed) && + (!results->inode.checked || results->inode.passed)) + results->passed_validation = true; + + // Flag that we did some repair work so that our repair operation + // can be flushed at end of scrub + if (results->backtrace.repaired || + results->inode.repaired || + results->raw_stats.repaired) + in->scrub_infop->header->set_repaired(); + if (fin) + fin->complete(get_rval()); + } + }; + + + dout(10) << "scrub starting validate_disk_state on " << *this << dendl; + ValidationContinuation *vc = new ValidationContinuation(this, + results, + fin); + vc->begin(); +} + +void CInode::validated_data::dump(Formatter *f) const +{ + f->open_object_section("results"); + { + f->dump_bool("performed_validation", performed_validation); + f->dump_bool("passed_validation", passed_validation); + f->open_object_section("backtrace"); + { + f->dump_bool("checked", backtrace.checked); + f->dump_bool("passed", backtrace.passed); + f->dump_int("read_ret_val", backtrace.ondisk_read_retval); + f->dump_stream("ondisk_value") << backtrace.ondisk_value; + f->dump_stream("memoryvalue") << backtrace.memory_value; + f->dump_string("error_str", backtrace.error_str.str()); + } + f->close_section(); // backtrace + f->open_object_section("raw_stats"); + { + f->dump_bool("checked", raw_stats.checked); + f->dump_bool("passed", raw_stats.passed); + f->dump_int("read_ret_val", raw_stats.ondisk_read_retval); + f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat; + f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat; + f->dump_stream("memory_value.dirrstat") << raw_stats.memory_value.dirstat; + f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat; + f->dump_string("error_str", raw_stats.error_str.str()); + } + f->close_section(); // raw_stats + // dump failure return code + int rc = 0; + if (backtrace.checked && backtrace.ondisk_read_retval) + rc = backtrace.ondisk_read_retval; + if (inode.checked && inode.ondisk_read_retval) + rc = inode.ondisk_read_retval; + if (raw_stats.checked && raw_stats.ondisk_read_retval) + rc = raw_stats.ondisk_read_retval; + f->dump_int("return_code", rc); + } + f->close_section(); // results +} + +bool CInode::validated_data::all_damage_repaired() const +{ + bool unrepaired = + (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired) + || + (backtrace.checked && !backtrace.passed && !backtrace.repaired) + || + (inode.checked && !inode.passed && !inode.repaired); + + return !unrepaired; +} + +void CInode::dump(Formatter *f, int flags) const +{ + if (flags & DUMP_PATH) { + std::string path; + make_path_string(path, true); + if (path.empty()) + path = "/"; + f->dump_string("path", path); + } + + if (flags & DUMP_INODE_STORE_BASE) + InodeStoreBase::dump(f); + + if (flags & DUMP_MDS_CACHE_OBJECT) + MDSCacheObject::dump(f); + + if (flags & DUMP_LOCKS) { + f->open_object_section("versionlock"); + versionlock.dump(f); + f->close_section(); + + f->open_object_section("authlock"); + authlock.dump(f); + f->close_section(); + + f->open_object_section("linklock"); + linklock.dump(f); + f->close_section(); + + f->open_object_section("dirfragtreelock"); + dirfragtreelock.dump(f); + f->close_section(); + + f->open_object_section("filelock"); + filelock.dump(f); + f->close_section(); + + f->open_object_section("xattrlock"); + xattrlock.dump(f); + f->close_section(); + + f->open_object_section("snaplock"); + snaplock.dump(f); + f->close_section(); + + f->open_object_section("nestlock"); + nestlock.dump(f); + f->close_section(); + + f->open_object_section("flocklock"); + flocklock.dump(f); + f->close_section(); + + f->open_object_section("policylock"); + policylock.dump(f); + f->close_section(); + } + + if (flags & DUMP_STATE) { + f->open_array_section("states"); + MDSCacheObject::dump_states(f); + if (state_test(STATE_EXPORTING)) + f->dump_string("state", "exporting"); + if (state_test(STATE_OPENINGDIR)) + f->dump_string("state", "openingdir"); + if (state_test(STATE_FREEZING)) + f->dump_string("state", "freezing"); + if (state_test(STATE_FROZEN)) + f->dump_string("state", "frozen"); + if (state_test(STATE_AMBIGUOUSAUTH)) + f->dump_string("state", "ambiguousauth"); + if (state_test(STATE_EXPORTINGCAPS)) + f->dump_string("state", "exportingcaps"); + if (state_test(STATE_NEEDSRECOVER)) + f->dump_string("state", "needsrecover"); + if (state_test(STATE_PURGING)) + f->dump_string("state", "purging"); + if (state_test(STATE_DIRTYPARENT)) + f->dump_string("state", "dirtyparent"); + if (state_test(STATE_DIRTYRSTAT)) + f->dump_string("state", "dirtyrstat"); + if (state_test(STATE_STRAYPINNED)) + f->dump_string("state", "straypinned"); + if (state_test(STATE_FROZENAUTHPIN)) + f->dump_string("state", "frozenauthpin"); + if (state_test(STATE_DIRTYPOOL)) + f->dump_string("state", "dirtypool"); + if (state_test(STATE_ORPHAN)) + f->dump_string("state", "orphan"); + if (state_test(STATE_MISSINGOBJS)) + f->dump_string("state", "missingobjs"); + f->close_section(); + } + + if (flags & DUMP_CAPS) { + f->open_array_section("client_caps"); + for (const auto &p : client_caps) { + auto &client = p.first; + auto cap = &p.second; + f->open_object_section("client_cap"); + f->dump_int("client_id", client.v); + f->dump_string("pending", ccap_string(cap->pending())); + f->dump_string("issued", ccap_string(cap->issued())); + f->dump_string("wanted", ccap_string(cap->wanted())); + f->dump_int("last_sent", cap->get_last_seq()); + f->close_section(); + } + f->close_section(); + + f->dump_int("loner", loner_cap.v); + f->dump_int("want_loner", want_loner_cap.v); + + f->open_array_section("mds_caps_wanted"); + for (const auto &p : mds_caps_wanted) { + f->open_object_section("mds_cap_wanted"); + f->dump_int("rank", p.first); + f->dump_string("cap", ccap_string(p.second)); + f->close_section(); + } + f->close_section(); + } + + if (flags & DUMP_DIRFRAGS) { + f->open_array_section("dirfrags"); + list<CDir*> dfs; + get_dirfrags(dfs); + for(const auto &dir: dfs) { + f->open_object_section("dir"); + dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS); + dir->check_rstats(); + f->close_section(); + } + f->close_section(); + } +} + +/****** Scrub Stuff *****/ +void CInode::scrub_info_create() const +{ + dout(25) << __func__ << dendl; + ceph_assert(!scrub_infop); + + // break out of const-land to set up implicit initial state + CInode *me = const_cast<CInode*>(this); + mempool_inode *in = me->get_projected_inode(); + + scrub_info_t *si = new scrub_info_t(); + si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp; + si->scrub_start_version = si->last_scrub_version = in->last_scrub_version; + + me->scrub_infop = si; +} + +void CInode::scrub_maybe_delete_info() +{ + if (scrub_infop && + !scrub_infop->scrub_in_progress && + !scrub_infop->last_scrub_dirty) { + delete scrub_infop; + scrub_infop = NULL; + } +} + +void CInode::scrub_initialize(CDentry *scrub_parent, + ScrubHeaderRef& header, + MDSContext *f) +{ + dout(20) << __func__ << " with scrub_version " << get_version() << dendl; + if (scrub_is_in_progress()) { + dout(20) << __func__ << " inode moved during scrub, reinitializing " + << dendl; + ceph_assert(scrub_infop->scrub_parent); + CDentry *dn = scrub_infop->scrub_parent; + CDir *dir = dn->dir; + dn->put(CDentry::PIN_SCRUBPARENT); + ceph_assert(dir->scrub_infop && dir->scrub_infop->directory_scrubbing); + dir->scrub_infop->directories_scrubbing.erase(dn->key()); + dir->scrub_infop->others_scrubbing.erase(dn->key()); + } + scrub_info(); + if (!scrub_infop) + scrub_infop = new scrub_info_t(); + + if (get_projected_inode()->is_dir()) { + // fill in dirfrag_stamps with initial state + frag_vec_t leaves; + dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + if (header->get_force()) + scrub_infop->dirfrag_stamps[leaf].reset(); + else + scrub_infop->dirfrag_stamps[leaf]; + } + } + + if (scrub_parent) + scrub_parent->get(CDentry::PIN_SCRUBPARENT); + scrub_infop->scrub_parent = scrub_parent; + scrub_infop->on_finish = f; + scrub_infop->scrub_in_progress = true; + scrub_infop->children_scrubbed = false; + scrub_infop->header = header; + + scrub_infop->scrub_start_version = get_version(); + scrub_infop->scrub_start_stamp = ceph_clock_now(); + // right now we don't handle remote inodes +} + +int CInode::scrub_dirfrag_next(frag_t* out_dirfrag) +{ + dout(20) << __func__ << dendl; + ceph_assert(scrub_is_in_progress()); + + if (!is_dir()) { + return -ENOTDIR; + } + + std::map<frag_t, scrub_stamp_info_t>::iterator i = + scrub_infop->dirfrag_stamps.begin(); + + while (i != scrub_infop->dirfrag_stamps.end()) { + if (i->second.scrub_start_version < scrub_infop->scrub_start_version) { + i->second.scrub_start_version = get_projected_version(); + i->second.scrub_start_stamp = ceph_clock_now(); + *out_dirfrag = i->first; + dout(20) << " return frag " << *out_dirfrag << dendl; + return 0; + } + ++i; + } + + dout(20) << " no frags left, ENOENT " << dendl; + return ENOENT; +} + +void CInode::scrub_dirfrags_scrubbing(frag_vec_t* out_dirfrags) +{ + ceph_assert(out_dirfrags != NULL); + ceph_assert(scrub_infop != NULL); + + out_dirfrags->clear(); + std::map<frag_t, scrub_stamp_info_t>::iterator i = + scrub_infop->dirfrag_stamps.begin(); + + while (i != scrub_infop->dirfrag_stamps.end()) { + if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) { + if (i->second.last_scrub_version < scrub_infop->scrub_start_version) + out_dirfrags->push_back(i->first); + } else { + return; + } + + ++i; + } +} + +void CInode::scrub_dirfrag_finished(frag_t dirfrag) +{ + dout(20) << __func__ << " on frag " << dirfrag << dendl; + ceph_assert(scrub_is_in_progress()); + + std::map<frag_t, scrub_stamp_info_t>::iterator i = + scrub_infop->dirfrag_stamps.find(dirfrag); + ceph_assert(i != scrub_infop->dirfrag_stamps.end()); + + scrub_stamp_info_t &si = i->second; + si.last_scrub_stamp = si.scrub_start_stamp; + si.last_scrub_version = si.scrub_start_version; +} + +void CInode::scrub_aborted(MDSContext **c) { + dout(20) << __func__ << dendl; + ceph_assert(scrub_is_in_progress()); + + *c = nullptr; + std::swap(*c, scrub_infop->on_finish); + + if (scrub_infop->scrub_parent) { + CDentry *dn = scrub_infop->scrub_parent; + scrub_infop->scrub_parent = NULL; + dn->dir->scrub_dentry_finished(dn); + dn->put(CDentry::PIN_SCRUBPARENT); + } + + delete scrub_infop; + scrub_infop = nullptr; +} + +void CInode::scrub_finished(MDSContext **c) { + dout(20) << __func__ << dendl; + ceph_assert(scrub_is_in_progress()); + for (std::map<frag_t, scrub_stamp_info_t>::iterator i = + scrub_infop->dirfrag_stamps.begin(); + i != scrub_infop->dirfrag_stamps.end(); + ++i) { + if(i->second.last_scrub_version != i->second.scrub_start_version) { + derr << i->second.last_scrub_version << " != " + << i->second.scrub_start_version << dendl; + } + ceph_assert(i->second.last_scrub_version == i->second.scrub_start_version); + } + + scrub_infop->last_scrub_version = scrub_infop->scrub_start_version; + scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp; + scrub_infop->last_scrub_dirty = true; + scrub_infop->scrub_in_progress = false; + + if (scrub_infop->scrub_parent) { + CDentry *dn = scrub_infop->scrub_parent; + scrub_infop->scrub_parent = NULL; + dn->dir->scrub_dentry_finished(dn); + dn->put(CDentry::PIN_SCRUBPARENT); + } + + *c = scrub_infop->on_finish; + scrub_infop->on_finish = NULL; + + if (scrub_infop->header->get_origin() == this) { + // We are at the point that a tagging scrub was initiated + LogChannelRef clog = mdcache->mds->clog; + clog->info() << "scrub complete with tag '" + << scrub_infop->header->get_tag() << "'"; + } +} + +int64_t CInode::get_backtrace_pool() const +{ + if (is_dir()) { + return mdcache->mds->mdsmap->get_metadata_pool(); + } else { + // Files are required to have an explicit layout that specifies + // a pool + ceph_assert(inode.layout.pool_id != -1); + return inode.layout.pool_id; + } +} + +void CInode::maybe_export_pin(bool update) +{ + if (!g_conf()->mds_bal_export_pin) + return; + if (!is_dir() || !is_normal()) + return; + + mds_rank_t export_pin = get_export_pin(false); + if (export_pin == MDS_RANK_NONE && !update) + return; + + if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) + return; + + bool queue = false; + for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) { + CDir *dir = p->second; + if (!dir->is_auth()) + continue; + if (export_pin != MDS_RANK_NONE) { + if (dir->is_subtree_root()) { + // set auxsubtree bit or export it + if (!dir->state_test(CDir::STATE_AUXSUBTREE) || + export_pin != dir->get_dir_auth().first) + queue = true; + } else { + // create aux subtree or export it + queue = true; + } + } else { + // clear aux subtrees ? + queue = dir->state_test(CDir::STATE_AUXSUBTREE); + } + if (queue) { + state_set(CInode::STATE_QUEUEDEXPORTPIN); + mdcache->export_pin_queue.insert(this); + break; + } + } +} + +void CInode::set_export_pin(mds_rank_t rank) +{ + ceph_assert(is_dir()); + ceph_assert(is_projected()); + get_projected_inode()->export_pin = rank; +} + +mds_rank_t CInode::get_export_pin(bool inherit) const +{ + /* An inode that is export pinned may not necessarily be a subtree root, we + * need to traverse the parents. A base or system inode cannot be pinned. + * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not + * have a parent yet. + */ + const CInode *in = this; + while (true) { + if (in->is_system()) + break; + const CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + // ignore export pin for unlinked directory + if (in->get_inode().nlink == 0) + break; + if (in->get_inode().export_pin >= 0) + return in->get_inode().export_pin; + + if (!inherit) + break; + in = pdn->get_dir()->inode; + } + return MDS_RANK_NONE; +} + +bool CInode::is_exportable(mds_rank_t dest) const +{ + mds_rank_t pin = get_export_pin(); + if (pin == dest) { + return true; + } else if (pin >= 0) { + return false; + } else { + return true; + } +} + +MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co); diff --git a/src/mds/CInode.h b/src/mds/CInode.h new file mode 100644 index 00000000..cbe8779a --- /dev/null +++ b/src/mds/CInode.h @@ -0,0 +1,1227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_CINODE_H +#define CEPH_CINODE_H + +#include <list> +#include <map> +#include <set> +#include <string_view> + +#include "common/config.h" +#include "include/counter.h" +#include "include/elist.h" +#include "include/types.h" +#include "include/lru.h" +#include "include/compact_set.h" + +#include "MDSCacheObject.h" +#include "MDSContext.h" +#include "flock.h" + +#include "CDentry.h" +#include "SimpleLock.h" +#include "ScatterLock.h" +#include "LocalLock.h" +#include "Capability.h" +#include "SnapRealm.h" +#include "Mutation.h" + +#include "messages/MClientCaps.h" + +#define dout_context g_ceph_context + +class Context; +class CDentry; +class CDir; +class CInode; +class MDCache; +class LogSegment; +struct SnapRealm; +class Session; +struct ObjectOperation; +class EMetaBlob; + + +ostream& operator<<(ostream& out, const CInode& in); + +struct cinode_lock_info_t { + int lock; + int wr_caps; +}; + +extern cinode_lock_info_t cinode_lock_info[]; +extern int num_cinode_locks; + + +/** + * Base class for CInode, containing the backing store data and + * serialization methods. This exists so that we can read and + * handle CInodes from the backing store without hitting all + * the business logic in CInode proper. + */ +class InodeStoreBase { +public: + typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode; + typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode; + typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map; + typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool + + mempool_inode inode; // the inode itself + mempool::mds_co::string symlink; // symlink dest, if symlink + mempool_xattr_map xattrs; + fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. + mempool_old_inode_map old_inodes; // key = last, value.first = first + snapid_t oldest_snap = CEPH_NOSNAP; + damage_flags_t damage_flags = 0; + + InodeStoreBase() {} + + /* Helpers */ + bool is_file() const { return inode.is_file(); } + bool is_symlink() const { return inode.is_symlink(); } + bool is_dir() const { return inode.is_dir(); } + static object_t get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix); + + /* Full serialization for use in ".inode" root inode objects */ + void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const; + void decode(bufferlist::const_iterator &bl, bufferlist& snap_blob); + + /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */ + void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const; + void decode_bare(bufferlist::const_iterator &bl, bufferlist &snap_blob, __u8 struct_v=5); + + /* For test/debug output */ + void dump(Formatter *f) const; + + /* For use by offline tools */ + __u32 hash_dentry_name(std::string_view dn); + frag_t pick_dirfrag(std::string_view dn); +}; + +inline void decode_noshare(InodeStoreBase::mempool_xattr_map& xattrs, + ceph::buffer::list::const_iterator &p) +{ + decode_noshare<mempool::mds_co::pool_allocator>(xattrs, p); +} + +class InodeStore : public InodeStoreBase { +public: + // FIXME bufferlist not part of mempool + bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't + // rehydrate it without full MDCache + void encode(bufferlist &bl, uint64_t features) const { + InodeStoreBase::encode(bl, features, &snap_blob); + } + void decode(bufferlist::const_iterator &bl) { + InodeStoreBase::decode(bl, snap_blob); + } + void encode_bare(bufferlist &bl, uint64_t features) const { + InodeStoreBase::encode_bare(bl, features, &snap_blob); + } + void decode_bare(bufferlist::const_iterator &bl) { + InodeStoreBase::decode_bare(bl, snap_blob); + } + + static void generate_test_instances(std::list<InodeStore*>& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(InodeStore) + +// just for ceph-dencoder +class InodeStoreBare : public InodeStore { +public: + void encode(bufferlist &bl, uint64_t features) const { + InodeStore::encode_bare(bl, features); + } + void decode(bufferlist::const_iterator &bl) { + InodeStore::decode_bare(bl); + } + static void generate_test_instances(std::list<InodeStoreBare*>& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare) + +// cached inode wrapper +class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> { + public: + MEMPOOL_CLASS_HELPERS(); + // -- pins -- + static const int PIN_DIRFRAG = -1; + static const int PIN_CAPS = 2; // client caps + static const int PIN_IMPORTING = -4; // importing + static const int PIN_OPENINGDIR = 7; + static const int PIN_REMOTEPARENT = 8; + static const int PIN_BATCHOPENJOURNAL = 9; + static const int PIN_SCATTERED = 10; + static const int PIN_STICKYDIRS = 11; + //static const int PIN_PURGING = -12; + static const int PIN_FREEZING = 13; + static const int PIN_FROZEN = 14; + static const int PIN_IMPORTINGCAPS = -15; + static const int PIN_PASTSNAPPARENT = -16; + static const int PIN_OPENINGSNAPPARENTS = 17; + static const int PIN_TRUNCATING = 18; + static const int PIN_STRAY = 19; // we pin our stray inode while active + static const int PIN_NEEDSNAPFLUSH = 20; + static const int PIN_DIRTYRSTAT = 21; + static const int PIN_EXPORTINGCAPS = 22; + static const int PIN_DIRTYPARENT = 23; + static const int PIN_DIRWAITER = 24; + static const int PIN_SCRUBQUEUE = 25; + + std::string_view pin_name(int p) const override { + switch (p) { + case PIN_DIRFRAG: return "dirfrag"; + case PIN_CAPS: return "caps"; + case PIN_IMPORTING: return "importing"; + case PIN_OPENINGDIR: return "openingdir"; + case PIN_REMOTEPARENT: return "remoteparent"; + case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; + case PIN_SCATTERED: return "scattered"; + case PIN_STICKYDIRS: return "stickydirs"; + //case PIN_PURGING: return "purging"; + case PIN_FREEZING: return "freezing"; + case PIN_FROZEN: return "frozen"; + case PIN_IMPORTINGCAPS: return "importingcaps"; + case PIN_EXPORTINGCAPS: return "exportingcaps"; + case PIN_PASTSNAPPARENT: return "pastsnapparent"; + case PIN_OPENINGSNAPPARENTS: return "openingsnapparents"; + case PIN_TRUNCATING: return "truncating"; + case PIN_STRAY: return "stray"; + case PIN_NEEDSNAPFLUSH: return "needsnapflush"; + case PIN_DIRTYRSTAT: return "dirtyrstat"; + case PIN_DIRTYPARENT: return "dirtyparent"; + case PIN_DIRWAITER: return "dirwaiter"; + case PIN_SCRUBQUEUE: return "scrubqueue"; + default: return generic_pin_name(p); + } + } + + // -- dump flags -- + static const int DUMP_INODE_STORE_BASE = (1 << 0); + static const int DUMP_MDS_CACHE_OBJECT = (1 << 1); + static const int DUMP_LOCKS = (1 << 2); + static const int DUMP_STATE = (1 << 3); + static const int DUMP_CAPS = (1 << 4); + static const int DUMP_PATH = (1 << 5); + static const int DUMP_DIRFRAGS = (1 << 6); + static const int DUMP_ALL = (-1); + static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS); + + // -- state -- + static const int STATE_EXPORTING = (1<<0); // on nonauth bystander. + static const int STATE_OPENINGDIR = (1<<1); + static const int STATE_FREEZING = (1<<2); + static const int STATE_FROZEN = (1<<3); + static const int STATE_AMBIGUOUSAUTH = (1<<4); + static const int STATE_EXPORTINGCAPS = (1<<5); + static const int STATE_NEEDSRECOVER = (1<<6); + static const int STATE_RECOVERING = (1<<7); + static const int STATE_PURGING = (1<<8); + static const int STATE_DIRTYPARENT = (1<<9); + static const int STATE_DIRTYRSTAT = (1<<10); + static const int STATE_STRAYPINNED = (1<<11); + static const int STATE_FROZENAUTHPIN = (1<<12); + static const int STATE_DIRTYPOOL = (1<<13); + static const int STATE_REPAIRSTATS = (1<<14); + static const int STATE_MISSINGOBJS = (1<<15); + static const int STATE_EVALSTALECAPS = (1<<16); + static const int STATE_QUEUEDEXPORTPIN = (1<<17); + static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table + static const int STATE_DELAYEDEXPORTPIN = (1<<19); + // orphan inode needs notification of releasing reference + static const int STATE_ORPHAN = STATE_NOTIFYREF; + + static const int MASK_STATE_EXPORTED = + (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL); + static const int MASK_STATE_EXPORT_KEPT = + (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS| + STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN); + + // -- waiters -- + static const uint64_t WAIT_DIR = (1<<0); + static const uint64_t WAIT_FROZEN = (1<<1); + static const uint64_t WAIT_TRUNC = (1<<2); + static const uint64_t WAIT_FLOCK = (1<<3); + + static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1); + + // misc + static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export + + ostream& print_db_line_prefix(ostream& out) override; + + public: + MDCache *mdcache; + + SnapRealm *snaprealm = nullptr; + SnapRealm *containing_realm = nullptr; + snapid_t first, last; + mempool::mds_co::compact_set<snapid_t> dirty_old_rstats; + + class scrub_stamp_info_t { + public: + /// version we started our latest scrub (whether in-progress or finished) + version_t scrub_start_version = 0; + /// time we started our latest scrub (whether in-progress or finished) + utime_t scrub_start_stamp; + /// version we started our most recent finished scrub + version_t last_scrub_version = 0; + /// time we started our most recent finished scrub + utime_t last_scrub_stamp; + scrub_stamp_info_t() {} + void reset() { + scrub_start_version = last_scrub_version = 0; + scrub_start_stamp = last_scrub_stamp = utime_t(); + } + }; + + class scrub_info_t : public scrub_stamp_info_t { + public: + CDentry *scrub_parent = nullptr; + MDSContext *on_finish = nullptr; + + bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state? + bool scrub_in_progress = false; /// are we currently scrubbing? + bool children_scrubbed = false; + + /// my own (temporary) stamps and versions for each dirfrag we have + std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool + + ScrubHeaderRef header; + + scrub_info_t() {} + }; + + const scrub_info_t *scrub_info() const{ + if (!scrub_infop) + scrub_info_create(); + return scrub_infop; + } + + ScrubHeaderRef get_scrub_header() { + if (scrub_infop == nullptr) { + return nullptr; + } else { + return scrub_infop->header; + } + } + + bool scrub_is_in_progress() const { + return (scrub_infop && scrub_infop->scrub_in_progress); + } + /** + * Start scrubbing on this inode. That could be very short if it's + * a file, or take a long time if we're recursively scrubbing a directory. + * @pre It is not currently scrubbing + * @post it has set up internal scrubbing state + * @param scrub_version What version are we scrubbing at (usually, parent + * directory's get_projected_version()) + */ + void scrub_initialize(CDentry *scrub_parent, + ScrubHeaderRef& header, + MDSContext *f); + /** + * Get the next dirfrag to scrub. Gives you a frag_t in output param which + * you must convert to a CDir (and possibly load off disk). + * @param dir A pointer to frag_t, will be filled in with the next dirfrag to + * scrub if there is one. + * @returns 0 on success, you should scrub the passed-out frag_t right now; + * ENOENT: There are no remaining dirfrags to scrub + * <0 There was some other error (It will return -ENOTDIR if not a directory) + */ + int scrub_dirfrag_next(frag_t* out_dirfrag); + /** + * Get the currently scrubbing dirfrags. When returned, the + * passed-in list will be filled in with all frag_ts which have + * been returned from scrub_dirfrag_next but not sent back + * via scrub_dirfrag_finished. + */ + void scrub_dirfrags_scrubbing(frag_vec_t *out_dirfrags); + /** + * Report to the CInode that a dirfrag it owns has been scrubbed. Call + * this for every frag_t returned from scrub_dirfrag_next(). + * @param dirfrag The frag_t that was scrubbed + */ + void scrub_dirfrag_finished(frag_t dirfrag); + /** + * Call this once the scrub has been completed, whether it's a full + * recursive scrub on a directory or simply the data on a file (or + * anything in between). + * @param c An out param which is filled in with a Context* that must + * be complete()ed. + */ + void scrub_finished(MDSContext **c); + + void scrub_aborted(MDSContext **c); + + /** + * Report to the CInode that alldirfrags it owns have been scrubbed. + */ + void scrub_children_finished() { + scrub_infop->children_scrubbed = true; + } + void scrub_set_finisher(MDSContext *c) { + ceph_assert(!scrub_infop->on_finish); + scrub_infop->on_finish = c; + } + +private: + /** + * Create a scrub_info_t struct for the scrub_infop pointer. + */ + void scrub_info_create() const; + /** + * Delete the scrub_info_t struct if it's not got any useful data + */ + void scrub_maybe_delete_info(); +public: + + bool is_multiversion() const { + return snaprealm || // other snaprealms will link to me + inode.is_dir() || // links to me in other snaps + inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me + !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out. + } + snapid_t get_oldest_snap(); + + uint64_t last_journaled = 0; // log offset for the last time i was journaled + //loff_t last_open_journaled; // log offset for the last journaled EOpen + utime_t last_dirstat_prop; + + + // list item node for when we have unpropagated rstat data + elist<CInode*>::item dirty_rstat_item; + + bool is_dirty_rstat() { + return state_test(STATE_DIRTYRSTAT); + } + void mark_dirty_rstat(); + void clear_dirty_rstat(); + + //bool hack_accessed = false; + //utime_t hack_load_stamp; + + /** + * Projection methods, used to store inode changes until they have been journaled, + * at which point they are popped. + * Usage: + * project_inode as needed. If you're changing xattrs or sr_t, then pass true + * as needed then change the xattrs/snapnode member as needed. (Dirty + * exception: project_past_snaprealm_parent allows you to project the + * snapnode after doing project_inode (i.e. you don't need to pass + * snap=true). + * + * Then, journal. Once journaling is done, pop_and_dirty_projected_inode. + * This function will take care of the inode itself, the xattrs, and the snaprealm. + */ + + class projected_inode { + public: + static sr_t* const UNDEF_SRNODE; + + mempool_inode inode; + std::unique_ptr<mempool_xattr_map> xattrs; + sr_t *snapnode = UNDEF_SRNODE; + + projected_inode() = delete; + explicit projected_inode(const mempool_inode &in) : inode(in) {} + }; + +private: + mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty) + size_t num_projected_xattrs = 0; + size_t num_projected_srnodes = 0; + +public: + CInode::projected_inode &project_inode(bool xattr = false, bool snap = false); + void pop_and_dirty_projected_inode(LogSegment *ls); + + projected_inode *get_projected_node() { + if (projected_nodes.empty()) + return NULL; + else + return &projected_nodes.back(); + } + + version_t get_projected_version() const { + if (projected_nodes.empty()) + return inode.version; + else + return projected_nodes.back().inode.version; + } + bool is_projected() const { + return !projected_nodes.empty(); + } + + const mempool_inode *get_projected_inode() const { + if (projected_nodes.empty()) + return &inode; + else + return &projected_nodes.back().inode; + } + mempool_inode *get_projected_inode() { + if (projected_nodes.empty()) + return &inode; + else + return &projected_nodes.back().inode; + } + mempool_inode *get_previous_projected_inode() { + ceph_assert(!projected_nodes.empty()); + auto it = projected_nodes.rbegin(); + ++it; + if (it != projected_nodes.rend()) + return &it->inode; + else + return &inode; + } + + mempool_xattr_map *get_projected_xattrs() { + if (num_projected_xattrs > 0) { + for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) + if (it->xattrs) + return it->xattrs.get(); + } + return &xattrs; + } + mempool_xattr_map *get_previous_projected_xattrs() { + if (num_projected_xattrs > 0) { + for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) + if (it->xattrs) + return it->xattrs.get(); + } + return &xattrs; + } + + sr_t *prepare_new_srnode(snapid_t snapid); + void project_snaprealm(sr_t *new_srnode); + sr_t *project_snaprealm(snapid_t snapid=0) { + sr_t* new_srnode = prepare_new_srnode(snapid); + project_snaprealm(new_srnode); + return new_srnode; + } + const sr_t *get_projected_srnode() const { + if (num_projected_srnodes > 0) { + for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) + if (it->snapnode != projected_inode::UNDEF_SRNODE) + return it->snapnode; + } + if (snaprealm) + return &snaprealm->srnode; + else + return NULL; + } + + void mark_snaprealm_global(sr_t *new_srnode); + void clear_snaprealm_global(sr_t *new_srnode); + bool is_projected_snaprealm_global() const; + + void record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent); + void record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent, + CDentry *dn, bool primary_dn); + void project_snaprealm_past_parent(SnapRealm *newparent); + void early_pop_projected_snaprealm(); + +private: + void pop_projected_snaprealm(sr_t *next_snaprealm, bool early); + +public: + mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head); + void split_old_inode(snapid_t snap); + mempool_old_inode *pick_old_inode(snapid_t last); + void pre_cow_old_inode(); + bool has_snap_data(snapid_t s); + void purge_stale_snap_data(const std::set<snapid_t>& snaps); + + // -- cache infrastructure -- +private: + mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode + + //for the purpose of quickly determining whether there's a subtree root or exporting dir + int num_subtree_roots = 0; + int num_exporting_dirs = 0; + + int stickydir_ref = 0; + scrub_info_t *scrub_infop = nullptr; + +public: + bool has_dirfrags() { return !dirfrags.empty(); } + CDir* get_dirfrag(frag_t fg) { + auto pi = dirfrags.find(fg); + if (pi != dirfrags.end()) { + //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME + return pi->second; + } + return NULL; + } + bool get_dirfrags_under(frag_t fg, std::list<CDir*>& ls); + CDir* get_approx_dirfrag(frag_t fg); + + template<typename Container> + void get_dirfrags(Container& ls) const { + // all dirfrags + if constexpr (std::is_same_v<Container, std::vector<CDir*>>) + ls.reserve(ls.size() + dirfrags.size()); + for (const auto &p : dirfrags) + ls.push_back(p.second); + } + template<typename Container> + void get_nested_dirfrags(Container& ls) const { + // dirfrags in same subtree + if constexpr (std::is_same_v<Container, std::vector<CDir*>>) + ls.reserve(ls.size() + dirfrags.size() - num_subtree_roots); + for (const auto &p : dirfrags) { + typename Container::value_type dir = p.second; + if (!dir->is_subtree_root()) + ls.push_back(dir); + } + } + template<typename Container> + void get_subtree_dirfrags(Container& ls) { + // dirfrags that are roots of new subtrees + if constexpr (std::is_same_v<Container, std::vector<CDir*>>) + ls.reserve(ls.size() + num_subtree_roots); + for (const auto &p : dirfrags) { + typename Container::value_type dir = p.second; + if (dir->is_subtree_root()) + ls.push_back(dir); + } + } + int get_num_subtree_roots() const { + return num_subtree_roots; + } + + CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); + CDir *add_dirfrag(CDir *dir); + void close_dirfrag(frag_t fg); + void close_dirfrags(); + bool has_subtree_root_dirfrag(int auth=-1); + bool has_subtree_or_exporting_dirfrag(); + + void force_dirfrags(); + void verify_dirfrags(); + + void get_stickydirs(); + void put_stickydirs(); + + protected: + // parent dentries in cache + CDentry *parent = nullptr; // primary link + mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked + + mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc. + + mds_authority_t inode_auth = CDIR_AUTH_DEFAULT; + + // -- distributed state -- +protected: + // file capabilities + using mempool_cap_map = mempool::mds_co::map<client_t, Capability>; + mempool_cap_map client_caps; // client -> caps + mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted + int replica_caps_wanted = 0; // [replica] what i've requested from auth + int num_caps_wanted = 0; + +public: + mempool::mds_co::set<client_t> client_snap_caps; + mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush; + + void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client); + void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client); + pair<bool,bool> split_need_snapflush(CInode *cowin, CInode *in); + +protected: + + ceph_lock_state_t *fcntl_locks = nullptr; + ceph_lock_state_t *flock_locks = nullptr; + + ceph_lock_state_t *get_fcntl_lock_state() { + if (!fcntl_locks) + fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL); + return fcntl_locks; + } + void clear_fcntl_lock_state() { + delete fcntl_locks; + fcntl_locks = NULL; + } + ceph_lock_state_t *get_flock_lock_state() { + if (!flock_locks) + flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK); + return flock_locks; + } + void clear_flock_lock_state() { + delete flock_locks; + flock_locks = NULL; + } + void clear_file_locks() { + clear_fcntl_lock_state(); + clear_flock_lock_state(); + } + void _encode_file_locks(bufferlist& bl) const { + using ceph::encode; + bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty(); + encode(has_fcntl_locks, bl); + if (has_fcntl_locks) + encode(*fcntl_locks, bl); + bool has_flock_locks = flock_locks && !flock_locks->empty(); + encode(has_flock_locks, bl); + if (has_flock_locks) + encode(*flock_locks, bl); + } + void _decode_file_locks(bufferlist::const_iterator& p) { + using ceph::decode; + bool has_fcntl_locks; + decode(has_fcntl_locks, p); + if (has_fcntl_locks) + decode(*get_fcntl_lock_state(), p); + else + clear_fcntl_lock_state(); + bool has_flock_locks; + decode(has_flock_locks, p); + if (has_flock_locks) + decode(*get_flock_lock_state(), p); + else + clear_flock_lock_state(); + } + + // LogSegment lists i (may) belong to +public: + elist<CInode*>::item item_dirty; + elist<CInode*>::item item_caps; + elist<CInode*>::item item_open_file; + elist<CInode*>::item item_dirty_parent; + elist<CInode*>::item item_dirty_dirfrag_dir; + elist<CInode*>::item item_dirty_dirfrag_nest; + elist<CInode*>::item item_dirty_dirfrag_dirfragtree; + elist<CInode*>::item item_scrub; + + // also update RecoveryQueue::RecoveryQueue() if you change this + elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir; + elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest; + +public: + int auth_pin_freeze_allowance = 0; + + inode_load_vec_t pop; + elist<CInode*>::item item_pop_lru; + + // friends + friend class Server; + friend class Locker; + friend class Migrator; + friend class MDCache; + friend class StrayManager; + friend class CDir; + friend class CInodeExport; + + // --------------------------- + CInode() = delete; + CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP); + ~CInode() override { + close_dirfrags(); + close_snaprealm(); + clear_file_locks(); + ceph_assert(num_projected_xattrs == 0); + ceph_assert(num_projected_srnodes == 0); + ceph_assert(num_caps_wanted == 0); + ceph_assert(num_subtree_roots == 0); + ceph_assert(num_exporting_dirs == 0); + } + + + // -- accessors -- + bool is_root() const { return inode.ino == MDS_INO_ROOT; } + bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); } + mds_rank_t get_stray_owner() const { + return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino); + } + bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); } + bool is_base() const { return MDS_INO_IS_BASE(inode.ino); } + bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; } + bool is_normal() const { return !(is_base() || is_system() || is_stray()); } + + bool is_head() const { return last == CEPH_NOSNAP; } + + // note: this overloads MDSCacheObject + bool is_ambiguous_auth() const { + return state_test(STATE_AMBIGUOUSAUTH) || + MDSCacheObject::is_ambiguous_auth(); + } + void set_ambiguous_auth() { + state_set(STATE_AMBIGUOUSAUTH); + } + void clear_ambiguous_auth(MDSContext::vec& finished); + void clear_ambiguous_auth(); + + inodeno_t ino() const { return inode.ino; } + vinodeno_t vino() const { return vinodeno_t(inode.ino, last); } + int d_type() const { return IFTODT(inode.mode); } + + mempool_inode& get_inode() { return inode; } + const mempool_inode& get_inode() const { return inode; } + CDentry* get_parent_dn() { return parent; } + const CDentry* get_parent_dn() const { return parent; } + CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; } + const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; } + const CDentry* get_oldest_parent_dn() const { + if (parent) + return parent; + return !projected_parent.empty() ? projected_parent.front(): NULL; + } + CDir *get_parent_dir(); + const CDir *get_projected_parent_dir() const; + CDir *get_projected_parent_dir(); + CInode *get_parent_inode(); + + bool is_lt(const MDSCacheObject *r) const override { + const CInode *o = static_cast<const CInode*>(r); + return ino() < o->ino() || + (ino() == o->ino() && last < o->last); + } + + // -- misc -- + bool is_ancestor_of(const CInode *other) const; + bool is_projected_ancestor_of(const CInode *other) const; + + void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const; + void make_path(filepath& s, bool projected=false) const; + void name_stray_dentry(std::string& dname); + + // -- dirtyness -- + version_t get_version() const { return inode.version; } + + version_t pre_dirty(); + void _mark_dirty(LogSegment *ls); + void mark_dirty(version_t projected_dirv, LogSegment *ls); + void mark_clean(); + + void store(MDSContext *fin); + void _stored(int r, version_t cv, Context *fin); + /** + * Flush a CInode to disk. This includes the backtrace, the parent + * directory's link, and the Inode object itself (if a base directory). + * @pre is_auth() on both the inode and its containing directory + * @pre can_auth_pin() + * @param fin The Context to call when the flush is completed. + */ + void flush(MDSContext *fin); + void fetch(MDSContext *fin); + void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin); + + + void build_backtrace(int64_t pool, inode_backtrace_t& bt); + void store_backtrace(MDSContext *fin, int op_prio=-1); + void _stored_backtrace(int r, version_t v, Context *fin); + void fetch_backtrace(Context *fin, bufferlist *backtrace); +protected: + /** + * Return the pool ID where we currently write backtraces for + * this inode (in addition to inode.old_pools) + * + * @returns a pool ID >=0 + */ + int64_t get_backtrace_pool() const; +public: + void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false); + void clear_dirty_parent(); + void verify_diri_backtrace(bufferlist &bl, int err); + bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); } + bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); } + + void encode_snap_blob(bufferlist &bl); + void decode_snap_blob(const bufferlist &bl); + void encode_store(bufferlist& bl, uint64_t features); + void decode_store(bufferlist::const_iterator& bl); + + void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) { + ceph_assert(is_auth()); + + __u32 nonce = add_replica(rep); + using ceph::encode; + encode(nonce, bl); + + _encode_base(bl, features); + _encode_locks_state_for_replica(bl, need_recover); + } + void decode_replica(bufferlist::const_iterator& p, bool is_new) { + using ceph::decode; + __u32 nonce; + decode(nonce, p); + replica_nonce = nonce; + + _decode_base(p); + _decode_locks_state(p, is_new); + } + + // -- waiting -- +protected: + mempool::mds_co::compact_map<frag_t, MDSContext::vec > waiting_on_dir; +public: + void add_dir_waiter(frag_t fg, MDSContext *c); + void take_dir_waiting(frag_t fg, MDSContext::vec& ls); + bool is_waiting_for_dir(frag_t fg) { + return waiting_on_dir.count(fg); + } + void add_waiter(uint64_t tag, MDSContext *c) override; + void take_waiting(uint64_t tag, MDSContext::vec& ls) override; + + // -- encode/decode helpers -- + void _encode_base(bufferlist& bl, uint64_t features); + void _decode_base(bufferlist::const_iterator& p); + void _encode_locks_full(bufferlist& bl); + void _decode_locks_full(bufferlist::const_iterator& p); + void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover); + void _encode_locks_state_for_rejoin(bufferlist& bl, int rep); + void _decode_locks_state(bufferlist::const_iterator& p, bool is_new); + void _decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, + std::list<SimpleLock*>& eval_locks, bool survivor); + + // -- import/export -- + void encode_export(bufferlist& bl); + void finish_export(); + void abort_export() { + put(PIN_TEMPEXPORTING); + ceph_assert(state_test(STATE_EXPORTINGCAPS)); + state_clear(STATE_EXPORTINGCAPS); + put(PIN_EXPORTINGCAPS); + } + void decode_import(bufferlist::const_iterator& p, LogSegment *ls); + + + // for giving to clients + int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm, + snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0, + int getattr_wants=0); + void encode_cap_message(const MClientCaps::ref &m, Capability *cap); + + + // -- locks -- +public: + static LockType versionlock_type; + static LockType authlock_type; + static LockType linklock_type; + static LockType dirfragtreelock_type; + static LockType filelock_type; + static LockType xattrlock_type; + static LockType snaplock_type; + static LockType nestlock_type; + static LockType flocklock_type; + static LockType policylock_type; + + // FIXME not part of mempool + LocalLock versionlock; + SimpleLock authlock; + SimpleLock linklock; + ScatterLock dirfragtreelock; + ScatterLock filelock; + SimpleLock xattrlock; + SimpleLock snaplock; + ScatterLock nestlock; + SimpleLock flocklock; + SimpleLock policylock; + + SimpleLock* get_lock(int type) override { + switch (type) { + case CEPH_LOCK_IFILE: return &filelock; + case CEPH_LOCK_IAUTH: return &authlock; + case CEPH_LOCK_ILINK: return &linklock; + case CEPH_LOCK_IDFT: return &dirfragtreelock; + case CEPH_LOCK_IXATTR: return &xattrlock; + case CEPH_LOCK_ISNAP: return &snaplock; + case CEPH_LOCK_INEST: return &nestlock; + case CEPH_LOCK_IFLOCK: return &flocklock; + case CEPH_LOCK_IPOLICY: return &policylock; + } + return 0; + } + + void set_object_info(MDSCacheObjectInfo &info) override; + void encode_lock_state(int type, bufferlist& bl) override; + void decode_lock_state(int type, const bufferlist& bl) override; + + void _finish_frag_update(CDir *dir, MutationRef& mut); + + void clear_dirty_scattered(int type) override; + bool is_dirty_scattered(); + void clear_scatter_dirty(); // on rejoin ack + + void start_scatter(ScatterLock *lock); + void finish_scatter_update(ScatterLock *lock, CDir *dir, + version_t inode_version, version_t dir_accounted_version); + void finish_scatter_gather_update(int type); + void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob); + + // -- snap -- + void open_snaprealm(bool no_split=false); + void close_snaprealm(bool no_join=false); + SnapRealm *find_snaprealm() const; + void encode_snap(bufferlist& bl); + void decode_snap(bufferlist::const_iterator& p); + + // -- caps -- (new) + // client caps + client_t loner_cap = -1, want_loner_cap = -1; + + client_t get_loner() const { return loner_cap; } + client_t get_wanted_loner() const { return want_loner_cap; } + + // this is the loner state our locks should aim for + client_t get_target_loner() const { + if (loner_cap == want_loner_cap) + return loner_cap; + else + return -1; + } + + client_t calc_ideal_loner(); + void set_loner_cap(client_t l); + bool choose_ideal_loner(); + bool try_set_loner(); + bool try_drop_loner(); + + // choose new lock state during recovery, based on issued caps + void choose_lock_state(SimpleLock *lock, int allissued); + void choose_lock_states(int dirty_caps); + + int count_nonstale_caps() { + int n = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) + n++; + } + return n; + } + bool multiple_nonstale_caps() { + int n = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) { + if (n) + return true; + n++; + } + } + return false; + } + + bool is_any_caps() { return !client_caps.empty(); } + bool is_any_nonstale_caps() { return count_nonstale_caps(); } + + const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; } + void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m); + void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted); + + const mempool_cap_map& get_client_caps() const { return client_caps; } + Capability *get_client_cap(client_t client) { + auto client_caps_entry = client_caps.find(client); + if (client_caps_entry != client_caps.end()) + return &client_caps_entry->second; + return 0; + } + int get_client_cap_pending(client_t client) const { + auto client_caps_entry = client_caps.find(client); + if (client_caps_entry != client_caps.end()) { + return client_caps_entry->second.pending(); + } else { + return 0; + } + } + + int get_num_caps_wanted() const { return num_caps_wanted; } + void adjust_num_caps_wanted(int d); + + Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0); + void remove_client_cap(client_t client); + void move_to_realm(SnapRealm *realm); + + Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session); + void clear_client_caps_after_export(); + void export_client_caps(std::map<client_t,Capability::Export>& cl); + + // caps allowed + int get_caps_liked() const; + int get_caps_allowed_ever() const; + int get_caps_allowed_by_type(int type) const; + int get_caps_careful() const; + int get_xlocker_mask(client_t client) const; + int get_caps_allowed_for_client(Session *s, Capability *cap, mempool_inode *file_i) const; + + // caps issued, wanted + int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0, + int shift = 0, int mask = -1); + bool is_any_caps_wanted() const; + int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const; + bool issued_caps_need_gather(SimpleLock *lock); + + // -- authority -- + mds_authority_t authority() const override; + + // -- auth pins -- + bool can_auth_pin(int *err_ret=nullptr) const override; + void auth_pin(void *by) override; + void auth_unpin(void *by) override; + + // -- freeze -- + bool is_freezing_inode() const { return state_test(STATE_FREEZING); } + bool is_frozen_inode() const { return state_test(STATE_FROZEN); } + bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); } + bool is_frozen() const override; + bool is_frozen_dir() const; + bool is_freezing() const override; + + /* Freeze the inode. auth_pin_allowance lets the caller account for any + * auth_pins it is itself holding/responsible for. */ + bool freeze_inode(int auth_pin_allowance=0); + void unfreeze_inode(MDSContext::vec& finished); + void unfreeze_inode(); + + void freeze_auth_pin(); + void unfreeze_auth_pin(); + + // -- reference counting -- + void bad_put(int by) override { + generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref +#ifdef MDS_REF_SET + << " (" << ref_map << ")" +#endif + << dendl; +#ifdef MDS_REF_SET + ceph_assert(ref_map[by] > 0); +#endif + ceph_assert(ref > 0); + } + void bad_get(int by) override { + generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref +#ifdef MDS_REF_SET + << " (" << ref_map << ")" +#endif + << dendl; +#ifdef MDS_REF_SET + ceph_assert(ref_map[by] >= 0); +#endif + } + void first_get() override; + void last_put() override; + void _put() override; + + + // -- hierarchy stuff -- +public: + void set_primary_parent(CDentry *p) { + ceph_assert(parent == 0 || + g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata")); + parent = p; + } + void remove_primary_parent(CDentry *dn) { + ceph_assert(dn == parent); + parent = 0; + } + void add_remote_parent(CDentry *p); + void remove_remote_parent(CDentry *p); + int num_remote_parents() { + return remote_parents.size(); + } + + void push_projected_parent(CDentry *dn) { + projected_parent.push_back(dn); + } + void pop_projected_parent() { + ceph_assert(projected_parent.size()); + parent = projected_parent.front(); + projected_parent.pop_front(); + } + +public: + void maybe_export_pin(bool update=false); + void set_export_pin(mds_rank_t rank); + mds_rank_t get_export_pin(bool inherit=true) const; + bool is_exportable(mds_rank_t dest) const; + + void print(ostream& out) override; + void dump(Formatter *f, int flags = DUMP_DEFAULT) const; + + /** + * @defgroup Scrubbing and fsck + * @{ + */ + + /** + * Report the results of validation against a particular inode. + * Each member is a pair of bools. + * <member>.first represents if validation was performed against the member. + * <member.second represents if the member passed validation. + * performed_validation is set to true if the validation was actually + * run. It might not be run if, for instance, the inode is marked as dirty. + * passed_validation is set to true if everything that was checked + * passed its validation. + */ + struct validated_data { + template<typename T>struct member_status { + bool checked = false; + bool passed = false; + bool repaired = false; + int ondisk_read_retval = 0; + T ondisk_value; + T memory_value; + std::stringstream error_str; + }; + + bool performed_validation = false; + bool passed_validation = false; + + struct raw_stats_t { + frag_info_t dirstat; + nest_info_t rstat; + }; + + member_status<inode_backtrace_t> backtrace; + member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr + member_status<raw_stats_t> raw_stats; + + validated_data() {} + + void dump(Formatter *f) const; + + bool all_damage_repaired() const; + }; + + /** + * Validate that the on-disk state of an inode matches what + * we expect from our memory state. Currently this checks that: + * 1) The backtrace associated with the file data exists and is correct + * 2) For directories, the actual inode metadata matches our memory state, + * 3) For directories, the rstats match + * + * @param results A freshly-created validated_data struct, with values set + * as described in the struct documentation. + * @param mdr The request to be responeded upon the completion of the + * validation (or NULL) + * @param fin Context to call back on completion (or NULL) + */ + void validate_disk_state(validated_data *results, + MDSContext *fin); + static void dump_validation_results(const validated_data& results, + Formatter *f); +private: + bool _validate_disk_state(class ValidationContinuation *c, + int rval, int stage); + friend class ValidationContinuation; + /** @} Scrubbing and fsck */ +}; + +ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si); + +#undef dout_context +#endif diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt new file mode 100644 index 00000000..025dbdd7 --- /dev/null +++ b/src/mds/CMakeLists.txt @@ -0,0 +1,47 @@ +set(mds_srcs + Capability.cc + MDSDaemon.cc + MDSRank.cc + Beacon.cc + flock.cc + locks.c + journal.cc + Server.cc + Mutation.cc + MDCache.cc + RecoveryQueue.cc + StrayManager.cc + PurgeQueue.cc + Locker.cc + Migrator.cc + MDBalancer.cc + CDentry.cc + CDir.cc + CInode.cc + LogEvent.cc + MDSTable.cc + InoTable.cc + JournalPointer.cc + MDSTableClient.cc + MDSTableServer.cc + ScrubStack.cc + DamageTable.cc + SimpleLock.cc + SnapRealm.cc + SnapServer.cc + SnapClient.cc + snap.cc + SessionMap.cc + MDSContext.cc + MDSAuthCaps.cc + MDLog.cc + MDSCacheObject.cc + Mantle.cc + Anchor.cc + OpenFileTable.cc + ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc + ${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc + ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc) +add_library(mds STATIC ${mds_srcs}) +target_link_libraries(mds PRIVATE + heap_profiler cpu_profiler osdc liblua) diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc new file mode 100644 index 00000000..1ce1803b --- /dev/null +++ b/src/mds/Capability.cc @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Capability.h" +#include "CInode.h" +#include "SessionMap.h" + +#include "common/Formatter.h" + + +/* + * Capability::Export + */ + +void Capability::Export::encode(bufferlist &bl) const +{ + ENCODE_START(3, 2, bl); + encode(cap_id, bl); + encode(wanted, bl); + encode(issued, bl); + encode(pending, bl); + encode(client_follows, bl); + encode(seq, bl); + encode(mseq, bl); + encode(last_issue_stamp, bl); + encode(state, bl); + ENCODE_FINISH(bl); +} + +void Capability::Export::decode(bufferlist::const_iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p); + decode(cap_id, p); + decode(wanted, p); + decode(issued, p); + decode(pending, p); + decode(client_follows, p); + decode(seq, p); + decode(mseq, p); + decode(last_issue_stamp, p); + if (struct_v >= 3) + decode(state, p); + DECODE_FINISH(p); +} + +void Capability::Export::dump(Formatter *f) const +{ + f->dump_unsigned("cap_id", cap_id); + f->dump_unsigned("wanted", wanted); + f->dump_unsigned("issued", issued); + f->dump_unsigned("pending", pending); + f->dump_unsigned("client_follows", client_follows); + f->dump_unsigned("seq", seq); + f->dump_unsigned("migrate_seq", mseq); + f->dump_stream("last_issue_stamp") << last_issue_stamp; +} + +void Capability::Export::generate_test_instances(list<Capability::Export*>& ls) +{ + ls.push_back(new Export); + ls.push_back(new Export); + ls.back()->wanted = 1; + ls.back()->issued = 2; + ls.back()->pending = 3; + ls.back()->client_follows = 4; + ls.back()->mseq = 5; + ls.back()->last_issue_stamp = utime_t(6, 7); +} + +void Capability::Import::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(cap_id, bl); + encode(issue_seq, bl); + encode(mseq, bl); + ENCODE_FINISH(bl); +} + +void Capability::Import::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(cap_id, bl); + decode(issue_seq, bl); + decode(mseq, bl); + DECODE_FINISH(bl); +} + +void Capability::Import::dump(Formatter *f) const +{ + f->dump_unsigned("cap_id", cap_id); + f->dump_unsigned("issue_seq", issue_seq); + f->dump_unsigned("migrate_seq", mseq); +} + +/* + * Capability::revoke_info + */ + +void Capability::revoke_info::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl) + encode(before, bl); + encode(seq, bl); + encode(last_issue, bl); + ENCODE_FINISH(bl); +} + +void Capability::revoke_info::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(before, bl); + decode(seq, bl); + decode(last_issue, bl); + DECODE_FINISH(bl); +} + +void Capability::revoke_info::dump(Formatter *f) const +{ + f->dump_unsigned("before", before); + f->dump_unsigned("seq", seq); + f->dump_unsigned("last_issue", last_issue); +} + +void Capability::revoke_info::generate_test_instances(list<Capability::revoke_info*>& ls) +{ + ls.push_back(new revoke_info); + ls.push_back(new revoke_info); + ls.back()->before = 1; + ls.back()->seq = 2; + ls.back()->last_issue = 3; +} + + +/* + * Capability + */ +Capability::Capability(CInode *i, Session *s, uint64_t id) : + client_follows(0), + client_xattr_version(0), client_inline_version(0), + last_rbytes(0), last_rsize(0), + item_session_caps(this), item_snaprealm_caps(this), + item_revoking_caps(this), item_client_revoking_caps(this), + inode(i), session(s), + cap_id(id), _wanted(0), num_revoke_warnings(0), + _pending(0), _issued(0), last_sent(0), last_issue(0), mseq(0), + suppress(0), state(0) +{ + if (session) { + session->touch_cap_bottom(this); + cap_gen = session->get_cap_gen(); + if (session->is_stale()) + --cap_gen; // not valid + + auto& conn = session->get_connection(); + if (conn) { + if (!conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) + state |= STATE_NOINLINE; + if (!conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) + state |= STATE_NOPOOLNS; + if (!conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) + state |= STATE_NOQUOTA; + } + } +} + +client_t Capability::get_client() const +{ + return session ? session->get_client() : client_t(-1); +} + +bool Capability::is_stale() const +{ + return session ? session->is_stale() : false; +} + +bool Capability::is_valid() const +{ + return !session || session->get_cap_gen() == cap_gen; +} + +void Capability::revalidate() +{ + if (!is_valid()) + cap_gen = session->get_cap_gen(); +} + +void Capability::mark_notable() +{ + state |= STATE_NOTABLE; + session->touch_cap(this); +} + +void Capability::maybe_clear_notable() +{ + if ((_issued == _pending) && + !is_clientwriteable() && + !is_wanted_notable(_wanted)) { + ceph_assert(is_notable()); + state &= ~STATE_NOTABLE; + session->touch_cap_bottom(this); + } +} + +void Capability::set_wanted(int w) { + CInode *in = get_inode(); + if (in) { + if (!_wanted && w) { + in->adjust_num_caps_wanted(1); + } else if (_wanted && !w) { + in->adjust_num_caps_wanted(-1); + } + if (!is_wanted_notable(_wanted) && is_wanted_notable(w)) { + if (!is_notable()) + mark_notable(); + } else if (is_wanted_notable(_wanted) && !is_wanted_notable(w)) { + maybe_clear_notable(); + } + } + _wanted = w; +} + +void Capability::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl) + encode(last_sent, bl); + encode(last_issue_stamp, bl); + + encode(_wanted, bl); + encode(_pending, bl); + encode(_revokes, bl); + ENCODE_FINISH(bl); +} + +void Capability::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl) + decode(last_sent, bl); + decode(last_issue_stamp, bl); + + __u32 tmp_wanted; + decode(tmp_wanted, bl); + set_wanted(tmp_wanted); + decode(_pending, bl); + decode(_revokes, bl); + DECODE_FINISH(bl); + + calc_issued(); +} + +void Capability::dump(Formatter *f) const +{ + f->dump_unsigned("last_sent", last_sent); + f->dump_unsigned("last_issue_stamp", last_issue_stamp); + f->dump_unsigned("wanted", _wanted); + f->dump_unsigned("pending", _pending); + + f->open_array_section("revokes"); + for (const auto &r : _revokes) { + f->open_object_section("revoke"); + r.dump(f); + f->close_section(); + } + f->close_section(); +} + +void Capability::generate_test_instances(list<Capability*>& ls) +{ + ls.push_back(new Capability); + ls.push_back(new Capability); + ls.back()->last_sent = 11; + ls.back()->last_issue_stamp = utime_t(12, 13); + ls.back()->set_wanted(14); + ls.back()->_pending = 15; + { + auto &r = ls.back()->_revokes.emplace_back(); + r.before = 16; + r.seq = 17; + r.last_issue = 18; + } + { + auto &r = ls.back()->_revokes.emplace_back(); + r.before = 19; + r.seq = 20; + r.last_issue = 21; + } +} + +MEMPOOL_DEFINE_OBJECT_FACTORY(Capability, co_cap, mds_co); diff --git a/src/mds/Capability.h b/src/mds/Capability.h new file mode 100644 index 00000000..a54f013c --- /dev/null +++ b/src/mds/Capability.h @@ -0,0 +1,406 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CAPABILITY_H +#define CEPH_CAPABILITY_H + +#include "include/buffer_fwd.h" +#include "include/counter.h" +#include "include/mempool.h" +#include "include/xlist.h" + +#include "common/config.h" + +#include "mdstypes.h" + + +/* + + Capability protocol notes. + +- two types of cap events from mds -> client: + - cap "issue" in a MClientReply, or an MClientCaps IMPORT op. + - cap "update" (revocation or grant) .. an MClientCaps message. +- if client has cap, the mds should have it too. + +- if client has no dirty data, it can release it without waiting for an mds ack. + - client may thus get a cap _update_ and not have the cap. ignore it. + +- mds should track seq of last issue. any release + attempt will only succeed if the client has seen the latest. + +- a UPDATE updates the clients issued caps, wanted, etc. it may also flush dirty metadata. + - 'caps' are which caps the client retains. + - if 0, client wishes to release the cap + - 'wanted' is which caps the client wants. + - 'dirty' is which metadata is to be written. + - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written. + +- a FLUSH_ACK acks a FLUSH. + - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back) + - 'seq' is the _original_ FLUSH's seq. + - 'caps' is the _original_ FLUSH's caps (not actually important) + - client can conclude that (dirty & ~caps) bits were successfully cleaned. + +- a FLUSHSNAP flushes snapshot metadata. + - 'dirty' indicates which caps, were dirty, if any. + - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK. + + */ + +class CInode; +class Session; + +namespace ceph { + class Formatter; +} + +class Capability : public Counter<Capability> { +public: + MEMPOOL_CLASS_HELPERS(); + + struct Export { + int64_t cap_id = 0; + int32_t wanted = 0; + int32_t issued = 0; + int32_t pending = 0; + snapid_t client_follows; + ceph_seq_t seq = 0; + ceph_seq_t mseq = 0; + utime_t last_issue_stamp; + uint32_t state = 0; + Export() {} + Export(int64_t id, int w, int i, int p, snapid_t cf, + ceph_seq_t s, ceph_seq_t m, utime_t lis, unsigned st) : + cap_id(id), wanted(w), issued(i), pending(p), client_follows(cf), + seq(s), mseq(m), last_issue_stamp(lis), state(st) {} + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &p); + void dump(Formatter *f) const; + static void generate_test_instances(list<Export*>& ls); + }; + struct Import { + int64_t cap_id; + ceph_seq_t issue_seq; + ceph_seq_t mseq; + Import() : cap_id(0), issue_seq(0), mseq(0) {} + Import(int64_t i, ceph_seq_t s, ceph_seq_t m) : cap_id(i), issue_seq(s), mseq(m) {} + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &p); + void dump(Formatter *f) const; + }; + struct revoke_info { + __u32 before; + ceph_seq_t seq, last_issue; + revoke_info() : before(0), seq(0), last_issue(0) {} + revoke_info(__u32 b, ceph_seq_t s, ceph_seq_t li) : before(b), seq(s), last_issue(li) {} + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<revoke_info*>& ls); + }; + + const static unsigned STATE_NOTABLE = (1<<0); + const static unsigned STATE_NEW = (1<<1); + const static unsigned STATE_IMPORTING = (1<<2); + const static unsigned STATE_NEEDSNAPFLUSH = (1<<3); + const static unsigned STATE_CLIENTWRITEABLE = (1<<4); + const static unsigned STATE_NOINLINE = (1<<5); + const static unsigned STATE_NOPOOLNS = (1<<6); + const static unsigned STATE_NOQUOTA = (1<<7); + + const static unsigned MASK_STATE_EXPORTED = + (STATE_CLIENTWRITEABLE | STATE_NOINLINE | STATE_NOPOOLNS | STATE_NOQUOTA); + + Capability(CInode *i=nullptr, Session *s=nullptr, uint64_t id=0); + Capability(const Capability& other) = delete; + + const Capability& operator=(const Capability& other) = delete; + + int pending() const { + return _pending; + } + int issued() const { + return _issued; + } + int revoking() const { + return _issued & ~_pending; + } + ceph_seq_t issue(unsigned c, bool reval=false) { + if (reval) + revalidate(); + + if (_pending & ~c) { + // revoking (and maybe adding) bits. note caps prior to this revocation + _revokes.emplace_back(_pending, last_sent, last_issue); + _pending = c; + _issued |= c; + if (!is_notable()) + mark_notable(); + } else if (~_pending & c) { + // adding bits only. remove obsolete revocations? + _pending |= c; + _issued |= c; + // drop old _revokes with no bits we don't have + while (!_revokes.empty() && + (_revokes.back().before & ~_pending) == 0) + _revokes.pop_back(); + } else { + // no change. + ceph_assert(_pending == c); + } + //last_issue = + inc_last_seq(); + return last_sent; + } + ceph_seq_t issue_norevoke(unsigned c, bool reval=false) { + if (reval) + revalidate(); + + _pending |= c; + _issued |= c; + clear_new(); + + inc_last_seq(); + return last_sent; + } + void confirm_receipt(ceph_seq_t seq, unsigned caps) { + bool was_revoking = (_issued & ~_pending); + if (seq == last_sent) { + _revokes.clear(); + _issued = caps; + // don't add bits + _pending &= caps; + } else { + // can i forget any revocations? + while (!_revokes.empty() && _revokes.front().seq < seq) + _revokes.pop_front(); + if (!_revokes.empty()) { + if (_revokes.front().seq == seq) + _revokes.begin()->before = caps; + calc_issued(); + } else { + // seq < last_sent + _issued = caps | _pending; + } + } + + if (was_revoking && _issued == _pending) { + item_revoking_caps.remove_myself(); + item_client_revoking_caps.remove_myself(); + maybe_clear_notable(); + } + //check_rdcaps_list(); + } + // we may get a release racing with revocations, which means our revokes will be ignored + // by the client. clean them out of our _revokes history so we don't wait on them. + void clean_revoke_from(ceph_seq_t li) { + bool changed = false; + while (!_revokes.empty() && _revokes.front().last_issue <= li) { + _revokes.pop_front(); + changed = true; + } + if (changed) { + bool was_revoking = (_issued & ~_pending); + calc_issued(); + if (was_revoking && _issued == _pending) { + item_revoking_caps.remove_myself(); + item_client_revoking_caps.remove_myself(); + maybe_clear_notable(); + } + } + } + ceph_seq_t get_mseq() const { return mseq; } + void inc_mseq() { mseq++; } + + utime_t get_last_issue_stamp() const { return last_issue_stamp; } + utime_t get_last_revoke_stamp() const { return last_revoke_stamp; } + + void set_last_issue() { last_issue = last_sent; } + void set_last_issue_stamp(utime_t t) { last_issue_stamp = t; } + void set_last_revoke_stamp(utime_t t) { last_revoke_stamp = t; } + void reset_num_revoke_warnings() { num_revoke_warnings = 0; } + void inc_num_revoke_warnings() { ++num_revoke_warnings; } + unsigned get_num_revoke_warnings() const { return num_revoke_warnings; } + + void set_cap_id(uint64_t i) { cap_id = i; } + uint64_t get_cap_id() const { return cap_id; } + + //ceph_seq_t get_last_issue() { return last_issue; } + + bool is_suppress() const { return suppress > 0; } + void inc_suppress() { suppress++; } + void dec_suppress() { suppress--; } + + static bool is_wanted_notable(int wanted) { + return wanted & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD); + } + bool is_notable() const { return state & STATE_NOTABLE; } + + bool is_stale() const; + bool is_valid() const; + bool is_new() const { return state & STATE_NEW; } + void mark_new() { state |= STATE_NEW; } + void clear_new() { state &= ~STATE_NEW; } + bool is_importing() const { return state & STATE_IMPORTING; } + void mark_importing() { state |= STATE_IMPORTING; } + void clear_importing() { state &= ~STATE_IMPORTING; } + bool need_snapflush() const { return state & STATE_NEEDSNAPFLUSH; } + void mark_needsnapflush() { state |= STATE_NEEDSNAPFLUSH; } + void clear_needsnapflush() { state &= ~STATE_NEEDSNAPFLUSH; } + + bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; } + void mark_clientwriteable() { + if (!is_clientwriteable()) { + state |= STATE_CLIENTWRITEABLE; + if (!is_notable()) + mark_notable(); + } + } + void clear_clientwriteable() { + if (is_clientwriteable()) { + state &= ~STATE_CLIENTWRITEABLE; + maybe_clear_notable(); + } + } + + bool is_noinline() const { return state & STATE_NOINLINE; } + bool is_nopoolns() const { return state & STATE_NOPOOLNS; } + bool is_noquota() const { return state & STATE_NOQUOTA; } + + CInode *get_inode() const { return inode; } + Session *get_session() const { return session; } + client_t get_client() const; + + // caps this client wants to hold + int wanted() const { return _wanted; } + void set_wanted(int w); + + void inc_last_seq() { last_sent++; } + ceph_seq_t get_last_seq() const { + return last_sent; + } + ceph_seq_t get_last_issue() const { return last_issue; } + + void reset_seq() { + last_sent = 0; + last_issue = 0; + } + + // -- exports -- + Export make_export() const { + return Export(cap_id, wanted(), issued(), pending(), client_follows, get_last_seq(), mseq+1, last_issue_stamp, state); + } + void merge(const Export& other, bool auth_cap) { + // issued + pending + int newpending = other.pending | pending(); + if (other.issued & ~newpending) + issue(other.issued | newpending); + else + issue(newpending); + last_issue_stamp = other.last_issue_stamp; + + client_follows = other.client_follows; + + state |= other.state & MASK_STATE_EXPORTED; + if ((other.state & STATE_CLIENTWRITEABLE) && !is_notable()) + mark_notable(); + + // wanted + set_wanted(wanted() | other.wanted); + if (auth_cap) + mseq = other.mseq; + } + void merge(int otherwanted, int otherissued) { + // issued + pending + int newpending = pending(); + if (otherissued & ~newpending) + issue(otherissued | newpending); + else + issue(newpending); + + // wanted + set_wanted(wanted() | otherwanted); + } + + void revoke() { + if (revoking()) + confirm_receipt(last_sent, pending()); + } + + // serializers + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<Capability*>& ls); + + snapid_t client_follows; + version_t client_xattr_version; + version_t client_inline_version; + int64_t last_rbytes; + int64_t last_rsize; + + xlist<Capability*>::item item_session_caps; + xlist<Capability*>::item item_snaprealm_caps; + xlist<Capability*>::item item_revoking_caps; + xlist<Capability*>::item item_client_revoking_caps; + +private: + CInode *inode; + Session *session; + + uint64_t cap_id; + uint32_t cap_gen; + + __u32 _wanted; // what the client wants (ideally) + + utime_t last_issue_stamp; + utime_t last_revoke_stamp; + unsigned num_revoke_warnings; + + // track in-flight caps -------------- + // - add new caps to _pending + // - track revocations in _revokes list + __u32 _pending, _issued; + mempool::mds_co::list<revoke_info> _revokes; + + ceph_seq_t last_sent; + ceph_seq_t last_issue; + ceph_seq_t mseq; + + int suppress; + unsigned state; + + void calc_issued() { + _issued = _pending; + for (const auto &r : _revokes) { + _issued |= r.before; + } + } + + void revalidate(); + + void mark_notable(); + void maybe_clear_notable(); +}; + +WRITE_CLASS_ENCODER(Capability::Export) +WRITE_CLASS_ENCODER(Capability::Import) +WRITE_CLASS_ENCODER(Capability::revoke_info) +WRITE_CLASS_ENCODER(Capability) + + + +#endif diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc new file mode 100644 index 00000000..c474b078 --- /dev/null +++ b/src/mds/DamageTable.cc @@ -0,0 +1,280 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/debug.h" + +#include "mds/CDir.h" + +#include "DamageTable.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << ".damage " << __func__ << " " + +namespace { +/** + * Record damage to a particular dirfrag, implicitly affecting + * any dentries within it. + */ +class DirFragDamage : public DamageEntry +{ + public: + inodeno_t ino; + frag_t frag; + + DirFragDamage(inodeno_t ino_, frag_t frag_) + : ino(ino_), frag(frag_) + {} + + damage_entry_type_t get_type() const override + { + return DAMAGE_ENTRY_DIRFRAG; + } + + void dump(Formatter *f) const override + { + f->open_object_section("dir_frag_damage"); + f->dump_string("damage_type", "dir_frag"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_stream("frag") << frag; + f->dump_string("path", path); + f->close_section(); + } +}; + + +/** + * Record damage to a particular dname within a particular dirfrag + */ +class DentryDamage : public DamageEntry +{ + public: + inodeno_t ino; + frag_t frag; + std::string dname; + snapid_t snap_id; + + DentryDamage( + inodeno_t ino_, + frag_t frag_, + std::string_view dname_, + snapid_t snap_id_) + : ino(ino_), frag(frag_), dname(dname_), snap_id(snap_id_) + {} + + damage_entry_type_t get_type() const override + { + return DAMAGE_ENTRY_DENTRY; + } + + void dump(Formatter *f) const override + { + f->open_object_section("dentry_damage"); + f->dump_string("damage_type", "dentry"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_stream("frag") << frag; + f->dump_string("dname", dname); + f->dump_stream("snap_id") << snap_id; + f->dump_string("path", path); + f->close_section(); + } +}; + + +/** + * Record damage to our ability to look up an ino by number + */ +class BacktraceDamage : public DamageEntry +{ + public: + inodeno_t ino; + + BacktraceDamage(inodeno_t ino_) + : ino(ino_) + {} + + damage_entry_type_t get_type() const override + { + return DAMAGE_ENTRY_BACKTRACE; + } + + void dump(Formatter *f) const override + { + f->open_object_section("backtrace_damage"); + f->dump_string("damage_type", "backtrace"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_string("path", path); + f->close_section(); + } +}; +} + +DamageEntry::~DamageEntry() +{} + +bool DamageTable::notify_dentry( + inodeno_t ino, frag_t frag, + snapid_t snap_id, std::string_view dname, std::string_view path) +{ + if (oversized()) { + return true; + } + + // Special cases: damage to these dirfrags is considered fatal to + // the MDS rank that owns them. + if ( + (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) + || + (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank) + ) { + derr << "Damage to dentries in fragment " << frag << " of ino " << ino + << "is fatal because it is a system directory for this rank" << dendl; + return true; + } + + auto key = DirFragIdent(ino, frag); + if (dentries.count(key) == 0) { + DamageEntryRef entry = std::make_shared<DentryDamage>( + ino, frag, dname, snap_id); + entry->path = path; + dentries[key][DentryIdent(dname, snap_id)] = entry; + by_id[entry->id] = std::move(entry); + } + + return false; +} + +bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, + std::string_view path) +{ + // Special cases: damage to these dirfrags is considered fatal to + // the MDS rank that owns them. + if ( + (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank) + || + (ino == MDS_INO_ROOT) + ) { + derr << "Damage to fragment " << frag << " of ino " << ino + << " is fatal because it is a system directory for this rank" << dendl; + return true; + } + + if (oversized()) { + return true; + } + + auto key = DirFragIdent(ino, frag); + if (dirfrags.count(key) == 0) { + DamageEntryRef entry = std::make_shared<DirFragDamage>(ino, frag); + entry->path = path; + dirfrags[key] = entry; + by_id[entry->id] = std::move(entry); + } + + return false; +} + +bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) +{ + if (oversized()) { + return true; + } + + if (remotes.count(ino) == 0) { + auto entry = std::make_shared<BacktraceDamage>(ino); + entry->path = path; + remotes[ino] = entry; + by_id[entry->id] = std::move(entry); + } + + return false; +} + +bool DamageTable::oversized() const +{ + return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries); +} + +bool DamageTable::is_dentry_damaged( + const CDir *dir_frag, + std::string_view dname, + const snapid_t snap_id) const +{ + if (dentries.count( + DirFragIdent(dir_frag->inode->ino(), dir_frag->frag) + ) == 0) { + return false; + } + + const std::map<DentryIdent, DamageEntryRef> &frag_dentries = + dentries.at(DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)); + + return frag_dentries.count(DentryIdent(dname, snap_id)) > 0; +} + +bool DamageTable::is_dirfrag_damaged( + const CDir *dir_frag) const +{ + return dirfrags.count( + DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)) > 0; +} + +bool DamageTable::is_remote_damaged( + const inodeno_t ino) const +{ + return remotes.count(ino) > 0; +} + +void DamageTable::dump(Formatter *f) const +{ + f->open_array_section("damage_table"); + for (const auto &i : by_id) + { + i.second->dump(f); + } + f->close_section(); +} + +void DamageTable::erase(damage_entry_id_t damage_id) +{ + auto by_id_entry = by_id.find(damage_id); + if (by_id_entry == by_id.end()) { + return; + } + + DamageEntryRef entry = by_id_entry->second; + ceph_assert(entry->id == damage_id); // Sanity + + const auto type = entry->get_type(); + if (type == DAMAGE_ENTRY_DIRFRAG) { + auto dirfrag_entry = std::static_pointer_cast<DirFragDamage>(entry); + dirfrags.erase(DirFragIdent(dirfrag_entry->ino, dirfrag_entry->frag)); + } else if (type == DAMAGE_ENTRY_DENTRY) { + auto dentry_entry = std::static_pointer_cast<DentryDamage>(entry); + dentries.erase(DirFragIdent(dentry_entry->ino, dentry_entry->frag)); + } else if (type == DAMAGE_ENTRY_BACKTRACE) { + auto backtrace_entry = std::static_pointer_cast<BacktraceDamage>(entry); + remotes.erase(backtrace_entry->ino); + } else { + derr << "Invalid type " << type << dendl; + ceph_abort(); + } + + by_id.erase(by_id_entry); +} + diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h new file mode 100644 index 00000000..a408036c --- /dev/null +++ b/src/mds/DamageTable.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef DAMAGE_TABLE_H_ +#define DAMAGE_TABLE_H_ + +#include <string_view> + +#include "mdstypes.h" +#include "include/random.h" + +class CDir; + +typedef uint64_t damage_entry_id_t; + +typedef enum +{ + DAMAGE_ENTRY_DIRFRAG, + DAMAGE_ENTRY_DENTRY, + DAMAGE_ENTRY_BACKTRACE + +} damage_entry_type_t; + +class DamageEntry +{ + public: + damage_entry_id_t id; + utime_t reported_at; + + // path is optional, advisory. Used to give the admin an idea of what + // part of his tree the damage affects. + std::string path; + + DamageEntry() + { + id = ceph::util::generate_random_number<damage_entry_id_t>(0, 0xffffffff); + reported_at = ceph_clock_now(); + } + + virtual damage_entry_type_t get_type() const = 0; + + virtual ~DamageEntry(); + + virtual void dump(Formatter *f) const = 0; +}; + + +typedef std::shared_ptr<DamageEntry> DamageEntryRef; + + +class DirFragIdent +{ + public: + inodeno_t ino; + frag_t frag; + + bool operator<(const DirFragIdent &rhs) const + { + if (ino == rhs.ino) { + return frag < rhs.frag; + } else { + return ino < rhs.ino; + } + } + + DirFragIdent(inodeno_t ino_, frag_t frag_) + : ino(ino_), frag(frag_) + {} +}; + +class DentryIdent +{ + public: + std::string dname; + snapid_t snap_id; + + bool operator<(const DentryIdent &rhs) const + { + if (dname == rhs.dname) { + return snap_id < rhs.snap_id; + } else { + return dname < rhs.dname; + } + } + + DentryIdent(std::string_view dname_, snapid_t snap_id_) + : dname(dname_), snap_id(snap_id_) + {} +}; + +/** + * Registry of in-RADOS metadata damage identified + * during forward scrub or during normal fetches. + * + * Used to indicate damage to the administrator, and + * to cache known-bad paths so that we don't hit them + * repeatedly. + * + * Callers notifying damage must check return code; if + * an fatal condition is indicated then they should mark the MDS + * rank damaged. + * + * An artificial limit on the number of damage entries + * is imposed to avoid this structure growing indefinitely. If + * a notification causes the limit to be exceeded, the fatal + * condition will be indicated in the return code and the MDS + * rank should be marked damaged. + * + * Protected by MDS::mds_lock + */ +class DamageTable +{ +protected: + + // Map of all dirfrags reported damaged + std::map<DirFragIdent, DamageEntryRef> dirfrags; + + // Store dentries in a map per dirfrag, so that we can + // readily look up all the bad dentries in a particular + // dirfrag + std::map<DirFragIdent, std::map<DentryIdent, DamageEntryRef> > dentries; + + // Map of all inodes which could not be resolved remotely + // (i.e. have probably/possibly missing backtraces) + std::map<inodeno_t, DamageEntryRef> remotes; + + // All damage, by ID. This is a secondary index + // to the dirfrag, dentry, remote maps. It exists + // to enable external tools to unambiguously operate + // on particular entries. + std::map<damage_entry_id_t, DamageEntryRef> by_id; + + // I need to know my MDS rank so that I can check if + // metadata items are part of my mydir. + const mds_rank_t rank; + + bool oversized() const; + +public: + + /** + * Return true if no damage entries exist + */ + bool empty() const + { + return by_id.empty(); + } + + /** + * Indicate that a dirfrag cannot be loaded. + * + * @return true if fatal + */ + bool notify_dirfrag(inodeno_t ino, frag_t frag, std::string_view path); + + /** + * Indicate that a particular dentry cannot be loaded. + * + * @return true if fatal + */ + bool notify_dentry( + inodeno_t ino, frag_t frag, + snapid_t snap_id, std::string_view dname, std::string_view path); + + /** + * Indicate that a particular Inode could not be loaded by number + */ + bool notify_remote_damaged( + inodeno_t ino, std::string_view path); + + bool is_dentry_damaged( + const CDir *dir_frag, + std::string_view dname, + const snapid_t snap_id) const; + + bool is_dirfrag_damaged( + const CDir *dir_frag) const; + + bool is_remote_damaged( + const inodeno_t ino) const; + + + explicit DamageTable(const mds_rank_t rank_) + : rank(rank_) + { + ceph_assert(rank_ != MDS_RANK_NONE); + } + + void dump(Formatter *f) const; + + void erase(damage_entry_id_t damage_id); +}; + +#endif // DAMAGE_TABLE_H_ + diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc new file mode 100644 index 00000000..623e1748 --- /dev/null +++ b/src/mds/FSMap.cc @@ -0,0 +1,1029 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "FSMap.h" + +#include "common/StackStringStream.h" + +#include <sstream> +#ifdef WITH_SEASTAR +#include "crimson/common/config_proxy.h" +#else +#include "common/config_proxy.h" +#endif +#include "global/global_context.h" +#include "mon/health_check.h" + +using std::stringstream; + +void Filesystem::dump(Formatter *f) const +{ + f->open_object_section("mdsmap"); + mds_map.dump(f); + f->close_section(); + f->dump_int("id", fscid); +} + +void FSMap::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + // Use 'default' naming to match 'set-default' CLI + f->dump_int("default_fscid", legacy_client_fscid); + + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + + f->open_object_section("feature_flags"); + f->dump_bool("enable_multiple", enable_multiple); + f->dump_bool("ever_enabled_multiple", ever_enabled_multiple); + f->close_section(); + + f->open_array_section("standbys"); + for (const auto& [gid, info] : standby_daemons) { + f->open_object_section("info"); + info.dump(f); + f->dump_int("epoch", standby_epochs.at(gid)); + f->close_section(); + } + f->close_section(); + + f->open_array_section("filesystems"); + for (const auto &fs : filesystems) { + f->open_object_section("filesystem"); + fs.second->dump(f); + f->close_section(); + } + f->close_section(); +} + +void FSMap::generate_test_instances(list<FSMap*>& ls) +{ + FSMap *m = new FSMap(); + + std::list<MDSMap*> mds_map_instances; + MDSMap::generate_test_instances(mds_map_instances); + + int k = 20; + for (auto i : mds_map_instances) { + auto fs = Filesystem::create(); + fs->fscid = k++; + fs->mds_map = *i; + delete i; + m->filesystems[fs->fscid] = fs; + } + mds_map_instances.clear(); + + ls.push_back(m); +} + +void FSMap::print(ostream& out) const +{ + out << "e" << epoch << std::endl; + out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << "," + << ever_enabled_multiple << std::endl; + out << "compat: " << compat << std::endl; + out << "legacy client fscid: " << legacy_client_fscid << std::endl; + out << " " << std::endl; + + if (filesystems.empty()) { + out << "No filesystems configured" << std::endl; + } + + for (const auto& p : filesystems) { + p.second->print(out); + out << " " << std::endl << " " << std::endl; // Space out a bit + } + + if (!standby_daemons.empty()) { + out << "Standby daemons:" << std::endl << " " << std::endl; + } + + for (const auto& p : standby_daemons) { + out << p.second << std::endl; + } +} + + + +void FSMap::print_summary(Formatter *f, ostream *out) const +{ + if (f) { + f->dump_unsigned("epoch", get_epoch()); + for (const auto &p : filesystems) { + auto& fs = p.second; + f->dump_unsigned("id", fs->fscid); + f->dump_unsigned("up", fs->mds_map.up.size()); + f->dump_unsigned("in", fs->mds_map.in.size()); + f->dump_unsigned("max", fs->mds_map.max_mds); + } + } else { + auto count = filesystems.size(); + if (count <= 3) { + bool first = true; + for (const auto& p : filesystems) { + const auto& fs = p.second; + if (!first) { + *out << " "; + } + if (fs->mds_map.is_degraded()) { + *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size(); + } else { + *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size(); + } + first = false; + } + } else { + *out << count << " fs"; + unsigned degraded = 0; + CachedStackStringStream css; + *css << " (degraded: "; + for (const auto& p : filesystems) { + const auto& fs = p.second; + if (fs->mds_map.is_degraded()) { + degraded++; + if (degraded <= 3) { + *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size(); + } + } + } + if (degraded > 0) { + if (degraded <= 3) { + *css << ")"; + *out << css->strv(); + } else { + *out << " (degraded: " << degraded << " fs)"; + } + } + } + } + + if (f) { + f->open_array_section("by_rank"); + } + + std::map<MDSMap::DaemonState,unsigned> by_state; + std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank; + by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size(); + for (const auto& [gid, fscid] : mds_roles) { + if (fscid == FS_CLUSTER_ID_NONE) + continue; + + const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid); + auto s = std::string(ceph_mds_state_name(info.state)); + if (info.laggy()) { + s += "(laggy or crashed)"; + } + + if (f) { + f->open_object_section("mds"); + f->dump_unsigned("filesystem_id", fscid); + f->dump_unsigned("rank", info.rank); + f->dump_string("name", info.name); + f->dump_string("status", s); + f->dump_unsigned("gid", gid); + f->close_section(); + } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) { + by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s); + } + by_state[info.state]++; + } + + if (f) { + f->close_section(); + } else { + if (0 < by_rank.size() && by_rank.size() < 5) { + if (filesystems.size() > 1) { + // Disambiguate filesystems + std::map<std::string, std::string> pretty; + for (const auto& [role,status] : by_rank) { + const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name; + CachedStackStringStream css; + *css << fs_name << ":" << role.rank; + pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second)); + --by_state[status.first]; /* already printed! */ + } + *out << " " << pretty; + } else { + // Omit FSCID in output when only one filesystem exists + std::map<mds_rank_t, std::string> shortened; + for (const auto& [role,status] : by_rank) { + shortened[role.rank] = status.second; + --by_state[status.first]; /* already printed! */ + } + *out << " " << shortened; + } + } + for (const auto& [state, count] : by_state) { + if (count > 0) { + auto s = std::string_view(ceph_mds_state_name(state)); + *out << " " << count << " " << s; + } + } + } + + if (f) { + const auto state = MDSMap::DaemonState::STATE_STANDBY; + auto&& name = ceph_mds_state_name(state); + auto count = standby_daemons.size(); + f->dump_unsigned(name, count); + } + + size_t failed = 0; + size_t damaged = 0; + for (const auto& p : filesystems) { + auto& fs = p.second; + failed += fs->mds_map.failed.size(); + damaged += fs->mds_map.damaged.size(); + } + + if (failed > 0) { + if (f) { + f->dump_unsigned("failed", failed); + } else { + *out << ", " << failed << " failed"; + } + } + + if (damaged > 0) { + if (f) { + f->dump_unsigned("damaged", damaged); + } else { + *out << ", " << damaged << " damaged"; + } + } + //if (stopped.size()) + //out << ", " << stopped.size() << " stopped"; +} + + +Filesystem::ref FSMap::create_filesystem(std::string_view name, + int64_t metadata_pool, int64_t data_pool, uint64_t features) +{ + auto fs = Filesystem::create(); + fs->mds_map.epoch = epoch; + fs->mds_map.fs_name = name; + fs->mds_map.data_pools.push_back(data_pool); + fs->mds_map.metadata_pool = metadata_pool; + fs->mds_map.cas_pool = -1; + fs->mds_map.compat = compat; + fs->mds_map.created = ceph_clock_now(); + fs->mds_map.modified = ceph_clock_now(); + fs->mds_map.enabled = true; + if (features & CEPH_FEATURE_SERVER_JEWEL) { + fs->fscid = next_filesystem_id++; + // ANONYMOUS is only for upgrades from legacy mdsmaps, we should + // have initialized next_filesystem_id such that it's never used here. + ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS); + } else { + // Use anon fscid because this will get thrown away when encoding + // as legacy MDSMap for legacy mons. + ceph_assert(filesystems.empty()); + fs->fscid = FS_CLUSTER_ID_ANONYMOUS; + } + filesystems[fs->fscid] = fs; + + // Created first filesystem? Set it as the one + // for legacy clients to use + if (filesystems.size() == 1) { + legacy_client_fscid = fs->fscid; + } + + return fs; +} + +void FSMap::reset_filesystem(fs_cluster_id_t fscid) +{ + auto fs = get_filesystem(fscid); + auto new_fs = Filesystem::create(); + + // Populate rank 0 as existing (so don't go into CREATING) + // but failed (so that next available MDS is assigned the rank) + new_fs->mds_map.in.insert(mds_rank_t(0)); + new_fs->mds_map.failed.insert(mds_rank_t(0)); + + // Carry forward what makes sense + new_fs->fscid = fs->fscid; + new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled; + new_fs->mds_map.data_pools = fs->mds_map.data_pools; + new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool; + new_fs->mds_map.cas_pool = fs->mds_map.cas_pool; + new_fs->mds_map.fs_name = fs->mds_map.fs_name; + new_fs->mds_map.compat = compat; + new_fs->mds_map.created = ceph_clock_now(); + new_fs->mds_map.modified = ceph_clock_now(); + new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted; + new_fs->mds_map.enabled = true; + + // Remember mds ranks that have ever started. (They should load old inotable + // instead of creating new one if they start again.) + new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end()); + new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end()); + new_fs->mds_map.stopped.erase(mds_rank_t(0)); + + // Persist the new FSMap + filesystems[new_fs->fscid] = new_fs; +} + +void FSMap::get_health(list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const +{ + mds_rank_t standby_count_wanted = 0; + for (const auto &i : filesystems) { + const auto &fs = i.second; + + // TODO: move get_health up into here so that we can qualify + // all the messages with what filesystem they're talking about + fs->mds_map.get_health(summary, detail); + + standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); + } + + if (standby_count_wanted) { + std::ostringstream oss; + oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more"; + summary.push_back(make_pair(HEALTH_WARN, oss.str())); + } +} + +bool FSMap::check_health(void) +{ + bool changed = false; + for (auto &i : filesystems) { + changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size()); + } + return changed; +} + +void FSMap::get_health_checks(health_check_map_t *checks) const +{ + mds_rank_t standby_count_wanted = 0; + for (const auto &i : filesystems) { + const auto &fs = i.second; + health_check_map_t fschecks; + + fs->mds_map.get_health_checks(&fschecks); + + // Some of the failed ranks might be transient (i.e. there are standbys + // ready to replace them). We will report only on "stuck" failed, i.e. + // ranks which are failed and have no standby replacement available. + std::set<mds_rank_t> stuck_failed; + + for (const auto &rank : fs->mds_map.failed) { + auto&& replacement = find_replacement_for({fs->fscid, rank}, {}); + if (replacement == MDS_GID_NONE) { + stuck_failed.insert(rank); + } + } + + // FS_WITH_FAILED_MDS + if (!stuck_failed.empty()) { + health_check_t& fscheck = checks->get_or_add( + "FS_WITH_FAILED_MDS", HEALTH_WARN, + "%num% filesystem%plurals% %hasorhave% a failed mds daemon"); + ostringstream ss; + ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size() + << " failed mds" << (stuck_failed.size() > 1 ? "s" : ""); + fscheck.detail.push_back(ss.str()); } + + checks->merge(fschecks); + standby_count_wanted = std::max( + standby_count_wanted, + fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); + } + + // MDS_INSUFFICIENT_STANDBY + if (standby_count_wanted) { + std::ostringstream oss, dss; + oss << "insufficient standby MDS daemons available"; + auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str()); + dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted + << " more"; + d.detail.push_back(dss.str()); + } +} + +void FSMap::encode(bufferlist& bl, uint64_t features) const +{ + if (features & CEPH_FEATURE_SERVER_JEWEL) { + ENCODE_START(7, 6, bl); + encode(epoch, bl); + encode(next_filesystem_id, bl); + encode(legacy_client_fscid, bl); + encode(compat, bl); + encode(enable_multiple, bl); + { + std::vector<Filesystem::ref> v; + v.reserve(filesystems.size()); + for (auto& p : filesystems) v.emplace_back(p.second); + encode(v, bl, features); + } + encode(mds_roles, bl); + encode(standby_daemons, bl, features); + encode(standby_epochs, bl); + encode(ever_enabled_multiple, bl); + ENCODE_FINISH(bl); + } else { + if (filesystems.empty()) { + MDSMap disabled_map; + disabled_map.epoch = epoch; + disabled_map.encode(bl, features); + } else { + // MDSMonitor should never have created multiple filesystems + // until the quorum features indicated Jewel + ceph_assert(filesystems.size() == 1); + auto fs = filesystems.begin()->second; + + // Take the MDSMap for the enabled filesystem, and populated its + // mds_info with the standbys to get a pre-jewel-style mon MDSMap. + MDSMap full_mdsmap = fs->mds_map; + full_mdsmap.epoch = epoch; + for (const auto &p : standby_daemons) { + full_mdsmap.mds_info[p.first] = p.second; + } + + // Old MDSMaps don't set rank on standby replay daemons + for (auto &i : full_mdsmap.mds_info) { + auto &info = i.second; + if (info.state == MDSMap::STATE_STANDBY_REPLAY) { + info.rank = MDS_RANK_NONE; + } + } + + full_mdsmap.encode(bl, features); + } + } +} + +void FSMap::decode(bufferlist::const_iterator& p) +{ + // The highest MDSMap encoding version before we changed the + // MDSMonitor to store an FSMap instead of an MDSMap was + // 5, so anything older than 6 is decoded as an MDSMap, + // and anything newer is decoded as an FSMap. + DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p); + if (struct_v < 6) { + // Because the mon used to store an MDSMap where we now + // store an FSMap, FSMap knows how to decode the legacy + // MDSMap format (it never needs to encode it though). + MDSMap legacy_mds_map; + + // Decoding an MDSMap (upgrade) + decode(epoch, p); + decode(legacy_mds_map.flags, p); + decode(legacy_mds_map.last_failure, p); + decode(legacy_mds_map.root, p); + decode(legacy_mds_map.session_timeout, p); + decode(legacy_mds_map.session_autoclose, p); + decode(legacy_mds_map.max_file_size, p); + decode(legacy_mds_map.max_mds, p); + decode(legacy_mds_map.mds_info, p); + if (struct_v < 3) { + __u32 n; + decode(n, p); + while (n--) { + __u32 m; + decode(m, p); + legacy_mds_map.data_pools.push_back(m); + } + __s32 s; + decode(s, p); + legacy_mds_map.cas_pool = s; + } else { + decode(legacy_mds_map.data_pools, p); + decode(legacy_mds_map.cas_pool, p); + } + + // kclient ignores everything from here + __u16 ev = 1; + if (struct_v >= 2) + decode(ev, p); + if (ev >= 3) + decode(legacy_mds_map.compat, p); + else + legacy_mds_map.compat = MDSMap::get_compat_set_base(); + if (ev < 5) { + __u32 n; + decode(n, p); + legacy_mds_map.metadata_pool = n; + } else { + decode(legacy_mds_map.metadata_pool, p); + } + decode(legacy_mds_map.created, p); + decode(legacy_mds_map.modified, p); + decode(legacy_mds_map.tableserver, p); + decode(legacy_mds_map.in, p); + std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop + decode(inc, p); + decode(legacy_mds_map.up, p); + decode(legacy_mds_map.failed, p); + decode(legacy_mds_map.stopped, p); + if (ev >= 4) + decode(legacy_mds_map.last_failure_osd_epoch, p); + if (ev >= 6) { + if (ev < 10) { + // previously this was a bool about snaps, not a flag map + bool flag; + decode(flag, p); + legacy_mds_map.ever_allowed_features = flag ? + CEPH_MDSMAP_ALLOW_SNAPS : 0; + decode(flag, p); + legacy_mds_map.explicitly_allowed_features = flag ? + CEPH_MDSMAP_ALLOW_SNAPS : 0; + } else { + decode(legacy_mds_map.ever_allowed_features, p); + decode(legacy_mds_map.explicitly_allowed_features, p); + } + } else { + legacy_mds_map.ever_allowed_features = 0; + legacy_mds_map.explicitly_allowed_features = 0; + } + if (ev >= 7) + decode(legacy_mds_map.inline_data_enabled, p); + + if (ev >= 8) { + ceph_assert(struct_v >= 5); + decode(legacy_mds_map.enabled, p); + decode(legacy_mds_map.fs_name, p); + } else { + legacy_mds_map.fs_name = "default"; + if (epoch > 1) { + // If an MDS has ever been started, epoch will be greater than 1, + // assume filesystem is enabled. + legacy_mds_map.enabled = true; + } else { + // Upgrading from a cluster that never used an MDS, switch off + // filesystem until it's explicitly enabled. + legacy_mds_map.enabled = false; + } + } + + if (ev >= 9) { + decode(legacy_mds_map.damaged, p); + } + + // We're upgrading, populate filesystems from the legacy fields + filesystems.clear(); + standby_daemons.clear(); + standby_epochs.clear(); + mds_roles.clear(); + compat = legacy_mds_map.compat; + enable_multiple = false; + + // Synthesise a Filesystem from legacy_mds_map, if enabled + if (legacy_mds_map.enabled) { + // Construct a Filesystem from the legacy MDSMap + auto migrate_fs = Filesystem::create(); + migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS; + migrate_fs->mds_map = legacy_mds_map; + migrate_fs->mds_map.epoch = epoch; + filesystems[migrate_fs->fscid] = migrate_fs; + + // List of GIDs that had invalid states + std::set<mds_gid_t> drop_gids; + + // Construct mds_roles, standby_daemons, and remove + // standbys from the MDSMap in the Filesystem. + for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) { + if (info.state == MDSMap::STATE_STANDBY_REPLAY) { + /* drop any legacy standby-replay daemons */ + drop_gids.insert(gid); + } else if (info.rank == MDS_RANK_NONE) { + if (info.state != MDSMap::STATE_STANDBY) { + // Old MDSMaps can have down:dne here, which + // is invalid in an FSMap (#17837) + drop_gids.insert(gid); + } else { + insert(info); // into standby_daemons + } + } else { + mds_roles[gid] = migrate_fs->fscid; + } + } + for (const auto &p : standby_daemons) { + // Erase from this Filesystem's MDSMap, because it has + // been copied into FSMap::Standby_daemons above + migrate_fs->mds_map.mds_info.erase(p.first); + } + for (const auto &gid : drop_gids) { + // Throw away all info for this MDS because it was identified + // as having invalid state above. + migrate_fs->mds_map.mds_info.erase(gid); + } + + legacy_client_fscid = migrate_fs->fscid; + } else { + legacy_client_fscid = FS_CLUSTER_ID_NONE; + } + } else { + decode(epoch, p); + decode(next_filesystem_id, p); + decode(legacy_client_fscid, p); + decode(compat, p); + decode(enable_multiple, p); + { + std::vector<Filesystem::ref> v; + decode(v, p); + filesystems.clear(); + for (auto& ref : v) { + auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref))); + ceph_assert(em.second); + } + } + decode(mds_roles, p); + decode(standby_daemons, p); + decode(standby_epochs, p); + if (struct_v >= 7) { + decode(ever_enabled_multiple, p); + } + } + + DECODE_FINISH(p); +} + +void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists) +{ + for (auto &fs : filesystems) { + fs.second->mds_map.sanitize(pool_exists); + } +} + +void Filesystem::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(fscid, bl); + bufferlist mdsmap_bl; + mds_map.encode(mdsmap_bl, features); + encode(mdsmap_bl, bl); + ENCODE_FINISH(bl); +} + +void Filesystem::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(fscid, p); + bufferlist mdsmap_bl; + decode(mdsmap_bl, p); + auto mdsmap_bl_iter = mdsmap_bl.cbegin(); + mds_map.decode(mdsmap_bl_iter); + DECODE_FINISH(p); +} + +int FSMap::parse_filesystem( + std::string_view ns_str, + Filesystem::const_ref* result + ) const +{ + std::string ns_err; + std::string s(ns_str); + fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err); + if (!ns_err.empty() || filesystems.count(fscid) == 0) { + for (auto &fs : filesystems) { + if (fs.second->mds_map.fs_name == s) { + *result = std::const_pointer_cast<const Filesystem>(fs.second); + return 0; + } + } + return -ENOENT; + } else { + *result = get_filesystem(fscid); + return 0; + } +} + +void Filesystem::print(std::ostream &out) const +{ + out << "Filesystem '" << mds_map.fs_name + << "' (" << fscid << ")" << std::endl; + mds_map.print(out); +} + +mds_gid_t FSMap::get_available_standby() const +{ + for (const auto& [gid, info] : standby_daemons) { + ceph_assert(info.rank == MDS_RANK_NONE); + ceph_assert(info.state == MDSMap::STATE_STANDBY); + + if (info.laggy() || info.is_frozen()) { + continue; + } + + return gid; + } + return MDS_GID_NONE; +} + +mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const +{ + auto&& fs = get_filesystem(role.fscid); + + // First see if we have a STANDBY_REPLAY + for (const auto& [gid, info] : fs->mds_map.mds_info) { + if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) { + if (info.is_frozen()) { + /* the standby-replay is frozen, do nothing! */ + return MDS_GID_NONE; + } else { + return gid; + } + } + } + + return get_available_standby(); +} + +void FSMap::sanity() const +{ + if (legacy_client_fscid != FS_CLUSTER_ID_NONE) { + ceph_assert(filesystems.count(legacy_client_fscid) == 1); + } + + for (const auto &i : filesystems) { + auto fs = i.second; + ceph_assert(fs->mds_map.compat.compare(compat) == 0); + ceph_assert(fs->fscid == i.first); + for (const auto &j : fs->mds_map.mds_info) { + ceph_assert(j.second.rank != MDS_RANK_NONE); + ceph_assert(mds_roles.count(j.first) == 1); + ceph_assert(standby_daemons.count(j.first) == 0); + ceph_assert(standby_epochs.count(j.first) == 0); + ceph_assert(mds_roles.at(j.first) == i.first); + if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) { + ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first); + ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0); + ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0); + } + } + + for (const auto &j : fs->mds_map.up) { + mds_rank_t rank = j.first; + ceph_assert(fs->mds_map.in.count(rank) == 1); + mds_gid_t gid = j.second; + ceph_assert(fs->mds_map.mds_info.count(gid) == 1); + } + } + + for (const auto &i : standby_daemons) { + ceph_assert(i.second.state == MDSMap::STATE_STANDBY); + ceph_assert(i.second.rank == MDS_RANK_NONE); + ceph_assert(i.second.global_id == i.first); + ceph_assert(standby_epochs.count(i.first) == 1); + ceph_assert(mds_roles.count(i.first) == 1); + ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE); + } + + for (const auto &i : standby_epochs) { + ceph_assert(standby_daemons.count(i.first) == 1); + } + + for (const auto &i : mds_roles) { + if (i.second == FS_CLUSTER_ID_NONE) { + ceph_assert(standby_daemons.count(i.first) == 1); + } else { + ceph_assert(filesystems.count(i.second) == 1); + ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1); + } + } +} + +void FSMap::promote( + mds_gid_t standby_gid, + Filesystem& filesystem, + mds_rank_t assigned_rank) +{ + ceph_assert(gid_exists(standby_gid)); + bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE; + if (!is_standby_replay) { + ceph_assert(standby_daemons.count(standby_gid)); + ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY); + } + + MDSMap &mds_map = filesystem.mds_map; + + // Insert daemon state to Filesystem + if (!is_standby_replay) { + mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); + } else { + ceph_assert(mds_map.mds_info.count(standby_gid)); + ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY); + ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank); + } + MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid]; + + if (mds_map.stopped.erase(assigned_rank)) { + // The cluster is being expanded with a stopped rank + info.state = MDSMap::STATE_STARTING; + } else if (!mds_map.is_in(assigned_rank)) { + // The cluster is being expanded with a new rank + info.state = MDSMap::STATE_CREATING; + } else { + // An existing rank is being assigned to a replacement + info.state = MDSMap::STATE_REPLAY; + mds_map.failed.erase(assigned_rank); + } + info.rank = assigned_rank; + info.inc = epoch; + mds_roles[standby_gid] = filesystem.fscid; + + // Update the rank state in Filesystem + mds_map.in.insert(assigned_rank); + mds_map.up[assigned_rank] = standby_gid; + + // Remove from the list of standbys + if (!is_standby_replay) { + standby_daemons.erase(standby_gid); + standby_epochs.erase(standby_gid); + } + + // Indicate that Filesystem has been modified + mds_map.epoch = epoch; +} + +void FSMap::assign_standby_replay( + const mds_gid_t standby_gid, + const fs_cluster_id_t leader_ns, + const mds_rank_t leader_rank) +{ + ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE); + ceph_assert(gid_exists(standby_gid)); + ceph_assert(!gid_has_rank(standby_gid)); + ceph_assert(standby_daemons.count(standby_gid)); + + // Insert to the filesystem + auto fs = filesystems.at(leader_ns); + fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); + fs->mds_map.mds_info[standby_gid].rank = leader_rank; + fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY; + mds_roles[standby_gid] = leader_ns; + + // Remove from the list of standbys + standby_daemons.erase(standby_gid); + standby_epochs.erase(standby_gid); + + // Indicate that Filesystem has been modified + fs->mds_map.epoch = epoch; +} + +void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch) +{ + if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) { + standby_daemons.erase(who); + standby_epochs.erase(who); + } else { + auto &fs = filesystems.at(mds_roles.at(who)); + const auto &info = fs->mds_map.mds_info.at(who); + if (info.state != MDSMap::STATE_STANDBY_REPLAY) { + if (info.state == MDSMap::STATE_CREATING) { + // If this gid didn't make it past CREATING, then forget + // the rank ever existed so that next time it's handed out + // to a gid it'll go back into CREATING. + fs->mds_map.in.erase(info.rank); + } else { + // Put this rank into the failed list so that the next available + // STANDBY will pick it up. + fs->mds_map.failed.insert(info.rank); + } + ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id); + fs->mds_map.up.erase(info.rank); + } + fs->mds_map.mds_info.erase(who); + fs->mds_map.last_failure_osd_epoch = blacklist_epoch; + fs->mds_map.epoch = epoch; + } + + mds_roles.erase(who); +} + +void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch) +{ + ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); + auto fs = filesystems.at(mds_roles.at(who)); + mds_rank_t rank = fs->mds_map.mds_info[who].rank; + + erase(who, blacklist_epoch); + fs->mds_map.failed.erase(rank); + fs->mds_map.damaged.insert(rank); + + ceph_assert(fs->mds_map.epoch == epoch); +} + +/** + * Update to indicate that the rank `rank` is to be removed + * from the damaged list of the filesystem `fscid` + */ +bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank) +{ + auto fs = filesystems.at(fscid); + + if (fs->mds_map.damaged.erase(rank)) { + fs->mds_map.failed.insert(rank); + fs->mds_map.epoch = epoch; + return true; + } else { + return false; + } +} + +void FSMap::insert(const MDSMap::mds_info_t &new_info) +{ + ceph_assert(new_info.state == MDSMap::STATE_STANDBY); + ceph_assert(new_info.rank == MDS_RANK_NONE); + mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE; + standby_daemons[new_info.global_id] = new_info; + standby_epochs[new_info.global_id] = epoch; +} + +std::list<mds_gid_t> FSMap::stop(mds_gid_t who) +{ + ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); + auto fs = filesystems.at(mds_roles.at(who)); + const auto &info = fs->mds_map.mds_info.at(who); + fs->mds_map.up.erase(info.rank); + fs->mds_map.in.erase(info.rank); + fs->mds_map.stopped.insert(info.rank); + + // Also drop any standby replays that were following this rank + std::list<mds_gid_t> standbys; + for (const auto &i : fs->mds_map.mds_info) { + const auto &other_gid = i.first; + const auto &other_info = i.second; + if (other_info.rank == info.rank + && other_info.state == MDSMap::STATE_STANDBY_REPLAY) { + standbys.push_back(other_gid); + erase(other_gid, 0); + } + } + + fs->mds_map.mds_info.erase(who); + mds_roles.erase(who); + + fs->mds_map.epoch = epoch; + + return standbys; +} + + +/** + * Given one of the following forms: + * <fs name>:<rank> + * <fs id>:<rank> + * <rank> + * + * Parse into a mds_role_t. The rank-only form is only valid + * if legacy_client_ns is set. + */ +int FSMap::parse_role( + std::string_view role_str, + mds_role_t *role, + std::ostream &ss) const +{ + size_t colon_pos = role_str.find(":"); + size_t rank_pos; + Filesystem::const_ref fs; + if (colon_pos == std::string::npos) { + if (legacy_client_fscid == FS_CLUSTER_ID_NONE) { + ss << "No filesystem selected"; + return -ENOENT; + } + fs = get_filesystem(legacy_client_fscid); + rank_pos = 0; + } else { + if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) { + ss << "Invalid filesystem"; + return -ENOENT; + } + rank_pos = colon_pos+1; + } + + mds_rank_t rank; + std::string err; + std::string rank_str(role_str.substr(rank_pos)); + long rank_i = strict_strtol(rank_str.c_str(), 10, &err); + if (rank_i < 0 || !err.empty()) { + ss << "Invalid rank '" << rank_str << "'"; + return -EINVAL; + } else { + rank = rank_i; + } + + if (fs->mds_map.in.count(rank) == 0) { + ss << "Rank '" << rank << "' not found"; + return -ENOENT; + } + + *role = {fs->fscid, rank}; + + return 0; +} diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h new file mode 100644 index 00000000..e02a3d72 --- /dev/null +++ b/src/mds/FSMap.h @@ -0,0 +1,532 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FSMAP_H +#define CEPH_FSMAP_H + +#include <map> +#include <memory> +#include <set> +#include <string> +#include <string_view> + +#include <errno.h> + +#include "include/types.h" +#include "common/Clock.h" +#include "mds/MDSMap.h" + +#include "include/CompatSet.h" +#include "include/ceph_features.h" +#include "common/Formatter.h" +#include "mds/mdstypes.h" + +class CephContext; +class health_check_map_t; + +#define MDS_FS_NAME_DEFAULT "cephfs" + +/** + * The MDSMap and any additional fields describing a particular + * filesystem (a unique fs_cluster_id_t). + */ +class Filesystem +{ +public: + using ref = std::shared_ptr<Filesystem>; + using const_ref = std::shared_ptr<Filesystem const>; + + template<typename... Args> + static ref create(Args&&... args) + { + return std::make_shared<Filesystem>(std::forward<Args>(args)...); + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + + void dump(Formatter *f) const; + void print(std::ostream& out) const; + + /** + * Return true if a daemon is already assigned as + * STANDBY_REPLAY for the gid `who` + */ + bool has_standby_replay(mds_gid_t who) const + { + return get_standby_replay(who) != MDS_GID_NONE; + } + mds_gid_t get_standby_replay(mds_gid_t who) const + { + for (const auto &i : mds_map.mds_info) { + const auto &info = i.second; + if (info.state == MDSMap::STATE_STANDBY_REPLAY + && info.rank == mds_map.mds_info.at(who).rank) { + return info.global_id; + } + } + return MDS_GID_NONE; + } + bool is_standby_replay(mds_gid_t who) const + { + auto p = mds_map.mds_info.find(who); + if (p != mds_map.mds_info.end() && + p->second.state == MDSMap::STATE_STANDBY_REPLAY) { + return true; + } + return false; + } + + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + MDSMap mds_map; +}; +WRITE_CLASS_ENCODER_FEATURES(Filesystem) + +class FSMap { +protected: + epoch_t epoch = 0; + uint64_t next_filesystem_id = FS_CLUSTER_ID_ANONYMOUS + 1; + fs_cluster_id_t legacy_client_fscid = FS_CLUSTER_ID_NONE; + CompatSet compat; + bool enable_multiple = false; + bool ever_enabled_multiple = false; // < the cluster had multiple MDSes enabled once + + std::map<fs_cluster_id_t, Filesystem::ref> filesystems; + + // Remember which Filesystem an MDS daemon's info is stored in + // (or in standby_daemons for FS_CLUSTER_ID_NONE) + std::map<mds_gid_t, fs_cluster_id_t> mds_roles; + + // For MDS daemons not yet assigned to a Filesystem + std::map<mds_gid_t, MDSMap::mds_info_t> standby_daemons; + std::map<mds_gid_t, epoch_t> standby_epochs; + +public: + + friend class MDSMonitor; + friend class PaxosFSMap; + + FSMap() : compat(MDSMap::get_compat_set_default()) {} + + FSMap(const FSMap &rhs) + : + epoch(rhs.epoch), + next_filesystem_id(rhs.next_filesystem_id), + legacy_client_fscid(rhs.legacy_client_fscid), + compat(rhs.compat), + enable_multiple(rhs.enable_multiple), + ever_enabled_multiple(rhs.ever_enabled_multiple), + mds_roles(rhs.mds_roles), + standby_daemons(rhs.standby_daemons), + standby_epochs(rhs.standby_epochs) + { + filesystems.clear(); + for (const auto &i : rhs.filesystems) { + const auto &fs = i.second; + filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs); + } + } + + FSMap &operator=(const FSMap &rhs) + { + epoch = rhs.epoch; + next_filesystem_id = rhs.next_filesystem_id; + legacy_client_fscid = rhs.legacy_client_fscid; + compat = rhs.compat; + enable_multiple = rhs.enable_multiple; + mds_roles = rhs.mds_roles; + standby_daemons = rhs.standby_daemons; + standby_epochs = rhs.standby_epochs; + + filesystems.clear(); + for (const auto &i : rhs.filesystems) { + const auto &fs = i.second; + filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs); + } + + return *this; + } + + const CompatSet &get_compat() const {return compat;} + + void set_enable_multiple(const bool v) + { + enable_multiple = v; + if (true == v) { + ever_enabled_multiple = true; + } + } + + bool get_enable_multiple() const + { + return enable_multiple; + } + + void set_legacy_client_fscid(fs_cluster_id_t fscid) + { + ceph_assert(fscid == FS_CLUSTER_ID_NONE || filesystems.count(fscid)); + legacy_client_fscid = fscid; + } + + fs_cluster_id_t get_legacy_client_fscid() const + { + return legacy_client_fscid; + } + + size_t get_num_standby() const { + return standby_daemons.size(); + } + + bool is_any_degraded() const { + for (auto& i : filesystems) { + if (i.second->mds_map.is_degraded()) { + return true; + } + } + return false; + } + + /** + * Get state of all daemons (for all filesystems, including all standbys) + */ + std::map<mds_gid_t, MDSMap::mds_info_t> get_mds_info() const + { + std::map<mds_gid_t, MDSMap::mds_info_t> result; + for (const auto &i : standby_daemons) { + result[i.first] = i.second; + } + + for (const auto &i : filesystems) { + const auto &fs_info = i.second->mds_map.get_mds_info(); + for (const auto &j : fs_info) { + result[j.first] = j.second; + } + } + + return result; + } + + mds_gid_t get_available_standby() const; + + /** + * Resolve daemon name to GID + */ + mds_gid_t find_mds_gid_by_name(std::string_view s) const + { + const auto info = get_mds_info(); + for (const auto &p : info) { + if (p.second.name == s) { + return p.first; + } + } + return MDS_GID_NONE; + } + + /** + * Resolve daemon name to status + */ + const MDSMap::mds_info_t* find_by_name(std::string_view name) const + { + std::map<mds_gid_t, MDSMap::mds_info_t> result; + for (const auto &i : standby_daemons) { + if (i.second.name == name) { + return &(i.second); + } + } + + for (const auto &i : filesystems) { + const auto &fs_info = i.second->mds_map.get_mds_info(); + for (const auto &j : fs_info) { + if (j.second.name == name) { + return &(j.second); + } + } + } + + return nullptr; + } + + /** + * Does a daemon exist with this GID? + */ + bool gid_exists(mds_gid_t gid) const + { + return mds_roles.count(gid) > 0; + } + + /** + * Does a daemon with this GID exist, *and* have an MDS rank assigned? + */ + bool gid_has_rank(mds_gid_t gid) const + { + return gid_exists(gid) && mds_roles.at(gid) != FS_CLUSTER_ID_NONE; + } + + /** + * Insert a new MDS daemon, as a standby + */ + void insert(const MDSMap::mds_info_t &new_info); + + /** + * Assign an MDS cluster standby replay rank to a standby daemon + */ + void assign_standby_replay( + const mds_gid_t standby_gid, + const fs_cluster_id_t leader_ns, + const mds_rank_t leader_rank); + + /** + * Assign an MDS cluster rank to a standby daemon + */ + void promote( + mds_gid_t standby_gid, + Filesystem& filesystem, + mds_rank_t assigned_rank); + + /** + * A daemon reports that it is STATE_STOPPED: remove it, + * and the rank it held. + * + * @returns a list of any additional GIDs that were removed from the map + * as a side effect (like standby replays) + */ + std::list<mds_gid_t> stop(mds_gid_t who); + + /** + * The rank held by 'who', if any, is to be relinquished, and + * the state for the daemon GID is to be forgotten. + */ + void erase(mds_gid_t who, epoch_t blacklist_epoch); + + /** + * Update to indicate that the rank held by 'who' is damaged + */ + void damaged(mds_gid_t who, epoch_t blacklist_epoch); + + /** + * Update to indicate that the rank `rank` is to be removed + * from the damaged list of the filesystem `fscid` + */ + bool undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank); + + /** + * Initialize a Filesystem and assign a fscid. Update legacy_client_fscid + * to point to the new filesystem if it's the only one. + * + * Caller must already have validated all arguments vs. the existing + * FSMap and OSDMap contents. + */ + Filesystem::ref create_filesystem( + std::string_view name, int64_t metadata_pool, + int64_t data_pool, uint64_t features); + + /** + * Remove the filesystem (it must exist). Caller should already + * have failed out any MDSs that were assigned to the filesystem. + */ + void erase_filesystem(fs_cluster_id_t fscid) + { + filesystems.erase(fscid); + } + + /** + * Reset all the state information (not configuration information) + * in a particular filesystem. Caller must have verified that + * the filesystem already exists. + */ + void reset_filesystem(fs_cluster_id_t fscid); + + /** + * Mutator helper for Filesystem objects: expose a non-const + * Filesystem pointer to `fn` and update epochs appropriately. + */ + template<typename T> + void modify_filesystem(fs_cluster_id_t fscid, T&& fn) + { + auto& fs = filesystems.at(fscid); + fn(fs); + fs->mds_map.epoch = epoch; + } + + /** + * Apply a mutation to the mds_info_t structure for a particular + * daemon (identified by GID), and make appropriate updates to epochs. + */ + template<typename T> + void modify_daemon(mds_gid_t who, T&& fn) + { + const auto& fscid = mds_roles.at(who); + if (fscid == FS_CLUSTER_ID_NONE) { + auto& info = standby_daemons.at(who); + fn(info); + ceph_assert(info.state == MDSMap::STATE_STANDBY); + standby_epochs[who] = epoch; + } else { + auto& fs = filesystems.at(fscid); + auto& info = fs->mds_map.mds_info.at(who); + fn(info); + fs->mds_map.epoch = epoch; + } + } + + /** + * Given that gid exists in a filesystem or as a standby, return + * a reference to its info. + */ + const MDSMap::mds_info_t& get_info_gid(mds_gid_t gid) const + { + auto fscid = mds_roles.at(gid); + if (fscid == FS_CLUSTER_ID_NONE) { + return standby_daemons.at(gid); + } else { + return filesystems.at(fscid)->mds_map.mds_info.at(gid); + } + } + + bool is_standby_replay(mds_gid_t who) const + { + return filesystems.at(mds_roles.at(who))->is_standby_replay(who); + } + + mds_gid_t get_standby_replay(mds_gid_t who) const + { + return filesystems.at(mds_roles.at(who))->get_standby_replay(who); + } + + /** + * A daemon has told us it's compat, and it's too new + * for the one we had previously. Impose the new one + * on all filesystems. + */ + void update_compat(const CompatSet &c) + { + // We could do something more complicated here to enable + // different filesystems to be served by different MDS versions, + // but this is a lot simpler because it doesn't require us to + // track the compat versions for standby daemons. + compat = c; + for (const auto &i : filesystems) { + MDSMap &mds_map = i.second->mds_map; + mds_map.compat = c; + mds_map.epoch = epoch; + } + } + + Filesystem::const_ref get_legacy_filesystem() + { + if (legacy_client_fscid == FS_CLUSTER_ID_NONE) { + return nullptr; + } else { + return filesystems.at(legacy_client_fscid); + } + } + + /** + * A daemon has informed us of its offload targets + */ + void update_export_targets(mds_gid_t who, const std::set<mds_rank_t> &targets) + { + auto fscid = mds_roles.at(who); + modify_filesystem(fscid, [who, &targets](auto&& fs) { + fs->mds_map.mds_info.at(who).export_targets = targets; + }); + } + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + size_t filesystem_count() const {return filesystems.size();} + bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;} + Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));} + Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);} + Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);} + Filesystem::const_ref get_filesystem(std::string_view name) const + { + for (const auto& p : filesystems) { + if (p.second->mds_map.fs_name == name) { + return p.second; + } + } + return nullptr; + } + std::vector<Filesystem::const_ref> get_filesystems(void) const + { + std::vector<Filesystem::const_ref> ret; + for (const auto& p : filesystems) { + ret.push_back(p.second); + } + return ret; + } + + int parse_filesystem( + std::string_view ns_str, + Filesystem::const_ref *result + ) const; + + int parse_role( + std::string_view role_str, + mds_role_t *role, + std::ostream &ss) const; + + /** + * Return true if this pool is in use by any of the filesystems + */ + bool pool_in_use(int64_t poolid) const { + for (auto const &i : filesystems) { + if (i.second->mds_map.is_data_pool(poolid) + || i.second->mds_map.metadata_pool == poolid) { + return true; + } + } + return false; + } + + mds_gid_t find_replacement_for(mds_role_t mds, std::string_view name) const; + + void get_health(list<pair<health_status_t,std::string> >& summary, + list<pair<health_status_t,std::string> > *detail) const; + + void get_health_checks(health_check_map_t *checks) const; + + bool check_health(void); + + /** + * Assert that the FSMap, Filesystem, MDSMap, mds_info_t relations are + * all self-consistent. + */ + void sanity() const; + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void decode(bufferlist& bl) { + auto p = bl.cbegin(); + decode(p); + } + void sanitize(const std::function<bool(int64_t pool)>& pool_exists); + + void print(ostream& out) const; + void print_summary(Formatter *f, ostream *out) const; + + void dump(Formatter *f) const; + static void generate_test_instances(list<FSMap*>& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(FSMap) + +inline ostream& operator<<(ostream& out, const FSMap& m) { + m.print_summary(NULL, &out); + return out; +} + +#endif diff --git a/src/mds/FSMapUser.cc b/src/mds/FSMapUser.cc new file mode 100644 index 00000000..47d5f19c --- /dev/null +++ b/src/mds/FSMapUser.cc @@ -0,0 +1,81 @@ +#include "FSMapUser.h" + +void FSMapUser::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(legacy_client_fscid, bl); + std::vector<fs_info_t> fs_list; + for (auto p = filesystems.begin(); p != filesystems.end(); ++p) + fs_list.push_back(p->second); + encode(fs_list, bl, features); + ENCODE_FINISH(bl); +} + +void FSMapUser::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(epoch, p); + decode(legacy_client_fscid, p); + std::vector<fs_info_t> fs_list; + decode(fs_list, p); + filesystems.clear(); + for (auto p = fs_list.begin(); p != fs_list.end(); ++p) + filesystems[p->cid] = *p; + DECODE_FINISH(p); +} + +void FSMapUser::fs_info_t::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(cid, bl); + encode(name, bl); + ENCODE_FINISH(bl); +} + +void FSMapUser::fs_info_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(cid, p); + decode(name, p); + DECODE_FINISH(p); +} + +void FSMapUser::generate_test_instances(list<FSMapUser*>& ls) +{ + FSMapUser *m = new FSMapUser(); + m->epoch = 2; + m->legacy_client_fscid = 1; + m->filesystems[1].cid = 1; + m->filesystems[2].name = "cephfs2"; + m->filesystems[2].cid = 2; + m->filesystems[1].name = "cephfs1"; + ls.push_back(m); +} + + +void FSMapUser::print(ostream& out) const +{ + out << "e" << epoch << std::endl; + out << "legacy_client_fscid: " << legacy_client_fscid << std::endl; + for (auto &p : filesystems) + out << " id " << p.second.cid << " name " << p.second.name << std::endl; +} + +void FSMapUser::print_summary(Formatter *f, ostream *out) +{ + map<mds_role_t,string> by_rank; + map<string,int> by_state; + + if (f) { + f->dump_unsigned("epoch", get_epoch()); + for (auto &p : filesystems) { + f->dump_unsigned("id", p.second.cid); + f->dump_string("name", p.second.name); + } + } else { + *out << "e" << get_epoch() << ":"; + for (auto &p : filesystems) + *out << " " << p.second.name << "(" << p.second.cid << ")"; + } +} diff --git a/src/mds/FSMapUser.h b/src/mds/FSMapUser.h new file mode 100644 index 00000000..23af8473 --- /dev/null +++ b/src/mds/FSMapUser.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_FSMAPCOMPACT_H +#define CEPH_FSMAPCOMPACT_H + +#include <map> +#include <string> +#include <string_view> + +#include "mds/mdstypes.h" + +class FSMapUser { +public: + struct fs_info_t { + fs_cluster_id_t cid; + std::string name; + fs_info_t() : cid(FS_CLUSTER_ID_NONE) {} + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + }; + + epoch_t epoch; + fs_cluster_id_t legacy_client_fscid; + std::map<fs_cluster_id_t, fs_info_t> filesystems; + + FSMapUser() + : epoch(0), legacy_client_fscid(FS_CLUSTER_ID_NONE) { } + + epoch_t get_epoch() const { return epoch; } + + fs_cluster_id_t get_fs_cid(std::string_view name) const { + for (auto &p : filesystems) { + if (p.second.name == name) + return p.first; + } + return FS_CLUSTER_ID_NONE; + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + + void print(ostream& out) const; + void print_summary(Formatter *f, ostream *out); + + static void generate_test_instances(list<FSMapUser*>& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(FSMapUser::fs_info_t) +WRITE_CLASS_ENCODER_FEATURES(FSMapUser) + +inline ostream& operator<<(ostream& out, FSMapUser& m) { + m.print_summary(NULL, &out); + return out; +} +#endif diff --git a/src/mds/InoTable.cc b/src/mds/InoTable.cc new file mode 100644 index 00000000..dfb6a41d --- /dev/null +++ b/src/mds/InoTable.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "InoTable.h" +#include "MDSRank.h" + +#include "include/types.h" + +#include "common/config.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": " + +void InoTable::reset_state() +{ + // use generic range. FIXME THIS IS CRAP + free.clear(); + //#ifdef __LP64__ + uint64_t start = (uint64_t)(rank+1) << 40; + uint64_t len = (uint64_t)1 << 40; + //#else + //# warning this looks like a 32-bit system, using small inode numbers. + // uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25; + // uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1; + //#endif + free.insert(start, len); + + projected_free = free; +} + +inodeno_t InoTable::project_alloc_id(inodeno_t id) +{ + dout(10) << "project_alloc_id " << id << " to " << projected_free << "/" << free << dendl; + ceph_assert(is_active()); + if (!id) + id = projected_free.range_start(); + projected_free.erase(id); + ++projected_version; + return id; +} +void InoTable::apply_alloc_id(inodeno_t id) +{ + dout(10) << "apply_alloc_id " << id << " to " << projected_free << "/" << free << dendl; + free.erase(id); + ++version; +} + +void InoTable::project_alloc_ids(interval_set<inodeno_t>& ids, int want) +{ + ceph_assert(is_active()); + while (want > 0) { + inodeno_t start = projected_free.range_start(); + inodeno_t end = projected_free.end_after(start); + inodeno_t num = end - start; + if (num > (inodeno_t)want) + num = want; + projected_free.erase(start, num); + ids.insert(start, num); + want -= num; + } + dout(10) << "project_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl; + ++projected_version; +} +void InoTable::apply_alloc_ids(interval_set<inodeno_t>& ids) +{ + dout(10) << "apply_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl; + free.subtract(ids); + ++version; +} + + +void InoTable::project_release_ids(interval_set<inodeno_t>& ids) +{ + dout(10) << "project_release_ids " << ids << " to " << projected_free << "/" << free << dendl; + projected_free.insert(ids); + ++projected_version; +} +void InoTable::apply_release_ids(interval_set<inodeno_t>& ids) +{ + dout(10) << "apply_release_ids " << ids << " to " << projected_free << "/" << free << dendl; + free.insert(ids); + ++version; +} + + +// + +void InoTable::replay_alloc_id(inodeno_t id) +{ + ceph_assert(mds); // Only usable in online mode + + dout(10) << "replay_alloc_id " << id << dendl; + if (free.contains(id)) { + free.erase(id); + projected_free.erase(id); + } else { + mds->clog->error() << "journal replay alloc " << id + << " not in free " << free; + } + projected_version = ++version; +} +void InoTable::replay_alloc_ids(interval_set<inodeno_t>& ids) +{ + ceph_assert(mds); // Only usable in online mode + + dout(10) << "replay_alloc_ids " << ids << dendl; + interval_set<inodeno_t> is; + is.intersection_of(free, ids); + if (!(is==ids)) { + mds->clog->error() << "journal replay alloc " << ids << ", only " + << is << " is in free " << free; + } + free.subtract(is); + projected_free.subtract(is); + + projected_version = ++version; +} +void InoTable::replay_release_ids(interval_set<inodeno_t>& ids) +{ + dout(10) << "replay_release_ids " << ids << dendl; + free.insert(ids); + projected_free.insert(ids); + projected_version = ++version; +} + + +void InoTable::replay_reset() +{ + dout(10) << "replay_reset " << free << dendl; + skip_inos(inodeno_t(10000000)); // a lot! + projected_free = free; + projected_version = ++version; +} + + +void InoTable::skip_inos(inodeno_t i) +{ + dout(10) << "skip_inos was " << free << dendl; + inodeno_t first = free.range_start(); + interval_set<inodeno_t> s; + s.insert(first, i); + s.intersection_of(free); + free.subtract(s); + projected_free = free; + projected_version = ++version; + dout(10) << "skip_inos now " << free << dendl; +} + +void InoTable::dump(Formatter *f) const +{ + f->open_object_section("inotable"); + + f->open_array_section("projected_free"); + for (interval_set<inodeno_t>::const_iterator i = projected_free.begin(); i != projected_free.end(); ++i) { + f->open_object_section("range"); + f->dump_int("start", (*i).first); + f->dump_int("len", (*i).second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("free"); + for (interval_set<inodeno_t>::const_iterator i = free.begin(); i != free.end(); ++i) { + f->open_object_section("range"); + f->dump_int("start", (*i).first); + f->dump_int("len", (*i).second); + f->close_section(); + } + f->close_section(); + + f->close_section(); +} + + +void InoTable::generate_test_instances(list<InoTable*>& ls) +{ + ls.push_back(new InoTable()); +} + + +bool InoTable::is_marked_free(inodeno_t id) const +{ + return free.contains(id) || projected_free.contains(id); +} + +bool InoTable::intersects_free( + const interval_set<inodeno_t> &other, + interval_set<inodeno_t> *intersection) +{ + interval_set<inodeno_t> i; + i.intersection_of(free, other); + if (intersection != nullptr) { + *intersection = i; + } + return !(i.empty()); +} + +bool InoTable::repair(inodeno_t id) +{ + if (projected_version != version) { + // Can't do the repair while other things are in flight + return false; + } + + ceph_assert(is_marked_free(id)); + dout(10) << "repair: before status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl; + free.erase(id); + projected_free.erase(id); + projected_version = ++version; + dout(10) << "repair: after status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl; + return true; +} + +bool InoTable::force_consume_to(inodeno_t ino) +{ + inodeno_t first = free.range_start(); + if (first > ino) + return false; + + skip_inos(inodeno_t(ino + 1 - first)); + return true; +} diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h new file mode 100644 index 00000000..0e26e1e9 --- /dev/null +++ b/src/mds/InoTable.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_INOTABLE_H +#define CEPH_INOTABLE_H + +#include "MDSTable.h" +#include "include/interval_set.h" + +class MDSRank; + +class InoTable : public MDSTable { + interval_set<inodeno_t> free; // unused ids + interval_set<inodeno_t> projected_free; + + public: + explicit InoTable(MDSRank *m) : MDSTable(m, "inotable", true) { } + + inodeno_t project_alloc_id(inodeno_t id=0); + void apply_alloc_id(inodeno_t id); + + void project_alloc_ids(interval_set<inodeno_t>& inos, int want); + void apply_alloc_ids(interval_set<inodeno_t>& inos); + + void project_release_ids(interval_set<inodeno_t>& inos); + void apply_release_ids(interval_set<inodeno_t>& inos); + + void replay_alloc_id(inodeno_t ino); + void replay_alloc_ids(interval_set<inodeno_t>& inos); + void replay_release_ids(interval_set<inodeno_t>& inos); + void replay_reset(); + bool repair(inodeno_t id); + bool is_marked_free(inodeno_t id) const; + bool intersects_free( + const interval_set<inodeno_t> &other, + interval_set<inodeno_t> *intersection); + + void reset_state() override; + void encode_state(bufferlist& bl) const override { + ENCODE_START(2, 2, bl); + encode(free, bl); + ENCODE_FINISH(bl); + } + void decode_state(bufferlist::const_iterator& bl) override { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(free, bl); + projected_free = free; + DECODE_FINISH(bl); + } + + // To permit enc/decoding in isolation in dencoder + InoTable() : MDSTable(NULL, "inotable", true) {} + void encode(bufferlist& bl) const { + encode_state(bl); + } + void decode(bufferlist::const_iterator& bl) { + decode_state(bl); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<InoTable*>& ls); + + void skip_inos(inodeno_t i); + + /** + * If the specified inode is marked as free, mark it as used. + * For use in tools, not normal operations. + * + * @returns true if the inode was previously marked as free + */ + bool force_consume(inodeno_t ino) + { + if (free.contains(ino)) { + free.erase(ino); + return true; + } else { + return false; + } + } + + /** + * If this ino is in this rank's range, consume up to and including it. + * For use in tools, when we know the max ino in use and want to make + * sure we're only allocating new inodes from above it. + * + * @return true if the table was modified + */ + bool force_consume_to(inodeno_t ino); +}; +WRITE_CLASS_ENCODER(InoTable) + +#endif diff --git a/src/mds/JournalPointer.cc b/src/mds/JournalPointer.cc new file mode 100644 index 00000000..797798aa --- /dev/null +++ b/src/mds/JournalPointer.cc @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/debug.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "osdc/Objecter.h" +#include "mds/mdstypes.h" +#include "msg/Messenger.h" + +#include "mds/JournalPointer.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_journaler +#undef dout_prefix +#define dout_prefix *_dout << objecter->messenger->get_myname() << ".journalpointer " + + +std::string JournalPointer::get_object_id() const +{ + inodeno_t const pointer_ino = MDS_INO_LOG_POINTER_OFFSET + node_id; + char buf[32]; + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)pointer_ino, (long long unsigned)0); + + return std::string(buf); +} + + +/** + * Blocking read of JournalPointer for this MDS + */ +int JournalPointer::load(Objecter *objecter) +{ + ceph_assert(objecter != NULL); + + // Blocking read of data + std::string const object_id = get_object_id(); + dout(4) << "Reading journal pointer '" << object_id << "'" << dendl; + bufferlist data; + C_SaferCond waiter; + objecter->read_full(object_t(object_id), object_locator_t(pool_id), + CEPH_NOSNAP, &data, 0, &waiter); + int r = waiter.wait(); + + // Construct JournalPointer result, null or decoded data + if (r == 0) { + auto q = data.cbegin(); + try { + decode(q); + } catch (const buffer::error &e) { + return -EINVAL; + } + } else { + dout(1) << "Journal pointer '" << object_id << "' read failed: " << cpp_strerror(r) << dendl; + } + return r; +} + + +/** + * Blocking write of JournalPointer for this MDS + * + * @return objecter write op status code + */ +int JournalPointer::save(Objecter *objecter) const +{ + ceph_assert(objecter != NULL); + // It is not valid to persist a null pointer + ceph_assert(!is_null()); + + // Serialize JournalPointer object + bufferlist data; + encode(data); + + // Write to RADOS and wait for durability + std::string const object_id = get_object_id(); + dout(4) << "Writing pointer object '" << object_id << "': 0x" + << std::hex << front << ":0x" << back << std::dec << dendl; + + C_SaferCond waiter; + objecter->write_full(object_t(object_id), object_locator_t(pool_id), + SnapContext(), data, + ceph::real_clock::now(), 0, + &waiter); + int write_result = waiter.wait(); + if (write_result < 0) { + derr << "Error writing pointer object '" << object_id << "': " << cpp_strerror(write_result) << dendl; + } + return write_result; +} + + +/** + * Non-blocking variant of save() that assumes objecter lock already held by + * caller + */ +void JournalPointer::save(Objecter *objecter, Context *completion) const +{ + ceph_assert(objecter != NULL); + + bufferlist data; + encode(data); + + objecter->write_full(object_t(get_object_id()), object_locator_t(pool_id), + SnapContext(), data, + ceph::real_clock::now(), 0, + completion); +} + diff --git a/src/mds/JournalPointer.h b/src/mds/JournalPointer.h new file mode 100644 index 00000000..0f423266 --- /dev/null +++ b/src/mds/JournalPointer.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef JOURNAL_POINTER_H +#define JOURNAL_POINTER_H + +#include "include/encoding.h" +#include "mdstypes.h" + +class Objecter; +class Mutex; + +// This always lives in the same location for a given MDS +// instance, it tells the daemon where to look for the journal. +class JournalPointer { + // MDS rank + int node_id; + // Metadata pool ID + int64_t pool_id; + + std::string get_object_id() const; + + public: + // The currently active journal + inodeno_t front; + // The backup journal, if any (may be 0) + inodeno_t back; + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(front, bl); + encode(back, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &bl) { + DECODE_START(1, bl); + decode(front, bl); + decode(back, bl); + DECODE_FINISH(bl); + } + + JournalPointer(int node_id_, int64_t pool_id_) : node_id(node_id_), pool_id(pool_id_), + front(0), back(0) {} + + JournalPointer() : node_id(-1), pool_id(-1), front(0), back(0) {} + + int load(Objecter *objecter); + int save(Objecter *objecter) const; + void save(Objecter *objecter, Context *completion) const; + + bool is_null() const { + return front == 0 && back == 0; + } + + void dump(Formatter *f) const { + f->open_object_section("journal_pointer"); + { + f->dump_unsigned("front", front); + f->dump_unsigned("back", back); + } + f->close_section(); // journal_header + } + + static void generate_test_instances(std::list<JournalPointer*> &ls) + { + ls.push_back(new JournalPointer()); + ls.push_back(new JournalPointer()); + ls.back()->front = 0xdeadbeef; + ls.back()->back = 0xfeedbead; + } +}; +WRITE_CLASS_ENCODER(JournalPointer) + +#endif // JOURNAL_POINTER_H diff --git a/src/mds/LocalLock.h b/src/mds/LocalLock.h new file mode 100644 index 00000000..d405a6b3 --- /dev/null +++ b/src/mds/LocalLock.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_LOCALLOCK_H +#define CEPH_LOCALLOCK_H + +#include "SimpleLock.h" + +class LocalLock : public SimpleLock { +public: + client_t last_wrlock_client; + + LocalLock(MDSCacheObject *o, LockType *t) : + SimpleLock(o, t) { + set_state(LOCK_LOCK); // always. + } + + bool is_locallock() const override { + return true; + } + + bool can_xlock_local() const { + return !is_wrlocked() && (get_xlock_by() == MutationRef()); + } + + bool can_wrlock() const { + return !is_xlocked(); + } + void get_wrlock(client_t client) { + ceph_assert(can_wrlock()); + SimpleLock::get_wrlock(); + last_wrlock_client = client; + } + void put_wrlock() { + SimpleLock::put_wrlock(); + if (get_num_wrlocks() == 0) + last_wrlock_client = client_t(); + } + client_t get_last_wrlock_client() const { + return last_wrlock_client; + } + + void print(ostream& out) const override { + out << "("; + _print(out); + if (last_wrlock_client >= 0) + out << " last_client=" << last_wrlock_client; + out << ")"; + } +}; + + +#endif diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc new file mode 100644 index 00000000..284cb254 --- /dev/null +++ b/src/mds/Locker.cc @@ -0,0 +1,5479 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string_view> + +#include "MDSRank.h" +#include "MDCache.h" +#include "Locker.h" +#include "MDBalancer.h" +#include "Migrator.h" +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" +#include "Mutation.h" +#include "MDSContext.h" + +#include "MDLog.h" +#include "MDSMap.h" + +#include "events/EUpdate.h" +#include "events/EOpen.h" + +#include "msg/Messenger.h" +#include "osdc/Objecter.h" + +#include "messages/MInodeFileCaps.h" +#include "messages/MLock.h" +#include "messages/MClientLease.h" +#include "messages/MClientReply.h" +#include "messages/MClientCaps.h" +#include "messages/MClientCapRelease.h" + +#include "messages/MMDSSlaveRequest.h" + +#include <errno.h> + +#include "common/config.h" + + +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_context g_ceph_context +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".locker "; +} + + +class LockerContext : public MDSContext { +protected: + Locker *locker; + MDSRank *get_mds() override + { + return locker->mds; + } + +public: + explicit LockerContext(Locker *locker_) : locker(locker_) { + ceph_assert(locker != NULL); + } +}; + +class LockerLogContext : public MDSLogContextBase { +protected: + Locker *locker; + MDSRank *get_mds() override + { + return locker->mds; + } + +public: + explicit LockerLogContext(Locker *locker_) : locker(locker_) { + ceph_assert(locker != NULL); + } +}; + +Locker::Locker(MDSRank *m, MDCache *c) : + mds(m), mdcache(c), need_snapflush_inodes(member_offset(CInode, item_caps)) {} + + +void Locker::dispatch(const Message::const_ref &m) +{ + + switch (m->get_type()) { + // inter-mds locking + case MSG_MDS_LOCK: + handle_lock(MLock::msgref_cast(m)); + break; + // inter-mds caps + case MSG_MDS_INODEFILECAPS: + handle_inode_file_caps(MInodeFileCaps::msgref_cast(m)); + break; + // client sync + case CEPH_MSG_CLIENT_CAPS: + handle_client_caps(MClientCaps::msgref_cast(m)); + break; + case CEPH_MSG_CLIENT_CAPRELEASE: + handle_client_cap_release(MClientCapRelease::msgref_cast(m)); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_client_lease(MClientLease::msgref_cast(m)); + break; + default: + derr << "locker unknown message " << m->get_type() << dendl; + ceph_abort_msg("locker unknown message"); + } +} + +void Locker::tick() +{ + scatter_tick(); + caps_tick(); +} + +/* + * locks vs rejoin + * + * + * + */ + +void Locker::send_lock_message(SimpleLock *lock, int msg) +{ + for (const auto &it : lock->get_parent()->get_replicas()) { + if (mds->is_cluster_degraded() && + mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN) + continue; + auto m = MLock::create(lock, msg, mds->get_nodeid()); + mds->send_message_mds(m, it.first); + } +} + +void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) +{ + for (const auto &it : lock->get_parent()->get_replicas()) { + if (mds->is_cluster_degraded() && + mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN) + continue; + auto m = MLock::create(lock, msg, mds->get_nodeid()); + m->set_data(data); + mds->send_message_mds(m, it.first); + } +} + + + + +void Locker::include_snap_rdlocks(CInode *in, MutationImpl::LockOpVec& lov) +{ + // rdlock ancestor snaps + CInode *t = in; + while (t->get_projected_parent_dn()) { + t = t->get_projected_parent_dn()->get_dir()->get_inode(); + lov.add_rdlock(&t->snaplock); + } + lov.add_rdlock(&in->snaplock); +} + +void Locker::include_snap_rdlocks_wlayout(CInode *in, MutationImpl::LockOpVec& lov, + file_layout_t **layout) +{ + //rdlock ancestor snaps + CInode *t = in; + lov.add_rdlock(&in->snaplock); + lov.add_rdlock(&in->policylock); + bool found_layout = false; + while (t) { + lov.add_rdlock(&t->snaplock); + if (!found_layout) { + lov.add_rdlock(&t->policylock); + if (t->get_projected_inode()->has_layout()) { + *layout = &t->get_projected_inode()->layout; + found_layout = true; + } + } + if (t->get_projected_parent_dn() && + t->get_projected_parent_dn()->get_dir()) + t = t->get_projected_parent_dn()->get_dir()->get_inode(); + else t = NULL; + } +} + +struct MarkEventOnDestruct { + MDRequestRef& mdr; + std::string_view message; + bool mark_event; + MarkEventOnDestruct(MDRequestRef& _mdr, std::string_view _message) : + mdr(_mdr), + message(_message), + mark_event(true) {} + ~MarkEventOnDestruct() { + if (mark_event) + mdr->mark_event(message); + } +}; + +/* If this function returns false, the mdr has been placed + * on the appropriate wait list */ +bool Locker::acquire_locks(MDRequestRef& mdr, + MutationImpl::LockOpVec& lov, + CInode *auth_pin_freeze, + bool auth_pin_nonblock) +{ + if (mdr->done_locking && + !mdr->is_slave()) { // not on slaves! master requests locks piecemeal. + dout(10) << "acquire_locks " << *mdr << " - done locking" << dendl; + return true; // at least we had better be! + } + dout(10) << "acquire_locks " << *mdr << dendl; + + MarkEventOnDestruct marker(mdr, "failed to acquire_locks"); + + client_t client = mdr->get_client(); + + set<MDSCacheObject*> mustpin; // items to authpin + + // xlocks + for (int i = 0, size = lov.size(); i < size; ++i) { + auto& p = lov[i]; + SimpleLock *lock = p.lock; + MDSCacheObject *object = lock->get_parent(); + + if (p.is_xlock()) { + if ((lock->get_type() == CEPH_LOCK_ISNAP || + lock->get_type() == CEPH_LOCK_IPOLICY) && + mds->is_cluster_degraded() && + mdr->is_master() && + !mdr->is_queued_for_replay()) { + // waiting for recovering mds, to guarantee replayed requests and mksnap/setlayout + // get processed in proper order. + bool wait = false; + if (object->is_auth()) { + if (!mdr->locks.count(lock)) { + set<mds_rank_t> ls; + object->list_replicas(ls); + for (auto m : ls) { + if (mds->mdsmap->get_state(m) < MDSMap::STATE_ACTIVE) { + wait = true; + break; + } + } + } + } else { + // if the lock is the latest locked one, it's possible that slave mds got the lock + // while there are recovering mds. + if (!mdr->locks.count(lock) || lock == *mdr->locks.rbegin()) + wait = true; + } + if (wait) { + dout(10) << " must xlock " << *lock << " " << *object + << ", waiting for cluster recovered" << dendl; + mds->locker->drop_locks(mdr.get(), NULL); + mdr->drop_local_auth_pins(); + mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + } + + dout(20) << " must xlock " << *lock << " " << *object << dendl; + + mustpin.insert(object); + + // augment xlock with a versionlock? + if (lock->get_type() == CEPH_LOCK_DN) { + CDentry *dn = static_cast<CDentry*>(object); + if (!dn->is_auth()) + continue; + if (mdr->is_master()) { + // master. wrlock versionlock so we can pipeline dentry updates to journal. + lov.add_wrlock(&dn->versionlock); + } else { + // slave. exclusively lock the dentry version (i.e. block other journal updates). + // this makes rollback safe. + lov.add_xlock(&dn->versionlock); + } + } + if (lock->get_type() > CEPH_LOCK_IVERSION) { + // inode version lock? + CInode *in = static_cast<CInode*>(object); + if (!in->is_auth()) + continue; + if (mdr->is_master()) { + // master. wrlock versionlock so we can pipeline inode updates to journal. + lov.add_wrlock(&in->versionlock); + } else { + // slave. exclusively lock the inode version (i.e. block other journal updates). + // this makes rollback safe. + lov.add_xlock(&in->versionlock); + } + } + } else if (p.is_wrlock()) { + dout(20) << " must wrlock " << *lock << " " << *object << dendl; + if (object->is_auth()) { + mustpin.insert(object); + } else if (!object->is_auth() && + !lock->can_wrlock(client) && // we might have to request a scatter + !mdr->is_slave()) { // if we are slave (remote_wrlock), the master already authpinned + dout(15) << " will also auth_pin " << *object + << " in case we need to request a scatter" << dendl; + mustpin.insert(object); + } + } else if (p.is_remote_wrlock()) { + dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " " + << *lock << " " << *object << dendl; + mustpin.insert(object); + } else if (p.is_rdlock()) { + + dout(20) << " must rdlock " << *lock << " " << *object << dendl; + if (object->is_auth()) { + mustpin.insert(object); + } else if (!object->is_auth() && + !lock->can_rdlock(client)) { // we might have to request an rdlock + dout(15) << " will also auth_pin " << *object + << " in case we need to request a rdlock" << dendl; + mustpin.insert(object); + } + } else { + ceph_assert(0 == "locker unknown lock operation"); + } + } + + lov.sort_and_merge(); + + // AUTH PINS + map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote; // mds -> (object set) + + // can i auth pin them all now? + marker.message = "failed to authpin local pins"; + for (const auto &p : mustpin) { + MDSCacheObject *object = p; + + dout(10) << " must authpin " << *object << dendl; + + if (mdr->is_auth_pinned(object)) { + if (object != (MDSCacheObject*)auth_pin_freeze) + continue; + if (mdr->more()->is_remote_frozen_authpin) { + if (mdr->more()->rename_inode == auth_pin_freeze) + continue; + // unfreeze auth pin for the wrong inode + mustpin_remote[mdr->more()->rename_inode->authority().first].size(); + } + } + + if (!object->is_auth()) { + if (!mdr->locks.empty()) + drop_locks(mdr.get()); + if (object->is_ambiguous_auth()) { + // wait + marker.message = "waiting for single auth, object is being migrated"; + dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl; + object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); + mdr->drop_local_auth_pins(); + return false; + } + mustpin_remote[object->authority().first].insert(object); + continue; + } + int err = 0; + if (!object->can_auth_pin(&err)) { + // wait + drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + if (auth_pin_nonblock) { + dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl; + mdr->aborted = true; + return false; + } + if (err == MDSCacheObject::ERR_EXPORTING_TREE) { + marker.message = "failed to authpin, subtree is being exported"; + } else if (err == MDSCacheObject::ERR_FRAGMENTING_DIR) { + marker.message = "failed to authpin, dir is being fragmented"; + } else if (err == MDSCacheObject::ERR_EXPORTING_INODE) { + marker.message = "failed to authpin, inode is being exported"; + } + dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; + object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + + if (!mdr->remote_auth_pins.empty()) + notify_freeze_waiter(object); + + return false; + } + } + + // ok, grab local auth pins + for (const auto& p : mustpin) { + MDSCacheObject *object = p; + if (mdr->is_auth_pinned(object)) { + dout(10) << " already auth_pinned " << *object << dendl; + } else if (object->is_auth()) { + dout(10) << " auth_pinning " << *object << dendl; + mdr->auth_pin(object); + } + } + + // request remote auth_pins + if (!mustpin_remote.empty()) { + marker.message = "requesting remote authpins"; + for (const auto& p : mdr->remote_auth_pins) { + if (mustpin.count(p.first)) { + ceph_assert(p.second == p.first->authority().first); + map<mds_rank_t, set<MDSCacheObject*> >::iterator q = mustpin_remote.find(p.second); + if (q != mustpin_remote.end()) + q->second.insert(p.first); + } + } + for (map<mds_rank_t, set<MDSCacheObject*> >::iterator p = mustpin_remote.begin(); + p != mustpin_remote.end(); + ++p) { + dout(10) << "requesting remote auth_pins from mds." << p->first << dendl; + + // wait for active auth + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) { + dout(10) << " mds." << p->first << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPIN); + for (set<MDSCacheObject*>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(10) << " req remote auth_pin of " << **q << dendl; + MDSCacheObjectInfo info; + (*q)->set_object_info(info); + req->get_authpins().push_back(info); + if (*q == auth_pin_freeze) + (*q)->set_object_info(req->get_authpin_freeze()); + mdr->pin(*q); + } + if (auth_pin_nonblock) + req->mark_nonblock(); + mds->send_message_mds(req, p->first); + + // put in waiting list + ceph_assert(mdr->more()->waiting_on_slave.count(p->first) == 0); + mdr->more()->waiting_on_slave.insert(p->first); + } + return false; + } + + // caps i'll need to issue + set<CInode*> issue_set; + bool result = false; + + // acquire locks. + // make sure they match currently acquired locks. + auto existing = mdr->locks.begin(); + for (const auto& p : lov) { + bool need_wrlock = p.is_wrlock(); + bool need_remote_wrlock = p.is_remote_wrlock(); + + // already locked? + if (existing != mdr->locks.end() && existing->lock == p.lock) { + // right kind? + auto it = existing++; + auto have = *it; // don't reference + + if (have.is_xlock() && p.is_xlock()) { + dout(10) << " already xlocked " << *have.lock << " " << *have.lock->get_parent() << dendl; + continue; + } + + if (have.is_remote_wrlock() && + (!need_remote_wrlock || have.wrlock_target != p.wrlock_target)) { + dout(10) << " unlocking remote_wrlock on wrong mds." << have.wrlock_target + << " " << *have.lock << " " << *have.lock->get_parent() << dendl; + remote_wrlock_finish(it, mdr.get()); + have.clear_remote_wrlock(); + } + + if (need_wrlock || need_remote_wrlock) { + if (need_wrlock == have.is_wrlock() && + need_remote_wrlock == have.is_remote_wrlock()) { + if (need_wrlock) + dout(10) << " already wrlocked " << *have.lock << " " << *have.lock->get_parent() << dendl; + if (need_remote_wrlock) + dout(10) << " already remote_wrlocked " << *have.lock << " " << *have.lock->get_parent() << dendl; + continue; + } + + if (have.is_wrlock()) { + if (!need_wrlock) + dout(10) << " unlocking extra " << *have.lock << " " << *have.lock->get_parent() << dendl; + else if (need_remote_wrlock) // acquire remote_wrlock first + dout(10) << " unlocking out-of-order " << *have.lock << " " << *have.lock->get_parent() << dendl; + bool need_issue = false; + wrlock_finish(it, mdr.get(), &need_issue); + if (need_issue) + issue_set.insert(static_cast<CInode*>(have.lock->get_parent())); + } + } else if (have.is_rdlock() && p.is_rdlock()) { + dout(10) << " already rdlocked " << *have.lock << " " << *have.lock->get_parent() << dendl; + continue; + } + } + + // hose any stray locks + while (existing != mdr->locks.end()) { + auto it = existing++; + auto stray = *it; // don't reference + dout(10) << " unlocking out-of-order " << *stray.lock << " " << *stray.lock->get_parent() << dendl; + bool need_issue = false; + if (stray.is_xlock()) { + xlock_finish(it, mdr.get(), &need_issue); + } else if (stray.is_rdlock()) { + rdlock_finish(it, mdr.get(), &need_issue); + } else { + // may have acquired both wrlock and remore wrlock + if (stray.is_wrlock()) + wrlock_finish(it, mdr.get(), &need_issue); + if (stray.is_remote_wrlock()) + remote_wrlock_finish(it, mdr.get()); + } + if (need_issue) + issue_set.insert(static_cast<CInode*>(stray.lock->get_parent())); + } + + // lock + if (mdr->locking && p.lock != mdr->locking) { + cancel_locking(mdr.get(), &issue_set); + } + if (p.is_xlock()) { + marker.message = "failed to xlock, waiting"; + if (!xlock_start(p.lock, mdr)) + goto out; + dout(10) << " got xlock on " << *p.lock << " " << *p.lock->get_parent() << dendl; + } else if (need_wrlock || need_remote_wrlock) { + if (need_remote_wrlock && !mdr->is_remote_wrlocked(p)) { + marker.message = "waiting for remote wrlocks"; + remote_wrlock_start(p, p.wrlock_target, mdr); + goto out; + } + if (need_wrlock) { + marker.message = "failed to wrlock, waiting"; + if (need_remote_wrlock && !p.lock->can_wrlock(mdr->get_client())) { + marker.message = "failed to wrlock, dropping remote wrlock and waiting"; + // can't take the wrlock because the scatter lock is gathering. need to + // release the remote wrlock, so that the gathering process can finish. + auto it = mdr->locks.end(); + ++it; + remote_wrlock_finish(it, mdr.get()); + remote_wrlock_start(p, p.wrlock_target, mdr); + goto out; + } + // nowait if we have already gotten remote wrlock + if (!wrlock_start(p, mdr, need_remote_wrlock)) + goto out; + dout(10) << " got wrlock on " << *p.lock << " " << *p.lock->get_parent() << dendl; + } + } else { + ceph_assert(mdr->is_master()); + if (p.lock->needs_recover()) { + if (mds->is_cluster_degraded()) { + if (!mdr->is_queued_for_replay()) { + // see comments in SimpleLock::set_state_rejoin() and + // ScatterLock::encode_state_for_rejoin() + drop_locks(mdr.get()); + mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr)); + dout(10) << " rejoin recovering " << *p.lock << " " << *p.lock->get_parent() + << ", waiting for cluster recovered" << dendl; + marker.message = "rejoin recovering lock, waiting for cluster recovered"; + return false; + } + } else { + p.lock->clear_need_recover(); + } + } + + marker.message = "failed to rdlock, waiting"; + if (!rdlock_start(p, mdr)) + goto out; + dout(10) << " got rdlock on " << *p.lock << " " << *p.lock->get_parent() << dendl; + } + } + + // any extra unneeded locks? + while (existing != mdr->locks.end()) { + auto it = existing++; + auto stray = *it; + dout(10) << " unlocking extra " << *stray.lock << " " << *stray.lock->get_parent() << dendl; + bool need_issue = false; + if (stray.is_xlock()) { + xlock_finish(it, mdr.get(), &need_issue); + } else if (stray.is_rdlock()) { + rdlock_finish(it, mdr.get(), &need_issue); + } else { + // may have acquired both wrlock and remore wrlock + if (stray.is_wrlock()) + wrlock_finish(it, mdr.get(), &need_issue); + if (stray.is_remote_wrlock()) + remote_wrlock_finish(it, mdr.get()); + } + if (need_issue) + issue_set.insert(static_cast<CInode*>(stray.lock->get_parent())); + } + + mdr->done_locking = true; + mdr->set_mds_stamp(ceph_clock_now()); + result = true; + marker.message = "acquired locks"; + + out: + issue_caps_set(issue_set); + return result; +} + +void Locker::notify_freeze_waiter(MDSCacheObject *o) +{ + CDir *dir = NULL; + if (CInode *in = dynamic_cast<CInode*>(o)) { + if (!in->is_root()) + dir = in->get_parent_dir(); + } else if (CDentry *dn = dynamic_cast<CDentry*>(o)) { + dir = dn->get_dir(); + } else { + dir = dynamic_cast<CDir*>(o); + ceph_assert(dir); + } + if (dir) { + if (dir->is_freezing_dir()) + mdcache->fragment_freeze_inc_num_waiters(dir); + if (dir->is_freezing_tree()) { + while (!dir->is_freezing_tree_root()) + dir = dir->get_parent_dir(); + mdcache->migrator->export_freeze_inc_num_waiters(dir); + } + } +} + +void Locker::set_xlocks_done(MutationImpl *mut, bool skip_dentry) +{ + for (const auto &p : mut->locks) { + if (!p.is_xlock()) + continue; + MDSCacheObject *obj = p.lock->get_parent(); + ceph_assert(obj->is_auth()); + if (skip_dentry && + (p.lock->get_type() == CEPH_LOCK_DN || p.lock->get_type() == CEPH_LOCK_DVERSION)) + continue; + dout(10) << "set_xlocks_done on " << *p.lock << " " << *obj << dendl; + p.lock->set_xlock_done(); + } +} + +void Locker::_drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue, + bool drop_rdlocks) +{ + set<mds_rank_t> slaves; + + for (auto it = mut->locks.begin(); it != mut->locks.end(); ) { + SimpleLock *lock = it->lock; + MDSCacheObject *obj = lock->get_parent(); + + if (it->is_xlock()) { + if (obj->is_auth()) { + bool ni = false; + xlock_finish(it++, mut, &ni); + if (ni) + pneed_issue->insert(static_cast<CInode*>(obj)); + } else { + ceph_assert(lock->get_sm()->can_remote_xlock); + slaves.insert(obj->authority().first); + lock->put_xlock(); + mut->locks.erase(it++); + } + } else if (it->is_wrlock() || it->is_remote_wrlock()) { + if (it->is_remote_wrlock()) { + slaves.insert(it->wrlock_target); + it->clear_remote_wrlock(); + } + if (it->is_wrlock()) { + bool ni = false; + wrlock_finish(it++, mut, &ni); + if (ni) + pneed_issue->insert(static_cast<CInode*>(obj)); + } else { + mut->locks.erase(it++); + } + } else if (drop_rdlocks && it->is_rdlock()) { + bool ni = false; + rdlock_finish(it++, mut, &ni); + if (ni) + pneed_issue->insert(static_cast<CInode*>(obj)); + } else { + ++it; + } + } + + for (set<mds_rank_t>::iterator p = slaves.begin(); p != slaves.end(); ++p) { + if (!mds->is_cluster_degraded() || + mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) { + dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << dendl; + auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_DROPLOCKS); + mds->send_message_mds(slavereq, *p); + } + } +} + +void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue) +{ + SimpleLock *lock = mut->locking; + ceph_assert(lock); + dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl; + + if (lock->get_parent()->is_auth()) { + bool need_issue = false; + if (lock->get_state() == LOCK_PREXLOCK) { + _finish_xlock(lock, -1, &need_issue); + } else if (lock->get_state() == LOCK_LOCK_XLOCK) { + lock->set_state(LOCK_XLOCKDONE); + eval_gather(lock, true, &need_issue); + } + if (need_issue) + pneed_issue->insert(static_cast<CInode *>(lock->get_parent())); + } + mut->finish_locking(lock); +} + +void Locker::drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue) +{ + // leftover locks + set<CInode*> my_need_issue; + if (!pneed_issue) + pneed_issue = &my_need_issue; + + if (mut->locking) + cancel_locking(mut, pneed_issue); + _drop_locks(mut, pneed_issue, true); + + if (pneed_issue == &my_need_issue) + issue_caps_set(*pneed_issue); + mut->done_locking = false; +} + +void Locker::drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue) +{ + set<CInode*> my_need_issue; + if (!pneed_issue) + pneed_issue = &my_need_issue; + + _drop_locks(mut, pneed_issue, false); + + if (pneed_issue == &my_need_issue) + issue_caps_set(*pneed_issue); +} + +void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut) +{ + set<CInode*> need_issue; + + for (auto it = mut->locks.begin(); it != mut->locks.end(); ) { + if (!it->is_rdlock()) { + ++it; + continue; + } + SimpleLock *lock = it->lock; + // make later mksnap/setlayout (at other mds) wait for this unsafe request + if (lock->get_type() == CEPH_LOCK_ISNAP || + lock->get_type() == CEPH_LOCK_IPOLICY) { + ++it; + continue; + } + bool ni = false; + rdlock_finish(it++, mut, &ni); + if (ni) + need_issue.insert(static_cast<CInode*>(lock->get_parent())); + } + + issue_caps_set(need_issue); +} + +void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut) +{ + set<CInode*> need_issue; + + for (auto it = mut->locks.begin(); it != mut->locks.end(); ) { + SimpleLock *lock = it->lock; + if (lock->get_type() == CEPH_LOCK_IDFT) { + ++it; + continue; + } + bool ni = false; + wrlock_finish(it++, mut, &ni); + if (ni) + need_issue.insert(static_cast<CInode*>(lock->get_parent())); + } + issue_caps_set(need_issue); +} + +// generics + +void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSContext::vec *pfinishers) +{ + dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl; + ceph_assert(!lock->is_stable()); + + int next = lock->get_next_state(); + + CInode *in = 0; + bool caps = lock->get_cap_shift(); + if (lock->get_type() != CEPH_LOCK_DN) + in = static_cast<CInode *>(lock->get_parent()); + + bool need_issue = false; + + int loner_issued = 0, other_issued = 0, xlocker_issued = 0; + ceph_assert(!caps || in != NULL); + if (caps && in->is_head()) { + in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, + lock->get_cap_shift(), lock->get_cap_mask()); + dout(10) << " next state is " << lock->get_state_name(next) + << " issued/allows loner " << gcap_string(loner_issued) + << "/" << gcap_string(lock->gcaps_allowed(CAP_LONER, next)) + << " xlocker " << gcap_string(xlocker_issued) + << "/" << gcap_string(lock->gcaps_allowed(CAP_XLOCKER, next)) + << " other " << gcap_string(other_issued) + << "/" << gcap_string(lock->gcaps_allowed(CAP_ANY, next)) + << dendl; + + if (first && ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) || + (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) || + (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued))) + need_issue = true; + } + +#define IS_TRUE_AND_LT_AUTH(x, auth) (x && ((auth && x <= AUTH) || (!auth && x < AUTH))) + bool auth = lock->get_parent()->is_auth(); + if (!lock->is_gathering() && + (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_rdlock, auth) || !lock->is_rdlocked()) && + (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_wrlock, auth) || !lock->is_wrlocked()) && + (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_xlock, auth) || !lock->is_xlocked()) && + (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_lease, auth) || !lock->is_leased()) && + !(lock->get_parent()->is_auth() && lock->is_flushing()) && // i.e. wait for scatter_writebehind! + (!caps || ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) == 0 && + (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) == 0 && + (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued) == 0)) && + lock->get_state() != LOCK_SYNC_MIX2 && // these states need an explicit trigger from the auth mds + lock->get_state() != LOCK_MIX_SYNC2 + ) { + dout(7) << "eval_gather finished gather on " << *lock + << " on " << *lock->get_parent() << dendl; + + if (lock->get_sm() == &sm_filelock) { + ceph_assert(in); + if (in->state_test(CInode::STATE_RECOVERING)) { + dout(7) << "eval_gather finished gather, but still recovering" << dendl; + return; + } else if (in->state_test(CInode::STATE_NEEDSRECOVER)) { + dout(7) << "eval_gather finished gather, but need to recover" << dendl; + mds->mdcache->queue_file_recover(in); + mds->mdcache->do_file_recover(); + return; + } + } + + if (!lock->get_parent()->is_auth()) { + // replica: tell auth + mds_rank_t auth = lock->get_parent()->authority().first; + + if (lock->get_parent()->is_rejoining() && + mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { + dout(7) << "eval_gather finished gather, but still rejoining " + << *lock->get_parent() << dendl; + return; + } + + if (!mds->is_cluster_degraded() || + mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + switch (lock->get_state()) { + case LOCK_SYNC_LOCK: + mds->send_message_mds(MLock::create(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), auth); + break; + + case LOCK_MIX_SYNC: + { + auto reply = MLock::create(lock, LOCK_AC_SYNCACK, mds->get_nodeid()); + lock->encode_locked_state(reply->get_data()); + mds->send_message_mds(reply, auth); + next = LOCK_MIX_SYNC2; + (static_cast<ScatterLock *>(lock))->start_flush(); + } + break; + + case LOCK_MIX_SYNC2: + (static_cast<ScatterLock *>(lock))->finish_flush(); + (static_cast<ScatterLock *>(lock))->clear_flushed(); + + case LOCK_SYNC_MIX2: + // do nothing, we already acked + break; + + case LOCK_SYNC_MIX: + { + auto reply = MLock::create(lock, LOCK_AC_MIXACK, mds->get_nodeid()); + mds->send_message_mds(reply, auth); + next = LOCK_SYNC_MIX2; + } + break; + + case LOCK_MIX_LOCK: + { + bufferlist data; + lock->encode_locked_state(data); + mds->send_message_mds(MLock::create(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth); + (static_cast<ScatterLock *>(lock))->start_flush(); + // we'll get an AC_LOCKFLUSHED to complete + } + break; + + default: + ceph_abort(); + } + } + } else { + // auth + + // once the first (local) stage of mix->lock gather complete we can + // gather from replicas + if (lock->get_state() == LOCK_MIX_LOCK && + lock->get_parent()->is_replicated()) { + dout(10) << " finished (local) gather for mix->lock, now gathering from replicas" << dendl; + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + lock->set_state(LOCK_MIX_LOCK2); + return; + } + + if (lock->is_dirty() && !lock->is_flushed()) { + scatter_writebehind(static_cast<ScatterLock *>(lock)); + mds->mdlog->flush(); + return; + } + lock->clear_flushed(); + + switch (lock->get_state()) { + // to mixed + case LOCK_TSYN_MIX: + case LOCK_SYNC_MIX: + case LOCK_EXCL_MIX: + case LOCK_XSYN_MIX: + in->start_scatter(static_cast<ScatterLock *>(lock)); + if (lock->get_parent()->is_replicated()) { + bufferlist softdata; + lock->encode_locked_state(softdata); + send_lock_message(lock, LOCK_AC_MIX, softdata); + } + (static_cast<ScatterLock *>(lock))->clear_scatter_wanted(); + break; + + case LOCK_XLOCK: + case LOCK_XLOCKDONE: + if (next != LOCK_SYNC) + break; + // fall-thru + + // to sync + case LOCK_EXCL_SYNC: + case LOCK_LOCK_SYNC: + case LOCK_MIX_SYNC: + case LOCK_XSYN_SYNC: + if (lock->get_parent()->is_replicated()) { + bufferlist softdata; + lock->encode_locked_state(softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); + } + break; + } + + } + + lock->set_state(next); + + if (lock->get_parent()->is_auth() && + lock->is_stable()) + lock->get_parent()->auth_unpin(lock); + + // drop loner before doing waiters + if (caps && + in->is_head() && + in->is_auth() && + in->get_wanted_loner() != in->get_loner()) { + dout(10) << " trying to drop loner" << dendl; + if (in->try_drop_loner()) { + dout(10) << " dropped loner" << dendl; + need_issue = true; + } + } + + if (pfinishers) + lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK, + *pfinishers); + else + lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK); + + if (caps && in->is_head()) + need_issue = true; + + if (lock->get_parent()->is_auth() && + lock->is_stable()) + try_eval(lock, &need_issue); + } + + if (need_issue) { + if (pneed_issue) + *pneed_issue = true; + else if (in->is_head()) + issue_caps(in); + } + +} + +bool Locker::eval(CInode *in, int mask, bool caps_imported) +{ + bool need_issue = caps_imported; + MDSContext::vec finishers; + + dout(10) << "eval " << mask << " " << *in << dendl; + + // choose loner? + if (in->is_auth() && in->is_head()) { + client_t orig_loner = in->get_loner(); + if (in->choose_ideal_loner()) { + dout(10) << "eval set loner: client." << orig_loner << " -> client." << in->get_loner() << dendl; + need_issue = true; + mask = -1; + } else if (in->get_wanted_loner() != in->get_loner()) { + dout(10) << "eval want loner: client." << in->get_wanted_loner() << " but failed to set it" << dendl; + mask = -1; + } + } + + retry: + if (mask & CEPH_LOCK_IFILE) + eval_any(&in->filelock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_IAUTH) + eval_any(&in->authlock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_ILINK) + eval_any(&in->linklock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_IXATTR) + eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_INEST) + eval_any(&in->nestlock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_IFLOCK) + eval_any(&in->flocklock, &need_issue, &finishers, caps_imported); + if (mask & CEPH_LOCK_IPOLICY) + eval_any(&in->policylock, &need_issue, &finishers, caps_imported); + + // drop loner? + if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) { + if (in->try_drop_loner()) { + need_issue = true; + if (in->get_wanted_loner() >= 0) { + dout(10) << "eval end set loner to client." << in->get_loner() << dendl; + bool ok = in->try_set_loner(); + ceph_assert(ok); + mask = -1; + goto retry; + } + } + } + + finish_contexts(g_ceph_context, finishers); + + if (need_issue && in->is_head()) + issue_caps(in); + + dout(10) << "eval done" << dendl; + return need_issue; +} + +class C_Locker_Eval : public LockerContext { + MDSCacheObject *p; + int mask; +public: + C_Locker_Eval(Locker *l, MDSCacheObject *pp, int m) : LockerContext(l), p(pp), mask(m) { + // We are used as an MDSCacheObject waiter, so should + // only be invoked by someone already holding the big lock. + ceph_assert(locker->mds->mds_lock.is_locked_by_me()); + p->get(MDSCacheObject::PIN_PTRWAITER); + } + void finish(int r) override { + locker->try_eval(p, mask); + p->put(MDSCacheObject::PIN_PTRWAITER); + } +}; + +void Locker::try_eval(MDSCacheObject *p, int mask) +{ + // unstable and ambiguous auth? + if (p->is_ambiguous_auth()) { + dout(7) << "try_eval ambiguous auth, waiting on " << *p << dendl; + p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, mask)); + return; + } + + if (p->is_auth() && p->is_frozen()) { + dout(7) << "try_eval frozen, waiting on " << *p << dendl; + p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, mask)); + return; + } + + if (mask & CEPH_LOCK_DN) { + ceph_assert(mask == CEPH_LOCK_DN); + bool need_issue = false; // ignore this, no caps on dentries + CDentry *dn = static_cast<CDentry *>(p); + eval_any(&dn->lock, &need_issue); + } else { + CInode *in = static_cast<CInode *>(p); + eval(in, mask); + } +} + +void Locker::try_eval(SimpleLock *lock, bool *pneed_issue) +{ + MDSCacheObject *p = lock->get_parent(); + + // unstable and ambiguous auth? + if (p->is_ambiguous_auth()) { + dout(7) << "try_eval " << *lock << " ambiguousauth, waiting on " << *p << dendl; + p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, lock->get_type())); + return; + } + + if (!p->is_auth()) { + dout(7) << "try_eval " << *lock << " not auth for " << *p << dendl; + return; + } + + if (p->is_frozen()) { + dout(7) << "try_eval " << *lock << " frozen, waiting on " << *p << dendl; + p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type())); + return; + } + + /* + * We could have a situation like: + * + * - mds A authpins item on mds B + * - mds B starts to freeze tree containing item + * - mds A tries wrlock_start on A, sends REQSCATTER to B + * - mds B lock is unstable, sets scatter_wanted + * - mds B lock stabilizes, calls try_eval. + * + * We can defer while freezing without causing a deadlock. Honor + * scatter_wanted flag here. This will never get deferred by the + * checks above due to the auth_pin held by the master. + */ + if (lock->is_scatterlock()) { + ScatterLock *slock = static_cast<ScatterLock *>(lock); + if (slock->get_scatter_wanted() && + slock->get_state() != LOCK_MIX) { + scatter_mix(slock, pneed_issue); + if (!lock->is_stable()) + return; + } else if (slock->get_unscatter_wanted() && + slock->get_state() != LOCK_LOCK) { + simple_lock(slock, pneed_issue); + if (!lock->is_stable()) { + return; + } + } + } + + if (lock->get_type() != CEPH_LOCK_DN && + lock->get_type() != CEPH_LOCK_ISNAP && + p->is_freezing()) { + dout(7) << "try_eval " << *lock << " freezing, waiting on " << *p << dendl; + p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type())); + return; + } + + eval(lock, pneed_issue); +} + +void Locker::eval_cap_gather(CInode *in, set<CInode*> *issue_set) +{ + bool need_issue = false; + MDSContext::vec finishers; + + // kick locks now + if (!in->filelock.is_stable()) + eval_gather(&in->filelock, false, &need_issue, &finishers); + if (!in->authlock.is_stable()) + eval_gather(&in->authlock, false, &need_issue, &finishers); + if (!in->linklock.is_stable()) + eval_gather(&in->linklock, false, &need_issue, &finishers); + if (!in->xattrlock.is_stable()) + eval_gather(&in->xattrlock, false, &need_issue, &finishers); + + if (need_issue && in->is_head()) { + if (issue_set) + issue_set->insert(in); + else + issue_caps(in); + } + + finish_contexts(g_ceph_context, finishers); +} + +void Locker::eval_scatter_gathers(CInode *in) +{ + bool need_issue = false; + MDSContext::vec finishers; + + dout(10) << "eval_scatter_gathers " << *in << dendl; + + // kick locks now + if (!in->filelock.is_stable()) + eval_gather(&in->filelock, false, &need_issue, &finishers); + if (!in->nestlock.is_stable()) + eval_gather(&in->nestlock, false, &need_issue, &finishers); + if (!in->dirfragtreelock.is_stable()) + eval_gather(&in->dirfragtreelock, false, &need_issue, &finishers); + + if (need_issue && in->is_head()) + issue_caps(in); + + finish_contexts(g_ceph_context, finishers); +} + +void Locker::eval(SimpleLock *lock, bool *need_issue) +{ + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + return file_eval(static_cast<ScatterLock*>(lock), need_issue); + case CEPH_LOCK_IDFT: + case CEPH_LOCK_INEST: + return scatter_eval(static_cast<ScatterLock*>(lock), need_issue); + default: + return simple_eval(lock, need_issue); + } +} + + +// ------------------ +// rdlock + +bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon) +{ + // kick the lock + if (lock->is_stable()) { + if (lock->get_parent()->is_auth()) { + if (lock->get_sm() == &sm_scatterlock) { + // not until tempsync is fully implemented + //if (lock->get_parent()->is_replicated()) + //scatter_tempsync((ScatterLock*)lock); + //else + simple_sync(lock); + } else if (lock->get_sm() == &sm_filelock) { + CInode *in = static_cast<CInode*>(lock->get_parent()); + if (lock->get_state() == LOCK_EXCL && + in->get_target_loner() >= 0 && + !in->is_dir() && !as_anon) // as_anon => caller wants SYNC, not XSYN + file_xsyn(lock); + else + simple_sync(lock); + } else + simple_sync(lock); + return true; + } else { + // request rdlock state change from auth + mds_rank_t auth = lock->get_parent()->authority().first; + if (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + dout(10) << "requesting rdlock from auth on " + << *lock << " on " << *lock->get_parent() << dendl; + mds->send_message_mds(MLock::create(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth); + } + return false; + } + } + if (lock->get_type() == CEPH_LOCK_IFILE) { + CInode *in = static_cast<CInode *>(lock->get_parent()); + if (in->state_test(CInode::STATE_RECOVERING)) { + mds->mdcache->recovery_queue.prioritize(in); + } + } + + return false; +} + +bool Locker::rdlock_try(SimpleLock *lock, client_t client, MDSContext *con) +{ + dout(7) << "rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl; + + // can read? grab ref. + if (lock->can_rdlock(client)) + return true; + + _rdlock_kick(lock, false); + + if (lock->can_rdlock(client)) + return true; + + // wait! + if (con) { + dout(7) << "rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; + lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, con); + } + return false; +} + +bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon) +{ + dout(7) << "rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl; + + // client may be allowed to rdlock the same item it has xlocked. + // UNLESS someone passes in as_anon, or we're reading snapped version here. + if (mut->snapid != CEPH_NOSNAP) + as_anon = true; + client_t client = as_anon ? -1 : mut->get_client(); + + CInode *in = 0; + if (lock->get_type() != CEPH_LOCK_DN) + in = static_cast<CInode *>(lock->get_parent()); + + /* + if (!lock->get_parent()->is_auth() && + lock->fw_rdlock_to_auth()) { + mdcache->request_forward(mut, lock->get_parent()->authority().first); + return false; + } + */ + + while (1) { + // can read? grab ref. + if (lock->can_rdlock(client)) { + lock->get_rdlock(); + mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::RDLOCK); + return true; + } + + // hmm, wait a second. + if (in && !in->is_head() && in->is_auth() && + lock->get_state() == LOCK_SNAP_SYNC) { + // okay, we actually need to kick the head's lock to get ourselves synced up. + CInode *head = mdcache->get_inode(in->ino()); + ceph_assert(head); + SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE); + if (hlock->get_state() == LOCK_SYNC) + hlock = head->get_lock(lock->get_type()); + + if (hlock->get_state() != LOCK_SYNC) { + dout(10) << "rdlock_start trying head inode " << *head << dendl; + if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL ** + return false; + // oh, check our lock again then + } + } + + if (!_rdlock_kick(lock, as_anon)) + break; + } + + // wait! + int wait_on; + if (lock->get_parent()->is_auth() && lock->is_stable()) + wait_on = SimpleLock::WAIT_RD; + else + wait_on = SimpleLock::WAIT_STABLE; // REQRDLOCK is ignored if lock is unstable, so we need to retry. + dout(7) << "rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; + lock->add_waiter(wait_on, new C_MDS_RetryRequest(mdcache, mut)); + nudge_log(lock); + return false; +} + +void Locker::nudge_log(SimpleLock *lock) +{ + dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl; + if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked()) // as with xlockdone, or cap flush + mds->mdlog->flush(); +} + +void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue) +{ + ceph_assert(it->is_rdlock()); + SimpleLock *lock = it->lock; + // drop ref + lock->put_rdlock(); + if (mut) + mut->locks.erase(it); + + dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; + + // last one? + if (!lock->is_rdlocked()) { + if (!lock->is_stable()) + eval_gather(lock, false, pneed_issue); + else if (lock->get_parent()->is_auth()) + try_eval(lock, pneed_issue); + } +} + + +bool Locker::can_rdlock_set(MutationImpl::LockOpVec& lov) +{ + dout(10) << "can_rdlock_set " << dendl; + for (const auto& p : lov) { + ceph_assert(p.is_rdlock()); + if (!p.lock->can_rdlock(-1)) { + dout(10) << "can_rdlock_set can't rdlock " << *p << " on " << *p.lock->get_parent() << dendl; + return false; + } + } + return true; +} + + +void Locker::rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut) +{ + dout(10) << "rdlock_take_set " << dendl; + for (const auto& p : lov) { + ceph_assert(p.is_rdlock()); + p.lock->get_rdlock(); + mut->locks.emplace(p.lock, MutationImpl::LockOp::RDLOCK); + } +} + +// ------------------ +// wrlock + +void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut) +{ + if (lock->get_type() == CEPH_LOCK_IVERSION || + lock->get_type() == CEPH_LOCK_DVERSION) + return local_wrlock_grab(static_cast<LocalLock*>(lock), mut); + + dout(7) << "wrlock_force on " << *lock + << " on " << *lock->get_parent() << dendl; + lock->get_wrlock(true); + mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK); +} + +bool Locker::wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait) +{ + if (lock->get_type() == CEPH_LOCK_IVERSION || + lock->get_type() == CEPH_LOCK_DVERSION) + return local_wrlock_start(static_cast<LocalLock*>(lock), mut); + + dout(10) << "wrlock_start " << *lock << " on " << *lock->get_parent() << dendl; + + CInode *in = static_cast<CInode *>(lock->get_parent()); + client_t client = mut->get_client(); + bool want_scatter = !nowait && lock->get_parent()->is_auth() && + (in->has_subtree_or_exporting_dirfrag() || + static_cast<ScatterLock*>(lock)->get_scatter_wanted()); + + while (1) { + // wrlock? + if (lock->can_wrlock(client) && + (!want_scatter || lock->get_state() == LOCK_MIX)) { + lock->get_wrlock(); + auto it = mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::WRLOCK); + it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked + return true; + } + + if (lock->get_type() == CEPH_LOCK_IFILE && + in->state_test(CInode::STATE_RECOVERING)) { + mds->mdcache->recovery_queue.prioritize(in); + } + + if (!lock->is_stable()) + break; + + if (in->is_auth()) { + // don't do nested lock state change if we have dirty scatterdata and + // may scatter_writebehind or start_scatter, because nowait==true implies + // that the caller already has a log entry open! + if (nowait && lock->is_dirty()) + return false; + + if (want_scatter) + scatter_mix(static_cast<ScatterLock*>(lock)); + else + simple_lock(lock); + + if (nowait && !lock->can_wrlock(client)) + return false; + + } else { + // replica. + // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). + mds_rank_t auth = lock->get_parent()->authority().first; + if (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + dout(10) << "requesting scatter from auth on " + << *lock << " on " << *lock->get_parent() << dendl; + mds->send_message_mds(MLock::create(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth); + } + break; + } + } + + if (!nowait) { + dout(7) << "wrlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut)); + nudge_log(lock); + } + + return false; +} + +void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue) +{ + ceph_assert(it->is_wrlock()); + SimpleLock* lock = it->lock; + + if (lock->get_type() == CEPH_LOCK_IVERSION || + lock->get_type() == CEPH_LOCK_DVERSION) + return local_wrlock_finish(it, mut); + + dout(7) << "wrlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; + lock->put_wrlock(); + + if (it->is_remote_wrlock()) + it->clear_wrlock(); + else + mut->locks.erase(it); + + if (!lock->is_wrlocked()) { + if (!lock->is_stable()) + eval_gather(lock, false, pneed_issue); + else if (lock->get_parent()->is_auth()) + try_eval(lock, pneed_issue); + } +} + + +// remote wrlock + +void Locker::remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut) +{ + dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl; + + // wait for active target + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) { + dout(7) << " mds." << target << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut)); + return; + } + + // send lock request + mut->start_locking(lock, target); + mut->more()->slaves.insert(target); + auto r = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_WRLOCK); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + mds->send_message_mds(r, target); + + ceph_assert(mut->more()->waiting_on_slave.count(target) == 0); + mut->more()->waiting_on_slave.insert(target); +} + +void Locker::remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut) +{ + ceph_assert(it->is_remote_wrlock()); + SimpleLock *lock = it->lock; + mds_rank_t target = it->wrlock_target; + + if (it->is_wrlock()) + it->clear_remote_wrlock(); + else + mut->locks.erase(it); + + dout(7) << "remote_wrlock_finish releasing remote wrlock on mds." << target + << " " << *lock->get_parent() << dendl; + if (!mds->is_cluster_degraded() || + mds->mdsmap->get_state(target) >= MDSMap::STATE_REJOIN) { + auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_UNWRLOCK); + slavereq->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(slavereq->get_object_info()); + mds->send_message_mds(slavereq, target); + } +} + + +// ------------------ +// xlock + +bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut) +{ + if (lock->get_type() == CEPH_LOCK_IVERSION || + lock->get_type() == CEPH_LOCK_DVERSION) + return local_xlock_start(static_cast<LocalLock*>(lock), mut); + + dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; + client_t client = mut->get_client(); + + CInode *in = nullptr; + if (lock->get_cap_shift()) + in = static_cast<CInode *>(lock->get_parent()); + + // auth? + if (lock->get_parent()->is_auth()) { + // auth + while (1) { + if (mut->locking && // started xlock (not preempt other request) + lock->can_xlock(client) && + !(lock->get_state() == LOCK_LOCK_XLOCK && // client is not xlocker or + in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap + lock->set_state(LOCK_XLOCK); + lock->get_xlock(mut, client); + mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::XLOCK); + mut->finish_locking(lock); + return true; + } + + if (lock->get_type() == CEPH_LOCK_IFILE && + in->state_test(CInode::STATE_RECOVERING)) { + mds->mdcache->recovery_queue.prioritize(in); + } + + if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE || + lock->get_xlock_by_client() != client || + lock->is_waiter_for(SimpleLock::WAIT_STABLE))) + break; + + if (lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_XLOCKDONE) { + mut->start_locking(lock); + simple_xlock(lock); + } else { + simple_lock(lock); + } + } + + lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut)); + nudge_log(lock); + return false; + } else { + // replica + ceph_assert(lock->get_sm()->can_remote_xlock); + ceph_assert(!mut->slave_request); + + // wait for single auth + if (lock->get_parent()->is_ambiguous_auth()) { + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, + new C_MDS_RetryRequest(mdcache, mut)); + return false; + } + + // wait for active auth + mds_rank_t auth = lock->get_parent()->authority().first; + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + dout(7) << " mds." << auth << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut)); + return false; + } + + // send lock request + mut->more()->slaves.insert(auth); + mut->start_locking(lock, auth); + auto r = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_XLOCK); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + mds->send_message_mds(r, auth); + + ceph_assert(mut->more()->waiting_on_slave.count(auth) == 0); + mut->more()->waiting_on_slave.insert(auth); + + return false; + } +} + +void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue) +{ + ceph_assert(!lock->is_stable()); + if (lock->get_type() != CEPH_LOCK_DN && + lock->get_type() != CEPH_LOCK_ISNAP && + lock->get_num_rdlocks() == 0 && + lock->get_num_wrlocks() == 0 && + !lock->is_leased() && + lock->get_state() != LOCK_XLOCKSNAP) { + CInode *in = static_cast<CInode*>(lock->get_parent()); + client_t loner = in->get_target_loner(); + if (loner >= 0 && (xlocker < 0 || xlocker == loner)) { + lock->set_state(LOCK_EXCL); + lock->get_parent()->auth_unpin(lock); + lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); + if (lock->get_cap_shift()) + *pneed_issue = true; + if (lock->get_parent()->is_auth() && + lock->is_stable()) + try_eval(lock, pneed_issue); + return; + } + } + // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK + eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue); +} + +void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue) +{ + ceph_assert(it->is_xlock()); + SimpleLock *lock = it->lock; + + if (lock->get_type() == CEPH_LOCK_IVERSION || + lock->get_type() == CEPH_LOCK_DVERSION) + return local_xlock_finish(it, mut); + + dout(10) << "xlock_finish on " << *lock << " " << *lock->get_parent() << dendl; + + client_t xlocker = lock->get_xlock_by_client(); + + // drop ref + lock->put_xlock(); + ceph_assert(mut); + mut->locks.erase(it); + + bool do_issue = false; + + // remote xlock? + if (!lock->get_parent()->is_auth()) { + ceph_assert(lock->get_sm()->can_remote_xlock); + + // tell auth + dout(7) << "xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl; + mds_rank_t auth = lock->get_parent()->authority().first; + if (!mds->is_cluster_degraded() || + mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_UNXLOCK); + slavereq->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(slavereq->get_object_info()); + mds->send_message_mds(slavereq, auth); + } + // others waiting? + lock->finish_waiters(SimpleLock::WAIT_STABLE | + SimpleLock::WAIT_WR | + SimpleLock::WAIT_RD, 0); + } else { + if (lock->get_num_xlocks() == 0 && + lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock + _finish_xlock(lock, xlocker, &do_issue); + } + } + + if (do_issue) { + CInode *in = static_cast<CInode*>(lock->get_parent()); + if (in->is_head()) { + if (pneed_issue) + *pneed_issue = true; + else + issue_caps(in); + } + } +} + +void Locker::xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut) +{ + ceph_assert(it->is_xlock()); + SimpleLock *lock = it->lock; + dout(10) << "xlock_export on " << *lock << " " << *lock->get_parent() << dendl; + + lock->put_xlock(); + mut->locks.erase(it); + + MDSCacheObject *p = lock->get_parent(); + ceph_assert(p->state_test(CInode::STATE_AMBIGUOUSAUTH)); // we are exporting this (inode) + + if (!lock->is_stable()) + lock->get_parent()->auth_unpin(lock); + + lock->set_state(LOCK_LOCK); +} + +void Locker::xlock_import(SimpleLock *lock) +{ + dout(10) << "xlock_import on " << *lock << " " << *lock->get_parent() << dendl; + lock->get_parent()->auth_pin(lock); +} + + + +// file i/o ----------------------------------------- + +version_t Locker::issue_file_data_version(CInode *in) +{ + dout(7) << "issue_file_data_version on " << *in << dendl; + return in->inode.file_data_version; +} + +class C_Locker_FileUpdate_finish : public LockerLogContext { + CInode *in; + MutationRef mut; + unsigned flags; + client_t client; + MClientCaps::ref ack; +public: + C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m, unsigned f, + const MClientCaps::ref &ack, client_t c=-1) + : LockerLogContext(l), in(i), mut(m), flags(f), client(c), ack(ack) { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + locker->file_update_finish(in, mut, flags, client, ack); + in->put(CInode::PIN_PTRWAITER); + } +}; + +enum { + UPDATE_SHAREMAX = 1, + UPDATE_NEEDSISSUE = 2, + UPDATE_SNAPFLUSH = 4, +}; + +void Locker::file_update_finish(CInode *in, MutationRef& mut, unsigned flags, + client_t client, const MClientCaps::ref &ack) +{ + dout(10) << "file_update_finish on " << *in << dendl; + in->pop_and_dirty_projected_inode(mut->ls); + + mut->apply(); + + if (ack) { + Session *session = mds->get_session(client); + if (session && !session->is_closed()) { + // "oldest flush tid" > 0 means client uses unique TID for each flush + if (ack->get_oldest_flush_tid() > 0) + session->add_completed_flush(ack->get_client_tid()); + mds->send_message_client_counted(ack, session); + } else { + dout(10) << " no session for client." << client << " " << *ack << dendl; + } + } + + set<CInode*> need_issue; + drop_locks(mut.get(), &need_issue); + + if (in->is_head()) { + if ((flags & UPDATE_NEEDSISSUE) && need_issue.count(in) == 0) { + Capability *cap = in->get_client_cap(client); + if (cap && (cap->wanted() & ~cap->pending())) + issue_caps(in, cap); + } + + if ((flags & UPDATE_SHAREMAX) && in->is_auth() && + (in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER))) + share_inode_max_size(in); + + } else if ((flags & UPDATE_SNAPFLUSH) && !in->client_snap_caps.empty()) { + dout(10) << " client_snap_caps " << in->client_snap_caps << dendl; + // check for snap writeback completion + in->client_snap_caps.erase(client); + if (in->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + lock->put_wrlock(); + } + in->item_open_file.remove_myself(); + in->item_caps.remove_myself(); + eval_cap_gather(in, &need_issue); + } + } + issue_caps_set(need_issue); + + mds->balancer->hit_inode(in, META_POP_IWR); + + // auth unpin after issuing caps + mut->cleanup(); +} + +Capability* Locker::issue_new_caps(CInode *in, + int mode, + Session *session, + SnapRealm *realm, + bool is_replay) +{ + dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl; + bool is_new; + + // if replay, try to reconnect cap, and otherwise do nothing. + if (is_replay) + return mds->mdcache->try_reconnect_cap(in, session); + + + // my needs + ceph_assert(session->info.inst.name.is_client()); + client_t my_client = session->get_client(); + int my_want = ceph_caps_for_mode(mode); + + // register a capability + Capability *cap = in->get_client_cap(my_client); + if (!cap) { + // new cap + cap = in->add_client_cap(my_client, session, realm); + cap->set_wanted(my_want); + cap->mark_new(); + cap->inc_suppress(); // suppress file cap messages for new cap (we'll bundle with the open() reply) + is_new = true; + } else { + is_new = false; + // make sure it wants sufficient caps + if (my_want & ~cap->wanted()) { + // augment wanted caps for this client + cap->set_wanted(cap->wanted() | my_want); + } + } + + if (in->is_auth()) { + // [auth] twiddle mode? + eval(in, CEPH_CAP_LOCKS); + + if (_need_flush_mdlog(in, my_want)) + mds->mdlog->flush(); + + } else { + // [replica] tell auth about any new caps wanted + request_inode_file_caps(in); + } + + // issue caps (pot. incl new one) + //issue_caps(in); // note: _eval above may have done this already... + + // re-issue whatever we can + //cap->issue(cap->pending()); + + if (is_new) + cap->dec_suppress(); + + return cap; +} + + +void Locker::issue_caps_set(set<CInode*>& inset) +{ + for (set<CInode*>::iterator p = inset.begin(); p != inset.end(); ++p) + issue_caps(*p); +} + +class C_Locker_RevokeStaleCap : public LockerContext { + CInode *in; + client_t client; +public: + C_Locker_RevokeStaleCap(Locker *l, CInode *i, client_t c) : + LockerContext(l), in(i), client(c) { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + locker->revoke_stale_cap(in, client); + in->put(CInode::PIN_PTRWAITER); + } +}; + +int Locker::issue_caps(CInode *in, Capability *only_cap) +{ + // allowed caps are determined by the lock mode. + int all_allowed = in->get_caps_allowed_by_type(CAP_ANY); + int loner_allowed = in->get_caps_allowed_by_type(CAP_LONER); + int xlocker_allowed = in->get_caps_allowed_by_type(CAP_XLOCKER); + + client_t loner = in->get_loner(); + if (loner >= 0) { + dout(7) << "issue_caps loner client." << loner + << " allowed=" << ccap_string(loner_allowed) + << ", xlocker allowed=" << ccap_string(xlocker_allowed) + << ", others allowed=" << ccap_string(all_allowed) + << " on " << *in << dendl; + } else { + dout(7) << "issue_caps allowed=" << ccap_string(all_allowed) + << ", xlocker allowed=" << ccap_string(xlocker_allowed) + << " on " << *in << dendl; + } + + ceph_assert(in->is_head()); + + // count conflicts with + int nissued = 0; + + // client caps + map<client_t, Capability>::iterator it; + if (only_cap) + it = in->client_caps.find(only_cap->get_client()); + else + it = in->client_caps.begin(); + for (; it != in->client_caps.end(); ++it) { + Capability *cap = &it->second; + + // do not issue _new_ bits when size|mtime is projected + int allowed; + if (loner == it->first) + allowed = loner_allowed; + else + allowed = all_allowed; + + // add in any xlocker-only caps (for locks this client is the xlocker for) + allowed |= xlocker_allowed & in->get_xlocker_mask(it->first); + + if ((in->inode.inline_data.version != CEPH_INLINE_NONE && + cap->is_noinline()) || + (!in->inode.layout.pool_ns.empty() && + cap->is_nopoolns())) + allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); + + int pending = cap->pending(); + int wanted = cap->wanted(); + + dout(20) << " client." << it->first + << " pending " << ccap_string(pending) + << " allowed " << ccap_string(allowed) + << " wanted " << ccap_string(wanted) + << dendl; + + if (!(pending & ~allowed)) { + // skip if suppress or new, and not revocation + if (cap->is_new() || cap->is_suppress() || cap->is_stale()) { + dout(20) << " !revoke and new|suppressed|stale, skipping client." << it->first << dendl; + continue; + } + } else { + ceph_assert(!cap->is_new()); + if (cap->is_stale()) { + dout(20) << " revoke stale cap from client." << it->first << dendl; + ceph_assert(!cap->is_valid()); + cap->issue(allowed & pending, false); + mds->queue_waiter_front(new C_Locker_RevokeStaleCap(this, in, it->first)); + continue; + } + + if (!cap->is_valid() && (pending & ~CEPH_CAP_PIN)) { + // After stale->resume circle, client thinks it only has CEPH_CAP_PIN. + // mds needs to re-issue caps, then do revocation. + long seq = cap->issue(pending, true); + + dout(7) << " sending MClientCaps to client." << it->first + << " seq " << seq << " re-issue " << ccap_string(pending) << dendl; + + auto m = MClientCaps::create(CEPH_CAP_OP_GRANT, in->ino(), + in->find_snaprealm()->inode->ino(), + cap->get_cap_id(), cap->get_last_seq(), + pending, wanted, 0, cap->get_mseq(), + mds->get_osd_epoch_barrier()); + in->encode_cap_message(m, cap); + + mds->send_message_client_counted(m, cap->get_session()); + } + } + + // notify clients about deleted inode, to make sure they release caps ASAP. + if (in->inode.nlink == 0) + wanted |= CEPH_CAP_LINK_SHARED; + + // are there caps that the client _wants_ and can have, but aren't pending? + // or do we need to revoke? + if ((pending & ~allowed) || // need to revoke ~allowed caps. + ((wanted & allowed) & ~pending) || // missing wanted+allowed caps + !cap->is_valid()) { // after stale->resume circle + // issue + nissued++; + + // include caps that clients generally like, while we're at it. + int likes = in->get_caps_liked(); + int before = pending; + long seq; + if (pending & ~allowed) + seq = cap->issue((wanted|likes) & allowed & pending, true); // if revoking, don't issue anything new. + else + seq = cap->issue((wanted|likes) & allowed, true); + int after = cap->pending(); + + dout(7) << " sending MClientCaps to client." << it->first + << " seq " << seq << " new pending " << ccap_string(after) + << " was " << ccap_string(before) << dendl; + + int op = (before & ~after) ? CEPH_CAP_OP_REVOKE : CEPH_CAP_OP_GRANT; + if (op == CEPH_CAP_OP_REVOKE) { + revoking_caps.push_back(&cap->item_revoking_caps); + revoking_caps_by_client[cap->get_client()].push_back(&cap->item_client_revoking_caps); + cap->set_last_revoke_stamp(ceph_clock_now()); + cap->reset_num_revoke_warnings(); + } + + auto m = MClientCaps::create(op, in->ino(), + in->find_snaprealm()->inode->ino(), + cap->get_cap_id(), cap->get_last_seq(), + after, wanted, 0, cap->get_mseq(), + mds->get_osd_epoch_barrier()); + in->encode_cap_message(m, cap); + + mds->send_message_client_counted(m, cap->get_session()); + } + + if (only_cap) + break; + } + + return nissued; +} + +void Locker::issue_truncate(CInode *in) +{ + dout(7) << "issue_truncate on " << *in << dendl; + + for (auto &p : in->client_caps) { + Capability *cap = &p.second; + auto m = MClientCaps::create(CEPH_CAP_OP_TRUNC, + in->ino(), + in->find_snaprealm()->inode->ino(), + cap->get_cap_id(), cap->get_last_seq(), + cap->pending(), cap->wanted(), 0, + cap->get_mseq(), + mds->get_osd_epoch_barrier()); + in->encode_cap_message(m, cap); + mds->send_message_client_counted(m, p.first); + } + + // should we increase max_size? + if (in->is_auth() && in->is_file()) + check_inode_max_size(in); +} + + +void Locker::revoke_stale_cap(CInode *in, client_t client) +{ + dout(7) << __func__ << " client." << client << " on " << *in << dendl; + Capability *cap = in->get_client_cap(client); + if (!cap) + return; + + if (cap->revoking() & CEPH_CAP_ANY_WR) { + std::stringstream ss; + mds->evict_client(client.v, false, g_conf()->mds_session_blacklist_on_timeout, ss, nullptr); + return; + } + + cap->revoke(); + + if (in->is_auth() && in->inode.client_ranges.count(cap->get_client())) + in->state_set(CInode::STATE_NEEDSRECOVER); + + if (in->state_test(CInode::STATE_EXPORTINGCAPS)) + return; + + if (!in->filelock.is_stable()) + eval_gather(&in->filelock); + if (!in->linklock.is_stable()) + eval_gather(&in->linklock); + if (!in->authlock.is_stable()) + eval_gather(&in->authlock); + if (!in->xattrlock.is_stable()) + eval_gather(&in->xattrlock); + + if (in->is_auth()) + try_eval(in, CEPH_CAP_LOCKS); + else + request_inode_file_caps(in); +} + +bool Locker::revoke_stale_caps(Session *session) +{ + dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl; + + // invalidate all caps + session->inc_cap_gen(); + + bool ret = true; + std::vector<CInode*> to_eval; + + for (auto p = session->caps.begin(); !p.end(); ) { + Capability *cap = *p; + ++p; + if (!cap->is_notable()) { + // the rest ones are not being revoked and don't have writeable range + // and don't want exclusive caps or want file read/write. They don't + // need recover, they don't affect eval_gather()/try_eval() + break; + } + + int revoking = cap->revoking(); + if (!revoking) + continue; + + if (revoking & CEPH_CAP_ANY_WR) { + ret = false; + break; + } + + int issued = cap->issued(); + CInode *in = cap->get_inode(); + dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl; + cap->revoke(); + + if (in->is_auth() && + in->inode.client_ranges.count(cap->get_client())) + in->state_set(CInode::STATE_NEEDSRECOVER); + + // eval lock/inode may finish contexts, which may modify other cap's position + // in the session->caps. + to_eval.push_back(in); + } + + for (auto in : to_eval) { + if (in->state_test(CInode::STATE_EXPORTINGCAPS)) + continue; + + if (!in->filelock.is_stable()) + eval_gather(&in->filelock); + if (!in->linklock.is_stable()) + eval_gather(&in->linklock); + if (!in->authlock.is_stable()) + eval_gather(&in->authlock); + if (!in->xattrlock.is_stable()) + eval_gather(&in->xattrlock); + + if (in->is_auth()) + try_eval(in, CEPH_CAP_LOCKS); + else + request_inode_file_caps(in); + } + + return ret; +} + +void Locker::resume_stale_caps(Session *session) +{ + dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl; + + bool lazy = session->info.has_feature(CEPHFS_FEATURE_LAZY_CAP_WANTED); + for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) { + Capability *cap = *p; + ++p; + if (lazy && !cap->is_notable()) + break; // see revoke_stale_caps() + + CInode *in = cap->get_inode(); + ceph_assert(in->is_head()); + dout(10) << " clearing stale flag on " << *in << dendl; + + if (in->state_test(CInode::STATE_EXPORTINGCAPS)) { + // if export succeeds, the cap will be removed. if export fails, + // we need to re-issue the cap if it's not stale. + in->state_set(CInode::STATE_EVALSTALECAPS); + continue; + } + + if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS)) + issue_caps(in, cap); + } +} + +void Locker::remove_stale_leases(Session *session) +{ + dout(10) << "remove_stale_leases for " << session->info.inst.name << dendl; + xlist<ClientLease*>::iterator p = session->leases.begin(); + while (!p.end()) { + ClientLease *l = *p; + ++p; + CDentry *parent = static_cast<CDentry*>(l->parent); + dout(15) << " removing lease on " << *parent << dendl; + parent->remove_client_lease(l, this); + } +} + + +class C_MDL_RequestInodeFileCaps : public LockerContext { + CInode *in; +public: + C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : LockerContext(l), in(i) { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + if (!in->is_auth()) + locker->request_inode_file_caps(in); + in->put(CInode::PIN_PTRWAITER); + } +}; + +void Locker::request_inode_file_caps(CInode *in) +{ + ceph_assert(!in->is_auth()); + + int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN; + if (wanted != in->replica_caps_wanted) { + // wait for single auth + if (in->is_ambiguous_auth()) { + in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, + new C_MDL_RequestInodeFileCaps(this, in)); + return; + } + + mds_rank_t auth = in->authority().first; + if (mds->is_cluster_degraded() && + mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { + mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in)); + return; + } + + dout(7) << "request_inode_file_caps " << ccap_string(wanted) + << " was " << ccap_string(in->replica_caps_wanted) + << " on " << *in << " to mds." << auth << dendl; + + in->replica_caps_wanted = wanted; + + if (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) + mds->send_message_mds(MInodeFileCaps::create(in->ino(), in->replica_caps_wanted), auth); + } +} + +void Locker::handle_inode_file_caps(const MInodeFileCaps::const_ref &m) +{ + // nobody should be talking to us during recovery. + if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) { + if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) { + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + ceph_abort_msg("got unexpected message during recovery"); + } + + // ok + CInode *in = mdcache->get_inode(m->get_ino()); + mds_rank_t from = mds_rank_t(m->get_source().num()); + + ceph_assert(in); + ceph_assert(in->is_auth()); + + dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl; + + in->set_mds_caps_wanted(from, m->get_caps()); + + try_eval(in, CEPH_CAP_LOCKS); +} + + +class C_MDL_CheckMaxSize : public LockerContext { + CInode *in; + uint64_t new_max_size; + uint64_t newsize; + utime_t mtime; + +public: + C_MDL_CheckMaxSize(Locker *l, CInode *i, uint64_t _new_max_size, + uint64_t _newsize, utime_t _mtime) : + LockerContext(l), in(i), + new_max_size(_new_max_size), newsize(_newsize), mtime(_mtime) + { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + if (in->is_auth()) + locker->check_inode_max_size(in, false, new_max_size, newsize, mtime); + in->put(CInode::PIN_PTRWAITER); + } +}; + +uint64_t Locker::calc_new_max_size(CInode::mempool_inode *pi, uint64_t size) +{ + uint64_t new_max = (size + 1) << 1; + uint64_t max_inc = g_conf()->mds_client_writeable_range_max_inc_objs; + if (max_inc > 0) { + max_inc *= pi->layout.object_size; + new_max = std::min(new_max, size + max_inc); + } + return round_up_to(new_max, pi->get_layout_size_increment()); +} + +void Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool update, + CInode::mempool_inode::client_range_map *new_ranges, + bool *max_increased) +{ + auto latest = in->get_projected_inode(); + uint64_t ms; + if (latest->has_layout()) { + ms = calc_new_max_size(latest, size); + } else { + // Layout-less directories like ~mds0/, have zero size + ms = 0; + } + + // increase ranges as appropriate. + // shrink to 0 if no WR|BUFFER caps issued. + for (auto &p : in->client_caps) { + if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) { + client_writeable_range_t& nr = (*new_ranges)[p.first]; + nr.range.first = 0; + if (latest->client_ranges.count(p.first)) { + client_writeable_range_t& oldr = latest->client_ranges[p.first]; + if (ms > oldr.range.last) + *max_increased = true; + nr.range.last = std::max(ms, oldr.range.last); + nr.follows = oldr.follows; + } else { + *max_increased = true; + nr.range.last = ms; + nr.follows = in->first - 1; + } + if (update) + p.second.mark_clientwriteable(); + } else { + if (update) + p.second.clear_clientwriteable(); + } + } +} + +bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, + uint64_t new_max_size, uint64_t new_size, + utime_t new_mtime) +{ + ceph_assert(in->is_auth()); + ceph_assert(in->is_file()); + + CInode::mempool_inode *latest = in->get_projected_inode(); + CInode::mempool_inode::client_range_map new_ranges; + uint64_t size = latest->size; + bool update_size = new_size > 0; + bool update_max = false; + bool max_increased = false; + + if (update_size) { + new_size = size = std::max(size, new_size); + new_mtime = std::max(new_mtime, latest->mtime); + if (latest->size == new_size && latest->mtime == new_mtime) + update_size = false; + } + + int can_update = 1; + if (in->is_frozen()) { + can_update = -1; + } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) { + // lock? + if (in->filelock.is_stable()) { + if (in->get_target_loner() >= 0) + file_excl(&in->filelock); + else + simple_lock(&in->filelock); + } + if (!in->filelock.can_wrlock(in->get_loner())) + can_update = -2; + } + + calc_new_client_ranges(in, std::max(new_max_size, size), can_update > 0, + &new_ranges, &max_increased); + + if (max_increased || latest->client_ranges != new_ranges) + update_max = true; + + if (!update_size && !update_max) { + dout(20) << "check_inode_max_size no-op on " << *in << dendl; + return false; + } + + dout(10) << "check_inode_max_size new_ranges " << new_ranges + << " update_size " << update_size + << " on " << *in << dendl; + + if (can_update < 0) { + auto cms = new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime); + if (can_update == -1) { + dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, cms); + } else { + in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms); + dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl; + } + return false; + } + + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + + if (update_max) { + dout(10) << "check_inode_max_size client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl; + pi.inode.client_ranges = new_ranges; + } + + if (update_size) { + dout(10) << "check_inode_max_size size " << pi.inode.size << " -> " << new_size << dendl; + pi.inode.size = new_size; + pi.inode.rstat.rbytes = new_size; + dout(10) << "check_inode_max_size mtime " << pi.inode.mtime << " -> " << new_mtime << dendl; + pi.inode.mtime = new_mtime; + if (new_mtime > pi.inode.ctime) { + pi.inode.ctime = new_mtime; + if (new_mtime > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = new_mtime; + } + } + + // use EOpen if the file is still open; otherwise, use EUpdate. + // this is just an optimization to push open files forward into + // newer log segments. + LogEvent *le; + EMetaBlob *metablob; + if (in->is_any_caps_wanted() && in->last == CEPH_NOSNAP) { + EOpen *eo = new EOpen(mds->mdlog); + eo->add_ino(in->ino()); + metablob = &eo->metablob; + le = eo; + } else { + EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size"); + metablob = &eu->metablob; + le = eu; + } + mds->mdlog->start_entry(le); + if (update_size) { // FIXME if/when we do max_size nested accounting + mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY); + // no cow, here! + CDentry *parent = in->get_projected_parent_dn(); + metablob->add_primary_dentry(parent, in, true); + } else { + metablob->add_dir_context(in->get_projected_parent_dn()->get_dir()); + mdcache->journal_dirty_inode(mut.get(), metablob, in); + } + mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, + UPDATE_SHAREMAX, MClientCaps::ref())); + wrlock_force(&in->filelock, mut); // wrlock for duration of journal + mut->auth_pin(in); + + // make max_size _increase_ timely + if (max_increased) + mds->mdlog->flush(); + + return true; +} + + +void Locker::share_inode_max_size(CInode *in, Capability *only_cap) +{ + /* + * only share if currently issued a WR cap. if client doesn't have it, + * file_max doesn't matter, and the client will get it if/when they get + * the cap later. + */ + dout(10) << "share_inode_max_size on " << *in << dendl; + map<client_t, Capability>::iterator it; + if (only_cap) + it = in->client_caps.find(only_cap->get_client()); + else + it = in->client_caps.begin(); + for (; it != in->client_caps.end(); ++it) { + const client_t client = it->first; + Capability *cap = &it->second; + if (cap->is_suppress()) + continue; + if (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) { + dout(10) << "share_inode_max_size with client." << client << dendl; + cap->inc_last_seq(); + auto m = MClientCaps::create(CEPH_CAP_OP_GRANT, + in->ino(), + in->find_snaprealm()->inode->ino(), + cap->get_cap_id(), + cap->get_last_seq(), + cap->pending(), + cap->wanted(), 0, + cap->get_mseq(), + mds->get_osd_epoch_barrier()); + in->encode_cap_message(m, cap); + mds->send_message_client_counted(m, client); + } + if (only_cap) + break; + } +} + +bool Locker::_need_flush_mdlog(CInode *in, int wanted) +{ + /* flush log if caps are wanted by client but corresponding lock is unstable and locked by + * pending mutations. */ + if (((wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_SHARED|CEPH_CAP_FILE_EXCL)) && + in->filelock.is_unstable_and_locked()) || + ((wanted & (CEPH_CAP_AUTH_SHARED|CEPH_CAP_AUTH_EXCL)) && + in->authlock.is_unstable_and_locked()) || + ((wanted & (CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL)) && + in->linklock.is_unstable_and_locked()) || + ((wanted & (CEPH_CAP_XATTR_SHARED|CEPH_CAP_XATTR_EXCL)) && + in->xattrlock.is_unstable_and_locked())) + return true; + return false; +} + +void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) +{ + if (ceph_seq_cmp(issue_seq, cap->get_last_issue()) == 0) { + dout(10) << " wanted " << ccap_string(cap->wanted()) + << " -> " << ccap_string(wanted) << dendl; + cap->set_wanted(wanted); + } else if (wanted & ~cap->wanted()) { + dout(10) << " wanted " << ccap_string(cap->wanted()) + << " -> " << ccap_string(wanted) + << " (added caps even though we had seq mismatch!)" << dendl; + cap->set_wanted(wanted | cap->wanted()); + } else { + dout(10) << " NOT changing wanted " << ccap_string(cap->wanted()) + << " -> " << ccap_string(wanted) + << " (issue_seq " << issue_seq << " != last_issue " + << cap->get_last_issue() << ")" << dendl; + return; + } + + CInode *cur = cap->get_inode(); + if (!cur->is_auth()) { + request_inode_file_caps(cur); + return; + } + + if (cap->wanted()) { + if (cur->state_test(CInode::STATE_RECOVERING) && + (cap->wanted() & (CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR))) { + mds->mdcache->recovery_queue.prioritize(cur); + } + + if (mdcache->open_file_table.should_log_open(cur)) { + ceph_assert(cur->last == CEPH_NOSNAP); + EOpen *le = new EOpen(mds->mdlog); + mds->mdlog->start_entry(le); + le->add_clean_inode(cur); + mds->mdlog->submit_entry(le); + } + } +} + +void Locker::snapflush_nudge(CInode *in) +{ + ceph_assert(in->last != CEPH_NOSNAP); + if (in->client_snap_caps.empty()) + return; + + CInode *head = mdcache->get_inode(in->ino()); + // head inode gets unpinned when snapflush starts. It might get trimmed + // before snapflush finishes. + if (!head) + return; + + ceph_assert(head->is_auth()); + if (head->client_need_snapflush.empty()) + return; + + SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE); + if (hlock->get_state() == LOCK_SYNC || !hlock->is_stable()) { + hlock = NULL; + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = head->get_lock(cinode_lock_info[i].lock); + if (lock->get_state() != LOCK_SYNC && lock->is_stable()) { + hlock = lock; + break; + } + } + } + if (hlock) { + _rdlock_kick(hlock, true); + } else { + // also, requeue, in case of unstable lock + need_snapflush_inodes.push_back(&in->item_caps); + } +} + +void Locker::mark_need_snapflush_inode(CInode *in) +{ + ceph_assert(in->last != CEPH_NOSNAP); + if (!in->item_caps.is_on_list()) { + need_snapflush_inodes.push_back(&in->item_caps); + utime_t now = ceph_clock_now(); + in->last_dirstat_prop = now; + dout(10) << "mark_need_snapflush_inode " << *in << " - added at " << now << dendl; + } +} + +bool Locker::is_revoking_any_caps_from(client_t client) +{ + auto it = revoking_caps_by_client.find(client); + if (it == revoking_caps_by_client.end()) + return false; + return !it->second.empty(); +} + +void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last) +{ + dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl; + for (auto p = head_in->client_need_snapflush.begin(); + p != head_in->client_need_snapflush.end() && p->first < last; ) { + snapid_t snapid = p->first; + auto &clients = p->second; + ++p; // be careful, q loop below depends on this + + if (clients.count(client)) { + dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl; + CInode *sin = mdcache->pick_inode_snap(head_in, snapid - 1); + ceph_assert(sin); + ceph_assert(sin->first <= snapid); + _do_snap_update(sin, snapid, 0, sin->first - 1, client, MClientCaps::ref(), MClientCaps::ref()); + head_in->remove_need_snapflush(sin, snapid, client); + } + } +} + + +bool Locker::should_defer_client_cap_frozen(CInode *in) +{ + /* + * This policy needs to be AT LEAST as permissive as allowing a client request + * to go forward, or else a client request can release something, the release + * gets deferred, but the request gets processed and deadlocks because when the + * caps can't get revoked. + * + * Currently, a request wait if anything locked is freezing (can't + * auth_pin), which would avoid any deadlock with cap release. Thus @in + * _MUST_ be in the lock/auth_pin set. + * + * auth_pins==0 implies no unstable lock and not auth pinnned by + * client request, otherwise continue even it's freezing. + */ + return (in->is_freezing() && in->get_num_auth_pins() == 0) || in->is_frozen(); +} + +void Locker::handle_client_caps(const MClientCaps::const_ref &m) +{ + client_t client = m->get_source().num(); + snapid_t follows = m->get_snap_follows(); + auto op = m->get_op(); + auto dirty = m->get_dirty(); + dout(7) << "handle_client_caps " + << " on " << m->get_ino() + << " tid " << m->get_client_tid() << " follows " << follows + << " op " << ceph_cap_op_name(op) + << " flags 0x" << std::hex << m->flags << std::dec << dendl; + + Session *session = mds->get_session(m); + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) { + if (!session) { + dout(5) << " no session, dropping " << *m << dendl; + return; + } + if (session->is_closed() || + session->is_closing() || + session->is_killing()) { + dout(7) << " session closed|closing|killing, dropping " << *m << dendl; + return; + } + if ((mds->is_reconnect() || mds->get_want_state() == MDSMap::STATE_RECONNECT) && + dirty && m->get_client_tid() > 0 && + !session->have_completed_flush(m->get_client_tid())) { + mdcache->set_reconnected_dirty_caps(client, m->get_ino(), dirty, + op == CEPH_CAP_OP_FLUSHSNAP); + } + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + + if (m->get_client_tid() > 0 && session && + session->have_completed_flush(m->get_client_tid())) { + dout(7) << "handle_client_caps already flushed tid " << m->get_client_tid() + << " for client." << client << dendl; + MClientCaps::ref ack; + if (op == CEPH_CAP_OP_FLUSHSNAP) { + ack = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier()); + } else { + ack = MClientCaps::create(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier()); + } + ack->set_snap_follows(follows); + ack->set_client_tid(m->get_client_tid()); + mds->send_message_client_counted(ack, m->get_connection()); + if (op == CEPH_CAP_OP_FLUSHSNAP) { + return; + } else { + // fall-thru because the message may release some caps + dirty = false; + op = CEPH_CAP_OP_UPDATE; + } + } + + // "oldest flush tid" > 0 means client uses unique TID for each flush + if (m->get_oldest_flush_tid() > 0 && session) { + if (session->trim_completed_flushes(m->get_oldest_flush_tid())) { + mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name); + + if (session->get_num_trim_flushes_warnings() > 0 && + session->get_num_completed_flushes() * 2 < g_conf()->mds_max_completed_flushes) + session->reset_num_trim_flushes_warnings(); + } else { + if (session->get_num_completed_flushes() >= + (g_conf()->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) { + session->inc_num_trim_flushes_warnings(); + stringstream ss; + ss << "client." << session->get_client() << " does not advance its oldest_flush_tid (" + << m->get_oldest_flush_tid() << "), " + << session->get_num_completed_flushes() + << " completed flushes recorded in session"; + mds->clog->warn() << ss.str(); + dout(20) << __func__ << " " << ss.str() << dendl; + } + } + } + + CInode *head_in = mdcache->get_inode(m->get_ino()); + if (!head_in) { + if (mds->is_clientreplay()) { + dout(7) << "handle_client_caps on unknown ino " << m->get_ino() + << ", will try again after replayed client requests" << dendl; + mdcache->wait_replay_cap_reconnect(m->get_ino(), new C_MDS_RetryMessage(mds, m)); + return; + } + + /* + * "handle_client_caps on unknown ino xxx” is normal after migrating a subtree + * Sequence of events that cause this are: + * - client sends caps message to mds.a + * - mds finishes subtree migration, send cap export to client + * - mds trim its cache + * - mds receives cap messages from client + */ + dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl; + return; + } + + if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) { + // Pause RADOS operations until we see the required epoch + mds->objecter->set_epoch_barrier(m->osd_epoch_barrier); + } + + if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) { + // Record the barrier so that we will retransmit it to clients + mds->set_osd_epoch_barrier(m->osd_epoch_barrier); + } + + dout(10) << " head inode " << *head_in << dendl; + + Capability *cap = 0; + cap = head_in->get_client_cap(client); + if (!cap) { + dout(7) << "handle_client_caps no cap for client." << client << " on " << *head_in << dendl; + return; + } + ceph_assert(cap); + + // freezing|frozen? + if (should_defer_client_cap_frozen(head_in)) { + dout(7) << "handle_client_caps freezing|frozen on " << *head_in << dendl; + head_in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m)); + return; + } + if (ceph_seq_cmp(m->get_mseq(), cap->get_mseq()) < 0) { + dout(7) << "handle_client_caps mseq " << m->get_mseq() << " < " << cap->get_mseq() + << ", dropping" << dendl; + return; + } + + bool need_unpin = false; + + // flushsnap? + if (op == CEPH_CAP_OP_FLUSHSNAP) { + if (!head_in->is_auth()) { + dout(7) << " not auth, ignoring flushsnap on " << *head_in << dendl; + goto out; + } + + SnapRealm *realm = head_in->find_snaprealm(); + snapid_t snap = realm->get_snap_following(follows); + dout(10) << " flushsnap follows " << follows << " -> snap " << snap << dendl; + + auto p = head_in->client_need_snapflush.begin(); + if (p != head_in->client_need_snapflush.end() && p->first < snap) { + head_in->auth_pin(this); // prevent subtree frozen + need_unpin = true; + _do_null_snapflush(head_in, client, snap); + } + + CInode *in = head_in; + if (snap != CEPH_NOSNAP) { + in = mdcache->pick_inode_snap(head_in, snap - 1); + if (in != head_in) + dout(10) << " snapped inode " << *in << dendl; + } + + // we can prepare the ack now, since this FLUSHEDSNAP is independent of any + // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst + // case we get a dup response, so whatever.) + MClientCaps::ref ack; + if (dirty) { + ack = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier()); + ack->set_snap_follows(follows); + ack->set_client_tid(m->get_client_tid()); + ack->set_oldest_flush_tid(m->get_oldest_flush_tid()); + } + + if (in == head_in || + (head_in->client_need_snapflush.count(snap) && + head_in->client_need_snapflush[snap].count(client))) { + dout(7) << " flushsnap snap " << snap + << " client." << client << " on " << *in << dendl; + + // this cap now follows a later snap (i.e. the one initiating this flush, or later) + if (in == head_in) + cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq(); + + _do_snap_update(in, snap, dirty, follows, client, m, ack); + + if (in != head_in) + head_in->remove_need_snapflush(in, snap, client); + } else { + dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl; + if (ack) + mds->send_message_client_counted(ack, m->get_connection()); + } + goto out; + } + + if (cap->get_cap_id() != m->get_cap_id()) { + dout(7) << " ignoring client capid " << m->get_cap_id() << " != my " << cap->get_cap_id() << dendl; + } else { + CInode *in = head_in; + if (follows > 0) { + in = mdcache->pick_inode_snap(head_in, follows); + // intermediate snap inodes + while (in != head_in) { + ceph_assert(in->last != CEPH_NOSNAP); + if (in->is_auth() && dirty) { + dout(10) << " updating intermediate snapped inode " << *in << dendl; + _do_cap_update(in, NULL, dirty, follows, m, MClientCaps::ref()); + } + in = mdcache->pick_inode_snap(head_in, in->last); + } + } + + // head inode, and cap + MClientCaps::ref ack; + + int caps = m->get_caps(); + if (caps & ~cap->issued()) { + dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl; + caps &= cap->issued(); + } + + cap->confirm_receipt(m->get_seq(), caps); + dout(10) << " follows " << follows + << " retains " << ccap_string(m->get_caps()) + << " dirty " << ccap_string(dirty) + << " on " << *in << dendl; + + + // missing/skipped snapflush? + // The client MAY send a snapflush if it is issued WR/EXCL caps, but + // presently only does so when it has actual dirty metadata. But, we + // set up the need_snapflush stuff based on the issued caps. + // We can infer that the client WONT send a FLUSHSNAP once they have + // released all WR/EXCL caps (the FLUSHSNAP always comes before the cap + // update/release). + if (!head_in->client_need_snapflush.empty()) { + if (!(cap->issued() & CEPH_CAP_ANY_FILE_WR) && + !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP)) { + head_in->auth_pin(this); // prevent subtree frozen + need_unpin = true; + _do_null_snapflush(head_in, client); + } else { + dout(10) << " revocation in progress, not making any conclusions about null snapflushes" << dendl; + } + } + if (cap->need_snapflush() && !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP)) + cap->clear_needsnapflush(); + + if (dirty && in->is_auth()) { + dout(7) << " flush client." << client << " dirty " << ccap_string(dirty) + << " seq " << m->get_seq() << " on " << *in << dendl; + ack = MClientCaps::create(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(), + m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier()); + ack->set_client_tid(m->get_client_tid()); + ack->set_oldest_flush_tid(m->get_oldest_flush_tid()); + } + + // filter wanted based on what we could ever give out (given auth/replica status) + bool need_flush = m->flags & MClientCaps::FLAG_SYNC; + int new_wanted = m->get_wanted(); + if (new_wanted != cap->wanted()) { + if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) { + // exapnding caps. make sure we aren't waiting for a log flush + need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending()); + } + + adjust_cap_wanted(cap, new_wanted, m->get_issue_seq()); + } + + if (in->is_auth() && + _do_cap_update(in, cap, dirty, follows, m, ack, &need_flush)) { + // updated + eval(in, CEPH_CAP_LOCKS); + + if (!need_flush && (cap->wanted() & ~cap->pending())) + need_flush = _need_flush_mdlog(in, cap->wanted() & ~cap->pending()); + } else { + // no update, ack now. + if (ack) + mds->send_message_client_counted(ack, m->get_connection()); + + bool did_issue = eval(in, CEPH_CAP_LOCKS); + if (!did_issue && (cap->wanted() & ~cap->pending())) + issue_caps(in, cap); + + if (cap->get_last_seq() == 0 && + (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) { + share_inode_max_size(in, cap); + } + } + + if (need_flush) + mds->mdlog->flush(); + } + + out: + if (need_unpin) + head_in->auth_unpin(this); +} + + +class C_Locker_RetryRequestCapRelease : public LockerContext { + client_t client; + ceph_mds_request_release item; +public: + C_Locker_RetryRequestCapRelease(Locker *l, client_t c, const ceph_mds_request_release& it) : + LockerContext(l), client(c), item(it) { } + void finish(int r) override { + string dname; + MDRequestRef null_ref; + locker->process_request_cap_release(null_ref, client, item, dname); + } +}; + +void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item, + std::string_view dname) +{ + inodeno_t ino = (uint64_t)item.ino; + uint64_t cap_id = item.cap_id; + int caps = item.caps; + int wanted = item.wanted; + int seq = item.seq; + int issue_seq = item.issue_seq; + int mseq = item.mseq; + + CInode *in = mdcache->get_inode(ino); + if (!in) + return; + + if (dname.length()) { + frag_t fg = in->pick_dirfrag(dname); + CDir *dir = in->get_dirfrag(fg); + if (dir) { + CDentry *dn = dir->lookup(dname); + if (dn) { + ClientLease *l = dn->get_client_lease(client); + if (l) { + dout(10) << __func__ << " removing lease on " << *dn << dendl; + dn->remove_client_lease(l, this); + } else { + dout(7) << __func__ << " client." << client + << " doesn't have lease on " << *dn << dendl; + } + } else { + dout(7) << __func__ << " client." << client << " released lease on dn " + << dir->dirfrag() << "/" << dname << " which dne" << dendl; + } + } + } + + Capability *cap = in->get_client_cap(client); + if (!cap) + return; + + dout(10) << __func__ << " client." << client << " " << ccap_string(caps) << " on " << *in + << (mdr ? "" : " (DEFERRED, no mdr)") + << dendl; + + if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) { + dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", dropping" << dendl; + return; + } + + if (cap->get_cap_id() != cap_id) { + dout(7) << " cap_id " << cap_id << " != " << cap->get_cap_id() << ", dropping" << dendl; + return; + } + + if (should_defer_client_cap_frozen(in)) { + dout(7) << " frozen, deferring" << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_RetryRequestCapRelease(this, client, item)); + return; + } + + if (caps & ~cap->issued()) { + dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl; + caps &= cap->issued(); + } + cap->confirm_receipt(seq, caps); + + if (!in->client_need_snapflush.empty() && + (cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) { + _do_null_snapflush(in, client); + } + + adjust_cap_wanted(cap, wanted, issue_seq); + + if (mdr) + cap->inc_suppress(); + eval(in, CEPH_CAP_LOCKS); + if (mdr) + cap->dec_suppress(); + + // take note; we may need to reissue on this cap later + if (mdr) + mdr->cap_releases[in->vino()] = cap->get_last_seq(); +} + +class C_Locker_RetryKickIssueCaps : public LockerContext { + CInode *in; + client_t client; + ceph_seq_t seq; +public: + C_Locker_RetryKickIssueCaps(Locker *l, CInode *i, client_t c, ceph_seq_t s) : + LockerContext(l), in(i), client(c), seq(s) { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + locker->kick_issue_caps(in, client, seq); + in->put(CInode::PIN_PTRWAITER); + } +}; + +void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq) +{ + Capability *cap = in->get_client_cap(client); + if (!cap || cap->get_last_seq() != seq) + return; + if (in->is_frozen()) { + dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, + new C_Locker_RetryKickIssueCaps(this, in, client, seq)); + return; + } + dout(10) << "kick_issue_caps released at current seq " << seq + << ", reissuing" << dendl; + issue_caps(in, cap); +} + +void Locker::kick_cap_releases(MDRequestRef& mdr) +{ + client_t client = mdr->get_client(); + for (map<vinodeno_t,ceph_seq_t>::iterator p = mdr->cap_releases.begin(); + p != mdr->cap_releases.end(); + ++p) { + CInode *in = mdcache->get_inode(p->first); + if (!in) + continue; + kick_issue_caps(in, client, p->second); + } +} + +/** + * m and ack might be NULL, so don't dereference them unless dirty != 0 + */ +void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const MClientCaps::const_ref &m, const MClientCaps::ref &ack) +{ + dout(10) << "_do_snap_update dirty " << ccap_string(dirty) + << " follows " << follows << " snap " << snap + << " on " << *in << dendl; + + if (snap == CEPH_NOSNAP) { + // hmm, i guess snap was already deleted? just ack! + dout(10) << " wow, the snap following " << follows + << " was already deleted. nothing to record, just ack." << dendl; + if (ack) + mds->send_message_client_counted(ack, m->get_connection()); + return; + } + + EUpdate *le = new EUpdate(mds->mdlog, "snap flush"); + mds->mdlog->start_entry(le); + MutationRef mut = new MutationImpl(); + mut->ls = mds->mdlog->get_current_segment(); + + // normal metadata updates that we can apply to the head as well. + + // update xattrs? + CInode::mempool_xattr_map *px = nullptr; + bool xattrs = (dirty & CEPH_CAP_XATTR_EXCL) && + m->xattrbl.length() && + m->head.xattr_version > in->get_projected_inode()->xattr_version; + + CInode::mempool_old_inode *oi = 0; + if (in->is_multiversion()) { + oi = in->pick_old_inode(snap); + } + + CInode::mempool_inode *i; + if (oi) { + dout(10) << " writing into old inode" << dendl; + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + if (snap > oi->first) + in->split_old_inode(snap); + i = &oi->inode; + if (xattrs) + px = &oi->xattrs; + } else { + auto &pi = in->project_inode(xattrs); + pi.inode.version = in->pre_dirty(); + i = &pi.inode; + if (xattrs) + px = pi.xattrs.get(); + } + + _update_cap_fields(in, dirty, m, i); + + // xattr + if (xattrs) { + dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version + << " len " << m->xattrbl.length() << dendl; + i->xattr_version = m->head.xattr_version; + auto p = m->xattrbl.cbegin(); + decode(*px, p); + } + + { + auto it = i->client_ranges.find(client); + if (it != i->client_ranges.end()) { + if (in->last == snap) { + dout(10) << " removing client_range entirely" << dendl; + i->client_ranges.erase(it); + } else { + dout(10) << " client_range now follows " << snap << dendl; + it->second.follows = snap; + } + } + } + + mut->auth_pin(in); + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows); + mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows); + + // "oldest flush tid" > 0 means client uses unique TID for each flush + if (ack && ack->get_oldest_flush_tid() > 0) + le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()), + ack->get_oldest_flush_tid()); + + mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, UPDATE_SNAPFLUSH, + ack, client)); +} + +void Locker::_update_cap_fields(CInode *in, int dirty, const MClientCaps::const_ref &m, CInode::mempool_inode *pi) +{ + if (dirty == 0) + return; + + /* m must be valid if there are dirty caps */ + ceph_assert(m); + uint64_t features = m->get_connection()->get_features(); + + if (m->get_ctime() > pi->ctime) { + dout(7) << " ctime " << pi->ctime << " -> " << m->get_ctime() + << " for " << *in << dendl; + pi->ctime = m->get_ctime(); + if (m->get_ctime() > pi->rstat.rctime) + pi->rstat.rctime = m->get_ctime(); + } + + if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) && + m->get_change_attr() > pi->change_attr) { + dout(7) << " change_attr " << pi->change_attr << " -> " << m->get_change_attr() + << " for " << *in << dendl; + pi->change_attr = m->get_change_attr(); + } + + // file + if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { + utime_t atime = m->get_atime(); + utime_t mtime = m->get_mtime(); + uint64_t size = m->get_size(); + version_t inline_version = m->inline_version; + + if (((dirty & CEPH_CAP_FILE_WR) && mtime > pi->mtime) || + ((dirty & CEPH_CAP_FILE_EXCL) && mtime != pi->mtime)) { + dout(7) << " mtime " << pi->mtime << " -> " << mtime + << " for " << *in << dendl; + pi->mtime = mtime; + if (mtime > pi->rstat.rctime) + pi->rstat.rctime = mtime; + } + if (in->inode.is_file() && // ONLY if regular file + size > pi->size) { + dout(7) << " size " << pi->size << " -> " << size + << " for " << *in << dendl; + pi->size = size; + pi->rstat.rbytes = size; + } + if (in->inode.is_file() && + (dirty & CEPH_CAP_FILE_WR) && + inline_version > pi->inline_data.version) { + pi->inline_data.version = inline_version; + if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0) + pi->inline_data.get_data() = m->inline_data; + else + pi->inline_data.free_data(); + } + if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) { + dout(7) << " atime " << pi->atime << " -> " << atime + << " for " << *in << dendl; + pi->atime = atime; + } + if ((dirty & CEPH_CAP_FILE_EXCL) && + ceph_seq_cmp(pi->time_warp_seq, m->get_time_warp_seq()) < 0) { + dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq() + << " for " << *in << dendl; + pi->time_warp_seq = m->get_time_warp_seq(); + } + } + // auth + if (dirty & CEPH_CAP_AUTH_EXCL) { + if (m->head.uid != pi->uid) { + dout(7) << " uid " << pi->uid + << " -> " << m->head.uid + << " for " << *in << dendl; + pi->uid = m->head.uid; + } + if (m->head.gid != pi->gid) { + dout(7) << " gid " << pi->gid + << " -> " << m->head.gid + << " for " << *in << dendl; + pi->gid = m->head.gid; + } + if (m->head.mode != pi->mode) { + dout(7) << " mode " << oct << pi->mode + << " -> " << m->head.mode << dec + << " for " << *in << dendl; + pi->mode = m->head.mode; + } + if ((features & CEPH_FEATURE_FS_BTIME) && m->get_btime() != pi->btime) { + dout(7) << " btime " << oct << pi->btime + << " -> " << m->get_btime() << dec + << " for " << *in << dendl; + pi->btime = m->get_btime(); + } + } +} + +/* + * update inode based on cap flush|flushsnap|wanted. + * adjust max_size, if needed. + * if we update, return true; otherwise, false (no updated needed). + */ +bool Locker::_do_cap_update(CInode *in, Capability *cap, + int dirty, snapid_t follows, + const MClientCaps::const_ref &m, const MClientCaps::ref &ack, + bool *need_flush) +{ + dout(10) << "_do_cap_update dirty " << ccap_string(dirty) + << " issued " << ccap_string(cap ? cap->issued() : 0) + << " wanted " << ccap_string(cap ? cap->wanted() : 0) + << " on " << *in << dendl; + ceph_assert(in->is_auth()); + client_t client = m->get_source().num(); + CInode::mempool_inode *latest = in->get_projected_inode(); + + // increase or zero max_size? + uint64_t size = m->get_size(); + bool change_max = false; + uint64_t old_max = latest->client_ranges.count(client) ? latest->client_ranges[client].range.last : 0; + uint64_t new_max = old_max; + + if (in->is_file()) { + bool forced_change_max = false; + dout(20) << "inode is file" << dendl; + if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) { + dout(20) << "client has write caps; m->get_max_size=" + << m->get_max_size() << "; old_max=" << old_max << dendl; + if (m->get_max_size() > new_max) { + dout(10) << "client requests file_max " << m->get_max_size() + << " > max " << old_max << dendl; + change_max = true; + forced_change_max = true; + new_max = calc_new_max_size(latest, m->get_max_size()); + } else { + new_max = calc_new_max_size(latest, size); + + if (new_max > old_max) + change_max = true; + else + new_max = old_max; + } + } else { + if (old_max) { + change_max = true; + new_max = 0; + } + } + + if (in->last == CEPH_NOSNAP && + change_max && + !in->filelock.can_wrlock(client) && + !in->filelock.can_force_wrlock(client)) { + dout(10) << " i want to change file_max, but lock won't allow it (yet)" << dendl; + if (in->filelock.is_stable()) { + bool need_issue = false; + if (cap) + cap->inc_suppress(); + if (in->get_mds_caps_wanted().empty() && + (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) { + if (in->filelock.get_state() != LOCK_EXCL) + file_excl(&in->filelock, &need_issue); + } else + simple_lock(&in->filelock, &need_issue); + if (need_issue) + issue_caps(in); + if (cap) + cap->dec_suppress(); + } + if (!in->filelock.can_wrlock(client) && + !in->filelock.can_force_wrlock(client)) { + C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in, + forced_change_max ? new_max : 0, + 0, utime_t()); + + in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms); + change_max = false; + } + } + } + + if (m->flockbl.length()) { + int32_t num_locks; + auto bli = m->flockbl.cbegin(); + decode(num_locks, bli); + for ( int i=0; i < num_locks; ++i) { + ceph_filelock decoded_lock; + decode(decoded_lock, bli); + in->get_fcntl_lock_state()->held_locks. + insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock)); + ++in->get_fcntl_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)]; + } + decode(num_locks, bli); + for ( int i=0; i < num_locks; ++i) { + ceph_filelock decoded_lock; + decode(decoded_lock, bli); + in->get_flock_lock_state()->held_locks. + insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock)); + ++in->get_flock_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)]; + } + } + + if (!dirty && !change_max) + return false; + + Session *session = mds->get_session(m); + if (session->check_access(in, MAY_WRITE, + m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) { + dout(10) << "check_access failed, dropping cap update on " << *in << dendl; + return false; + } + + // do the update. + EUpdate *le = new EUpdate(mds->mdlog, "cap update"); + mds->mdlog->start_entry(le); + + bool xattr = (dirty & CEPH_CAP_XATTR_EXCL) && + m->xattrbl.length() && + m->head.xattr_version > in->get_projected_inode()->xattr_version; + + auto &pi = in->project_inode(xattr); + pi.inode.version = in->pre_dirty(); + + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + + _update_cap_fields(in, dirty, m, &pi.inode); + + if (change_max) { + dout(7) << " max_size " << old_max << " -> " << new_max + << " for " << *in << dendl; + if (new_max) { + auto &cr = pi.inode.client_ranges[client]; + cr.range.first = 0; + cr.range.last = new_max; + cr.follows = in->first - 1; + if (cap) + cap->mark_clientwriteable(); + } else { + pi.inode.client_ranges.erase(client); + if (cap) + cap->clear_clientwriteable(); + } + } + + if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) + wrlock_force(&in->filelock, mut); // wrlock for duration of journal + + // auth + if (dirty & CEPH_CAP_AUTH_EXCL) + wrlock_force(&in->authlock, mut); + + // xattrs update? + if (xattr) { + dout(7) << " xattrs v" << pi.inode.xattr_version << " -> " << m->head.xattr_version << dendl; + pi.inode.xattr_version = m->head.xattr_version; + auto p = m->xattrbl.cbegin(); + decode_noshare(*pi.xattrs, p); + wrlock_force(&in->xattrlock, mut); + } + + mut->auth_pin(in); + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows); + mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows); + + // "oldest flush tid" > 0 means client uses unique TID for each flush + if (ack && ack->get_oldest_flush_tid() > 0) + le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()), + ack->get_oldest_flush_tid()); + + unsigned update_flags = 0; + if (change_max) + update_flags |= UPDATE_SHAREMAX; + if (cap) + update_flags |= UPDATE_NEEDSISSUE; + mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, update_flags, + ack, client)); + if (need_flush && !*need_flush && + ((change_max && new_max) || // max INCREASE + _need_flush_mdlog(in, dirty))) + *need_flush = true; + + return true; +} + +void Locker::handle_client_cap_release(const MClientCapRelease::const_ref &m) +{ + client_t client = m->get_source().num(); + dout(10) << "handle_client_cap_release " << *m << dendl; + + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) { + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + + if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) { + // Pause RADOS operations until we see the required epoch + mds->objecter->set_epoch_barrier(m->osd_epoch_barrier); + } + + if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) { + // Record the barrier so that we will retransmit it to clients + mds->set_osd_epoch_barrier(m->osd_epoch_barrier); + } + + Session *session = mds->get_session(m); + + for (const auto &cap : m->caps) { + _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq); + } + + if (session) { + session->notify_cap_release(m->caps.size()); + } +} + +class C_Locker_RetryCapRelease : public LockerContext { + client_t client; + inodeno_t ino; + uint64_t cap_id; + ceph_seq_t migrate_seq; + ceph_seq_t issue_seq; +public: + C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id, + ceph_seq_t mseq, ceph_seq_t seq) : + LockerContext(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {} + void finish(int r) override { + locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq); + } +}; + +void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, + ceph_seq_t mseq, ceph_seq_t seq) +{ + CInode *in = mdcache->get_inode(ino); + if (!in) { + dout(7) << "_do_cap_release missing ino " << ino << dendl; + return; + } + Capability *cap = in->get_client_cap(client); + if (!cap) { + dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl; + return; + } + + dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl; + if (cap->get_cap_id() != cap_id) { + dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl; + return; + } + if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) { + dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl; + return; + } + if (should_defer_client_cap_frozen(in)) { + dout(7) << " freezing|frozen, deferring" << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, + new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq)); + return; + } + if (seq != cap->get_last_issue()) { + dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl; + // clean out any old revoke history + cap->clean_revoke_from(seq); + eval_cap_gather(in); + return; + } + remove_client_cap(in, cap); +} + +void Locker::remove_client_cap(CInode *in, Capability *cap, bool kill) +{ + client_t client = cap->get_client(); + // clean out any pending snapflush state + if (!in->client_need_snapflush.empty()) + _do_null_snapflush(in, client); + + bool notable = cap->is_notable(); + in->remove_client_cap(client); + if (!notable) + return; + + if (in->is_auth()) { + // make sure we clear out the client byte range + if (in->get_projected_inode()->client_ranges.count(client) && + !(in->inode.nlink == 0 && !in->is_any_caps())) { // unless it's unlink + stray + if (kill) + in->state_set(CInode::STATE_NEEDSRECOVER); + else + check_inode_max_size(in); + } + } else { + request_inode_file_caps(in); + } + + try_eval(in, CEPH_CAP_LOCKS); +} + + +/** + * Return true if any currently revoking caps exceed the + * session_timeout threshold. + */ +bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking, + double timeout) const +{ + xlist<Capability*>::const_iterator p = revoking.begin(); + if (p.end()) { + // No revoking caps at the moment + return false; + } else { + utime_t now = ceph_clock_now(); + utime_t age = now - (*p)->get_last_revoke_stamp(); + if (age <= timeout) { + return false; + } else { + return true; + } + } +} + +void Locker::get_late_revoking_clients(std::list<client_t> *result, + double timeout) const +{ + if (!any_late_revoking_caps(revoking_caps, timeout)) { + // Fast path: no misbehaving clients, execute in O(1) + return; + } + + // Slow path: execute in O(N_clients) + for (auto &p : revoking_caps_by_client) { + if (any_late_revoking_caps(p.second, timeout)) { + // Search the list for duplicate and only insert if unique + std::list<client_t>::const_iterator it = std::find(result->begin(), result->end(), p.first); + if (it == result->end()) + result->push_back(p.first); + } + } +} + +// Hard-code instead of surfacing a config settings because this is +// really a hack that should go away at some point when we have better +// inspection tools for getting at detailed cap state (#7316) +#define MAX_WARN_CAPS 100 + +void Locker::caps_tick() +{ + utime_t now = ceph_clock_now(); + + if (!need_snapflush_inodes.empty()) { + // snap inodes that needs flush are auth pinned, they affect + // subtree/difrarg freeze. + utime_t cutoff = now; + cutoff -= g_conf()->mds_freeze_tree_timeout / 3; + + CInode *last = need_snapflush_inodes.back(); + while (!need_snapflush_inodes.empty()) { + CInode *in = need_snapflush_inodes.front(); + if (in->last_dirstat_prop >= cutoff) + break; + in->item_caps.remove_myself(); + snapflush_nudge(in); + if (in == last) + break; + } + } + + dout(20) << __func__ << " " << revoking_caps.size() << " revoking caps" << dendl; + + now = ceph_clock_now(); + int n = 0; + for (xlist<Capability*>::iterator p = revoking_caps.begin(); !p.end(); ++p) { + Capability *cap = *p; + + utime_t age = now - cap->get_last_revoke_stamp(); + dout(20) << __func__ << " age = " << age << " client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl; + if (age <= mds->mdsmap->get_session_timeout()) { + dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl; + break; + } else { + ++n; + if (n > MAX_WARN_CAPS) { + dout(1) << __func__ << " more than " << MAX_WARN_CAPS << " caps are late" + << "revoking, ignoring subsequent caps" << dendl; + break; + } + } + // exponential backoff of warning intervals + if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) { + cap->inc_num_revoke_warnings(); + stringstream ss; + ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino " + << cap->get_inode()->ino() << " pending " << ccap_string(cap->pending()) + << " issued " << ccap_string(cap->issued()) << ", sent " << age << " seconds ago"; + mds->clog->warn() << ss.str(); + dout(20) << __func__ << " " << ss.str() << dendl; + } else { + dout(20) << __func__ << " silencing log message (backoff) for " << "client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl; + } + } +} + + +void Locker::handle_client_lease(const MClientLease::const_ref &m) +{ + dout(10) << "handle_client_lease " << *m << dendl; + + ceph_assert(m->get_source().is_client()); + client_t client = m->get_source().num(); + + CInode *in = mdcache->get_inode(m->get_ino(), m->get_last()); + if (!in) { + dout(7) << "handle_client_lease don't have ino " << m->get_ino() << "." << m->get_last() << dendl; + return; + } + CDentry *dn = 0; + + frag_t fg = in->pick_dirfrag(m->dname); + CDir *dir = in->get_dirfrag(fg); + if (dir) + dn = dir->lookup(m->dname); + if (!dn) { + dout(7) << "handle_client_lease don't have dn " << m->get_ino() << " " << m->dname << dendl; + return; + } + dout(10) << " on " << *dn << dendl; + + // replica and lock + ClientLease *l = dn->get_client_lease(client); + if (!l) { + dout(7) << "handle_client_lease didn't have lease for client." << client << " of " << *dn << dendl; + return; + } + + switch (m->get_action()) { + case CEPH_MDS_LEASE_REVOKE_ACK: + case CEPH_MDS_LEASE_RELEASE: + if (l->seq != m->get_seq()) { + dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl; + } else { + dout(7) << "handle_client_lease client." << client + << " on " << *dn << dendl; + dn->remove_client_lease(l, this); + } + break; + + case CEPH_MDS_LEASE_RENEW: + { + dout(7) << "handle_client_lease client." << client << " renew on " << *dn + << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl; + if (dn->lock.can_lease(client)) { + auto reply = MClientLease::create(*m); + int pool = 1; // fixme.. do something smart! + reply->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]); + reply->h.seq = ++l->seq; + reply->clear_payload(); + + utime_t now = ceph_clock_now(); + now += mdcache->client_lease_durations[pool]; + mdcache->touch_client_lease(l, pool, now); + + mds->send_message_client_counted(reply, m->get_connection()); + } + } + break; + + default: + ceph_abort(); // implement me + break; + } +} + + +void Locker::issue_client_lease(CDentry *dn, client_t client, + bufferlist &bl, utime_t now, Session *session) +{ + CInode *diri = dn->get_dir()->get_inode(); + if (!diri->is_stray() && // do not issue dn leases in stray dir! + ((!diri->filelock.can_lease(client) && + (diri->get_client_cap_pending(client) & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL)) == 0)) && + dn->lock.can_lease(client)) { + int pool = 1; // fixme.. do something smart! + // issue a dentry lease + ClientLease *l = dn->add_client_lease(client, session); + session->touch_lease(l); + + now += mdcache->client_lease_durations[pool]; + mdcache->touch_client_lease(l, pool, now); + + LeaseStat lstat; + lstat.mask = 1 | CEPH_LOCK_DN; // old and new bit values + lstat.duration_ms = (uint32_t)(1000 * mdcache->client_lease_durations[pool]); + lstat.seq = ++l->seq; + encode_lease(bl, session->info, lstat); + dout(20) << "issue_client_lease seq " << lstat.seq << " dur " << lstat.duration_ms << "ms " + << " on " << *dn << dendl; + } else { + // null lease + LeaseStat lstat; + encode_lease(bl, session->info, lstat); + dout(20) << "issue_client_lease no/null lease on " << *dn << dendl; + } +} + + +void Locker::revoke_client_leases(SimpleLock *lock) +{ + int n = 0; + CDentry *dn = static_cast<CDentry*>(lock->get_parent()); + for (map<client_t, ClientLease*>::iterator p = dn->client_lease_map.begin(); + p != dn->client_lease_map.end(); + ++p) { + ClientLease *l = p->second; + + n++; + ceph_assert(lock->get_type() == CEPH_LOCK_DN); + + CDentry *dn = static_cast<CDentry*>(lock->get_parent()); + int mask = 1 | CEPH_LOCK_DN; // old and new bits + + // i should also revoke the dir ICONTENT lease, if they have it! + CInode *diri = dn->get_dir()->get_inode(); + auto lease = MClientLease::create(CEPH_MDS_LEASE_REVOKE, l->seq, mask, diri->ino(), diri->first, CEPH_NOSNAP, dn->get_name()); + mds->send_message_client_counted(lease, l->client); + } +} + +void Locker::encode_lease(bufferlist& bl, const session_info_t& info, + const LeaseStat& ls) +{ + if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { + ENCODE_START(1, 1, bl); + encode(ls.mask, bl); + encode(ls.duration_ms, bl); + encode(ls.seq, bl); + ENCODE_FINISH(bl); + } + else { + encode(ls.mask, bl); + encode(ls.duration_ms, bl); + encode(ls.seq, bl); + } +} + +// locks ---------------------------------------------------------------- + +SimpleLock *Locker::get_lock(int lock_type, const MDSCacheObjectInfo &info) +{ + switch (lock_type) { + case CEPH_LOCK_DN: + { + // be careful; info.dirfrag may have incorrect frag; recalculate based on dname. + CInode *diri = mdcache->get_inode(info.dirfrag.ino); + frag_t fg; + CDir *dir = 0; + CDentry *dn = 0; + if (diri) { + fg = diri->pick_dirfrag(info.dname); + dir = diri->get_dirfrag(fg); + if (dir) + dn = dir->lookup(info.dname, info.snapid); + } + if (!dn) { + dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl; + return 0; + } + return &dn->lock; + } + + case CEPH_LOCK_IAUTH: + case CEPH_LOCK_ILINK: + case CEPH_LOCK_IDFT: + case CEPH_LOCK_IFILE: + case CEPH_LOCK_INEST: + case CEPH_LOCK_IXATTR: + case CEPH_LOCK_ISNAP: + case CEPH_LOCK_IFLOCK: + case CEPH_LOCK_IPOLICY: + { + CInode *in = mdcache->get_inode(info.ino, info.snapid); + if (!in) { + dout(7) << "get_lock don't have ino " << info.ino << dendl; + return 0; + } + switch (lock_type) { + case CEPH_LOCK_IAUTH: return &in->authlock; + case CEPH_LOCK_ILINK: return &in->linklock; + case CEPH_LOCK_IDFT: return &in->dirfragtreelock; + case CEPH_LOCK_IFILE: return &in->filelock; + case CEPH_LOCK_INEST: return &in->nestlock; + case CEPH_LOCK_IXATTR: return &in->xattrlock; + case CEPH_LOCK_ISNAP: return &in->snaplock; + case CEPH_LOCK_IFLOCK: return &in->flocklock; + case CEPH_LOCK_IPOLICY: return &in->policylock; + } + } + + default: + dout(7) << "get_lock don't know lock_type " << lock_type << dendl; + ceph_abort(); + break; + } + + return 0; +} + +void Locker::handle_lock(const MLock::const_ref &m) +{ + // nobody should be talking to us during recovery. + ceph_assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + + SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); + if (!lock) { + dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl; + return; + } + + switch (lock->get_type()) { + case CEPH_LOCK_DN: + case CEPH_LOCK_IAUTH: + case CEPH_LOCK_ILINK: + case CEPH_LOCK_ISNAP: + case CEPH_LOCK_IXATTR: + case CEPH_LOCK_IFLOCK: + case CEPH_LOCK_IPOLICY: + handle_simple_lock(lock, m); + break; + + case CEPH_LOCK_IDFT: + case CEPH_LOCK_INEST: + //handle_scatter_lock((ScatterLock*)lock, m); + //break; + + case CEPH_LOCK_IFILE: + handle_file_lock(static_cast<ScatterLock*>(lock), m); + break; + + default: + dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl; + ceph_abort(); + break; + } +} + + + + + +// ========================================================================== +// simple lock + +/** This function may take a reference to m if it needs one, but does + * not put references. */ +void Locker::handle_reqrdlock(SimpleLock *lock, const MLock::const_ref &m) +{ + MDSCacheObject *parent = lock->get_parent(); + if (parent->is_auth() && + lock->get_state() != LOCK_SYNC && + !parent->is_frozen()) { + dout(7) << "handle_reqrdlock got rdlock request on " << *lock + << " on " << *parent << dendl; + ceph_assert(parent->is_auth()); // replica auth pinned if they're doing this! + if (lock->is_stable()) { + simple_sync(lock); + } else { + dout(7) << "handle_reqrdlock delaying request until lock is stable" << dendl; + lock->add_waiter(SimpleLock::WAIT_STABLE | MDSCacheObject::WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + } + } else { + dout(7) << "handle_reqrdlock dropping rdlock request on " << *lock + << " on " << *parent << dendl; + // replica should retry + } +} + +void Locker::handle_simple_lock(SimpleLock *lock, const MLock::const_ref &m) +{ + int from = m->get_asker(); + + dout(10) << "handle_simple_lock " << *m + << " on " << *lock << " " << *lock->get_parent() << dendl; + + if (mds->is_rejoin()) { + if (lock->get_parent()->is_rejoining()) { + dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() + << ", dropping " << *m << dendl; + return; + } + } + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + ceph_assert(lock->get_state() == LOCK_LOCK); + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + break; + + case LOCK_AC_LOCK: + ceph_assert(lock->get_state() == LOCK_SYNC); + lock->set_state(LOCK_SYNC_LOCK); + if (lock->is_leased()) + revoke_client_leases(lock); + eval_gather(lock, true); + if (lock->is_unstable_and_locked()) + mds->mdlog->flush(); + break; + + + // -- auth -- + case LOCK_AC_LOCKACK: + ceph_assert(lock->get_state() == LOCK_SYNC_LOCK || + lock->get_state() == LOCK_SYNC_EXCL); + ceph_assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", still gathering " << lock->get_gather_set() << dendl; + } else { + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", last one" << dendl; + eval_gather(lock); + } + break; + + case LOCK_AC_REQRDLOCK: + handle_reqrdlock(lock, m); + break; + + } +} + +/* unused, currently. + +class C_Locker_SimpleEval : public Context { + Locker *locker; + SimpleLock *lock; +public: + C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} + void finish(int r) { + locker->try_simple_eval(lock); + } +}; + +void Locker::try_simple_eval(SimpleLock *lock) +{ + // unstable and ambiguous auth? + if (!lock->is_stable() && + lock->get_parent()->is_ambiguous_auth()) { + dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); + return; + } + + if (!lock->get_parent()->is_auth()) { + dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl; + return; + } + + if (!lock->get_parent()->can_auth_pin()) { + dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; + //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock)); + return; + } + + if (lock->is_stable()) + simple_eval(lock); +} +*/ + + +void Locker::simple_eval(SimpleLock *lock, bool *need_issue) +{ + dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + if (lock->get_parent()->is_freezing_or_frozen()) { + // dentry/snap lock in unreadable state can block path traverse + if ((lock->get_type() != CEPH_LOCK_DN && + lock->get_type() != CEPH_LOCK_ISNAP) || + lock->get_state() == LOCK_SYNC || + lock->get_parent()->is_frozen()) + return; + } + + if (mdcache->is_readonly()) { + if (lock->get_state() != LOCK_SYNC) { + dout(10) << "simple_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl; + simple_sync(lock, need_issue); + } + return; + } + + CInode *in = 0; + int wanted = 0; + if (lock->get_cap_shift()) { + in = static_cast<CInode*>(lock->get_parent()); + in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift()); + } + + // -> excl? + if (lock->get_state() != LOCK_EXCL && + in && in->get_target_loner() >= 0 && + (wanted & CEPH_CAP_GEXCL)) { + dout(7) << "simple_eval stable, going to excl " << *lock + << " on " << *lock->get_parent() << dendl; + simple_excl(lock, need_issue); + } + + // stable -> sync? + else if (lock->get_state() != LOCK_SYNC && + !lock->is_wrlocked() && + ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) || + (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) { + dout(7) << "simple_eval stable, syncing " << *lock + << " on " << *lock->get_parent() << dendl; + simple_sync(lock, need_issue); + } +} + + +// mid + +bool Locker::simple_sync(SimpleLock *lock, bool *need_issue) +{ + dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl; + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + CInode *in = 0; + if (lock->get_cap_shift()) + in = static_cast<CInode *>(lock->get_parent()); + + int old_state = lock->get_state(); + + if (old_state != LOCK_TSYN) { + + switch (lock->get_state()) { + case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break; + case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break; + case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break; + case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_wrlocked()) + gather++; + + if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) { + send_lock_message(lock, LOCK_AC_SYNC); + lock->init_gather(); + gather++; + } + + if (in && in->is_head()) { + if (in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + } + + bool need_recover = false; + if (lock->get_type() == CEPH_LOCK_IFILE) { + ceph_assert(in); + if (in->state_test(CInode::STATE_NEEDSRECOVER)) { + mds->mdcache->queue_file_recover(in); + need_recover = true; + gather++; + } + } + + if (!gather && lock->is_dirty()) { + lock->get_parent()->auth_pin(lock); + scatter_writebehind(static_cast<ScatterLock*>(lock)); + mds->mdlog->flush(); + return false; + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + if (need_recover) + mds->mdcache->do_file_recover(); + return false; + } + } + + if (lock->get_parent()->is_replicated()) { // FIXME + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SYNC, data); + } + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + if (in && in->is_head()) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } + return true; +} + +void Locker::simple_excl(SimpleLock *lock, bool *need_issue) +{ + dout(7) << "simple_excl on " << *lock << " on " << *lock->get_parent() << dendl; + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + CInode *in = 0; + if (lock->get_cap_shift()) + in = static_cast<CInode *>(lock->get_parent()); + + switch (lock->get_state()) { + case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break; + case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break; + case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_rdlocked()) + gather++; + if (lock->is_wrlocked()) + gather++; + + if (lock->get_parent()->is_replicated() && + lock->get_state() != LOCK_LOCK_EXCL && + lock->get_state() != LOCK_XSYN_EXCL) { + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + gather++; + } + + if (in && in->is_head()) { + if (in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + } else { + lock->set_state(LOCK_EXCL); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + if (in) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } + } +} + +void Locker::simple_lock(SimpleLock *lock, bool *need_issue) +{ + dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl; + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + ceph_assert(lock->get_state() != LOCK_LOCK); + + CInode *in = 0; + if (lock->get_cap_shift()) + in = static_cast<CInode *>(lock->get_parent()); + + int old_state = lock->get_state(); + + switch (lock->get_state()) { + case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break; + case LOCK_XSYN: lock->set_state(LOCK_XSYN_LOCK); break; + case LOCK_EXCL: lock->set_state(LOCK_EXCL_LOCK); break; + case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); + (static_cast<ScatterLock *>(lock))->clear_unscatter_wanted(); + break; + case LOCK_TSYN: lock->set_state(LOCK_TSYN_LOCK); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_leased()) { + gather++; + revoke_client_leases(lock); + } + if (lock->is_rdlocked()) + gather++; + if (in && in->is_head()) { + if (in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + } + + bool need_recover = false; + if (lock->get_type() == CEPH_LOCK_IFILE) { + ceph_assert(in); + if(in->state_test(CInode::STATE_NEEDSRECOVER)) { + mds->mdcache->queue_file_recover(in); + need_recover = true; + gather++; + } + } + + if (lock->get_parent()->is_replicated() && + lock->get_state() == LOCK_MIX_LOCK && + gather) { + dout(10) << " doing local stage of mix->lock gather before gathering from replicas" << dendl; + } else { + // move to second stage of gather now, so we don't send the lock action later. + if (lock->get_state() == LOCK_MIX_LOCK) + lock->set_state(LOCK_MIX_LOCK2); + + if (lock->get_parent()->is_replicated() && + lock->get_sm()->states[old_state].replica_state != LOCK_LOCK) { // replica may already be LOCK + gather++; + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + } + } + + if (!gather && lock->is_dirty()) { + lock->get_parent()->auth_pin(lock); + scatter_writebehind(static_cast<ScatterLock*>(lock)); + mds->mdlog->flush(); + return; + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + if (need_recover) + mds->mdcache->do_file_recover(); + } else { + lock->set_state(LOCK_LOCK); + lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); + } +} + + +void Locker::simple_xlock(SimpleLock *lock) +{ + dout(7) << "simple_xlock on " << *lock << " on " << *lock->get_parent() << dendl; + ceph_assert(lock->get_parent()->is_auth()); + //assert(lock->is_stable()); + ceph_assert(lock->get_state() != LOCK_XLOCK); + + CInode *in = 0; + if (lock->get_cap_shift()) + in = static_cast<CInode *>(lock->get_parent()); + + if (lock->is_stable()) + lock->get_parent()->auth_pin(lock); + + switch (lock->get_state()) { + case LOCK_LOCK: + case LOCK_XLOCKDONE: lock->set_state(LOCK_LOCK_XLOCK); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_rdlocked()) + gather++; + if (lock->is_wrlocked()) + gather++; + + if (in && in->is_head()) { + if (in->issued_caps_need_gather(lock)) { + issue_caps(in); + gather++; + } + } + + if (!gather) { + lock->set_state(LOCK_PREXLOCK); + //assert("shouldn't be called if we are already xlockable" == 0); + } +} + + + + + +// ========================================================================== +// scatter lock + +/* + +Some notes on scatterlocks. + + - The scatter/gather is driven by the inode lock. The scatter always + brings in the latest metadata from the fragments. + + - When in a scattered/MIX state, fragments are only allowed to + update/be written to if the accounted stat matches the inode's + current version. + + - That means, on gather, we _only_ assimilate diffs for frag metadata + that match the current version, because those are the only ones + written during this scatter/gather cycle. (Others didn't permit + it.) We increment the version and journal this to disk. + + - When possible, we also simultaneously update our local frag + accounted stats to match. + + - On scatter, the new inode info is broadcast to frags, both local + and remote. If possible (auth and !frozen), the dirfrag auth + should update the accounted state (if it isn't already up to date). + Note that this may occur on both the local inode auth node and + inode replicas, so there are two potential paths. If it is NOT + possible, they need to mark_stale to prevent any possible writes. + + - A scatter can be to MIX (potentially writeable) or to SYNC (read + only). Both are opportunities to update the frag accounted stats, + even though only the MIX case is affected by a stale dirfrag. + + - Because many scatter/gather cycles can potentially go by without a + frag being able to update its accounted stats (due to being frozen + by exports/refragments in progress), the frag may have (even very) + old stat versions. That's fine. If when we do want to update it, + we can update accounted_* and the version first. + +*/ + +class C_Locker_ScatterWB : public LockerLogContext { + ScatterLock *lock; + MutationRef mut; +public: + C_Locker_ScatterWB(Locker *l, ScatterLock *sl, MutationRef& m) : + LockerLogContext(l), lock(sl), mut(m) {} + void finish(int r) override { + locker->scatter_writebehind_finish(lock, mut); + } +}; + +void Locker::scatter_writebehind(ScatterLock *lock) +{ + CInode *in = static_cast<CInode*>(lock->get_parent()); + dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl; + + // journal + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + + // forcefully take a wrlock + lock->get_wrlock(true); + mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK); + + in->pre_cow_old_inode(); // avoid cow mayhem + + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + + in->finish_scatter_gather_update(lock->get_type()); + lock->start_flush(); + + EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind"); + mds->mdlog->start_entry(le); + + mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mut.get(), &le->metablob, in); + + in->finish_scatter_gather_update_accounted(lock->get_type(), mut, &le->metablob); + + mds->mdlog->submit_entry(le, new C_Locker_ScatterWB(this, lock, mut)); +} + +void Locker::scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut) +{ + CInode *in = static_cast<CInode*>(lock->get_parent()); + dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; + in->pop_and_dirty_projected_inode(mut->ls); + + lock->finish_flush(); + + // if replicas may have flushed in a mix->lock state, send another + // message so they can finish_flush(). + if (in->is_replicated()) { + switch (lock->get_state()) { + case LOCK_MIX_LOCK: + case LOCK_MIX_LOCK2: + case LOCK_MIX_EXCL: + case LOCK_MIX_TSYN: + send_lock_message(lock, LOCK_AC_LOCKFLUSHED); + } + } + + mut->apply(); + drop_locks(mut.get()); + mut->cleanup(); + + if (lock->is_stable()) + lock->finish_waiters(ScatterLock::WAIT_STABLE); + + //scatter_eval_gather(lock); +} + +void Locker::scatter_eval(ScatterLock *lock, bool *need_issue) +{ + dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + if (lock->get_parent()->is_freezing_or_frozen()) { + dout(20) << " freezing|frozen" << dendl; + return; + } + + if (mdcache->is_readonly()) { + if (lock->get_state() != LOCK_SYNC) { + dout(10) << "scatter_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl; + simple_sync(lock, need_issue); + } + return; + } + + if (!lock->is_rdlocked() && + lock->get_state() != LOCK_MIX && + lock->get_scatter_wanted()) { + dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock + << " on " << *lock->get_parent() << dendl; + scatter_mix(lock, need_issue); + return; + } + + if (lock->get_type() == CEPH_LOCK_INEST) { + // in general, we want to keep INEST writable at all times. + if (!lock->is_rdlocked()) { + if (lock->get_parent()->is_replicated()) { + if (lock->get_state() != LOCK_MIX) + scatter_mix(lock, need_issue); + } else { + if (lock->get_state() != LOCK_LOCK) + simple_lock(lock, need_issue); + } + } + return; + } + + CInode *in = static_cast<CInode*>(lock->get_parent()); + if (!in->has_subtree_or_exporting_dirfrag() || in->is_base()) { + // i _should_ be sync. + if (!lock->is_wrlocked() && + lock->get_state() != LOCK_SYNC) { + dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; + simple_sync(lock, need_issue); + } + } +} + + +/* + * mark a scatterlock to indicate that the dir fnode has some dirty data + */ +void Locker::mark_updated_scatterlock(ScatterLock *lock) +{ + lock->mark_dirty(); + if (lock->get_updated_item()->is_on_list()) { + dout(10) << "mark_updated_scatterlock " << *lock + << " - already on list since " << lock->get_update_stamp() << dendl; + } else { + updated_scatterlocks.push_back(lock->get_updated_item()); + utime_t now = ceph_clock_now(); + lock->set_update_stamp(now); + dout(10) << "mark_updated_scatterlock " << *lock + << " - added at " << now << dendl; + } +} + +/* + * this is called by scatter_tick and LogSegment::try_to_trim() when + * trying to flush dirty scattered data (i.e. updated fnode) back to + * the inode. + * + * we need to lock|scatter in order to push fnode changes into the + * inode.dirstat. + */ +void Locker::scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange) +{ + CInode *p = static_cast<CInode *>(lock->get_parent()); + + if (p->is_frozen() || p->is_freezing()) { + dout(10) << "scatter_nudge waiting for unfreeze on " << *p << dendl; + if (c) + p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, c); + else if (lock->is_dirty()) + // just requeue. not ideal.. starvation prone.. + updated_scatterlocks.push_back(lock->get_updated_item()); + return; + } + + if (p->is_ambiguous_auth()) { + dout(10) << "scatter_nudge waiting for single auth on " << *p << dendl; + if (c) + p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, c); + else if (lock->is_dirty()) + // just requeue. not ideal.. starvation prone.. + updated_scatterlocks.push_back(lock->get_updated_item()); + return; + } + + if (p->is_auth()) { + int count = 0; + while (true) { + if (lock->is_stable()) { + // can we do it now? + // (only if we're not replicated.. if we are, we really do need + // to nudge the lock state!) + /* + actually, even if we're not replicated, we can't stay in MIX, because another mds + could discover and replicate us at any time. if that happens while we're flushing, + they end up in MIX but their inode has the old scatterstat version. + + if (!forcelockchange && !lock->get_parent()->is_replicated() && lock->can_wrlock(-1)) { + dout(10) << "scatter_nudge auth, propagating " << *lock << " on " << *p << dendl; + scatter_writebehind(lock); + if (c) + lock->add_waiter(SimpleLock::WAIT_STABLE, c); + return; + } + */ + + if (mdcache->is_readonly()) { + if (lock->get_state() != LOCK_SYNC) { + dout(10) << "scatter_nudge auth, read-only FS, syncing " << *lock << " on " << *p << dendl; + simple_sync(static_cast<ScatterLock*>(lock)); + } + break; + } + + // adjust lock state + dout(10) << "scatter_nudge auth, scatter/unscattering " << *lock << " on " << *p << dendl; + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + if (p->is_replicated() && lock->get_state() != LOCK_MIX) + scatter_mix(static_cast<ScatterLock*>(lock)); + else if (lock->get_state() != LOCK_LOCK) + simple_lock(static_cast<ScatterLock*>(lock)); + else + simple_sync(static_cast<ScatterLock*>(lock)); + break; + + case CEPH_LOCK_IDFT: + case CEPH_LOCK_INEST: + if (p->is_replicated() && lock->get_state() != LOCK_MIX) + scatter_mix(lock); + else if (lock->get_state() != LOCK_LOCK) + simple_lock(lock); + else + simple_sync(lock); + break; + default: + ceph_abort(); + } + ++count; + if (lock->is_stable() && count == 2) { + dout(10) << "scatter_nudge oh, stable after two cycles." << dendl; + // this should only realy happen when called via + // handle_file_lock due to AC_NUDGE, because the rest of the + // time we are replicated or have dirty data and won't get + // called. bailing here avoids an infinite loop. + ceph_assert(!c); + break; + } + } else { + dout(10) << "scatter_nudge auth, waiting for stable " << *lock << " on " << *p << dendl; + if (c) + lock->add_waiter(SimpleLock::WAIT_STABLE, c); + return; + } + } + } else { + dout(10) << "scatter_nudge replica, requesting scatter/unscatter of " + << *lock << " on " << *p << dendl; + // request unscatter? + mds_rank_t auth = lock->get_parent()->authority().first; + if (!mds->is_cluster_degraded() || mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + mds->send_message_mds(MLock::create(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth); + } + + // wait... + if (c) + lock->add_waiter(SimpleLock::WAIT_STABLE, c); + + // also, requeue, in case we had wrong auth or something + if (lock->is_dirty()) + updated_scatterlocks.push_back(lock->get_updated_item()); + } +} + +void Locker::scatter_tick() +{ + dout(10) << "scatter_tick" << dendl; + + // updated + utime_t now = ceph_clock_now(); + int n = updated_scatterlocks.size(); + while (!updated_scatterlocks.empty()) { + ScatterLock *lock = updated_scatterlocks.front(); + + if (n-- == 0) break; // scatter_nudge() may requeue; avoid looping + + if (!lock->is_dirty()) { + updated_scatterlocks.pop_front(); + dout(10) << " removing from updated_scatterlocks " + << *lock << " " << *lock->get_parent() << dendl; + continue; + } + if (now - lock->get_update_stamp() < g_conf()->mds_scatter_nudge_interval) + break; + updated_scatterlocks.pop_front(); + scatter_nudge(lock, 0); + } + mds->mdlog->flush(); +} + + +void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue) +{ + dout(10) << "scatter_tempsync " << *lock + << " on " << *lock->get_parent() << dendl; + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + ceph_abort_msg("not fully implemented, at least not for filelock"); + + CInode *in = static_cast<CInode *>(lock->get_parent()); + + switch (lock->get_state()) { + case LOCK_SYNC: ceph_abort(); // this shouldn't happen + case LOCK_LOCK: lock->set_state(LOCK_LOCK_TSYN); break; + case LOCK_MIX: lock->set_state(LOCK_MIX_TSYN); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_wrlocked()) + gather++; + + if (lock->get_cap_shift() && + in->is_head() && + in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + + if (lock->get_state() == LOCK_MIX_TSYN && + in->is_replicated()) { + lock->init_gather(); + send_lock_message(lock, LOCK_AC_LOCK); + gather++; + } + + if (gather) { + in->auth_pin(lock); + } else { + // do tempsync + lock->set_state(LOCK_TSYN); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + if (lock->get_cap_shift()) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } + } +} + + + +// ========================================================================== +// local lock + +void Locker::local_wrlock_grab(LocalLock *lock, MutationRef& mut) +{ + dout(7) << "local_wrlock_grab on " << *lock + << " on " << *lock->get_parent() << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->can_wrlock()); + lock->get_wrlock(mut->get_client()); + + auto ret = mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK); + ceph_assert(ret.second); +} + +bool Locker::local_wrlock_start(LocalLock *lock, MDRequestRef& mut) +{ + dout(7) << "local_wrlock_start on " << *lock + << " on " << *lock->get_parent() << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + if (lock->can_wrlock()) { + lock->get_wrlock(mut->get_client()); + auto it = mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::WRLOCK); + ceph_assert(it->is_wrlock()); + return true; + } else { + lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut)); + return false; + } +} + +void Locker::local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut) +{ + ceph_assert(it->is_wrlock()); + LocalLock *lock = static_cast<LocalLock*>(it->lock); + dout(7) << "local_wrlock_finish on " << *lock + << " on " << *lock->get_parent() << dendl; + lock->put_wrlock(); + mut->locks.erase(it); + if (lock->get_num_wrlocks() == 0) { + lock->finish_waiters(SimpleLock::WAIT_STABLE | + SimpleLock::WAIT_WR | + SimpleLock::WAIT_RD); + } +} + +bool Locker::local_xlock_start(LocalLock *lock, MDRequestRef& mut) +{ + dout(7) << "local_xlock_start on " << *lock + << " on " << *lock->get_parent() << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + if (!lock->can_xlock_local()) { + lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut)); + return false; + } + + lock->get_xlock(mut, mut->get_client()); + mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::XLOCK); + return true; +} + +void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut) +{ + ceph_assert(it->is_xlock()); + LocalLock *lock = static_cast<LocalLock*>(it->lock); + dout(7) << "local_xlock_finish on " << *lock + << " on " << *lock->get_parent() << dendl; + lock->put_xlock(); + mut->locks.erase(it); + + lock->finish_waiters(SimpleLock::WAIT_STABLE | + SimpleLock::WAIT_WR | + SimpleLock::WAIT_RD); +} + + + +// ========================================================================== +// file lock + + +void Locker::file_eval(ScatterLock *lock, bool *need_issue) +{ + CInode *in = static_cast<CInode*>(lock->get_parent()); + int loner_wanted, other_wanted; + int wanted = in->get_caps_wanted(&loner_wanted, &other_wanted, CEPH_CAP_SFILE); + dout(7) << "file_eval wanted=" << gcap_string(wanted) + << " loner_wanted=" << gcap_string(loner_wanted) + << " other_wanted=" << gcap_string(other_wanted) + << " filelock=" << *lock << " on " << *lock->get_parent() + << dendl; + + ceph_assert(lock->get_parent()->is_auth()); + ceph_assert(lock->is_stable()); + + if (lock->get_parent()->is_freezing_or_frozen()) + return; + + if (mdcache->is_readonly()) { + if (lock->get_state() != LOCK_SYNC) { + dout(10) << "file_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl; + simple_sync(lock, need_issue); + } + return; + } + + // excl -> *? + if (lock->get_state() == LOCK_EXCL) { + dout(20) << " is excl" << dendl; + int loner_issued, other_issued, xlocker_issued; + in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, CEPH_CAP_SFILE); + dout(7) << "file_eval loner_issued=" << gcap_string(loner_issued) + << " other_issued=" << gcap_string(other_issued) + << " xlocker_issued=" << gcap_string(xlocker_issued) + << dendl; + if (!((loner_wanted|loner_issued) & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) || + (other_wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GRD)) || + (in->inode.is_dir() && in->multiple_nonstale_caps())) { // FIXME.. :/ + dout(20) << " should lose it" << dendl; + // we should lose it. + // loner other want + // R R SYNC + // R R|W MIX + // R W MIX + // R|W R MIX + // R|W R|W MIX + // R|W W MIX + // W R MIX + // W R|W MIX + // W W MIX + // -> any writer means MIX; RD doesn't matter. + if (((other_wanted|loner_wanted) & CEPH_CAP_GWR) || + lock->is_waiter_for(SimpleLock::WAIT_WR)) + scatter_mix(lock, need_issue); + else if (!lock->is_wrlocked()) // let excl wrlocks drain first + simple_sync(lock, need_issue); + else + dout(10) << " waiting for wrlock to drain" << dendl; + } + } + + // * -> excl? + else if (lock->get_state() != LOCK_EXCL && + !lock->is_rdlocked() && + //!lock->is_waiter_for(SimpleLock::WAIT_WR) && + ((wanted & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) || + (in->inode.is_dir() && !in->has_subtree_or_exporting_dirfrag())) && + in->get_target_loner() >= 0) { + dout(7) << "file_eval stable, bump to loner " << *lock + << " on " << *lock->get_parent() << dendl; + file_excl(lock, need_issue); + } + + // * -> mixed? + else if (lock->get_state() != LOCK_MIX && + !lock->is_rdlocked() && + //!lock->is_waiter_for(SimpleLock::WAIT_WR) && + (lock->get_scatter_wanted() || + (in->get_target_loner() < 0 && (wanted & CEPH_CAP_GWR)))) { + dout(7) << "file_eval stable, bump to mixed " << *lock + << " on " << *lock->get_parent() << dendl; + scatter_mix(lock, need_issue); + } + + // * -> sync? + else if (lock->get_state() != LOCK_SYNC && + !lock->is_wrlocked() && // drain wrlocks first! + !lock->is_waiter_for(SimpleLock::WAIT_WR) && + !(wanted & CEPH_CAP_GWR) && + !((lock->get_state() == LOCK_MIX) && + in->is_dir() && in->has_subtree_or_exporting_dirfrag()) // if we are a delegation point, stay where we are + //((wanted & CEPH_CAP_RD) || + //in->is_replicated() || + //lock->is_leased() || + //(!loner && lock->get_state() == LOCK_EXCL)) && + ) { + dout(7) << "file_eval stable, bump to sync " << *lock + << " on " << *lock->get_parent() << dendl; + simple_sync(lock, need_issue); + } +} + + + +void Locker::scatter_mix(ScatterLock *lock, bool *need_issue) +{ + dout(7) << "scatter_mix " << *lock << " on " << *lock->get_parent() << dendl; + + CInode *in = static_cast<CInode*>(lock->get_parent()); + ceph_assert(in->is_auth()); + ceph_assert(lock->is_stable()); + + if (lock->get_state() == LOCK_LOCK) { + in->start_scatter(lock); + if (in->is_replicated()) { + // data + bufferlist softdata; + lock->encode_locked_state(softdata); + + // bcast to replicas + send_lock_message(lock, LOCK_AC_MIX, softdata); + } + + // change lock + lock->set_state(LOCK_MIX); + lock->clear_scatter_wanted(); + if (lock->get_cap_shift()) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } + } else { + // gather? + switch (lock->get_state()) { + case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break; + case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break; + case LOCK_XSYN: lock->set_state(LOCK_XSYN_MIX); break; + case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_rdlocked()) + gather++; + if (in->is_replicated()) { + if (lock->get_state() == LOCK_SYNC_MIX) { // for the rest states, replicas are already LOCK + send_lock_message(lock, LOCK_AC_MIX); + lock->init_gather(); + gather++; + } + } + if (lock->is_leased()) { + revoke_client_leases(lock); + gather++; + } + if (lock->get_cap_shift() && + in->is_head() && + in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + bool need_recover = false; + if (in->state_test(CInode::STATE_NEEDSRECOVER)) { + mds->mdcache->queue_file_recover(in); + need_recover = true; + gather++; + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + if (need_recover) + mds->mdcache->do_file_recover(); + } else { + in->start_scatter(lock); + lock->set_state(LOCK_MIX); + lock->clear_scatter_wanted(); + if (in->is_replicated()) { + bufferlist softdata; + lock->encode_locked_state(softdata); + send_lock_message(lock, LOCK_AC_MIX, softdata); + } + if (lock->get_cap_shift()) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } + } + } +} + + +void Locker::file_excl(ScatterLock *lock, bool *need_issue) +{ + CInode *in = static_cast<CInode*>(lock->get_parent()); + dout(7) << "file_excl " << *lock << " on " << *lock->get_parent() << dendl; + + ceph_assert(in->is_auth()); + ceph_assert(lock->is_stable()); + + ceph_assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) || + (lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> <anything else> + + switch (lock->get_state()) { + case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break; + case LOCK_MIX: lock->set_state(LOCK_MIX_EXCL); break; + case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break; + case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break; + default: ceph_abort(); + } + int gather = 0; + + if (lock->is_rdlocked()) + gather++; + if (lock->is_wrlocked()) + gather++; + + if (in->is_replicated() && + lock->get_state() != LOCK_LOCK_EXCL && + lock->get_state() != LOCK_XSYN_EXCL) { // if we were lock, replicas are already lock. + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + gather++; + } + if (lock->is_leased()) { + revoke_client_leases(lock); + gather++; + } + if (in->is_head() && + in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + bool need_recover = false; + if (in->state_test(CInode::STATE_NEEDSRECOVER)) { + mds->mdcache->queue_file_recover(in); + need_recover = true; + gather++; + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + if (need_recover) + mds->mdcache->do_file_recover(); + } else { + lock->set_state(LOCK_EXCL); + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } +} + +void Locker::file_xsyn(SimpleLock *lock, bool *need_issue) +{ + dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl; + CInode *in = static_cast<CInode *>(lock->get_parent()); + ceph_assert(in->is_auth()); + ceph_assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()); + + switch (lock->get_state()) { + case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break; + default: ceph_abort(); + } + + int gather = 0; + if (lock->is_wrlocked()) + gather++; + + if (in->is_head() && + in->issued_caps_need_gather(lock)) { + if (need_issue) + *need_issue = true; + else + issue_caps(in); + gather++; + } + + if (gather) { + lock->get_parent()->auth_pin(lock); + } else { + lock->set_state(LOCK_XSYN); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + if (need_issue) + *need_issue = true; + else + issue_caps(in); + } +} + +void Locker::file_recover(ScatterLock *lock) +{ + CInode *in = static_cast<CInode *>(lock->get_parent()); + dout(7) << "file_recover " << *lock << " on " << *in << dendl; + + ceph_assert(in->is_auth()); + //assert(lock->is_stable()); + ceph_assert(lock->get_state() == LOCK_PRE_SCAN); // only called from MDCache::start_files_to_recover() + + int gather = 0; + + /* + if (in->is_replicated() + lock->get_sm()->states[oldstate].replica_state != LOCK_LOCK) { + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + gather++; + } + */ + if (in->is_head() && + in->issued_caps_need_gather(lock)) { + issue_caps(in); + gather++; + } + + lock->set_state(LOCK_SCAN); + if (gather) + in->state_set(CInode::STATE_NEEDSRECOVER); + else + mds->mdcache->queue_file_recover(in); +} + + +// messenger +void Locker::handle_file_lock(ScatterLock *lock, const MLock::const_ref &m) +{ + CInode *in = static_cast<CInode*>(lock->get_parent()); + int from = m->get_asker(); + + if (mds->is_rejoin()) { + if (in->is_rejoining()) { + dout(7) << "handle_file_lock still rejoining " << *in + << ", dropping " << *m << dendl; + return; + } + } + + dout(7) << "handle_file_lock a=" << lock->get_lock_action_name(m->get_action()) + << " on " << *lock + << " from mds." << from << " " + << *in << dendl; + + bool caps = lock->get_cap_shift(); + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + ceph_assert(lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_MIX || + lock->get_state() == LOCK_MIX_SYNC2); + + if (lock->get_state() == LOCK_MIX) { + lock->set_state(LOCK_MIX_SYNC); + eval_gather(lock, true); + if (lock->is_unstable_and_locked()) + mds->mdlog->flush(); + break; + } + + (static_cast<ScatterLock *>(lock))->finish_flush(); + (static_cast<ScatterLock *>(lock))->clear_flushed(); + + // ok + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SYNC); + + lock->get_rdlock(); + if (caps) + issue_caps(in); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); + break; + + case LOCK_AC_LOCK: + switch (lock->get_state()) { + case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break; + case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break; + default: ceph_abort(); + } + + eval_gather(lock, true); + if (lock->is_unstable_and_locked()) + mds->mdlog->flush(); + + break; + + case LOCK_AC_LOCKFLUSHED: + (static_cast<ScatterLock *>(lock))->finish_flush(); + (static_cast<ScatterLock *>(lock))->clear_flushed(); + // wake up scatter_nudge waiters + if (lock->is_stable()) + lock->finish_waiters(SimpleLock::WAIT_STABLE); + break; + + case LOCK_AC_MIX: + ceph_assert(lock->get_state() == LOCK_SYNC || + lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_SYNC_MIX2); + + if (lock->get_state() == LOCK_SYNC) { + // MIXED + lock->set_state(LOCK_SYNC_MIX); + eval_gather(lock, true); + if (lock->is_unstable_and_locked()) + mds->mdlog->flush(); + break; + } + + // ok + lock->set_state(LOCK_MIX); + lock->decode_locked_state(m->get_data()); + + if (caps) + issue_caps(in); + + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + break; + + + // -- auth -- + case LOCK_AC_LOCKACK: + ceph_assert(lock->get_state() == LOCK_SYNC_LOCK || + lock->get_state() == LOCK_MIX_LOCK || + lock->get_state() == LOCK_MIX_LOCK2 || + lock->get_state() == LOCK_MIX_EXCL || + lock->get_state() == LOCK_SYNC_EXCL || + lock->get_state() == LOCK_SYNC_MIX || + lock->get_state() == LOCK_MIX_TSYN); + ceph_assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->get_state() == LOCK_MIX_LOCK || + lock->get_state() == LOCK_MIX_LOCK2 || + lock->get_state() == LOCK_MIX_EXCL || + lock->get_state() == LOCK_MIX_TSYN) { + lock->decode_locked_state(m->get_data()); + // replica is waiting for AC_LOCKFLUSHED, eval_gather() should not + // delay calling scatter_writebehind(). + lock->clear_flushed(); + } + + if (lock->is_gathering()) { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << dendl; + } else { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", last one" << dendl; + eval_gather(lock); + } + break; + + case LOCK_AC_SYNCACK: + ceph_assert(lock->get_state() == LOCK_MIX_SYNC); + ceph_assert(lock->is_gathering(from)); + lock->remove_gather(from); + + lock->decode_locked_state(m->get_data()); + + if (lock->is_gathering()) { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << dendl; + } else { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", last one" << dendl; + eval_gather(lock); + } + break; + + case LOCK_AC_MIXACK: + ceph_assert(lock->get_state() == LOCK_SYNC_MIX); + ceph_assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << dendl; + } else { + dout(7) << "handle_file_lock " << *in << " from " << from + << ", last one" << dendl; + eval_gather(lock); + } + break; + + + // requests.... + case LOCK_AC_REQSCATTER: + if (lock->is_stable()) { + /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) + * because the replica should be holding an auth_pin if they're + * doing this (and thus, we are freezing, not frozen, and indefinite + * starvation isn't an issue). + */ + dout(7) << "handle_file_lock got scatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + if (lock->get_state() != LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter + scatter_mix(lock); + } else { + dout(7) << "handle_file_lock got scatter request, !stable, marking scatter_wanted on " << *lock + << " on " << *lock->get_parent() << dendl; + lock->set_scatter_wanted(); + } + break; + + case LOCK_AC_REQUNSCATTER: + if (lock->is_stable()) { + /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) + * because the replica should be holding an auth_pin if they're + * doing this (and thus, we are freezing, not frozen, and indefinite + * starvation isn't an issue). + */ + dout(7) << "handle_file_lock got unscatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + if (lock->get_state() == LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter + simple_lock(lock); // FIXME tempsync? + } else { + dout(7) << "handle_file_lock ignoring unscatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + lock->set_unscatter_wanted(); + } + break; + + case LOCK_AC_REQRDLOCK: + handle_reqrdlock(lock, m); + break; + + case LOCK_AC_NUDGE: + if (!lock->get_parent()->is_auth()) { + dout(7) << "handle_file_lock IGNORING nudge on non-auth " << *lock + << " on " << *lock->get_parent() << dendl; + } else if (!lock->get_parent()->is_replicated()) { + dout(7) << "handle_file_lock IGNORING nudge on non-replicated " << *lock + << " on " << *lock->get_parent() << dendl; + } else { + dout(7) << "handle_file_lock trying nudge on " << *lock + << " on " << *lock->get_parent() << dendl; + scatter_nudge(lock, 0, true); + mds->mdlog->flush(); + } + break; + + default: + ceph_abort(); + } +} diff --git a/src/mds/Locker.h b/src/mds/Locker.h new file mode 100644 index 00000000..c4dd65ee --- /dev/null +++ b/src/mds/Locker.h @@ -0,0 +1,291 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_LOCKER_H +#define CEPH_MDS_LOCKER_H + +#include "include/types.h" + +#include "messages/MClientCaps.h" +#include "messages/MClientCapRelease.h" +#include "messages/MClientLease.h" +#include "messages/MLock.h" + +#include <map> +#include <list> +#include <set> +#include <string_view> + +class MDSRank; +class Session; +class CDentry; +struct SnapRealm; + +class Capability; + +class SimpleLock; +class ScatterLock; +class LocalLock; + +#include "CInode.h" +#include "SimpleLock.h" +#include "MDSContext.h" +#include "Mutation.h" +#include "messages/MClientReply.h" + +class Locker { +private: + MDSRank *mds; + MDCache *mdcache; + + public: + Locker(MDSRank *m, MDCache *c); + + SimpleLock *get_lock(int lock_type, const MDSCacheObjectInfo &info); + + void dispatch(const Message::const_ref &m); + void handle_lock(const MLock::const_ref &m); + + void tick(); + + void nudge_log(SimpleLock *lock); + +protected: + void send_lock_message(SimpleLock *lock, int msg); + void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); + + // -- locks -- + void _drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue, bool drop_rdlocks); +public: + void include_snap_rdlocks(CInode *in, MutationImpl::LockOpVec& lov); + void include_snap_rdlocks_wlayout(CInode *in, MutationImpl::LockOpVec& lov, + file_layout_t **layout); + + bool acquire_locks(MDRequestRef& mdr, + MutationImpl::LockOpVec& lov, + CInode *auth_pin_freeze=NULL, + bool auth_pin_nonblock=false); + + void notify_freeze_waiter(MDSCacheObject *o); + void cancel_locking(MutationImpl *mut, std::set<CInode*> *pneed_issue); + void drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0); + void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false); + void drop_non_rdlocks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0); + void drop_rdlocks_for_early_reply(MutationImpl *mut); + void drop_locks_for_fragment_unfreeze(MutationImpl *mut); + + void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, MDSContext::vec *pfinishers=0); + void eval(SimpleLock *lock, bool *need_issue); + void eval_any(SimpleLock *lock, bool *need_issue, MDSContext::vec *pfinishers=0, bool first=false) { + if (!lock->is_stable()) + eval_gather(lock, first, need_issue, pfinishers); + else if (lock->get_parent()->is_auth()) + eval(lock, need_issue); + } + + void eval_scatter_gathers(CInode *in); + + void eval_cap_gather(CInode *in, std::set<CInode*> *issue_set=0); + + bool eval(CInode *in, int mask, bool caps_imported=false); + void try_eval(MDSCacheObject *p, int mask); + void try_eval(SimpleLock *lock, bool *pneed_issue); + + bool _rdlock_kick(SimpleLock *lock, bool as_anon); + bool rdlock_try(SimpleLock *lock, client_t client, MDSContext *c); + bool rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon=false); + void rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue); + bool can_rdlock_set(MutationImpl::LockOpVec& lov); + void rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut); + + void wrlock_force(SimpleLock *lock, MutationRef& mut); + bool wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait=false); + void wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue); + + void remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut); + void remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut); + + bool xlock_start(SimpleLock *lock, MDRequestRef& mut); + void _finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue); + void xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue); + + void xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut); + void xlock_import(SimpleLock *lock); + + + // simple +public: + void try_simple_eval(SimpleLock *lock); + bool simple_rdlock_try(SimpleLock *lock, MDSContext *con); +protected: + void simple_eval(SimpleLock *lock, bool *need_issue); + void handle_simple_lock(SimpleLock *lock, const MLock::const_ref &m); + +public: + bool simple_sync(SimpleLock *lock, bool *need_issue=0); +protected: + void simple_lock(SimpleLock *lock, bool *need_issue=0); + void simple_excl(SimpleLock *lock, bool *need_issue=0); + void simple_xlock(SimpleLock *lock); + + + // scatter +public: + void scatter_eval(ScatterLock *lock, bool *need_issue); // public for MDCache::adjust_subtree_auth() + + void scatter_tick(); + void scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange=false); + +protected: + void handle_scatter_lock(ScatterLock *lock, const MLock::const_ref &m); + bool scatter_scatter_fastpath(ScatterLock *lock); + void scatter_scatter(ScatterLock *lock, bool nowait=false); + void scatter_tempsync(ScatterLock *lock, bool *need_issue=0); + + void scatter_writebehind(ScatterLock *lock); + + void scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut); + + xlist<ScatterLock*> updated_scatterlocks; +public: + void mark_updated_scatterlock(ScatterLock *lock); + + + void handle_reqrdlock(SimpleLock *lock, const MLock::const_ref &m); + + + + // caps + + // when to defer processing client cap release or writeback due to being + // frozen. the condition must be consistent across handle_client_caps and + // process_request_cap_release to preserve ordering. + bool should_defer_client_cap_frozen(CInode *in); + + void process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& r, + std::string_view dname); + + void kick_cap_releases(MDRequestRef& mdr); + void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq); + + void remove_client_cap(CInode *in, Capability *cap, bool kill=false); + + void get_late_revoking_clients(std::list<client_t> *result, double timeout) const; + +private: + bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const; + +protected: + bool _need_flush_mdlog(CInode *in, int wanted_caps); + void adjust_cap_wanted(Capability *cap, int wanted, int issue_seq); + void handle_client_caps(const MClientCaps::const_ref &m); + void _update_cap_fields(CInode *in, int dirty, const MClientCaps::const_ref &m, CInode::mempool_inode *pi); + void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const MClientCaps::const_ref &m, const MClientCaps::ref &ack); + void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP); + bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, const MClientCaps::const_ref &m, + const MClientCaps::ref &ack, bool *need_flush=NULL); + void handle_client_cap_release(const MClientCapRelease::const_ref &m); + void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq); + void caps_tick(); + + // Maintain a global list to quickly find if any caps are late revoking + xlist<Capability*> revoking_caps; + // Maintain a per-client list to find clients responsible for late ones quickly + std::map<client_t, xlist<Capability*> > revoking_caps_by_client; + + elist<CInode*> need_snapflush_inodes; +public: + void snapflush_nudge(CInode *in); + void mark_need_snapflush_inode(CInode *in); + bool is_revoking_any_caps_from(client_t client); + + // local +public: + void local_wrlock_grab(LocalLock *lock, MutationRef& mut); +protected: + bool local_wrlock_start(LocalLock *lock, MDRequestRef& mut); + void local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut); + bool local_xlock_start(LocalLock *lock, MDRequestRef& mut); + void local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut); + + + // file +public: + void file_eval(ScatterLock *lock, bool *need_issue); +protected: + void handle_file_lock(ScatterLock *lock, const MLock::const_ref &m); + void scatter_mix(ScatterLock *lock, bool *need_issue=0); + void file_excl(ScatterLock *lock, bool *need_issue=0); + void file_xsyn(SimpleLock *lock, bool *need_issue=0); + +public: + void file_recover(ScatterLock *lock); + +private: + xlist<ScatterLock*> updated_filelocks; +public: + void mark_updated_Filelock(ScatterLock *lock); + + // -- file i/o -- +public: + version_t issue_file_data_version(CInode *in); + Capability* issue_new_caps(CInode *in, int mode, Session *session, SnapRealm *conrealm, bool is_replay); + int issue_caps(CInode *in, Capability *only_cap=0); + void issue_caps_set(std::set<CInode*>& inset); + void issue_truncate(CInode *in); + void revoke_stale_cap(CInode *in, client_t client); + bool revoke_stale_caps(Session *session); + void resume_stale_caps(Session *session); + void remove_stale_leases(Session *session); + +public: + void request_inode_file_caps(CInode *in); +protected: + void handle_inode_file_caps(const MInodeFileCaps::const_ref &m); + + void file_update_finish(CInode *in, MutationRef& mut, unsigned flags, + client_t client, const MClientCaps::ref &ack); +private: + uint64_t calc_new_max_size(CInode::mempool_inode *pi, uint64_t size); +public: + void calc_new_client_ranges(CInode *in, uint64_t size, bool update, + CInode::mempool_inode::client_range_map* new_ranges, + bool *max_increased); + bool check_inode_max_size(CInode *in, bool force_wrlock=false, + uint64_t newmax=0, uint64_t newsize=0, + utime_t mtime=utime_t()); + void share_inode_max_size(CInode *in, Capability *only_cap=0); + +private: + friend class C_MDL_CheckMaxSize; + friend class C_MDL_RequestInodeFileCaps; + friend class C_Locker_FileUpdate_finish; + friend class C_Locker_RetryCapRelease; + friend class C_Locker_Eval; + friend class C_Locker_ScatterWB; + friend class LockerContext; + friend class LockerLogContext; + + + // -- client leases -- +public: + void handle_client_lease(const MClientLease::const_ref &m); + + void issue_client_lease(CDentry *dn, client_t client, bufferlist &bl, utime_t now, Session *session); + void revoke_client_leases(SimpleLock *lock); + static void encode_lease(bufferlist& bl, const session_info_t& info, const LeaseStat& ls); +}; + + +#endif diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc new file mode 100644 index 00000000..3e321531 --- /dev/null +++ b/src/mds/LogEvent.cc @@ -0,0 +1,209 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/config.h" +#include "LogEvent.h" + +#include "MDSRank.h" + +// events i know of +#include "events/ESubtreeMap.h" +#include "events/EExport.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" +#include "events/EFragment.h" + +#include "events/EResetJournal.h" +#include "events/ESession.h" +#include "events/ESessions.h" + +#include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" +#include "events/EOpen.h" +#include "events/ECommitted.h" + +#include "events/ETableClient.h" +#include "events/ETableServer.h" + +#include "events/ENoOp.h" + +#define dout_context g_ceph_context + + +std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator p) +{ + // parse type, length + EventType type; + std::unique_ptr<LogEvent> event; + using ceph::decode; + decode(type, p); + + if (EVENT_NEW_ENCODING == type) { + try { + DECODE_START(1, p); + decode(type, p); + event = decode_event(p, type); + DECODE_FINISH(p); + } + catch (const buffer::error &e) { + generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl; + return NULL; + } + } else { // we are using classic encoding + event = decode_event(p, type); + } + return event; +} + + +std::string_view LogEvent::get_type_str() const +{ + switch(_type) { + case EVENT_SUBTREEMAP: return "SUBTREEMAP"; + case EVENT_SUBTREEMAP_TEST: return "SUBTREEMAP_TEST"; + case EVENT_EXPORT: return "EXPORT"; + case EVENT_IMPORTSTART: return "IMPORTSTART"; + case EVENT_IMPORTFINISH: return "IMPORTFINISH"; + case EVENT_FRAGMENT: return "FRAGMENT"; + case EVENT_RESETJOURNAL: return "RESETJOURNAL"; + case EVENT_SESSION: return "SESSION"; + case EVENT_SESSIONS_OLD: return "SESSIONS_OLD"; + case EVENT_SESSIONS: return "SESSIONS"; + case EVENT_UPDATE: return "UPDATE"; + case EVENT_SLAVEUPDATE: return "SLAVEUPDATE"; + case EVENT_OPEN: return "OPEN"; + case EVENT_COMMITTED: return "COMMITTED"; + case EVENT_TABLECLIENT: return "TABLECLIENT"; + case EVENT_TABLESERVER: return "TABLESERVER"; + case EVENT_NOOP: return "NOOP"; + + default: + generic_dout(0) << "get_type_str: unknown type " << _type << dendl; + return "UNKNOWN"; + } +} + +const std::map<std::string, LogEvent::EventType> LogEvent::types = { + {"SUBTREEMAP", EVENT_SUBTREEMAP}, + {"SUBTREEMAP_TEST", EVENT_SUBTREEMAP_TEST}, + {"EXPORT", EVENT_EXPORT}, + {"IMPORTSTART", EVENT_IMPORTSTART}, + {"IMPORTFINISH", EVENT_IMPORTFINISH}, + {"FRAGMENT", EVENT_FRAGMENT}, + {"RESETJOURNAL", EVENT_RESETJOURNAL}, + {"SESSION", EVENT_SESSION}, + {"SESSIONS_OLD", EVENT_SESSIONS_OLD}, + {"SESSIONS", EVENT_SESSIONS}, + {"UPDATE", EVENT_UPDATE}, + {"SLAVEUPDATE", EVENT_SLAVEUPDATE}, + {"OPEN", EVENT_OPEN}, + {"COMMITTED", EVENT_COMMITTED}, + {"TABLECLIENT", EVENT_TABLECLIENT}, + {"TABLESERVER", EVENT_TABLESERVER}, + {"NOOP", EVENT_NOOP} +}; + +/* + * Resolve type string to type enum + * + * Return -1 if not found + */ +LogEvent::EventType LogEvent::str_to_type(std::string_view str) +{ + return LogEvent::types.at(std::string(str)); +} + + +std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator& p, LogEvent::EventType type) +{ + const auto length = p.get_remaining(); + generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl; + + // create event + std::unique_ptr<LogEvent> le; + switch (type) { + case EVENT_SUBTREEMAP: + le = std::make_unique<ESubtreeMap>(); + break; + case EVENT_SUBTREEMAP_TEST: + le = std::make_unique<ESubtreeMap>(); + le->set_type(type); + break; + case EVENT_EXPORT: + le = std::make_unique<EExport>(); + break; + case EVENT_IMPORTSTART: + le = std::make_unique<EImportStart>(); + break; + case EVENT_IMPORTFINISH: + le = std::make_unique<EImportFinish>(); + break; + case EVENT_FRAGMENT: + le = std::make_unique<EFragment>(); + break; + case EVENT_RESETJOURNAL: + le = std::make_unique<EResetJournal>(); + break; + case EVENT_SESSION: + le = std::make_unique<ESession>(); + break; + case EVENT_SESSIONS_OLD: + { + auto e = std::make_unique<ESessions>(); + e->mark_old_encoding(); + le = std::move(e); + } + break; + case EVENT_SESSIONS: + le = std::make_unique<ESessions>(); + break; + case EVENT_UPDATE: + le = std::make_unique<EUpdate>(); + break; + case EVENT_SLAVEUPDATE: + le = std::make_unique<ESlaveUpdate>(); + break; + case EVENT_OPEN: + le = std::make_unique<EOpen>(); + break; + case EVENT_COMMITTED: + le = std::make_unique<ECommitted>(); + break; + case EVENT_TABLECLIENT: + le = std::make_unique<ETableClient>(); + break; + case EVENT_TABLESERVER: + le = std::make_unique<ETableServer>(); + break; + case EVENT_NOOP: + le = std::make_unique<ENoOp>(); + break; + default: + generic_dout(0) << "uh oh, unknown log event type " << type << " length " << length << dendl; + return nullptr; + } + + // decode + try { + le->decode(p); + } + catch (const buffer::error &e) { + generic_dout(0) << "failed to decode LogEvent type " << type << dendl; + return nullptr; + } + + ceph_assert(p.end()); + return le; +} + diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h new file mode 100644 index 00000000..7c7273f8 --- /dev/null +++ b/src/mds/LogEvent.h @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LOGEVENT_H +#define CEPH_LOGEVENT_H + +#define EVENT_NEW_ENCODING 0 // indicates that the encoding is versioned +#define EVENT_UNUSED 1 // was previously EVENT_STRING + +#define EVENT_SUBTREEMAP 2 +#define EVENT_EXPORT 3 +#define EVENT_IMPORTSTART 4 +#define EVENT_IMPORTFINISH 5 +#define EVENT_FRAGMENT 6 + +#define EVENT_RESETJOURNAL 9 + +#define EVENT_SESSION 10 +#define EVENT_SESSIONS_OLD 11 +#define EVENT_SESSIONS 12 + +#define EVENT_UPDATE 20 +#define EVENT_SLAVEUPDATE 21 +#define EVENT_OPEN 22 +#define EVENT_COMMITTED 23 + +#define EVENT_TABLECLIENT 42 +#define EVENT_TABLESERVER 43 + +#define EVENT_SUBTREEMAP_TEST 50 +#define EVENT_NOOP 51 + + +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "include/utime.h" + +class MDSRank; +class LogSegment; +class EMetaBlob; + +// generic log event +class LogEvent { +public: + friend class MDLog; + typedef __u32 EventType; + + LogEvent() = delete; + explicit LogEvent(int t) : _type(t) {} + LogEvent(const LogEvent&) = delete; + LogEvent& operator=(const LogEvent&) = delete; + virtual ~LogEvent() {} + + std::string_view get_type_str() const; + static EventType str_to_type(std::string_view str); + EventType get_type() const { return _type; } + void set_type(EventType t) { _type = t; } + + uint64_t get_start_off() const { return _start_off; } + void set_start_off(uint64_t o) { _start_off = o; } + + utime_t get_stamp() const { return stamp; } + void set_stamp(utime_t t) { stamp = t; } + + // encoding + virtual void encode(bufferlist& bl, uint64_t features) const = 0; + virtual void decode(bufferlist::const_iterator &) = 0; + static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator); + virtual void dump(Formatter *f) const = 0; + + void encode_with_header(bufferlist& bl, uint64_t features) { + using ceph::encode; + encode(EVENT_NEW_ENCODING, bl); + ENCODE_START(1, 1, bl) + encode(_type, bl); + this->encode(bl, features); + ENCODE_FINISH(bl); + } + + virtual void print(ostream& out) const { + out << "event(" << _type << ")"; + } + + /*** live journal ***/ + /* update_segment() - adjust any state we need to in the LogSegment + */ + virtual void update_segment() { } + + /*** recovery ***/ + /* replay() - replay given event. this is idempotent. + */ + virtual void replay(MDSRank *m) { ceph_abort(); } + + /** + * If the subclass embeds a MetaBlob, return it here so that + * tools can examine metablobs while traversing lists of LogEvent. + */ + virtual EMetaBlob *get_metablob() { return NULL; } + +protected: + utime_t stamp; + + LogSegment* get_segment() { return _segment; } + LogSegment const* get_segment() const { return _segment; } + +private: + static const std::map<std::string, LogEvent::EventType> types; + + static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator&, EventType); + + EventType _type = 0; + uint64_t _start_off = 0; + LogSegment *_segment = nullptr; +}; + +inline ostream& operator<<(ostream& out, const LogEvent &le) { + le.print(out); + return out; +} + +#endif diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h new file mode 100644 index 00000000..c1c8e7ea --- /dev/null +++ b/src/mds/LogSegment.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LOGSEGMENT_H +#define CEPH_LOGSEGMENT_H + +#include "include/elist.h" +#include "include/interval_set.h" +#include "include/Context.h" +#include "MDSContext.h" +#include "mdstypes.h" +#include "CInode.h" +#include "CDentry.h" +#include "CDir.h" + +#include "include/unordered_set.h" + +using ceph::unordered_set; + +class CDir; +class CInode; +class CDentry; +class MDSRank; +struct MDSlaveUpdate; + +class LogSegment { + public: + using seq_t = uint64_t; + + LogSegment(uint64_t _seq, loff_t off=-1) : + seq(_seq), offset(off), end(off), + dirty_dirfrags(member_offset(CDir, item_dirty)), + new_dirfrags(member_offset(CDir, item_new)), + dirty_inodes(member_offset(CInode, item_dirty)), + dirty_dentries(member_offset(CDentry, item_dirty)), + open_files(member_offset(CInode, item_open_file)), + dirty_parent_inodes(member_offset(CInode, item_dirty_parent)), + dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)), + dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)), + dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)) + {} + + void try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio); + + void wait_for_expiry(MDSContext *c) + { + ceph_assert(c != NULL); + expiry_waiters.push_back(c); + } + + const seq_t seq; + uint64_t offset, end; + int num_events = 0; + + // dirty items + elist<CDir*> dirty_dirfrags, new_dirfrags; + elist<CInode*> dirty_inodes; + elist<CDentry*> dirty_dentries; + + elist<CInode*> open_files; + elist<CInode*> dirty_parent_inodes; + elist<CInode*> dirty_dirfrag_dir; + elist<CInode*> dirty_dirfrag_nest; + elist<CInode*> dirty_dirfrag_dirfragtree; + + set<CInode*> truncating_inodes; + + map<int, ceph::unordered_set<version_t> > pending_commit_tids; // mdstable + set<metareqid_t> uncommitted_masters; + set<metareqid_t> uncommitted_slaves; + set<dirfrag_t> uncommitted_fragments; + + // client request ids + map<int, ceph_tid_t> last_client_tids; + + // potentially dirty sessions + std::set<entity_name_t> touched_sessions; + + // table version + version_t inotablev = 0; + version_t sessionmapv = 0; + map<int,version_t> tablev; + + MDSContext::vec expiry_waiters; +}; + +#endif diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc new file mode 100644 index 00000000..b6f6bbc1 --- /dev/null +++ b/src/mds/MDBalancer.cc @@ -0,0 +1,1456 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "mdstypes.h" + +#include "mon/MonClient.h" +#include "MDBalancer.h" +#include "MDSRank.h" +#include "MDSMap.h" +#include "CInode.h" +#include "CDir.h" +#include "MDCache.h" +#include "Migrator.h" +#include "Mantle.h" + +#include "include/Context.h" +#include "msg/Messenger.h" + +#include <fstream> +#include <iostream> +#include <vector> +#include <map> +using std::map; +using std::vector; +using std::chrono::duration_cast; + +#include "common/config.h" +#include "common/errno.h" + +#define dout_context g_ceph_context +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " +#undef dout +#define dout(lvl) \ + do {\ + auto subsys = ceph_subsys_mds;\ + if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\ + subsys = ceph_subsys_mds_balancer;\ + }\ + dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix +#undef dendl +#define dendl dendl_impl; } while (0) + + +#define MIN_LOAD 50 // ?? +#define MIN_REEXPORT 5 // will automatically reexport +#define MIN_OFFLOAD 10 // point at which i stop trying, close enough + + +int MDBalancer::proc_message(const Message::const_ref &m) +{ + switch (m->get_type()) { + + case MSG_MDS_HEARTBEAT: + handle_heartbeat(MHeartbeat::msgref_cast(m)); + break; + + default: + derr << " balancer unknown message " << m->get_type() << dendl_impl; + ceph_abort_msg("balancer unknown message"); + } + + return 0; +} + +MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : + mds(m), messenger(msgr), mon_client(monc) +{ + bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs"); + bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval"); +} + +void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map) +{ + if (changed.count("mds_bal_fragment_dirs")) + bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs"); + if (changed.count("mds_bal_fragment_interval")) + bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval"); +} + +void MDBalancer::handle_export_pins(void) +{ + auto &q = mds->mdcache->export_pin_queue; + auto it = q.begin(); + dout(20) << "export_pin_queue size=" << q.size() << dendl; + while (it != q.end()) { + auto cur = it++; + CInode *in = *cur; + ceph_assert(in->is_dir()); + mds_rank_t export_pin = in->get_export_pin(false); + if (export_pin >= mds->mdsmap->get_max_mds()) { + dout(20) << " delay export pin on " << *in << dendl; + in->state_clear(CInode::STATE_QUEUEDEXPORTPIN); + q.erase(cur); + + in->state_set(CInode::STATE_DELAYEDEXPORTPIN); + mds->mdcache->export_pin_delayed_queue.insert(in); + continue; + } + + bool remove = true; + list<CDir*> dfls; + in->get_dirfrags(dfls); + for (auto dir : dfls) { + if (!dir->is_auth()) + continue; + + if (export_pin == MDS_RANK_NONE) { + if (dir->state_test(CDir::STATE_AUXSUBTREE)) { + if (dir->is_frozen() || dir->is_freezing()) { + // try again later + remove = false; + continue; + } + dout(10) << " clear auxsubtree on " << *dir << dendl; + dir->state_clear(CDir::STATE_AUXSUBTREE); + mds->mdcache->try_subtree_merge(dir); + } + } else if (export_pin == mds->get_nodeid()) { + if (dir->state_test(CDir::STATE_CREATING) || + dir->is_frozen() || dir->is_freezing()) { + // try again later + remove = false; + continue; + } + if (!dir->is_subtree_root()) { + dir->state_set(CDir::STATE_AUXSUBTREE); + mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid()); + dout(10) << " create aux subtree on " << *dir << dendl; + } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) { + dout(10) << " set auxsubtree bit on " << *dir << dendl; + dir->state_set(CDir::STATE_AUXSUBTREE); + } + } else { + mds->mdcache->migrator->export_dir(dir, export_pin); + remove = false; + } + } + + if (remove) { + in->state_clear(CInode::STATE_QUEUEDEXPORTPIN); + q.erase(cur); + } + } + + std::vector<CDir *> authsubs = mds->mdcache->get_auth_subtrees(); + bool print_auth_subtrees = true; + + if (authsubs.size() > AUTH_TREES_THRESHOLD && + !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) { + dout(15) << "number of auth trees = " << authsubs.size() << "; not " + "printing auth trees" << dendl; + print_auth_subtrees = false; + } + + for (auto &cd : authsubs) { + mds_rank_t export_pin = cd->inode->get_export_pin(); + + if (print_auth_subtrees) { + dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << + dendl; + } + + if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds() + && export_pin != mds->get_nodeid()) { + mds->mdcache->migrator->export_dir(cd, export_pin); + } + } +} + +void MDBalancer::tick() +{ + static int num_bal_times = g_conf()->mds_bal_max; + auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval"); + auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until"); + time now = clock::now(); + + if (g_conf()->mds_bal_export_pin) { + handle_export_pins(); + } + + // sample? + if (chrono::duration<double>(now-last_sample).count() > + g_conf()->mds_bal_sample_interval) { + dout(15) << "tick last_sample now " << now << dendl; + last_sample = now; + } + + // We can use duration_cast below, although the result is an int, + // because the values from g_conf are also integers. + // balance? + if (mds->get_nodeid() == 0 + && mds->is_active() + && bal_interval > 0 + && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval + && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) { + last_heartbeat = now; + send_heartbeat(); + num_bal_times--; + } + + mds->mdcache->show_subtrees(10, true); +} + + + + +class C_Bal_SendHeartbeat : public MDSInternalContext { +public: + explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { } + void finish(int f) override { + mds->balancer->send_heartbeat(); + } +}; + + +double mds_load_t::mds_load() const +{ + switch(g_conf()->mds_bal_mode) { + case 0: + return + .8 * auth.meta_load() + + .2 * all.meta_load() + + req_rate + + 10.0 * queue_len; + + case 1: + return req_rate + 10.0*queue_len; + + case 2: + return cpu_load_avg; + + } + ceph_abort(); + return 0; +} + +mds_load_t MDBalancer::get_load() +{ + auto now = clock::now(); + + mds_load_t load{DecayRate()}; /* zero DecayRate! */ + + if (mds->mdcache->get_root()) { + list<CDir*> ls; + mds->mdcache->get_root()->get_dirfrags(ls); + for (auto &d : ls) { + load.auth.add(d->pop_auth_subtree_nested); + load.all.add(d->pop_nested); + } + } else { + dout(20) << "get_load no root, no load" << dendl; + } + + uint64_t num_requests = mds->get_num_requests(); + + uint64_t cpu_time = 1; + { + string stat_path = PROCPREFIX "/proc/self/stat"; + ifstream stat_file(stat_path); + if (stat_file.is_open()) { + vector<string> stat_vec(std::istream_iterator<string>{stat_file}, + std::istream_iterator<string>()); + if (stat_vec.size() >= 15) { + // utime + stime + cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) + + strtoll(stat_vec[14].c_str(), nullptr, 10); + } else { + derr << "input file '" << stat_path << "' not resolvable" << dendl_impl; + } + } else { + derr << "input file '" << stat_path << "' not found" << dendl_impl; + } + } + + load.queue_len = messenger->get_dispatch_queue_len(); + + bool update_last = true; + if (last_get_load != clock::zero() && + now > last_get_load) { + double el = std::chrono::duration<double>(now-last_get_load).count(); + if (el >= 1.0) { + if (num_requests > last_num_requests) + load.req_rate = (num_requests - last_num_requests) / el; + if (cpu_time > last_cpu_time) + load.cpu_load_avg = (cpu_time - last_cpu_time) / el; + } else { + auto p = mds_load.find(mds->get_nodeid()); + if (p != mds_load.end()) { + load.req_rate = p->second.req_rate; + load.cpu_load_avg = p->second.cpu_load_avg; + } + if (num_requests >= last_num_requests && cpu_time >= last_cpu_time) + update_last = false; + } + } + + if (update_last) { + last_num_requests = num_requests; + last_cpu_time = cpu_time; + last_get_load = now; + } + + dout(15) << "get_load " << load << dendl; + return load; +} + +/* + * Read synchronously from RADOS using a timeout. We cannot do daemon-local + * fallbacks (i.e. kick off async read when we are processing the map and + * check status when we get here) with the way the mds is structured. + */ +int MDBalancer::localize_balancer() +{ + /* reset everything */ + bool ack = false; + int r = 0; + bufferlist lua_src; + Mutex lock("lock"); + Cond cond; + + /* we assume that balancer is in the metadata pool */ + object_t oid = object_t(mds->mdsmap->get_balancer()); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0, + new C_SafeCond(&lock, &cond, &ack, &r)); + dout(15) << "launched non-blocking read tid=" << tid + << " oid=" << oid << " oloc=" << oloc << dendl; + + /* timeout: if we waste half our time waiting for RADOS, then abort! */ + auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval"); + lock.Lock(); + int ret_t = cond.WaitInterval(lock, utime_t(bal_interval / 2, 0)); + lock.Unlock(); + + /* success: store the balancer in memory and set the version. */ + if (!r) { + if (ret_t == ETIMEDOUT) { + mds->objecter->op_cancel(tid, -ECANCELED); + return -ETIMEDOUT; + } + bal_code.assign(lua_src.to_str()); + bal_version.assign(oid.name); + dout(10) << "localized balancer, bal_code=" << bal_code << dendl; + } + return r; +} + +void MDBalancer::send_heartbeat() +{ + if (mds->is_cluster_degraded()) { + dout(10) << "send_heartbeat degraded" << dendl; + return; + } + + if (!mds->mdcache->is_open()) { + dout(5) << "not open" << dendl; + mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds)); + return; + } + + if (mds->get_nodeid() == 0) { + beat_epoch++; + mds_load.clear(); + } + + // my load + mds_load_t load = get_load(); + mds->logger->set(l_mds_load_cent, 100 * load.mds_load()); + mds->logger->set(l_mds_dispatch_queue_len, load.queue_len); + + auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load)); + if (!em.second) { + em.first->second = load; + } + + // import_map -- how much do i import from whom + map<mds_rank_t, float> import_map; + for (auto& im : mds->mdcache->get_auth_subtrees()) { + mds_rank_t from = im->inode->authority().first; + if (from == mds->get_nodeid()) continue; + if (im->get_inode()->is_stray()) continue; + import_map[from] += im->pop_auth_subtree.meta_load(); + } + mds_import_map[ mds->get_nodeid() ] = import_map; + + + dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl; + for (map<mds_rank_t, float>::iterator it = import_map.begin(); + it != import_map.end(); + ++it) { + dout(5) << " import_map from " << it->first << " -> " << it->second << dendl; + } + + + set<mds_rank_t> up; + mds->get_mds_map()->get_up_mds_set(up); + for (const auto& r : up) { + if (r == mds->get_nodeid()) + continue; + auto hb = MHeartbeat::create(load, beat_epoch); + hb->get_import_map() = import_map; + mds->send_message_mds(hb, r); + } +} + +void MDBalancer::handle_heartbeat(const MHeartbeat::const_ref &m) +{ + mds_rank_t who = mds_rank_t(m->get_source().num()); + dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl; + + if (!mds->is_active()) + return; + + if (!mds->mdcache->is_open()) { + dout(10) << "opening root on handle_heartbeat" << dendl; + mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m)); + return; + } + + if (mds->is_cluster_degraded()) { + dout(10) << " degraded, ignoring" << dendl; + return; + } + + if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) { + dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl; + + beat_epoch = m->get_beat(); + // clear the mds load info whose epoch is less than beat_epoch + mds_load.clear(); + } + + if (who == 0) { + dout(20) << " from mds0, new epoch " << m->get_beat() << dendl; + if (beat_epoch != m->get_beat()) { + beat_epoch = m->get_beat(); + mds_load.clear(); + } + + send_heartbeat(); + + mds->mdcache->show_subtrees(); + } else if (mds->get_nodeid() == 0) { + if (beat_epoch != m->get_beat()) { + dout(10) << " old heartbeat epoch, ignoring" << dendl; + return; + } + } + + { + auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load())); + if (!em.second) { + em.first->second = m->get_load(); + } + } + mds_import_map[who] = m->get_import_map(); + + { + unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); + if (mds_load.size() == cluster_size) { + // let's go! + //export_empties(); // no! + + /* avoid spamming ceph -w if user does not turn mantle on */ + if (mds->mdsmap->get_balancer() != "") { + int r = mantle_prep_rebalance(); + if (!r) return; + mds->clog->warn() << "using old balancer; mantle failed for " + << "balancer=" << mds->mdsmap->get_balancer() + << " : " << cpp_strerror(r); + } + prep_rebalance(m->get_beat()); + } + } +} + +double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex, + mds_rank_t im, double& maxim) +{ + if (maxex <= 0 || maxim <= 0) return 0.0; + + double howmuch = std::min(maxex, maxim); + if (howmuch <= 0) return 0.0; + + dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl; + + if (ex == mds->get_nodeid()) + state.targets[im] += howmuch; + + state.exported[ex] += howmuch; + state.imported[im] += howmuch; + + maxex -= howmuch; + maxim -= howmuch; + + return howmuch; +} + +void MDBalancer::queue_split(const CDir *dir, bool fast) +{ + dout(10) << __func__ << " enqueuing " << *dir + << " (fast=" << fast << ")" << dendl; + + const dirfrag_t frag = dir->dirfrag(); + + auto callback = [this, frag](int r) { + if (split_pending.erase(frag) == 0) { + // Someone beat me to it. This can happen in the fast splitting + // path, because we spawn two contexts, one with mds->timer and + // one with mds->queue_waiter. The loser can safely just drop + // out. + return; + } + + CDir *split_dir = mds->mdcache->get_dirfrag(frag); + if (!split_dir) { + dout(10) << "drop split on " << frag << " because not in cache" << dendl; + return; + } + if (!split_dir->is_auth()) { + dout(10) << "drop split on " << frag << " because non-auth" << dendl; + return; + } + + // Pass on to MDCache: note that the split might still not + // happen if the checks in MDCache::can_fragment fail. + dout(10) << __func__ << " splitting " << *split_dir << dendl; + mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits); + }; + + bool is_new = false; + if (split_pending.count(frag) == 0) { + split_pending.insert(frag); + is_new = true; + } + + if (fast) { + // Do the split ASAP: enqueue it in the MDSRank waiters which are + // run at the end of dispatching the current request + mds->queue_waiter(new MDSInternalContextWrapper(mds, + new FunctionContext(callback))); + } else if (is_new) { + // Set a timer to really do the split: we don't do it immediately + // so that bursts of ops on a directory have a chance to go through + // before we freeze it. + mds->timer.add_event_after(bal_fragment_interval, + new FunctionContext(callback)); + } +} + +void MDBalancer::queue_merge(CDir *dir) +{ + const auto frag = dir->dirfrag(); + auto callback = [this, frag](int r) { + ceph_assert(frag.frag != frag_t()); + + // frag must be in this set because only one context is in flight + // for a given frag at a time (because merge_pending is checked before + // starting one), and this context is the only one that erases it. + merge_pending.erase(frag); + + CDir *dir = mds->mdcache->get_dirfrag(frag); + if (!dir) { + dout(10) << "drop merge on " << frag << " because not in cache" << dendl; + return; + } + ceph_assert(dir->dirfrag() == frag); + + if(!dir->is_auth()) { + dout(10) << "drop merge on " << *dir << " because lost auth" << dendl; + return; + } + + dout(10) << "merging " << *dir << dendl; + + CInode *diri = dir->get_inode(); + + frag_t fg = dir->get_frag(); + while (fg != frag_t()) { + frag_t sibfg = fg.get_sibling(); + list<CDir*> sibs; + bool complete = diri->get_dirfrags_under(sibfg, sibs); + if (!complete) { + dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl; + break; + } + bool all = true; + for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) { + CDir *sib = *p; + if (!sib->is_auth() || !sib->should_merge()) { + all = false; + break; + } + } + if (!all) { + dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl; + break; + } + dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl; + fg = fg.parent(); + } + + if (fg != dir->get_frag()) + mds->mdcache->merge_dir(diri, fg); + }; + + if (merge_pending.count(frag) == 0) { + dout(20) << __func__ << " enqueued dir " << *dir << dendl; + merge_pending.insert(frag); + mds->timer.add_event_after(bal_fragment_interval, + new FunctionContext(callback)); + } else { + dout(20) << __func__ << " dir already in queue " << *dir << dendl; + } +} + +void MDBalancer::prep_rebalance(int beat) +{ + balance_state_t state; + + if (g_conf()->mds_thrash_exports) { + //we're going to randomly export to all the mds in the cluster + set<mds_rank_t> up_mds; + mds->get_mds_map()->get_up_mds_set(up_mds); + for (const auto &rank : up_mds) { + state.targets[rank] = 0.0; + } + } else { + int cluster_size = mds->get_mds_map()->get_num_in_mds(); + mds_rank_t whoami = mds->get_nodeid(); + rebalance_time = clock::now(); + + dout(5) << " prep_rebalance: cluster loads are" << dendl; + + mds->mdcache->migrator->clear_export_queue(); + + // rescale! turn my mds_load back into meta_load units + double load_fac = 1.0; + map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami); + if ((m != mds_load.end()) && (m->second.mds_load() > 0)) { + double metald = m->second.auth.meta_load(); + double mdsld = m->second.mds_load(); + load_fac = metald / mdsld; + dout(7) << " load_fac is " << load_fac + << " <- " << m->second.auth << " " << metald + << " / " << mdsld + << dendl; + } + + mds_meta_load.clear(); + + double total_load = 0.0; + multimap<double,mds_rank_t> load_map; + for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) { + mds_load_t& load = mds_load.at(i); + + double l = load.mds_load() * load_fac; + mds_meta_load[i] = l; + + if (whoami == 0) + dout(5) << " mds." << i + << " " << load + << " = " << load.mds_load() + << " ~ " << l << dendl; + + if (whoami == i) my_load = l; + total_load += l; + + load_map.insert(pair<double,mds_rank_t>( l, i )); + } + + // target load + target_load = total_load / (double)cluster_size; + dout(5) << "prep_rebalance: my load " << my_load + << " target " << target_load + << " total " << total_load + << dendl; + + // under or over? + for (auto p : load_map) { + if (p.first < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) { + dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl; + mds_last_epoch_under_map[p.second] = beat_epoch; + } + } + + int last_epoch_under = mds_last_epoch_under_map[whoami]; + if (last_epoch_under == beat_epoch) { + dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl; + return; + } + // am i over long enough? + if (last_epoch_under && beat_epoch - last_epoch_under < 2) { + dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl; + return; + } + + dout(5) << " i am sufficiently overloaded" << dendl; + + + // first separate exporters and importers + multimap<double,mds_rank_t> importers; + multimap<double,mds_rank_t> exporters; + set<mds_rank_t> importer_set; + set<mds_rank_t> exporter_set; + + for (multimap<double,mds_rank_t>::iterator it = load_map.begin(); + it != load_map.end(); + ++it) { + if (it->first < target_load) { + dout(15) << " mds." << it->second << " is importer" << dendl; + importers.insert(pair<double,mds_rank_t>(it->first,it->second)); + importer_set.insert(it->second); + } else { + int mds_last_epoch_under = mds_last_epoch_under_map[it->second]; + if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) { + dout(15) << " mds." << it->second << " is exporter" << dendl; + exporters.insert(pair<double,mds_rank_t>(it->first,it->second)); + exporter_set.insert(it->second); + } + } + } + + + // determine load transfer mapping + + if (true) { + // analyze import_map; do any matches i can + + dout(15) << " matching exporters to import sources" << dendl; + + // big -> small exporters + for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin(); + ex != exporters.rend(); + ++ex) { + double maxex = get_maxex(state, ex->second); + if (maxex <= .001) continue; + + // check importers. for now, just in arbitrary order (no intelligent matching). + for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin(); + im != mds_import_map[ex->second].end(); + ++im) { + double maxim = get_maxim(state, im->first); + if (maxim <= .001) continue; + try_match(state, ex->second, maxex, im->first, maxim); + if (maxex <= .001) break; + } + } + } + + // old way + if (beat % 2 == 1) { + dout(15) << " matching big exporters to big importers" << dendl; + // big exporters to big importers + multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin(); + multimap<double,mds_rank_t>::iterator im = importers.begin(); + while (ex != exporters.rend() && + im != importers.end()) { + double maxex = get_maxex(state, ex->second); + double maxim = get_maxim(state, im->second); + if (maxex < .001 || maxim < .001) break; + try_match(state, ex->second, maxex, im->second, maxim); + if (maxex <= .001) ++ex; + if (maxim <= .001) ++im; + } + } else { // new way + dout(15) << " matching small exporters to big importers" << dendl; + // small exporters to big importers + multimap<double,mds_rank_t>::iterator ex = exporters.begin(); + multimap<double,mds_rank_t>::iterator im = importers.begin(); + while (ex != exporters.end() && + im != importers.end()) { + double maxex = get_maxex(state, ex->second); + double maxim = get_maxim(state, im->second); + if (maxex < .001 || maxim < .001) break; + try_match(state, ex->second, maxex, im->second, maxim); + if (maxex <= .001) ++ex; + if (maxim <= .001) ++im; + } + } + } + try_rebalance(state); +} + +int MDBalancer::mantle_prep_rebalance() +{ + balance_state_t state; + + /* refresh balancer if it has changed */ + if (bal_version != mds->mdsmap->get_balancer()) { + bal_version.assign(""); + int r = localize_balancer(); + if (r) return r; + + /* only spam the cluster log from 1 mds on version changes */ + if (mds->get_nodeid() == 0) + mds->clog->info() << "mantle balancer version changed: " << bal_version; + } + + /* prepare for balancing */ + int cluster_size = mds->get_mds_map()->get_num_in_mds(); + rebalance_time = clock::now(); + mds->mdcache->migrator->clear_export_queue(); + + /* fill in the metrics for each mds by grabbing load struct */ + vector < map<string, double> > metrics (cluster_size); + for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) { + mds_load_t& load = mds_load.at(i); + + metrics[i] = {{"auth.meta_load", load.auth.meta_load()}, + {"all.meta_load", load.all.meta_load()}, + {"req_rate", load.req_rate}, + {"queue_len", load.queue_len}, + {"cpu_load_avg", load.cpu_load_avg}}; + } + + /* execute the balancer */ + Mantle mantle; + int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets); + dout(5) << " mantle decided that new targets=" << state.targets << dendl; + + /* mantle doesn't know about cluster size, so check target len here */ + if ((int) state.targets.size() != cluster_size) + return -EINVAL; + else if (ret) + return ret; + + try_rebalance(state); + return 0; +} + + + +void MDBalancer::try_rebalance(balance_state_t& state) +{ + if (g_conf()->mds_thrash_exports) { + dout(5) << "mds_thrash is on; not performing standard rebalance operation!" + << dendl; + return; + } + + // make a sorted list of my imports + multimap<double, CDir*> import_pop_map; + multimap<mds_rank_t, pair<CDir*, double> > import_from_map; + + for (auto& dir : mds->mdcache->get_fullauth_subtrees()) { + CInode *diri = dir->get_inode(); + if (diri->is_mdsdir()) + continue; + if (diri->get_export_pin(false) != MDS_RANK_NONE) + continue; + if (dir->is_freezing() || dir->is_frozen()) + continue; // export pbly already in progress + + mds_rank_t from = diri->authority().first; + double pop = dir->pop_auth_subtree.meta_load(); + if (g_conf()->mds_bal_idle_threshold > 0 && + pop < g_conf()->mds_bal_idle_threshold && + diri != mds->mdcache->get_root() && + from != mds->get_nodeid()) { + dout(5) << " exporting idle (" << pop << ") import " << *dir + << " back to mds." << from << dendl; + mds->mdcache->migrator->export_dir_nicely(dir, from); + continue; + } + + dout(15) << " map: i imported " << *dir << " from " << from << dendl; + import_pop_map.insert(make_pair(pop, dir)); + import_from_map.insert(make_pair(from, make_pair(dir, pop))); + } + + // do my exports! + map<mds_rank_t, double> export_pop_map; + + for (auto &it : state.targets) { + mds_rank_t target = it.first; + double amount = it.second; + + if (amount < MIN_OFFLOAD) + continue; + if (amount * 10 * state.targets.size() < target_load) + continue; + + dout(5) << "want to send " << amount << " to mds." << target + //<< " .. " << (*it).second << " * " << load_fac + << " -> " << amount + << dendl;//" .. fudge is " << fudge << dendl; + + double& have = export_pop_map[target]; + + mds->mdcache->show_subtrees(); + + // search imports from target + if (import_from_map.count(target)) { + dout(5) << " aha, looking through imports from target mds." << target << dendl; + for (auto p = import_from_map.equal_range(target); + p.first != p.second; ) { + CDir *dir = p.first->second.first; + double pop = p.first->second.second; + dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl; + auto plast = p.first++; + + if (dir->inode->is_base()) + continue; + ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy + + if (pop <= amount-have) { + dout(5) << "reexporting " << *dir << " pop " << pop + << " back to mds." << target << dendl; + mds->mdcache->migrator->export_dir_nicely(dir, target); + have += pop; + import_from_map.erase(plast); + for (auto q = import_pop_map.equal_range(pop); + q.first != q.second; ) { + if (q.first->second == dir) { + import_pop_map.erase(q.first); + break; + } + q.first++; + } + } else { + dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl; + } + if (amount-have < MIN_OFFLOAD) + break; + } + } + } + + // any other imports + for (auto &it : state.targets) { + mds_rank_t target = it.first; + double amount = it.second; + + if (!export_pop_map.count(target)) + continue; + double& have = export_pop_map[target]; + if (amount-have < MIN_OFFLOAD) + continue; + + for (auto p = import_pop_map.begin(); + p != import_pop_map.end(); ) { + CDir *dir = p->second; + if (dir->inode->is_base()) { + ++p; + continue; + } + + double pop = p->first; + if (pop <= amount-have && pop > MIN_REEXPORT) { + dout(0) << "reexporting " << *dir << " pop " << pop + << " to mds." << target << dendl; + have += pop; + mds->mdcache->migrator->export_dir_nicely(dir, target); + import_pop_map.erase(p++); + } else { + ++p; + } + if (amount-have < MIN_OFFLOAD) + break; + } + } + + set<CDir*> already_exporting; + + for (auto &it : state.targets) { + mds_rank_t target = it.first; + double amount = it.second; + + if (!export_pop_map.count(target)) + continue; + double& have = export_pop_map[target]; + if (amount-have < MIN_OFFLOAD) + continue; + + // okay, search for fragments of my workload + list<CDir*> exports; + + for (auto p = import_pop_map.rbegin(); + p != import_pop_map.rend(); + ++p) { + CDir *dir = p->second; + find_exports(dir, amount, exports, have, already_exporting); + if (amount-have < MIN_OFFLOAD) + break; + } + //fudge = amount - have; + + for (auto dir : exports) { + dout(5) << " - exporting " << dir->pop_auth_subtree + << " " << dir->pop_auth_subtree.meta_load() + << " to mds." << target << " " << *dir << dendl; + mds->mdcache->migrator->export_dir_nicely(dir, target); + } + } + + dout(5) << "rebalance done" << dendl; + mds->mdcache->show_subtrees(); +} + +void MDBalancer::find_exports(CDir *dir, + double amount, + list<CDir*>& exports, + double& have, + set<CDir*>& already_exporting) +{ + auto now = clock::now(); + auto duration = std::chrono::duration<double>(now-rebalance_time).count(); + if (duration > 0.1) { + derr << " balancer runs too long" << dendl_impl; + have = amount; + return; + } + + ceph_assert(dir->is_auth()); + + double need = amount - have; + if (need < amount * g_conf()->mds_bal_min_start) + return; // good enough! + + double needmax = need * g_conf()->mds_bal_need_max; + double needmin = need * g_conf()->mds_bal_need_min; + double midchunk = need * g_conf()->mds_bal_midchunk; + double minchunk = need * g_conf()->mds_bal_minchunk; + + list<CDir*> bigger_rep, bigger_unrep; + multimap<double, CDir*> smaller; + + double dir_pop = dir->pop_auth_subtree.meta_load(); + dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl; + + double subdir_sum = 0; + for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current(); + !it.end(); ) { + CInode *in = *it; + ++it; + + ceph_assert(in->is_dir()); + ceph_assert(in->get_parent_dir() == dir); + + list<CDir*> dfls; + in->get_nested_dirfrags(dfls); + + size_t num_idle_frags = 0; + for (list<CDir*>::iterator p = dfls.begin(); + p != dfls.end(); + ++p) { + CDir *subdir = *p; + if (already_exporting.count(subdir)) + continue; + + // we know all ancestor dirfrags up to subtree root are not freezing or frozen. + // It's more efficient to use CDir::is_{freezing,frozen}_tree_root() + if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() || + subdir->is_freezing_dir() || subdir->is_freezing_tree_root()) + continue; // can't export this right now! + + // how popular? + double pop = subdir->pop_auth_subtree.meta_load(); + subdir_sum += pop; + dout(15) << " subdir pop " << pop << " " << *subdir << dendl; + + if (pop < minchunk) { + num_idle_frags++; + continue; + } + + // lucky find? + if (pop > needmin && pop < needmax) { + exports.push_back(subdir); + already_exporting.insert(subdir); + have += pop; + return; + } + + if (pop > need) { + if (subdir->is_rep()) + bigger_rep.push_back(subdir); + else + bigger_unrep.push_back(subdir); + } else + smaller.insert(pair<double,CDir*>(pop, subdir)); + } + if (dfls.size() == num_idle_frags) + in->item_pop_lru.remove_myself(); + } + dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl; + + // grab some sufficiently big small items + multimap<double,CDir*>::reverse_iterator it; + for (it = smaller.rbegin(); + it != smaller.rend(); + ++it) { + + if ((*it).first < midchunk) + break; // try later + + dout(7) << " taking smaller " << *(*it).second << dendl; + + exports.push_back((*it).second); + already_exporting.insert((*it).second); + have += (*it).first; + if (have > needmin) + return; + } + + // apprently not enough; drill deeper into the hierarchy (if non-replicated) + for (list<CDir*>::iterator it = bigger_unrep.begin(); + it != bigger_unrep.end(); + ++it) { + dout(15) << " descending into " << **it << dendl; + find_exports(*it, amount, exports, have, already_exporting); + if (have > needmin) + return; + } + + // ok fine, use smaller bits + for (; + it != smaller.rend(); + ++it) { + dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl; + + exports.push_back((*it).second); + already_exporting.insert((*it).second); + have += (*it).first; + if (have > needmin) + return; + } + + // ok fine, drill into replicated dirs + for (list<CDir*>::iterator it = bigger_rep.begin(); + it != bigger_rep.end(); + ++it) { + dout(7) << " descending into replicated " << **it << dendl; + find_exports(*it, amount, exports, have, already_exporting); + if (have > needmin) + return; + } +} + +void MDBalancer::hit_inode(CInode *in, int type, int who) +{ + // hit inode + in->pop.get(type).hit(); + + if (in->get_parent_dn()) + hit_dir(in->get_parent_dn()->get_dir(), type, who); +} + +void MDBalancer::maybe_fragment(CDir *dir, bool hot) +{ + // split/merge + if (bal_fragment_dirs && bal_fragment_interval > 0 && + dir->is_auth() && + !dir->inode->is_base() && // not root/mdsdir (for now at least) + !dir->inode->is_stray()) { // not straydir + + // split + if (g_conf()->mds_bal_split_size > 0 && (dir->should_split() || hot)) { + if (split_pending.count(dir->dirfrag()) == 0) { + queue_split(dir, false); + } else { + if (dir->should_split_fast()) { + queue_split(dir, true); + } else { + dout(10) << __func__ << ": fragment already enqueued to split: " + << *dir << dendl; + } + } + } + + // merge? + if (dir->get_frag() != frag_t() && dir->should_merge() && + merge_pending.count(dir->dirfrag()) == 0) { + queue_merge(dir); + } + } +} + +void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount) +{ + // hit me + double v = dir->pop_me.get(type).hit(amount); + + const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) || + (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR); + + dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag() + << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl; + + maybe_fragment(dir, hot); + + // replicate? + if (type == META_POP_IRD && who >= 0) { + dir->pop_spread.hit(who); + } + + double rd_adj = 0.0; + if (type == META_POP_IRD && + dir->last_popularity_sample < last_sample) { + double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm?? + dir->last_popularity_sample = last_sample; + double pop_sp = dir->pop_spread.get(); + dir_pop += pop_sp * 10; + + //if (dir->ino() == inodeno_t(0x10000000002)) + if (pop_sp > 0) { + dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp + << " " << dir->pop_spread.last[0] + << " " << dir->pop_spread.last[1] + << " " << dir->pop_spread.last[2] + << " " << dir->pop_spread.last[3] + << " in " << *dir << dendl; + } + + if (dir->is_auth() && !dir->is_ambiguous_auth()) { + if (!dir->is_rep() && + dir_pop >= g_conf()->mds_bal_replicate_threshold) { + // replicate + double rdp = dir->pop_me.get(META_POP_IRD).get(); + rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp; + rd_adj /= 2.0; // temper somewhat + + dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl; + + dir->dir_rep = CDir::REP_ALL; + mds->mdcache->send_dir_updates(dir, true); + + // fixme this should adjust the whole pop hierarchy + dir->pop_me.get(META_POP_IRD).adjust(rd_adj); + dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); + } + + if (dir->ino() != 1 && + dir->is_rep() && + dir_pop < g_conf()->mds_bal_unreplicate_threshold) { + // unreplicate + dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl; + + dir->dir_rep = CDir::REP_NONE; + mds->mdcache->send_dir_updates(dir); + } + } + } + + // adjust ancestors + bool hit_subtree = dir->is_auth(); // current auth subtree (if any) + bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees + + while (true) { + CDir *pdir = dir->inode->get_parent_dir(); + dir->pop_nested.get(type).hit(amount); + if (rd_adj != 0.0) + dir->pop_nested.get(META_POP_IRD).adjust(rd_adj); + + if (hit_subtree) { + dir->pop_auth_subtree.get(type).hit(amount); + + if (rd_adj != 0.0) + dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); + + if (dir->is_subtree_root()) + hit_subtree = false; // end of auth domain, stop hitting auth counters. + else if (pdir) + pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru); + } + + if (hit_subtree_nested) { + dir->pop_auth_subtree_nested.get(type).hit(amount); + if (rd_adj != 0.0) + dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj); + } + if (!pdir) break; + dir = pdir; + } +} + + +/* + * subtract off an exported chunk. + * this excludes *dir itself (encode_export_dir should have take care of that) + * we _just_ do the parents' nested counters. + * + * NOTE: call me _after_ forcing *dir into a subtree root, + * but _before_ doing the encode_export_dirs. + */ +void MDBalancer::subtract_export(CDir *dir) +{ + dirfrag_load_vec_t subload = dir->pop_auth_subtree; + + while (true) { + dir = dir->inode->get_parent_dir(); + if (!dir) break; + + dir->pop_nested.sub(subload); + dir->pop_auth_subtree_nested.sub(subload); + } +} + + +void MDBalancer::add_import(CDir *dir) +{ + dirfrag_load_vec_t subload = dir->pop_auth_subtree; + + while (true) { + dir = dir->inode->get_parent_dir(); + if (!dir) break; + + dir->pop_nested.add(subload); + dir->pop_auth_subtree_nested.add(subload); + } +} + +void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc) +{ + bool adjust_subtree_nest = dir->is_auth(); + bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root(); + CDir *cur = dir; + while (true) { + if (inc) { + pdir->pop_nested.add(dir->pop_nested); + if (adjust_subtree) { + pdir->pop_auth_subtree.add(dir->pop_auth_subtree); + pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru); + } + + if (adjust_subtree_nest) + pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested); + } else { + pdir->pop_nested.sub(dir->pop_nested); + if (adjust_subtree) + pdir->pop_auth_subtree.sub(dir->pop_auth_subtree); + + if (adjust_subtree_nest) + pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested); + } + + if (pdir->is_subtree_root()) + adjust_subtree = false; + cur = pdir; + pdir = pdir->inode->get_parent_dir(); + if (!pdir) break; + } +} + +void MDBalancer::handle_mds_failure(mds_rank_t who) +{ + if (0 == who) { + mds_last_epoch_under_map.clear(); + } +} + +int MDBalancer::dump_loads(Formatter *f) +{ + list<CDir*> dfs; + if (mds->mdcache->get_root()) { + mds->mdcache->get_root()->get_dirfrags(dfs); + } else { + dout(5) << "dump_load no root" << dendl; + } + + f->open_object_section("loads"); + + f->open_array_section("dirfrags"); + while (!dfs.empty()) { + CDir *dir = dfs.front(); + dfs.pop_front(); + + f->open_object_section("dir"); + dir->dump_load(f); + f->close_section(); + + for (auto it = dir->begin(); it != dir->end(); ++it) { + CInode *in = it->second->get_linkage()->get_inode(); + if (!in || !in->is_dir()) + continue; + + list<CDir*> ls; + in->get_dirfrags(ls); + for (auto subdir : ls) { + if (subdir->pop_nested.meta_load() < .001) + continue; + dfs.push_back(subdir); + } + } + } + f->close_section(); // dirfrags array + + f->open_object_section("mds_load"); + { + + auto dump_mds_load = [f](mds_load_t& load) { + f->dump_float("request_rate", load.req_rate); + f->dump_float("cache_hit_rate", load.cache_hit_rate); + f->dump_float("queue_length", load.queue_len); + f->dump_float("cpu_load", load.cpu_load_avg); + f->dump_float("mds_load", load.mds_load()); + + f->open_object_section("auth_dirfrags"); + load.auth.dump(f); + f->close_section(); + f->open_object_section("all_dirfrags"); + load.all.dump(f); + f->close_section(); + }; + + for (auto p : mds_load) { + stringstream name; + name << "mds." << p.first; + f->open_object_section(name.str().c_str()); + dump_mds_load(p.second); + f->close_section(); + } + } + f->close_section(); // mds_load + + f->open_object_section("mds_meta_load"); + for (auto p : mds_meta_load) { + stringstream name; + name << "mds." << p.first; + f->dump_float(name.str().c_str(), p.second); + } + f->close_section(); // mds_meta_load + + f->open_object_section("mds_import_map"); + for (auto p : mds_import_map) { + stringstream name1; + name1 << "mds." << p.first; + f->open_array_section(name1.str().c_str()); + for (auto q : p.second) { + f->open_object_section("from"); + stringstream name2; + name2 << "mds." << q.first; + f->dump_float(name2.str().c_str(), q.second); + f->close_section(); + } + f->close_section(); // mds.? array + } + f->close_section(); // mds_import_map + + f->close_section(); // loads + return 0; +} diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h new file mode 100644 index 00000000..4050eac9 --- /dev/null +++ b/src/mds/MDBalancer.h @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_MDBALANCER_H +#define CEPH_MDBALANCER_H + +#include <list> +#include <map> + +#include "include/types.h" +#include "common/Clock.h" +#include "common/Cond.h" + +#include "msg/Message.h" +#include "messages/MHeartbeat.h" + +#include "MDSMap.h" + +class MDSRank; +class MHeartbeat; +class CInode; +class CDir; +class Messenger; +class MonClient; + +class MDBalancer { +public: + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + friend class C_Bal_SendHeartbeat; + + MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc); + + void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); + + int proc_message(const Message::const_ref &m); + + /** + * Regularly called upkeep function. + * + * Sends MHeartbeat messages to the mons. + */ + void tick(); + + void subtract_export(CDir *ex); + void add_import(CDir *im); + void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc); + + void hit_inode(CInode *in, int type, int who=-1); + void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0); + + void queue_split(const CDir *dir, bool fast); + void queue_merge(CDir *dir); + + /** + * Based on size and configuration, decide whether to issue a queue_split + * or queue_merge for this CDir. + * + * \param hot whether the directory's temperature is enough to split it + */ + void maybe_fragment(CDir *dir, bool hot); + + void handle_mds_failure(mds_rank_t who); + + int dump_loads(Formatter *f); + +private: + bool bal_fragment_dirs; + int64_t bal_fragment_interval; + static const unsigned int AUTH_TREES_THRESHOLD = 5; + + typedef struct { + std::map<mds_rank_t, double> targets; + std::map<mds_rank_t, double> imported; + std::map<mds_rank_t, double> exported; + } balance_state_t; + + //set up the rebalancing targets for export and do one if the + //MDSMap is up to date + void prep_rebalance(int beat); + int mantle_prep_rebalance(); + + void handle_export_pins(void); + + mds_load_t get_load(); + int localize_balancer(); + void send_heartbeat(); + void handle_heartbeat(const MHeartbeat::const_ref &m); + void find_exports(CDir *dir, + double amount, + std::list<CDir*>& exports, + double& have, + set<CDir*>& already_exporting); + + double try_match(balance_state_t &state, + mds_rank_t ex, double& maxex, + mds_rank_t im, double& maxim); + + double get_maxim(balance_state_t &state, mds_rank_t im) { + return target_load - mds_meta_load[im] - state.imported[im]; + } + double get_maxex(balance_state_t &state, mds_rank_t ex) { + return mds_meta_load[ex] - target_load - state.exported[ex]; + } + + /** + * Try to rebalance. + * + * Check if the monitor has recorded the current export targets; + * if it has then do the actual export. Otherwise send off our + * export targets message again. + */ + void try_rebalance(balance_state_t& state); + + MDSRank *mds; + Messenger *messenger; + MonClient *mon_client; + int beat_epoch = 0; + + string bal_code; + string bal_version; + + time last_heartbeat = clock::zero(); + time last_sample = clock::zero(); + time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance + + time last_get_load = clock::zero(); + uint64_t last_num_requests = 0; + uint64_t last_cpu_time = 0; + + // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir + // just as soon as a delayed context comes back and triggers it. + // These sets just prevent us from spawning extra timer contexts for + // dirfrags that already have one in flight. + set<dirfrag_t> split_pending, merge_pending; + + // per-epoch scatter/gathered info + std::map<mds_rank_t, mds_load_t> mds_load; + std::map<mds_rank_t, double> mds_meta_load; + std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map; + std::map<mds_rank_t, int> mds_last_epoch_under_map; + + // per-epoch state + double my_load = 0; + double target_load = 0; +}; + +#endif diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc new file mode 100644 index 00000000..eb0a706f --- /dev/null +++ b/src/mds/MDCache.cc @@ -0,0 +1,13084 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <errno.h> +#include <fstream> +#include <iostream> +#include <sstream> +#include <string> +#include <string_view> +#include <map> + +#include "MDCache.h" +#include "MDSRank.h" +#include "Server.h" +#include "Locker.h" +#include "MDLog.h" +#include "MDBalancer.h" +#include "Migrator.h" +#include "ScrubStack.h" + +#include "SnapClient.h" + +#include "MDSMap.h" + +#include "CInode.h" +#include "CDir.h" + +#include "Mutation.h" + +#include "include/ceph_fs.h" +#include "include/filepath.h" +#include "include/util.h" + +#include "messages/MClientCaps.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "common/MemoryModel.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/safe_io.h" + +#include "osdc/Journaler.h" +#include "osdc/Filer.h" + +#include "events/ESubtreeMap.h" +#include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" +#include "events/EImportFinish.h" +#include "events/EFragment.h" +#include "events/ECommitted.h" +#include "events/ESessions.h" + +#include "InoTable.h" + +#include "common/Timer.h" + +#include "perfglue/heap_profiler.h" + + +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".cache "; +} + +set<int> SimpleLock::empty_gather_set; + + +/** + * All non-I/O contexts that require a reference + * to an MDCache instance descend from this. + */ +class MDCacheContext : public virtual MDSContext { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {} +}; + + +/** + * Only for contexts called back from an I/O completion + * + * Note: duplication of members wrt MDCacheContext, because + * it'ls the lesser of two evils compared with introducing + * yet another piece of (multiple) inheritance. + */ +class MDCacheIOContext : public virtual MDSIOContextBase { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheIOContext(MDCache *mdc_, bool track=true) : + MDSIOContextBase(track), mdcache(mdc_) {} +}; + +class MDCacheLogContext : public virtual MDSLogContextBase { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {} +}; + +MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : + mds(m), + filer(m->objecter, m->finisher), + recovery_queue(m), + stray_manager(m, purge_queue_), + trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")), + open_file_table(m) +{ + migrator.reset(new Migrator(mds, this)); + + max_dir_commit_size = g_conf()->mds_dir_max_commit_size ? + (g_conf()->mds_dir_max_commit_size << 20) : + (0.9 *(g_conf()->osd_max_write_size << 20)); + + cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size"); + cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit"); + cache_reservation = g_conf().get_val<double>("mds_cache_reservation"); + cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold"); + forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth"); + + lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid")); + + bottom_lru.lru_set_midpoint(0); + + decayrate.set_halflife(g_conf()->mds_decay_halflife); + + upkeeper = std::thread([this]() { + std::unique_lock lock(upkeep_mutex); + while (!upkeep_trim_shutdown.load()) { + auto now = clock::now(); + auto since = now-upkeep_last_trim; + auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval")); + if (since >= trim_interval*.90) { + lock.unlock(); /* mds_lock -> upkeep_mutex */ + std::scoped_lock mds_lock(mds->mds_lock); + lock.lock(); + if (upkeep_trim_shutdown.load()) + return; + if (mds->is_cache_trimmable()) { + dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl; + trim_client_leases(); + trim(); + check_memory_usage(); + auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS; + mds->server->recall_client_state(nullptr, flags); + upkeep_last_trim = clock::now(); + upkeep_last_trim = now = clock::now(); + } else { + dout(10) << "cache not ready for trimming" << dendl; + } + } else { + trim_interval -= since; + } + since = now-upkeep_last_release; + auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval")); + if (since >= release_interval*.90) { + /* XXX not necessary once MDCache uses PriorityCache */ + dout(10) << "releasing free memory" << dendl; + ceph_heap_release_free_memory(); + upkeep_last_release = clock::now(); + } else { + release_interval -= since; + } + auto interval = std::min(release_interval, trim_interval); + dout(20) << "upkeep thread waiting interval " << interval << dendl; + upkeep_cvar.wait_for(lock, interval); + } + }); +} + +MDCache::~MDCache() +{ + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger.get()); + } + if (upkeeper.joinable()) + upkeeper.join(); +} + +void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap) +{ + if (changed.count("mds_cache_size")) + cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size"); + if (changed.count("mds_cache_memory_limit")) + cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit"); + if (changed.count("mds_cache_reservation")) + cache_reservation = g_conf().get_val<double>("mds_cache_reservation"); + if (changed.count("mds_health_cache_threshold")) + cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold"); + if (changed.count("mds_cache_mid")) + lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid")); + if (changed.count("mds_cache_trim_decay_rate")) { + trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate")); + } + if (changed.count("mds_forward_all_requests_to_auth")){ + forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth"); + } + + migrator->handle_conf_change(changed, mdsmap); + mds->balancer->handle_conf_change(changed, mdsmap); +} + +void MDCache::log_stat() +{ + mds->logger->set(l_mds_inode_max, cache_inode_limit ? : INT_MAX); + mds->logger->set(l_mds_inodes, lru.lru_get_size()); + mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned()); + mds->logger->set(l_mds_inodes_top, lru.lru_get_top()); + mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot()); + mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail()); + mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps); + mds->logger->set(l_mds_caps, Capability::count()); + if (root) { + mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles); + mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes); + mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps); + } +} + + +// + +bool MDCache::shutdown() +{ + { + std::scoped_lock lock(upkeep_mutex); + upkeep_trim_shutdown = true; + upkeep_cvar.notify_one(); + } + if (lru.lru_get_size() > 0) { + dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl; + //show_cache(); + show_subtrees(); + //dump(); + } + return true; +} + + +// ==================================================================== +// some inode functions + +void MDCache::add_inode(CInode *in) +{ + // add to lru, inode map + if (in->last == CEPH_NOSNAP) { + auto &p = inode_map[in->ino()]; + ceph_assert(!p); // should be no dup inos! + p = in; + } else { + auto &p = snap_inode_map[in->vino()]; + ceph_assert(!p); // should be no dup inos! + p = in; + } + + if (in->ino() < MDS_INO_SYSTEM_BASE) { + if (in->ino() == MDS_INO_ROOT) + root = in; + else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) + myin = in; + else if (in->is_stray()) { + if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) { + strays[MDS_INO_STRAY_INDEX(in->ino())] = in; + } + } + if (in->is_base()) + base_inodes.insert(in); + } + + if (cache_toofull()) { + exceeded_size_limit = true; + } +} + +void MDCache::remove_inode(CInode *o) +{ + dout(14) << "remove_inode " << *o << dendl; + + if (o->get_parent_dn()) { + // FIXME: multiple parents? + CDentry *dn = o->get_parent_dn(); + ceph_assert(!dn->is_dirty()); + dn->dir->unlink_inode(dn); // leave dentry ... FIXME? + } + + if (o->is_dirty()) + o->mark_clean(); + if (o->is_dirty_parent()) + o->clear_dirty_parent(); + + o->clear_scatter_dirty(); + + o->item_open_file.remove_myself(); + + if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN)) + export_pin_queue.erase(o); + + if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN)) + export_pin_delayed_queue.erase(o); + + // remove from inode map + if (o->last == CEPH_NOSNAP) { + inode_map.erase(o->ino()); + } else { + o->item_caps.remove_myself(); + snap_inode_map.erase(o->vino()); + } + + if (o->ino() < MDS_INO_SYSTEM_BASE) { + if (o == root) root = 0; + if (o == myin) myin = 0; + if (o->is_stray()) { + if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) { + strays[MDS_INO_STRAY_INDEX(o->ino())] = 0; + } + } + if (o->is_base()) + base_inodes.erase(o); + } + + // delete it + ceph_assert(o->get_num_ref() == 0); + delete o; +} + +file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap) +{ + file_layout_t result = file_layout_t::get_default(); + result.pool_id = mdsmap.get_first_data_pool(); + return result; +} + +file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap) +{ + file_layout_t result = file_layout_t::get_default(); + result.pool_id = mdsmap.get_metadata_pool(); + if (g_conf()->mds_log_segment_size > 0) { + result.object_size = g_conf()->mds_log_segment_size; + result.stripe_unit = g_conf()->mds_log_segment_size; + } + return result; +} + +void MDCache::init_layouts() +{ + default_file_layout = gen_default_file_layout(*(mds->mdsmap)); + default_log_layout = gen_default_log_layout(*(mds->mdsmap)); +} + +void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, + int mode) const +{ + in->inode.ino = ino; + in->inode.version = 1; + in->inode.xattr_version = 1; + in->inode.mode = 0500 | mode; + in->inode.size = 0; + in->inode.ctime = + in->inode.mtime = + in->inode.btime = ceph_clock_now(); + in->inode.nlink = 1; + in->inode.truncate_size = -1ull; + in->inode.change_attr = 0; + in->inode.export_pin = MDS_RANK_NONE; + + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); + if (in->inode.is_dir()) { + in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + in->inode.rstat.rsubdirs = 1; /* itself */ + in->inode.rstat.rctime = in->inode.ctime; + } else { + in->inode.layout = default_file_layout; + ++in->inode.rstat.rfiles; + } + in->inode.accounted_rstat = in->inode.rstat; + + if (in->is_base()) { + if (in->is_root()) + in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN); + else + in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN); + in->open_snaprealm(); // empty snaprealm + ceph_assert(!in->snaprealm->parent); // created its own + in->snaprealm->srnode.seq = 1; + } +} + +CInode *MDCache::create_system_inode(inodeno_t ino, int mode) +{ + dout(0) << "creating system inode with ino:" << ino << dendl; + CInode *in = new CInode(this); + create_unlinked_system_inode(in, ino, mode); + add_inode(in); + return in; +} + +CInode *MDCache::create_root_inode() +{ + CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); + i->inode.uid = g_conf()->mds_root_ino_uid; + i->inode.gid = g_conf()->mds_root_ino_gid; + i->inode.layout = default_file_layout; + i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool(); + return i; +} + +void MDCache::create_empty_hierarchy(MDSGather *gather) +{ + // create root dir + CInode *root = create_root_inode(); + + // force empty root dir + CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(rootdir, mds->get_nodeid()); + rootdir->dir_rep = CDir::REP_ALL; //NONE; + + ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat); + ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat); + ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat); + /* Do no update rootdir rstat information of the fragment, rstat upkeep magic + * assume version 0 is stale/invalid. + */ + + rootdir->mark_complete(); + rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment()); + rootdir->commit(0, gather->new_sub()); + + root->mark_clean(); + root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment()); + root->mark_dirty_parent(mds->mdlog->get_current_segment(), true); + root->flush(gather->new_sub()); +} + +void MDCache::create_mydir_hierarchy(MDSGather *gather) +{ + // create mds dir + CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR); + + CDir *mydir = my->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(mydir, mds->get_nodeid()); + + LogSegment *ls = mds->mdlog->get_current_segment(); + + // stray dir + for (int i = 0; i < NUM_STRAY; ++i) { + CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR); + CDir *straydir = stray->get_or_open_dirfrag(this, frag_t()); + stringstream name; + name << "stray" << i; + CDentry *sdn = mydir->add_primary_dentry(name.str(), stray); + sdn->_mark_dirty(mds->mdlog->get_current_segment()); + + stray->inode.dirstat = straydir->fnode.fragstat; + + mydir->fnode.rstat.add(stray->inode.rstat); + mydir->fnode.fragstat.nsubdirs++; + // save them + straydir->mark_complete(); + straydir->mark_dirty(straydir->pre_dirty(), ls); + straydir->commit(0, gather->new_sub()); + stray->mark_dirty_parent(ls, true); + stray->store_backtrace(gather->new_sub()); + } + + mydir->fnode.accounted_fragstat = mydir->fnode.fragstat; + mydir->fnode.accounted_rstat = mydir->fnode.rstat; + + myin->inode.dirstat = mydir->fnode.fragstat; + myin->inode.rstat = mydir->fnode.rstat; + ++myin->inode.rstat.rsubdirs; + myin->inode.accounted_rstat = myin->inode.rstat; + + mydir->mark_complete(); + mydir->mark_dirty(mydir->pre_dirty(), ls); + mydir->commit(0, gather->new_sub()); + + myin->store(gather->new_sub()); +} + +struct C_MDC_CreateSystemFile : public MDCacheLogContext { + MutationRef mut; + CDentry *dn; + version_t dpv; + MDSContext *fin; + C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) : + MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {} + void finish(int r) override { + mdcache->_create_system_file_finish(mut, dn, dpv, fin); + } +}; + +void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin) +{ + dout(10) << "_create_system_file " << name << " in " << *dir << dendl; + CDentry *dn = dir->add_null_dentry(name); + + dn->push_projected_linkage(in); + version_t dpv = dn->pre_dirty(); + + CDir *mdir = 0; + if (in->inode.is_dir()) { + in->inode.rstat.rsubdirs = 1; + + mdir = in->get_or_open_dirfrag(this, frag_t()); + mdir->mark_complete(); + mdir->pre_dirty(); + } else + in->inode.rstat.rfiles = 1; + in->inode.version = dn->pre_dirty(); + + SnapRealm *realm = dir->get_inode()->find_snaprealm(); + dn->first = in->first = realm->get_newest_seq() + 1; + + MutationRef mut(new MutationImpl()); + + // force some locks. hacky. + mds->locker->wrlock_force(&dir->inode->filelock, mut); + mds->locker->wrlock_force(&dir->inode->nestlock, mut); + + mut->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "create system file"); + mds->mdlog->start_entry(le); + + if (!in->is_mdsdir()) { + predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, in, true); + } else { + predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1); + journal_dirty_inode(mut.get(), &le->metablob, in); + dn->push_projected_linkage(in->ino(), in->d_type()); + le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type()); + le->metablob.add_root(true, in); + } + if (mdir) + le->metablob.add_new_dir(mdir); // dirty AND complete AND new + + mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin)); + mds->mdlog->flush(); +} + +void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin) +{ + dout(10) << "_create_system_file_finish " << *dn << dendl; + + dn->pop_projected_linkage(); + dn->mark_dirty(dpv, mut->ls); + + CInode *in = dn->get_linkage()->get_inode(); + in->inode.version--; + in->mark_dirty(in->inode.version + 1, mut->ls); + + if (in->inode.is_dir()) { + CDir *dir = in->get_dirfrag(frag_t()); + ceph_assert(dir); + dir->mark_dirty(1, mut->ls); + dir->mark_new(mut->ls); + } + + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + + fin->complete(0); + + //if (dir && MDS_INO_IS_MDSDIR(in->ino())) + //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET); +} + + + +struct C_MDS_RetryOpenRoot : public MDSInternalContext { + MDCache *cache; + explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {} + void finish(int r) override { + if (r < 0) { + // If we can't open root, something disastrous has happened: mark + // this rank damaged for operator intervention. Note that + // it is not okay to call suicide() here because we are in + // a Finisher callback. + cache->mds->damaged(); + ceph_abort(); // damaged should never return + } else { + cache->open_root(); + } + } +}; + +void MDCache::open_root_inode(MDSContext *c) +{ + if (mds->get_nodeid() == mds->mdsmap->get_root()) { + CInode *in; + in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate! + in->fetch(c); + } else { + discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root()); + } +} + +void MDCache::open_mydir_inode(MDSContext *c) +{ + CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! + in->fetch(c); +} + +void MDCache::open_mydir_frag(MDSContext *c) +{ + open_mydir_inode( + new MDSInternalContextWrapper(mds, + new FunctionContext([this, c](int r) { + if (r < 0) { + c->complete(r); + return; + } + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + adjust_subtree_auth(mydir, mds->get_nodeid()); + mydir->fetch(c); + }) + ) + ); +} + +void MDCache::open_root() +{ + dout(10) << "open_root" << dendl; + + if (!root) { + open_root_inode(new C_MDS_RetryOpenRoot(this)); + return; + } + if (mds->get_nodeid() == mds->mdsmap->get_root()) { + ceph_assert(root->is_auth()); + CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); + ceph_assert(rootdir); + if (!rootdir->is_subtree_root()) + adjust_subtree_auth(rootdir, mds->get_nodeid()); + if (!rootdir->is_complete()) { + rootdir->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + } else { + ceph_assert(!root->is_auth()); + CDir *rootdir = root->get_dirfrag(frag_t()); + if (!rootdir) { + open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this)); + return; + } + } + + if (!myin) { + CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! + in->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + adjust_subtree_auth(mydir, mds->get_nodeid()); + + populate_mydir(); +} + +void MDCache::populate_mydir() +{ + ceph_assert(myin); + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + + dout(10) << "populate_mydir " << *mydir << dendl; + + if (!mydir->is_complete()) { + mydir->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + + if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) { + // A missing dirfrag, we will recreate it. Before that, we must dirty + // it before dirtying any of the strays we create within it. + mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, " + "recreating it now"; + LogSegment *ls = mds->mdlog->get_current_segment(); + mydir->state_clear(CDir::STATE_BADFRAG); + mydir->mark_complete(); + mydir->mark_dirty(mydir->pre_dirty(), ls); + } + + // open or create stray + uint64_t num_strays = 0; + for (int i = 0; i < NUM_STRAY; ++i) { + stringstream name; + name << "stray" << i; + CDentry *straydn = mydir->lookup(name.str()); + + // allow for older fs's with stray instead of stray0 + if (straydn == NULL && i == 0) + straydn = mydir->lookup("stray"); + + if (!straydn || !straydn->get_linkage()->get_inode()) { + _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR), + new C_MDS_RetryOpenRoot(this)); + return; + } + ceph_assert(straydn); + ceph_assert(strays[i]); + // we make multiple passes through this method; make sure we only pin each stray once. + if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) { + strays[i]->get(CInode::PIN_STRAY); + strays[i]->state_set(CInode::STATE_STRAYPINNED); + strays[i]->get_stickydirs(); + } + dout(20) << " stray num " << i << " is " << *strays[i] << dendl; + + // open all frags + frag_vec_t leaves; + strays[i]->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = strays[i]->get_dirfrag(leaf); + if (!dir) { + dir = strays[i]->get_or_open_dirfrag(this, leaf); + } + + // DamageTable applies special handling to strays: it will + // have damaged() us out if one is damaged. + ceph_assert(!dir->state_test(CDir::STATE_BADFRAG)); + + if (dir->get_version() == 0) { + dir->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + + if (dir->get_frag_size() > 0) + num_strays += dir->get_frag_size(); + } + } + + // okay! + dout(10) << "populate_mydir done" << dendl; + ceph_assert(!open); + open = true; + mds->queue_waiters(waiting_for_open); + + stray_manager.set_num_strays(num_strays); + stray_manager.activate(); + + scan_stray_dir(); +} + +void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin) +{ + discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1))); +} + +CDir *MDCache::get_stray_dir(CInode *in) +{ + string straydname; + in->name_stray_dentry(straydname); + + CInode *strayi = get_stray(); + ceph_assert(strayi); + frag_t fg = strayi->pick_dirfrag(straydname); + CDir *straydir = strayi->get_dirfrag(fg); + ceph_assert(straydir); + return straydir; +} + +CDentry *MDCache::get_or_create_stray_dentry(CInode *in) +{ + CDir *straydir = get_stray_dir(in); + string straydname; + in->name_stray_dentry(straydname); + CDentry *straydn = straydir->lookup(straydname); + if (!straydn) { + straydn = straydir->add_null_dentry(straydname); + straydn->mark_new(); + } else { + ceph_assert(straydn->get_projected_linkage()->is_null()); + } + + straydn->state_set(CDentry::STATE_STRAY); + return straydn; +} + + + +MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info) +{ + // inode? + if (info.ino) + return get_inode(info.ino, info.snapid); + + // dir or dentry. + CDir *dir = get_dirfrag(info.dirfrag); + if (!dir) return 0; + + if (info.dname.length()) + return dir->lookup(info.dname, info.snapid); + else + return dir; +} + + + + +// ==================================================================== +// subtree management + +/* + * adjust the dir_auth of a subtree. + * merge with parent and/or child subtrees, if is it appropriate. + * merge can ONLY happen if both parent and child have unambiguous auth. + */ +void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop) +{ + dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir << dendl; + + show_subtrees(); + + CDir *root; + if (dir->inode->is_base()) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) { + subtrees[root]; + root->get(CDir::PIN_SUBTREE); + } + } else { + root = get_subtree_root(dir); // subtree root + } + ceph_assert(root); + ceph_assert(subtrees.count(root)); + dout(7) << " current root is " << *root << dendl; + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << dendl; + ceph_assert(subtrees.count(dir) == 0); + subtrees[dir]; // create empty subtree bounds list for me. + dir->get(CDir::PIN_SUBTREE); + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set<CDir*>::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set<CDir*>::iterator next = p; + ++next; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << dendl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + + // adjust recursive pop counters + if (adjust_pop && dir->is_auth()) { + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree.sub(dir->pop_auth_subtree); + if (p->is_subtree_root()) break; + p = p->inode->get_parent_dir(); + } + } + } + + show_subtrees(); +} + + +void MDCache::try_subtree_merge(CDir *dir) +{ + dout(7) << "try_subtree_merge " << *dir << dendl; + // record my old bounds + auto oldbounds = subtrees.at(dir); + + set<CInode*> to_eval; + // try merge at my root + try_subtree_merge_at(dir, &to_eval); + + // try merge at my old bounds + for (auto bound : oldbounds) + try_subtree_merge_at(bound, &to_eval); + + if (!(mds->is_any_replay() || mds->is_resolve())) { + for(auto in : to_eval) + eval_subtree_root(in); + } +} + +class C_MDC_SubtreeMergeWB : public MDCacheLogContext { + CInode *in; + MutationRef mut; +public: + C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {} + void finish(int r) override { + mdcache->subtree_merge_writebehind_finish(in, mut); + } +}; + +void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop) +{ + dout(10) << "try_subtree_merge_at " << *dir << dendl; + + if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN || + dir->state_test(CDir::STATE_EXPORTBOUND) || + dir->state_test(CDir::STATE_AUXSUBTREE)) + return; + + auto it = subtrees.find(dir); + ceph_assert(it != subtrees.end()); + + // merge with parent? + CDir *parent = dir; + if (!dir->inode->is_base()) + parent = get_subtree_root(dir->get_parent_dir()); + + if (parent != dir && // we have a parent, + parent->dir_auth == dir->dir_auth) { // auth matches, + // merge with parent. + dout(10) << " subtree merge at " << *dir << dendl; + dir->set_dir_auth(CDIR_AUTH_DEFAULT); + + // move our bounds under the parent + subtrees[parent].insert(it->second.begin(), it->second.end()); + + // we are no longer a subtree or bound + dir->put(CDir::PIN_SUBTREE); + subtrees.erase(it); + subtrees[parent].erase(dir); + + // adjust popularity? + if (adjust_pop && dir->is_auth()) { + CDir *cur = dir; + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree.add(dir->pop_auth_subtree); + p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru); + if (p->is_subtree_root()) break; + cur = p; + p = p->inode->get_parent_dir(); + } + } + + if (to_eval && dir->get_inode()->is_auth()) + to_eval->insert(dir->get_inode()); + + show_subtrees(15); + } +} + +void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut) +{ + dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; + in->pop_and_dirty_projected_inode(mut->ls); + + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + + in->auth_unpin(this); +} + +void MDCache::eval_subtree_root(CInode *diri) +{ + // evaluate subtree inode filelock? + // (we should scatter the filelock on subtree bounds) + ceph_assert(diri->is_auth()); + mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST); +} + + +void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir + << " bounds " << bounds + << dendl; + + show_subtrees(); + + CDir *root; + if (dir->ino() == MDS_INO_ROOT) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) { + subtrees[root]; + root->get(CDir::PIN_SUBTREE); + } + } else { + root = get_subtree_root(dir); // subtree root + } + ceph_assert(root); + ceph_assert(subtrees.count(root)); + dout(7) << " current root is " << *root << dendl; + + mds_authority_t oldauth = dir->authority(); + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << dendl; + ceph_assert(subtrees.count(dir) == 0); + subtrees[dir]; // create empty subtree bounds list for me. + dir->get(CDir::PIN_SUBTREE); + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set<CDir*>::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set<CDir*>::iterator next = p; + ++next; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << dendl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + } + + set<CInode*> to_eval; + + // verify/adjust bounds. + // - these may be new, or + // - beneath existing ambiguous bounds (which will be collapsed), + // - but NOT beneath unambiguous bounds. + for (const auto& bound : bounds) { + // new bound? + if (subtrees[dir].count(bound) == 0) { + if (get_subtree_root(bound) == dir) { + dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; + adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. + } + else { + dout(10) << " want bound " << *bound << dendl; + CDir *t = get_subtree_root(bound->get_parent_dir()); + if (subtrees[t].count(bound) == 0) { + ceph_assert(t != dir); + dout(10) << " new bound " << *bound << dendl; + adjust_subtree_auth(bound, t->authority()); + } + // make sure it's nested beneath ambiguous subtree(s) + while (1) { + while (subtrees[dir].count(t) == 0) + t = get_subtree_root(t->get_parent_dir()); + dout(10) << " swallowing intervening subtree at " << *t << dendl; + adjust_subtree_auth(t, auth); + try_subtree_merge_at(t, &to_eval); + t = get_subtree_root(bound->get_parent_dir()); + if (t == dir) break; + } + } + } + else { + dout(10) << " already have bound " << *bound << dendl; + } + } + // merge stray bounds? + while (!subtrees[dir].empty()) { + set<CDir*> copy = subtrees[dir]; + for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) { + if (bounds.count(*p) == 0) { + CDir *stray = *p; + dout(10) << " swallowing extra subtree at " << *stray << dendl; + adjust_subtree_auth(stray, auth); + try_subtree_merge_at(stray, &to_eval); + } + } + // swallowing subtree may add new subtree bounds + if (copy == subtrees[dir]) + break; + } + + // bound should now match. + verify_subtree_bounds(dir, bounds); + + show_subtrees(); + + if (!(mds->is_any_replay() || mds->is_resolve())) { + for(auto in : to_eval) + eval_subtree_root(in); + } +} + + +/* + * return a set of CDir*'s that correspond to the given bound set. Only adjust + * fragmentation as necessary to get an equivalent bounding set. That is, only + * split if one of our frags spans the provided bounding set. Never merge. + */ +void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds) +{ + dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl; + + // sort by ino + map<inodeno_t, fragset_t> byino; + for (auto& frag : dfs) { + byino[frag.ino].insert(frag.frag); + } + dout(10) << " by ino: " << byino << dendl; + + for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) { + CInode *diri = get_inode(p->first); + if (!diri) + continue; + dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl; + + fragtree_t tmpdft; + for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) + tmpdft.force_to_leaf(g_ceph_context, *q); + + for (const auto& fg : p->second) { + frag_vec_t leaves; + diri->dirfragtree.get_leaves_under(fg, leaves); + if (leaves.empty()) { + bool all = true; + frag_t approx_fg = diri->dirfragtree[fg.value()]; + frag_vec_t approx_leaves; + tmpdft.get_leaves_under(approx_fg, approx_leaves); + for (const auto& leaf : approx_leaves) { + if (p->second.get().count(leaf) == 0) { + // not bound, so the resolve message is from auth MDS of the dirfrag + force_dir_fragment(diri, leaf); + all = false; + } + } + if (all) + leaves.push_back(approx_fg); + else + diri->dirfragtree.get_leaves_under(fg, leaves); + } + dout(10) << " frag " << fg << " contains " << leaves << dendl; + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (dir) + bounds.insert(dir); + } + } + } +} + +void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir << " bound_dfs " << bound_dfs << dendl; + + set<CDir*> bounds; + get_force_dirfrag_bound_set(bound_dfs, bounds); + adjust_bounded_subtree_auth(dir, bounds, auth); +} + +void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result) +{ + dout(10) << "map_dirfrag_set " << dfs << dendl; + + // group by inode + map<inodeno_t, fragset_t> ino_fragset; + for (const auto &df : dfs) { + ino_fragset[df.ino].insert(df.frag); + } + + // get frags + for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin(); + p != ino_fragset.end(); + ++p) { + CInode *in = get_inode(p->first); + if (!in) + continue; + + frag_vec_t fgs; + for (const auto& fg : p->second) { + in->dirfragtree.get_leaves_under(fg, fgs); + } + + dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs + << " on " << *in << dendl; + + for (const auto& fg : fgs) { + CDir *dir = in->get_dirfrag(fg); + if (dir) + result.insert(dir); + } + } +} + + + +CDir *MDCache::get_subtree_root(CDir *dir) +{ + // find the underlying dir that delegates (or is about to delegate) auth + while (true) { + if (dir->is_subtree_root()) + return dir; + dir = dir->get_inode()->get_parent_dir(); + if (!dir) + return 0; // none + } +} + +CDir *MDCache::get_projected_subtree_root(CDir *dir) +{ + // find the underlying dir that delegates (or is about to delegate) auth + while (true) { + if (dir->is_subtree_root()) + return dir; + dir = dir->get_inode()->get_projected_parent_dir(); + if (!dir) + return 0; // none + } +} + +void MDCache::remove_subtree(CDir *dir) +{ + dout(10) << "remove_subtree " << *dir << dendl; + ceph_assert(subtrees.count(dir)); + ceph_assert(subtrees[dir].empty()); + subtrees.erase(dir); + dir->put(CDir::PIN_SUBTREE); + if (dir->get_parent_dir()) { + CDir *p = get_subtree_root(dir->get_parent_dir()); + ceph_assert(subtrees[p].count(dir)); + subtrees[p].erase(dir); + } +} + +void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds) +{ + ceph_assert(subtrees.count(dir)); + bounds = subtrees[dir]; +} + +void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds) +{ + if (subtrees.count(dir)) { + // just copy them, dir is a subtree. + get_subtree_bounds(dir, bounds); + } else { + // find them + CDir *root = get_subtree_root(dir); + for (set<CDir*>::iterator p = subtrees[root].begin(); + p != subtrees[root].end(); + ++p) { + CDir *t = *p; + while (t != root) { + t = t->get_parent_dir(); + ceph_assert(t); + if (t == dir) { + bounds.insert(*p); + continue; + } + } + } + } +} + +void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds) +{ + // for debugging only. + ceph_assert(subtrees.count(dir)); + if (bounds != subtrees[dir]) { + dout(0) << "verify_subtree_bounds failed" << dendl; + set<CDir*> b = bounds; + for (auto &cd : subtrees[dir]) { + if (bounds.count(cd)) { + b.erase(cd); + continue; + } + dout(0) << " missing bound " << *cd << dendl; + } + for (const auto &cd : b) + dout(0) << " extra bound " << *cd << dendl; + } + ceph_assert(bounds == subtrees[dir]); +} + +void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds) +{ + // for debugging only. + ceph_assert(subtrees.count(dir)); + + // make sure that any bounds i do have are properly noted as such. + int failed = 0; + for (const auto &fg : bounds) { + CDir *bd = get_dirfrag(fg); + if (!bd) continue; + if (subtrees[dir].count(bd) == 0) { + dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; + failed++; + } + } + ceph_assert(failed == 0); +} + +void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir) +{ + dout(10) << "project_subtree_rename " << *diri << " from " << *olddir + << " to " << *newdir << dendl; + projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir)); +} + +void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop) +{ + dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; + + CDir *newdir = diri->get_parent_dir(); + + if (pop) { + map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri); + ceph_assert(p != projected_subtree_renames.end()); + ceph_assert(!p->second.empty()); + ceph_assert(p->second.front().first == olddir); + ceph_assert(p->second.front().second == newdir); + p->second.pop_front(); + if (p->second.empty()) + projected_subtree_renames.erase(p); + } + + vector<CDir*> dfls; + + // adjust total auth pin of freezing subtree + if (olddir != newdir) { + diri->get_nested_dirfrags(dfls); + for (auto dir : dfls) + olddir->adjust_freeze_after_rename(dir); + dfls.clear(); + } + + // adjust subtree + // make sure subtree dirfrags are at the front of the list + diri->get_subtree_dirfrags(dfls); + diri->get_nested_dirfrags(dfls); + for (auto dir : dfls) { + dout(10) << "dirfrag " << *dir << dendl; + CDir *oldparent = get_subtree_root(olddir); + dout(10) << " old parent " << *oldparent << dendl; + CDir *newparent = get_subtree_root(newdir); + dout(10) << " new parent " << *newparent << dendl; + + if (olddir != newdir) + mds->balancer->adjust_pop_for_rename(olddir, dir, false); + + if (oldparent == newparent) { + dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; + } else if (dir->is_subtree_root()) { + // children are fine. change parent. + dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; + ceph_assert(subtrees[oldparent].count(dir)); + subtrees[oldparent].erase(dir); + ceph_assert(subtrees.count(newparent)); + subtrees[newparent].insert(dir); + // caller is responsible for 'eval diri' + try_subtree_merge_at(dir, NULL, false); + } else { + // mid-subtree. + + // see if any old bounds move to the new parent. + list<CDir*> tomove; + for (set<CDir*>::iterator p = subtrees[oldparent].begin(); + p != subtrees[oldparent].end(); + ++p) { + CDir *bound = *p; + CDir *broot = get_subtree_root(bound->get_parent_dir()); + if (broot != oldparent) { + ceph_assert(broot == newparent); + tomove.push_back(bound); + } + } + for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) { + CDir *bound = *p; + dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; + subtrees[oldparent].erase(bound); + subtrees[newparent].insert(bound); + } + + // did auth change? + if (oldparent->authority() != newparent->authority()) { + adjust_subtree_auth(dir, oldparent->authority(), false); + // caller is responsible for 'eval diri' + try_subtree_merge_at(dir, NULL, false); + } + } + + if (olddir != newdir) + mds->balancer->adjust_pop_for_rename(newdir, dir, true); + } + + show_subtrees(); +} + +// =================================== +// journal and snap/cow helpers + + +/* + * find first inode in cache that follows given snapid. otherwise, return current. + */ +CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows) +{ + dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl; + ceph_assert(in->last == CEPH_NOSNAP); + + auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows)); + if (p != snap_inode_map.end() && p->second->ino() == in->ino()) { + dout(10) << "pick_inode_snap found " << *p->second << dendl; + in = p->second; + } + + return in; +} + + +/* + * note: i'm currently cheating wrt dirty and inode.version on cow + * items. instead of doing a full dir predirty, i just take the + * original item's version, and set the dirty flag (via + * mutation::add_cow_{inode,dentry}() and mutation::apply(). that + * means a special case in the dir commit clean sweep assertions. + * bah. + */ +CInode *MDCache::cow_inode(CInode *in, snapid_t last) +{ + ceph_assert(last >= in->first); + + CInode *oldin = new CInode(this, true, in->first, last); + oldin->inode = *in->get_previous_projected_inode(); + oldin->xattrs = *in->get_previous_projected_xattrs(); + oldin->symlink = in->symlink; + oldin->inode.trim_client_ranges(last); + + if (in->first < in->oldest_snap) + in->oldest_snap = in->first; + + in->first = last+1; + + dout(10) << "cow_inode " << *in << " to " << *oldin << dendl; + add_inode(oldin); + + if (in->last != CEPH_NOSNAP) { + CInode *head_in = get_inode(in->ino()); + ceph_assert(head_in); + auto ret = head_in->split_need_snapflush(oldin, in); + if (ret.first) { + oldin->client_snap_caps = in->client_snap_caps; + if (!oldin->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + if (lock->get_state() != LOCK_SNAP_SYNC) { + ceph_assert(lock->is_stable()); + lock->set_state(LOCK_SNAP_SYNC); // gathering + oldin->auth_pin(lock); + } + lock->get_wrlock(true); + } + } + } + if (!ret.second) { + auto client_snap_caps = std::move(in->client_snap_caps); + in->client_snap_caps.clear(); + in->item_open_file.remove_myself(); + in->item_caps.remove_myself(); + + if (!client_snap_caps.empty()) { + MDSContext::vec finished; + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering + lock->put_wrlock(); + if (!lock->get_num_wrlocks()) { + lock->set_state(LOCK_SYNC); + lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished); + in->auth_unpin(lock); + } + } + mds->queue_waiters(finished); + } + } + return oldin; + } + + if (!in->client_caps.empty()) { + const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps(); + // clone caps? + for (auto &p : in->client_caps) { + client_t client = p.first; + Capability *cap = &p.second; + int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued(); + if ((issued & CEPH_CAP_ANY_WR) && + cap->client_follows < last) { + dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl; + oldin->client_snap_caps.insert(client); + cap->client_follows = last; + + // we need snapflushes for any intervening snaps + dout(10) << " snaps " << snaps << dendl; + for (auto q = snaps.lower_bound(oldin->first); + q != snaps.end() && *q <= last; + ++q) { + in->add_need_snapflush(oldin, *q, client); + } + } else { + dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl; + } + } + + if (!oldin->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + if (lock->get_state() != LOCK_SNAP_SYNC) { + ceph_assert(lock->is_stable()); + lock->set_state(LOCK_SNAP_SYNC); // gathering + oldin->auth_pin(lock); + } + lock->get_wrlock(true); + } + } + } + return oldin; +} + +void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, + CDentry *dn, snapid_t follows, + CInode **pcow_inode, CDentry::linkage_t *dnl) +{ + if (!dn) { + dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl; + return; + } + dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl; + ceph_assert(dn->is_auth()); + + // nothing to cow on a null dentry, fix caller + if (!dnl) + dnl = dn->get_projected_linkage(); + ceph_assert(!dnl->is_null()); + + CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL; + bool cow_head = false; + if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) { + ceph_assert(in->is_frozen_inode()); + cow_head = true; + } + if (in && (in->is_multiversion() || cow_head)) { + // multiversion inode. + SnapRealm *realm = NULL; + + if (in->get_projected_parent_dn() != dn) { + ceph_assert(follows == CEPH_NOSNAP); + realm = dn->dir->inode->find_snaprealm(); + snapid_t dir_follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(dir_follows >= realm->get_newest_seq()); + + if (dir_follows+1 > dn->first) { + snapid_t oldfirst = dn->first; + dn->first = dir_follows+1; + if (realm->has_snaps_in_range(oldfirst, dir_follows)) { + CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), + oldfirst, dir_follows); + olddn->pre_dirty(); + dout(10) << " olddn " << *olddn << dendl; + metablob->add_remote_dentry(olddn, true); + mut->add_cow_dentry(olddn); + // FIXME: adjust link count here? hmm. + + if (dir_follows+1 > in->first) + in->cow_old_inode(dir_follows, cow_head); + } + } + + follows = dir_follows; + if (in->snaprealm) { + realm = in->snaprealm; + ceph_assert(follows >= realm->get_newest_seq()); + } + } else { + realm = in->find_snaprealm(); + if (follows == CEPH_NOSNAP) { + follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(follows >= realm->get_newest_seq()); + } + } + + // already cloned? + if (follows < in->first) { + dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl; + return; + } + + if (!realm->has_snaps_in_range(in->first, follows)) { + dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl; + in->first = follows + 1; + return; + } + + in->cow_old_inode(follows, cow_head); + + } else { + SnapRealm *realm = dn->dir->inode->find_snaprealm(); + if (follows == CEPH_NOSNAP) { + follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(follows >= realm->get_newest_seq()); + } + + // already cloned? + if (follows < dn->first) { + dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl; + return; + } + + // update dn.first before adding old dentry to cdir's map + snapid_t oldfirst = dn->first; + dn->first = follows+1; + + if (!realm->has_snaps_in_range(oldfirst, follows)) { + dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl; + if (in) + in->first = follows+1; + return; + } + + dout(10) << " dn " << *dn << dendl; + if (in) { + CInode *oldin = cow_inode(in, follows); + mut->add_cow_inode(oldin); + if (pcow_inode) + *pcow_inode = oldin; + CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows); + oldin->inode.version = olddn->pre_dirty(); + dout(10) << " olddn " << *olddn << dendl; + bool need_snapflush = !oldin->client_snap_caps.empty(); + if (need_snapflush) { + mut->ls->open_files.push_back(&oldin->item_open_file); + mds->locker->mark_need_snapflush_inode(oldin); + } + metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush); + mut->add_cow_dentry(olddn); + } else { + ceph_assert(dnl->is_remote()); + CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), + oldfirst, follows); + olddn->pre_dirty(); + dout(10) << " olddn " << *olddn << dendl; + metablob->add_remote_dentry(olddn, true); + mut->add_cow_dentry(olddn); + } + } +} + + +void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, + CInode *in, snapid_t follows, + CInode **pcow_inode) +{ + dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl; + CDentry *dn = in->get_projected_parent_dn(); + journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode); +} + +void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows) +{ + if (in->is_base()) { + metablob->add_root(true, in); + } else { + if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP) + follows = in->first - 1; + CDentry *dn = in->get_projected_parent_dn(); + if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry + journal_cow_dentry(mut, metablob, dn, follows); + if (in->get_projected_inode()->is_backtrace_updated()) { + bool dirty_pool = in->get_projected_inode()->layout.pool_id != + in->get_previous_projected_inode()->layout.pool_id; + metablob->add_primary_dentry(dn, in, true, true, dirty_pool); + } else { + metablob->add_primary_dentry(dn, in, true); + } + } +} + + + +// nested --------------------------------------------------------------- + +void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, + int linkunlink, SnapRealm *prealm) +{ + CDentry *parentdn = cur->get_projected_parent_dn(); + CInode::mempool_inode *curi = cur->get_projected_inode(); + + if (cur->first > first) + first = cur->first; + + dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink + << " " << *cur << dendl; + dout(20) << " frag head is [" << parent->first << ",head] " << dendl; + dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl; + + /* + * FIXME. this incompletely propagates rstats to _old_ parents + * (i.e. shortly after a directory rename). but we need full + * blown hard link backpointers to make this work properly... + */ + snapid_t floor = parentdn->first; + dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl; + + if (!prealm) + prealm = parent->inode->find_snaprealm(); + const set<snapid_t> snaps = prealm->get_snaps(); + + if (cur->last != CEPH_NOSNAP) { + ceph_assert(cur->dirty_old_rstats.empty()); + set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor)); + if (q == snaps.end() || *q > cur->last) + return; + } + + if (cur->last >= floor) { + bool update = true; + if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) { + // rename src inode is not projected in the slave rename prep case. so we should + // avoid updateing the inode. + ceph_assert(linkunlink < 0); + ceph_assert(cur->is_frozen_inode()); + update = false; + } + _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent, + linkunlink, update); + } + + if (g_conf()->mds_snap_rstat) { + for (const auto &p : cur->dirty_old_rstats) { + auto &old = cur->old_inodes[p]; + snapid_t ofirst = std::max(old.first, floor); + auto it = snaps.lower_bound(ofirst); + if (it == snaps.end() || *it > p) + continue; + if (p >= floor) + _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false); + } + } + cur->dirty_old_rstats.clear(); +} + + +void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last, + CDir *parent, int linkunlink, bool update_inode) +{ + dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl; + dout(20) << " inode rstat " << inode.rstat << dendl; + dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl; + nest_info_t delta; + if (linkunlink == 0) { + delta.add(inode.rstat); + delta.sub(inode.accounted_rstat); + } else if (linkunlink < 0) { + delta.sub(inode.accounted_rstat); + } else { + delta.add(inode.rstat); + } + dout(20) << " delta " << delta << dendl; + + if (update_inode) + inode.accounted_rstat = inode.rstat; + + while (last >= ofirst) { + /* + * pick fnode version to update. at each iteration, we want to + * pick a segment ending in 'last' to update. split as necessary + * to make that work. then, adjust first up so that we only + * update one segment at a time. then loop to cover the whole + * [ofirst,last] interval. + */ + nest_info_t *prstat; + snapid_t first; + fnode_t *pf = parent->get_projected_fnode(); + if (last == CEPH_NOSNAP) { + if (g_conf()->mds_snap_rstat) + first = std::max(ofirst, parent->first); + else + first = parent->first; + prstat = &pf->rstat; + dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl; + + if (first > parent->first && + !(pf->rstat == pf->accounted_rstat)) { + dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat [" + << parent->first << "," << (first-1) << "] " + << " " << *prstat << "/" << pf->accounted_rstat + << dendl; + parent->dirty_old_rstat[first-1].first = parent->first; + parent->dirty_old_rstat[first-1].rstat = pf->rstat; + parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat; + } + parent->first = first; + } else if (!g_conf()->mds_snap_rstat) { + // drop snapshots' rstats + break; + } else if (last >= parent->first) { + first = parent->first; + parent->dirty_old_rstat[last].first = first; + parent->dirty_old_rstat[last].rstat = pf->rstat; + parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat; + prstat = &parent->dirty_old_rstat[last].rstat; + dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] " + << " " << *prstat << "/" << pf->accounted_rstat << dendl; + } else { + // be careful, dirty_old_rstat is a _sparse_ map. + // sorry, this is ugly. + first = ofirst; + + // find any intersection with last + auto it = parent->dirty_old_rstat.lower_bound(last); + if (it == parent->dirty_old_rstat.end()) { + dout(20) << " no dirty_old_rstat with last >= last " << last << dendl; + if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) { + dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl; + first = parent->dirty_old_rstat.rbegin()->first+1; + } + } else { + // *it last is >= last + if (it->second.first <= last) { + // *it intersects [first,last] + if (it->second.first < first) { + dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl; + parent->dirty_old_rstat[first-1] = it->second; + it->second.first = first; + } + if (it->second.first > first) + first = it->second.first; + if (last < it->first) { + dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl; + parent->dirty_old_rstat[last] = it->second; + it->second.first = last+1; + } + } else { + // *it is to the _right_ of [first,last] + it = parent->dirty_old_rstat.lower_bound(first); + // new *it last is >= first + if (it->second.first <= last && // new *it isn't also to the right, and + it->first >= first) { // it intersects our first bit, + dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl; + first = it->first+1; + } + dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl; + } + } + dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl; + parent->dirty_old_rstat[last].first = first; + prstat = &parent->dirty_old_rstat[last].rstat; + } + + // apply + dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl; + ceph_assert(last >= first); + prstat->add(delta); + if (update_inode) + inode.accounted_rstat = inode.rstat; + dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl; + + last = first-1; + } +} + +void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat, + snapid_t ofirst, snapid_t last, + CInode *pin, bool cow_head) +{ + dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl; + dout(20) << " frag rstat " << rstat << dendl; + dout(20) << " frag accounted_rstat " << accounted_rstat << dendl; + nest_info_t delta = rstat; + delta.sub(accounted_rstat); + dout(20) << " delta " << delta << dendl; + + while (last >= ofirst) { + CInode::mempool_inode *pi; + snapid_t first; + if (last == pin->last) { + pi = pin->get_projected_inode(); + first = std::max(ofirst, pin->first); + if (first > pin->first) { + auto &old = pin->cow_old_inode(first-1, cow_head); + dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl; + } + } else { + if (last >= pin->first) { + first = pin->first; + pin->cow_old_inode(last, cow_head); + } else { + // our life is easier here because old_inodes is not sparse + // (although it may not begin at snapid 1) + auto it = pin->old_inodes.lower_bound(last); + if (it == pin->old_inodes.end()) { + dout(10) << " no old_inode <= " << last << ", done." << dendl; + break; + } + first = it->second.first; + if (first > last) { + dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl; + //assert(p == pin->old_inodes.begin()); + break; + } + if (it->first > last) { + dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to [" + << (last+1) << "," << it->first << "]" << dendl; + pin->old_inodes[last] = it->second; + it->second.first = last+1; + pin->dirty_old_rstats.insert(it->first); + } + } + if (first < ofirst) { + dout(10) << " splitting left old_inode [" << first << "," << last << "] to [" + << first << "," << ofirst-1 << "]" << dendl; + pin->old_inodes[ofirst-1] = pin->old_inodes[last]; + pin->dirty_old_rstats.insert(ofirst-1); + pin->old_inodes[last].first = first = ofirst; + } + pi = &pin->old_inodes[last].inode; + pin->dirty_old_rstats.insert(last); + } + dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl; + pi->rstat.add(delta); + dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl; + + last = first-1; + } +} + +void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change) +{ + if (!(mds->is_active() || mds->is_stopping())) + return; + + if (!in->is_auth() || in->is_frozen()) + return; + + auto i = in->get_projected_inode(); + + if (!i->quota.is_enable() && + !quota_change) + return; + + // creaete snaprealm for quota inode (quota was set before mimic) + if (!in->get_projected_srnode()) + mds->server->create_quota_realm(in); + + for (auto &p : in->client_caps) { + Capability *cap = &p.second; + if (cap->is_noquota()) + continue; + + if (exclude_ct >= 0 && exclude_ct != p.first) + goto update; + + if (cap->last_rbytes == i->rstat.rbytes && + cap->last_rsize == i->rstat.rsize()) + continue; + + if (i->quota.max_files > 0) { + if (i->rstat.rsize() >= i->quota.max_files) + goto update; + + if ((abs(cap->last_rsize - i->quota.max_files) >> 4) < + abs(cap->last_rsize - i->rstat.rsize())) + goto update; + } + + if (i->quota.max_bytes > 0) { + if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3)) + goto update; + + if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) < + abs(cap->last_rbytes - i->rstat.rbytes)) + goto update; + } + + continue; + +update: + cap->last_rsize = i->rstat.rsize(); + cap->last_rbytes = i->rstat.rbytes; + + auto msg = MClientQuota::create(); + msg->ino = in->ino(); + msg->rstat = i->rstat; + msg->quota = i->quota; + mds->send_message_client_counted(msg, cap->get_session()); + } + for (const auto &it : in->get_replicas()) { + auto msg = MGatherCaps::create(); + msg->ino = in->ino(); + mds->send_message_mds(msg, it.first); + } +} + +/* + * NOTE: we _have_ to delay the scatter if we are called during a + * rejoin, because we can't twiddle locks between when the + * rejoin_(weak|strong) is received and when we send the rejoin_ack. + * normally, this isn't a problem: a recover mds doesn't twiddle locks + * (no requests), and a survivor acks immediately. _except_ that + * during rejoin_(weak|strong) processing, we may complete a lock + * gather, and do a scatter_writebehind.. and we _can't_ twiddle the + * scatterlock state in that case or the lock states will get out of + * sync between the auth and replica. + * + * the simple solution is to never do the scatter here. instead, put + * the scatterlock on a list if it isn't already wrlockable. this is + * probably the best plan anyway, since we avoid too many + * scatters/locks under normal usage. + */ +/* + * some notes on dirlock/nestlock scatterlock semantics: + * + * the fragstat (dirlock) will never be updated without + * dirlock+nestlock wrlock held by the caller. + * + * the rstat (nestlock) _may_ get updated without a wrlock when nested + * data is pushed up the tree. this could be changed with some + * restructuring here, but in its current form we ensure that the + * fragstat+rstat _always_ reflect an accurrate summation over the dir + * frag, which is nice. and, we only need to track frags that need to + * be nudged (and not inodes with pending rstat changes that need to + * be pushed into the frag). a consequence of this is that the + * accounted_rstat on scatterlock sync may not match our current + * rstat. this is normal and expected. + */ +void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink, + snapid_t cfollows) +{ + bool primary_dn = flags & PREDIRTY_PRIMARY; + bool do_parent_mtime = flags & PREDIRTY_DIR; + bool shallow = flags & PREDIRTY_SHALLOW; + + ceph_assert(mds->mdlog->entry_is_open()); + + // make sure stamp is set + if (mut->get_mds_stamp() == utime_t()) + mut->set_mds_stamp(ceph_clock_now()); + + if (in->is_base()) + return; + + dout(10) << "predirty_journal_parents" + << (do_parent_mtime ? " do_parent_mtime":"") + << " linkunlink=" << linkunlink + << (primary_dn ? " primary_dn":" remote_dn") + << (shallow ? " SHALLOW":"") + << " follows " << cfollows + << " " << *in << dendl; + + if (!parent) { + ceph_assert(primary_dn); + parent = in->get_projected_parent_dn()->get_dir(); + } + + if (flags == 0 && linkunlink == 0) { + dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl; + blob->add_dir_context(parent); + return; + } + + // build list of inodes to wrlock, dirty, and update + list<CInode*> lsi; + CInode *cur = in; + CDentry *parentdn = NULL; + bool first = true; + while (parent) { + //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack + ceph_assert(parent->is_auth()); + + // opportunistically adjust parent dirfrag + CInode *pin = parent->get_inode(); + + // inode -> dirfrag + mut->auth_pin(parent); + mut->add_projected_fnode(parent); + + fnode_t *pf = parent->project_fnode(); + pf->version = parent->pre_dirty(); + + if (do_parent_mtime || linkunlink) { + ceph_assert(mut->is_wrlocked(&pin->filelock)); + ceph_assert(mut->is_wrlocked(&pin->nestlock)); + ceph_assert(cfollows == CEPH_NOSNAP); + + // update stale fragstat/rstat? + parent->resync_accounted_fragstat(); + parent->resync_accounted_rstat(); + + if (do_parent_mtime) { + pf->fragstat.mtime = mut->get_op_stamp(); + pf->fragstat.change_attr++; + dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl; + if (pf->fragstat.mtime > pf->rstat.rctime) { + dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; + pf->rstat.rctime = pf->fragstat.mtime; + } else { + dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; + } + } + if (linkunlink) { + dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; + if (in->is_dir()) { + pf->fragstat.nsubdirs += linkunlink; + //pf->rstat.rsubdirs += linkunlink; + } else { + pf->fragstat.nfiles += linkunlink; + //pf->rstat.rfiles += linkunlink; + } + } + } + + // rstat + if (!primary_dn) { + // don't update parent this pass + } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) && + pin->versionlock.can_wrlock())) { + dout(20) << " unwritable parent nestlock " << pin->nestlock + << ", marking dirty rstat on " << *cur << dendl; + cur->mark_dirty_rstat(); + } else { + // if we don't hold a wrlock reference on this nestlock, take one, + // because we are about to write into the dirfrag fnode and that needs + // to commit before the lock can cycle. + if (linkunlink) { + ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave()); + } + + if (!mut->is_wrlocked(&pin->nestlock)) { + dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl; + mds->locker->wrlock_force(&pin->nestlock, mut); + } + + // now we can project the inode rstat diff the dirfrag + SnapRealm *prealm = pin->find_snaprealm(); + + snapid_t follows = cfollows; + if (follows == CEPH_NOSNAP) + follows = prealm->get_newest_seq(); + + snapid_t first = follows+1; + + // first, if the frag is stale, bring it back in sync. + parent->resync_accounted_rstat(); + + // now push inode rstats into frag + project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm); + cur->clear_dirty_rstat(); + } + + bool stop = false; + if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) { + dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl; + stop = true; + } + + // delay propagating until later? + if (!stop && !first && + g_conf()->mds_dirstat_min_interval > 0) { + double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop; + if (since_last_prop < g_conf()->mds_dirstat_min_interval) { + dout(10) << "predirty_journal_parents last prop " << since_last_prop + << " < " << g_conf()->mds_dirstat_min_interval + << ", stopping" << dendl; + stop = true; + } else { + dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl; + } + } + + // can cast only because i'm passing nowait=true in the sole user + MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get()); + if (!stop && + !mut->is_wrlocked(&pin->nestlock) && + (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too + //true + !mds->locker->wrlock_start(&pin->nestlock, mdmut, true) + )) { // ** do not initiate.. see above comment ** + dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock + << " on " << *pin << dendl; + stop = true; + } + if (stop) { + dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl; + mds->locker->mark_updated_scatterlock(&pin->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest); + mut->add_updated_lock(&pin->nestlock); + if (do_parent_mtime || linkunlink) { + mds->locker->mark_updated_scatterlock(&pin->filelock); + mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir); + mut->add_updated_lock(&pin->filelock); + } + break; + } + if (!mut->is_wrlocked(&pin->versionlock)) + mds->locker->local_wrlock_grab(&pin->versionlock, mut); + + ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave()); + + pin->last_dirstat_prop = mut->get_mds_stamp(); + + // dirfrag -> diri + mut->auth_pin(pin); + mut->add_projected_inode(pin); + lsi.push_front(pin); + + pin->pre_cow_old_inode(); // avoid cow mayhem! + + auto &pi = pin->project_inode(); + pi.inode.version = pin->pre_dirty(); + + // dirstat + if (do_parent_mtime || linkunlink) { + dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl; + dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; + bool touched_mtime = false, touched_chattr = false; + pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); + pf->accounted_fragstat = pf->fragstat; + if (touched_mtime) + pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime; + if (touched_chattr) + pi.inode.change_attr = pi.inode.dirstat.change_attr; + dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl; + + if (parent->get_frag() == frag_t()) { // i.e., we are the only frag + if (pi.inode.dirstat.size() < 0) + ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter); + if (pi.inode.dirstat.size() != pf->fragstat.size()) { + mds->clog->error() << "unmatched fragstat size on single dirfrag " + << parent->dirfrag() << ", inode has " << pi.inode.dirstat + << ", dirfrag has " << pf->fragstat; + + // trust the dirfrag for now + pi.inode.dirstat = pf->fragstat; + + ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter); + } + } + } + + /* + * the rule here is to follow the _oldest_ parent with dirty rstat + * data. if we don't propagate all data, we add ourselves to the + * nudge list. that way all rstat data will (eventually) get + * pushed up the tree. + * + * actually, no. for now, silently drop rstats for old parents. we need + * hard link backpointers to do the above properly. + */ + + // stop? + if (pin->is_base()) + break; + parentdn = pin->get_projected_parent_dn(); + ceph_assert(parentdn); + + // rstat + dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl; + + // first, if the frag is stale, bring it back in sync. + parent->resync_accounted_rstat(); + + if (g_conf()->mds_snap_rstat) { + for (auto &p : parent->dirty_old_rstat) { + project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first, + p.first, pin, true); + } + } + parent->dirty_old_rstat.clear(); + project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false); + + pf->accounted_rstat = pf->rstat; + + if (parent->get_frag() == frag_t()) { // i.e., we are the only frag + if (pi.inode.rstat.rbytes != pf->rstat.rbytes) { + mds->clog->error() << "unmatched rstat rbytes on single dirfrag " + << parent->dirfrag() << ", inode has " << pi.inode.rstat + << ", dirfrag has " << pf->rstat; + + // trust the dirfrag for now + pi.inode.rstat = pf->rstat; + + ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter); + } + } + + parent->check_rstats(); + broadcast_quota_to_client(pin); + // next parent! + cur = pin; + parent = parentdn->get_dir(); + linkunlink = 0; + do_parent_mtime = false; + primary_dn = true; + first = false; + } + + // now, stick it in the blob + ceph_assert(parent); + ceph_assert(parent->is_auth()); + blob->add_dir_context(parent); + blob->add_dir(parent, true); + for (list<CInode*>::iterator p = lsi.begin(); + p != lsi.end(); + ++p) { + CInode *cur = *p; + journal_dirty_inode(mut.get(), blob, cur); + } + +} + + + + + +// =================================== +// slave requests + + +/* + * some handlers for master requests with slaves. we need to make + * sure slaves journal commits before we forget we mastered them and + * remove them from the uncommitted_masters map (used during recovery + * to commit|abort slaves). + */ +struct C_MDC_CommittedMaster : public MDCacheLogContext { + metareqid_t reqid; + C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {} + void finish(int r) override { + mdcache->_logged_master_commit(reqid); + } +}; + +void MDCache::log_master_commit(metareqid_t reqid) +{ + dout(10) << "log_master_commit " << reqid << dendl; + uncommitted_masters[reqid].committing = true; + mds->mdlog->start_submit_entry(new ECommitted(reqid), + new C_MDC_CommittedMaster(this, reqid)); +} + +void MDCache::_logged_master_commit(metareqid_t reqid) +{ + dout(10) << "_logged_master_commit " << reqid << dendl; + ceph_assert(uncommitted_masters.count(reqid)); + uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid); + mds->queue_waiters(uncommitted_masters[reqid].waiters); + uncommitted_masters.erase(reqid); +} + +// while active... + +void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from) +{ + dout(10) << "committed_master_slave mds." << from << " on " << r << dendl; + ceph_assert(uncommitted_masters.count(r)); + uncommitted_masters[r].slaves.erase(from); + if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty()) + log_master_commit(r); +} + +void MDCache::logged_master_update(metareqid_t reqid) +{ + dout(10) << "logged_master_update " << reqid << dendl; + ceph_assert(uncommitted_masters.count(reqid)); + uncommitted_masters[reqid].safe = true; + auto p = pending_masters.find(reqid); + if (p != pending_masters.end()) { + pending_masters.erase(p); + if (pending_masters.empty()) + process_delayed_resolve(); + } +} + +/* + * Master may crash after receiving all slaves' commit acks, but before journalling + * the final commit. Slaves may crash after journalling the slave commit, but before + * sending commit ack to the master. Commit masters with no uncommitted slave when + * resolve finishes. + */ +void MDCache::finish_committed_masters() +{ + for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + p->second.recovering = false; + if (!p->second.committing && p->second.slaves.empty()) { + dout(10) << "finish_committed_masters " << p->first << dendl; + log_master_commit(p->first); + } + } +} + +/* + * at end of resolve... we must journal a commit|abort for all slave + * updates, before moving on. + * + * this is so that the master can safely journal ECommitted on ops it + * masters when it reaches up:active (all other recovering nodes must + * complete resolve before that happens). + */ +struct C_MDC_SlaveCommit : public MDCacheLogContext { + mds_rank_t from; + metareqid_t reqid; + C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {} + void finish(int r) override { + mdcache->_logged_slave_commit(from, reqid); + } +}; + +void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid) +{ + dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl; + + // send a message + auto req = MMDSSlaveRequest::create(reqid, 0, MMDSSlaveRequest::OP_COMMITTED); + mds->send_message_mds(req, from); +} + + + + + + +// ==================================================================== +// import map, recovery + +void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, + map<dirfrag_t,vector<dirfrag_t> >& subtrees) +{ + if (subtrees.count(oldparent)) { + vector<dirfrag_t>& v = subtrees[oldparent]; + dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl; + for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it) + if (*it == df) { + v.erase(it); + break; + } + } + if (subtrees.count(newparent)) { + vector<dirfrag_t>& v = subtrees[newparent]; + dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl; + v.push_back(df); + } +} + +ESubtreeMap *MDCache::create_subtree_map() +{ + dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " + << num_subtrees_fullauth() << " fullauth" + << dendl; + + show_subtrees(); + + ESubtreeMap *le = new ESubtreeMap(); + mds->mdlog->_start_entry(le); + + map<dirfrag_t, CDir*> dirs_to_add; + + if (myin) { + CDir* mydir = myin->get_dirfrag(frag_t()); + dirs_to_add[mydir->dirfrag()] = mydir; + } + + // include all auth subtrees, and their bounds. + // and a spanning tree to tie it to the root. + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + // journal subtree as "ours" if we are + // me, -2 + // me, me + // me, !me (may be importing and ambiguous!) + + // so not + // !me, * + if (dir->get_dir_auth().first != mds->get_nodeid()) + continue; + + if (migrator->is_ambiguous_import(dir->dirfrag()) || + my_ambiguous_imports.count(dir->dirfrag())) { + dout(15) << " ambig subtree " << *dir << dendl; + le->ambiguous_subtrees.insert(dir->dirfrag()); + } else { + dout(15) << " subtree " << *dir << dendl; + } + + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[dir->dirfrag()].clear(); + + + // bounds + for (set<CDir*>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CDir *bound = *q; + dout(15) << " subtree bound " << *bound << dendl; + dirs_to_add[bound->dirfrag()] = bound; + le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); + } + } + + // apply projected renames + for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin(); + p != projected_subtree_renames.end(); + ++p) { + for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) { + CInode *diri = p->first; + CDir *olddir = q->first; + CDir *newdir = q->second; + dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl; + + list<CDir*> dfls; + diri->get_dirfrags(dfls); + for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) { + CDir *dir = *p; + dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl; + CDir *oldparent = get_projected_subtree_root(olddir); + dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl; + CDir *newparent = get_projected_subtree_root(newdir); + dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl; + + if (oldparent == newparent) { + dout(10) << "parent unchanged for " << dir->dirfrag() << " at " + << oldparent->dirfrag() << dendl; + continue; + } + + if (dir->is_subtree_root()) { + if (le->subtrees.count(newparent->dirfrag()) && + oldparent->get_dir_auth() != newparent->get_dir_auth()) + dirs_to_add[dir->dirfrag()] = dir; + // children are fine. change parent. + _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), + le->subtrees); + } else { + // mid-subtree. + + if (oldparent->get_dir_auth() != newparent->get_dir_auth()) { + dout(10) << " creating subtree for " << dir->dirfrag() << dendl; + // if oldparent is auth, subtree is mine; include it. + if (le->subtrees.count(oldparent->dirfrag())) { + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[dir->dirfrag()].clear(); + } + // if newparent is auth, subtree is a new bound + if (le->subtrees.count(newparent->dirfrag())) { + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound + } + newparent = dir; + } + + // see if any old bounds move to the new parent. + for (set<CDir*>::iterator p = subtrees[oldparent].begin(); + p != subtrees[oldparent].end(); + ++p) { + CDir *bound = *p; + if (dir->contains(bound->get_parent_dir())) + _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), + le->subtrees); + } + } + } + } + } + + // simplify the journaled map. our in memory map may have more + // subtrees than needed due to migrations that are just getting + // started or just completing. but on replay, the "live" map will + // be simple and we can do a straight comparison. + for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) { + if (le->ambiguous_subtrees.count(p->first)) + continue; + unsigned i = 0; + while (i < p->second.size()) { + dirfrag_t b = p->second[i]; + if (le->subtrees.count(b) && + le->ambiguous_subtrees.count(b) == 0) { + vector<dirfrag_t>& bb = le->subtrees[b]; + dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl; + for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r) + p->second.push_back(*r); + dirs_to_add.erase(b); + le->subtrees.erase(b); + p->second.erase(p->second.begin() + i); + } else { + ++i; + } + } + } + + for (auto &p : dirs_to_add) { + CDir *dir = p.second; + le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); + le->metablob.add_dir(dir, false); + } + + dout(15) << " subtrees " << le->subtrees << dendl; + dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl; + + //le->metablob.print(cout); + le->expire_pos = mds->mdlog->journaler->get_expire_pos(); + return le; +} + +void MDCache::dump_resolve_status(Formatter *f) const +{ + f->open_object_section("resolve_status"); + f->dump_stream("resolve_gather") << resolve_gather; + f->dump_stream("resolve_ack_gather") << resolve_gather; + f->close_section(); +} + +void MDCache::resolve_start(MDSContext *resolve_done_) +{ + dout(10) << "resolve_start" << dendl; + ceph_assert(!resolve_done); + resolve_done.reset(resolve_done_); + + if (mds->mdsmap->get_root() != mds->get_nodeid()) { + // if we don't have the root dir, adjust it to UNKNOWN. during + // resolve we want mds0 to explicit claim the portion of it that + // it owns, so that anything beyond its bounds get left as + // unknown. + CDir *rootdir = root->get_dirfrag(frag_t()); + if (rootdir) + adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); + } + resolve_gather = recovery_set; + + resolve_snapclient_commits = mds->snapclient->get_journaled_tids(); +} + +void MDCache::send_resolves() +{ + send_slave_resolves(); + + if (!resolve_done) { + // I'm survivor: refresh snap cache + mds->snapclient->sync( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + maybe_finish_slave_resolve(); + }) + ) + ); + dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl; + return; + } + if (!resolve_ack_gather.empty()) { + dout(10) << "send_resolves still waiting for resolve ack from (" + << resolve_ack_gather << ")" << dendl; + return; + } + if (!resolve_need_rollback.empty()) { + dout(10) << "send_resolves still waiting for rollback to commit on (" + << resolve_need_rollback << ")" << dendl; + return; + } + + send_subtree_resolves(); +} + +void MDCache::send_slave_resolves() +{ + dout(10) << "send_slave_resolves" << dendl; + + map<mds_rank_t, MMDSResolve::ref> resolves; + + if (mds->is_resolve()) { + for (map<metareqid_t, uslave>::iterator p = uncommitted_slaves.begin(); + p != uncommitted_slaves.end(); + ++p) { + mds_rank_t master = p->second.master; + auto &m = resolves[master]; + if (!m) m = MMDSResolve::create(); + m->add_slave_request(p->first, false); + } + } else { + set<mds_rank_t> resolve_set; + mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE); + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (!mdr->is_slave()) + continue; + if (!mdr->slave_did_prepare() && !mdr->committing) { + continue; + } + mds_rank_t master = mdr->slave_to_mds; + if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) { + dout(10) << " including uncommitted " << *mdr << dendl; + if (!resolves.count(master)) + resolves[master] = MMDSResolve::create(); + if (!mdr->committing && + mdr->has_more() && mdr->more()->is_inode_exporter) { + // re-send cap exports + CInode *in = mdr->more()->rename_inode; + map<client_t, Capability::Export> cap_map; + in->export_client_caps(cap_map); + bufferlist bl; + encode(in->ino(), bl); + encode(cap_map, bl); + resolves[master]->add_slave_request(p->first, bl); + } else { + resolves[master]->add_slave_request(p->first, mdr->committing); + } + } + } + } + + for (auto &p : resolves) { + dout(10) << "sending slave resolve to mds." << p.first << dendl; + mds->send_message_mds(p.second, p.first); + resolve_ack_gather.insert(p.first); + } +} + +void MDCache::send_subtree_resolves() +{ + dout(10) << "send_subtree_resolves" << dendl; + + if (migrator->is_exporting() || migrator->is_importing()) { + dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl; + migrator->show_importing(); + migrator->show_exporting(); + resolves_pending = true; + return; // not now + } + + map<mds_rank_t, MMDSResolve::ref> resolves; + for (set<mds_rank_t>::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (*p == mds->get_nodeid()) + continue; + if (mds->is_resolve() || mds->mdsmap->is_resolve(*p)) + resolves[*p] = MMDSResolve::create(); + } + + map<dirfrag_t, vector<dirfrag_t> > my_subtrees; + map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports; + + // known + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + // only our subtrees + if (dir->authority().first != mds->get_nodeid()) + continue; + + if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag())) + continue; // we'll add it below + + if (migrator->is_ambiguous_import(dir->dirfrag())) { + // ambiguous (mid-import) + set<CDir*> bounds; + get_subtree_bounds(dir, bounds); + vector<dirfrag_t> dfls; + for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) + dfls.push_back((*q)->dirfrag()); + + my_ambig_imports[dir->dirfrag()] = dfls; + dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl; + } else { + // not ambiguous. + for (auto &q : resolves) { + resolves[q.first]->add_subtree(dir->dirfrag()); + } + // bounds too + vector<dirfrag_t> dfls; + for (set<CDir*>::iterator q = subtrees[dir].begin(); + q != subtrees[dir].end(); + ++q) { + CDir *bound = *q; + dfls.push_back(bound->dirfrag()); + } + + my_subtrees[dir->dirfrag()] = dfls; + dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl; + } + } + + // ambiguous + for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin(); + p != my_ambiguous_imports.end(); + ++p) { + my_ambig_imports[p->first] = p->second; + dout(10) << " ambig " << p->first << " " << p->second << dendl; + } + + // simplify the claimed subtree. + for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) { + unsigned i = 0; + while (i < p->second.size()) { + dirfrag_t b = p->second[i]; + if (my_subtrees.count(b)) { + vector<dirfrag_t>& bb = my_subtrees[b]; + dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl; + for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r) + p->second.push_back(*r); + my_subtrees.erase(b); + p->second.erase(p->second.begin() + i); + } else { + ++i; + } + } + } + + // send + for (auto &p : resolves) { + const MMDSResolve::ref &m = p.second; + if (mds->is_resolve()) { + m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits); + } else { + m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids()); + } + m->subtrees = my_subtrees; + m->ambiguous_imports = my_ambig_imports; + dout(10) << "sending subtee resolve to mds." << p.first << dendl; + mds->send_message_mds(m, p.first); + } + resolves_pending = false; +} + +void MDCache::maybe_finish_slave_resolve() { + if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) { + // snap cache get synced or I'm in resolve state + if (mds->snapclient->is_synced() || resolve_done) + send_subtree_resolves(); + process_delayed_resolve(); + } +} + +void MDCache::handle_mds_failure(mds_rank_t who) +{ + dout(7) << "handle_mds_failure mds." << who << dendl; + + dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl; + + resolve_gather.insert(who); + discard_delayed_resolve(who); + ambiguous_slave_updates.erase(who); + + rejoin_gather.insert(who); + rejoin_sent.erase(who); // i need to send another + rejoin_ack_sent.erase(who); // i need to send another + rejoin_ack_gather.erase(who); // i'll need/get another. + + dout(10) << " resolve_gather " << resolve_gather << dendl; + dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl; + dout(10) << " rejoin_sent " << rejoin_sent << dendl; + dout(10) << " rejoin_gather " << rejoin_gather << dendl; + dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; + + + // tell the migrator too. + migrator->handle_mds_failure_or_stop(who); + + // tell the balancer too. + mds->balancer->handle_mds_failure(who); + + // clean up any requests slave to/from this node + list<MDRequestRef> finish; + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + // slave to the failed node? + if (mdr->slave_to_mds == who) { + if (mdr->slave_did_prepare()) { + dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; + if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds)) + remove_ambiguous_slave_update(p->first, mdr->slave_to_mds); + + if (!mdr->more()->waiting_on_slave.empty()) { + ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); + // will rollback, no need to wait + mdr->reset_slave_request(); + mdr->more()->waiting_on_slave.clear(); + } + } else if (!mdr->committing) { + dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl; + if (mdr->slave_request || mdr->slave_rolling_back()) + mdr->aborted = true; + else + finish.push_back(mdr); + } + } + + if (mdr->is_slave() && mdr->slave_did_prepare()) { + if (mdr->more()->waiting_on_slave.count(who)) { + ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); + dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds." + << who << dendl; + mdr->more()->waiting_on_slave.erase(who); + if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request) + mds->queue_waiter(new C_MDS_RetryRequest(this, mdr)); + } + + if (mdr->more()->srcdn_auth_mds == who && + mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) { + // rename srcdn's auth mds failed, resolve even I'm a survivor. + dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; + add_ambiguous_slave_update(p->first, mdr->slave_to_mds); + } + } else if (mdr->slave_request) { + const MMDSSlaveRequest::const_ref &slave_req = mdr->slave_request; + // FIXME: Slave rename request can arrive after we notice mds failure. + // This can cause mds to crash (does not affect integrity of FS). + if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP && + slave_req->srcdn_auth == who) + slave_req->mark_interrupted(); + } + + // failed node is slave? + if (mdr->is_master() && !mdr->committing) { + if (mdr->more()->srcdn_auth_mds == who) { + dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds." + << who << " to recover" << dendl; + ceph_assert(mdr->more()->witnessed.count(who) == 0); + if (mdr->more()->is_ambiguous_auth) + mdr->clear_ambiguous_auth(); + // rename srcdn's auth mds failed, all witnesses will rollback + mdr->more()->witnessed.clear(); + pending_masters.erase(p->first); + } + + if (mdr->more()->witnessed.count(who)) { + mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds; + if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) { + dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds." + << mdr->more()->srcdn_auth_mds << " to reply" << dendl; + // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack + // until either the request is committing or the slave also fails. + ceph_assert(mdr->more()->waiting_on_slave.size() == 1); + pending_masters.insert(p->first); + } else { + dout(10) << " master request " << *mdr << " no longer witnessed by slave mds." + << who << " to recover" << dendl; + if (srcdn_auth >= 0) + ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0); + + // discard this peer's prepare (if any) + mdr->more()->witnessed.erase(who); + } + } + + if (mdr->more()->waiting_on_slave.count(who)) { + dout(10) << " master request " << *mdr << " waiting for slave mds." << who + << " to recover" << dendl; + // retry request when peer recovers + mdr->more()->waiting_on_slave.erase(who); + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr)); + } + + if (mdr->locking && mdr->locking_target_mds == who) + mdr->finish_locking(mdr->locking); + } + } + + for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + // The failed MDS may have already committed the slave update + if (p->second.slaves.count(who)) { + p->second.recovering = true; + p->second.slaves.erase(who); + } + } + + while (!finish.empty()) { + dout(10) << "cleaning up slave request " << *finish.front() << dendl; + request_finish(finish.front()); + finish.pop_front(); + } + + kick_find_ino_peers(who); + kick_open_ino_peers(who); + + for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin(); + p != fragments.end(); ) { + dirfrag_t df = p->first; + fragment_info_t& info = p->second; + + if (info.is_fragmenting()) { + if (info.notify_ack_waiting.erase(who) && + info.notify_ack_waiting.empty()) { + fragment_drop_locks(info); + fragment_maybe_finish(p++); + } else { + ++p; + } + continue; + } + + ++p; + dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl; + list<CDir*> dirs; + info.dirs.swap(dirs); + fragments.erase(df); + fragment_unmark_unfreeze_dirs(dirs); + } + + // MDCache::shutdown_export_strays() always exports strays to mds.0 + if (who == mds_rank_t(0)) + shutdown_exporting_strays.clear(); + + show_subtrees(); +} + +/* + * handle_mds_recovery - called on another node's transition + * from resolve -> active. + */ +void MDCache::handle_mds_recovery(mds_rank_t who) +{ + dout(7) << "handle_mds_recovery mds." << who << dendl; + + // exclude all discover waiters. kick_discovers() will do the job + static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR; + static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY; + + MDSContext::vec waiters; + + // wake up any waiters in their subtrees + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + if (dir->authority().first != who || + dir->authority().second == mds->get_nodeid()) + continue; + ceph_assert(!dir->is_auth()); + + // wake any waiters + list<CDir*> q; + q.push_back(dir); + + while (!q.empty()) { + CDir *d = q.front(); + q.pop_front(); + d->take_waiting(d_mask, waiters); + + // inode waiters too + for (auto &p : d->items) { + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + dnl->get_inode()->take_waiting(i_mask, waiters); + + // recurse? + list<CDir*> ls; + dnl->get_inode()->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); + p != ls.end(); + ++p) { + CDir *subdir = *p; + if (!subdir->is_subtree_root()) + q.push_back(subdir); + } + } + } + } + } + + kick_open_ino_peers(who); + kick_find_ino_peers(who); + + // queue them up. + mds->queue_waiters(waiters); +} + +void MDCache::set_recovery_set(set<mds_rank_t>& s) +{ + dout(7) << "set_recovery_set " << s << dendl; + recovery_set = s; +} + + +/* + * during resolve state, we share resolves to determine who + * is authoritative for which trees. we expect to get an resolve + * from _everyone_ in the recovery_set (the mds cluster at the time of + * the first failure). + * + * This functions puts the passed message before returning + */ +void MDCache::handle_resolve(const MMDSResolve::const_ref &m) +{ + dout(7) << "handle_resolve from " << m->get_source() << dendl; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_RESOLVE) { + if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { + mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); + return; + } + // wait until we reach the resolve stage! + return; + } + + discard_delayed_resolve(from); + + // ambiguous slave requests? + if (!m->slave_requests.empty()) { + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) { + if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) { + ceph_assert(!p->second.committing); + pending_masters.insert(p->first); + } + } + + if (!pending_masters.empty()) { + dout(10) << " still have pending updates, delay processing slave resolve" << dendl; + delayed_resolve[from] = m; + return; + } + } + + auto ack = MMDSResolveAck::create(); + for (const auto &p : m->slave_requests) { + if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) { + // COMMIT + if (p.second.committing) { + // already committing, waiting for the OP_COMMITTED slave reply + dout(10) << " already committing slave request " << p << " noop "<< dendl; + } else { + dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl; + ack->add_commit(p.first); + } + uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted + + if (p.second.inode_caps.length() > 0) { + // slave wants to export caps (rename) + ceph_assert(mds->is_resolve()); + + inodeno_t ino; + map<client_t,Capability::Export> cap_exports; + auto q = p.second.inode_caps.cbegin(); + decode(ino, q); + decode(cap_exports, q); + + ceph_assert(get_inode(ino)); + + for (map<client_t,Capability::Export>::iterator q = cap_exports.begin(); + q != cap_exports.end(); + ++q) { + Capability::Import& im = rejoin_imported_caps[from][ino][q->first]; + im.cap_id = ++last_cap_id; // assign a new cap ID + im.issue_seq = 1; + im.mseq = q->second.mseq; + + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) + rejoin_client_map.emplace(q->first, session->info.inst); + } + + // will process these caps in rejoin stage + rejoin_slave_exports[ino].first = from; + rejoin_slave_exports[ino].second.swap(cap_exports); + + // send information of imported caps back to slave + encode(rejoin_imported_caps[from][ino], ack->commit[p.first]); + } + } else { + // ABORT + dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl; + ceph_assert(!p.second.committing); + ack->add_abort(p.first); + } + } + mds->send_message(ack, m->get_connection()); + return; + } + + if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) { + dout(10) << "delay processing subtree resolve" << dendl; + delayed_resolve[from] = m; + return; + } + + bool survivor = false; + // am i a surviving ambiguous importer? + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + survivor = true; + // check for any import success/failure (from this node) + map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin(); + while (p != my_ambiguous_imports.end()) { + map<dirfrag_t, vector<dirfrag_t> >::iterator next = p; + ++next; + CDir *dir = get_dirfrag(p->first); + ceph_assert(dir); + dout(10) << "checking ambiguous import " << *dir << dendl; + if (migrator->is_importing(dir->dirfrag()) && + migrator->get_import_peer(dir->dirfrag()) == from) { + ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); + + // check if sender claims the subtree + bool claimed_by_sender = false; + for (const auto &q : m->subtrees) { + // an ambiguous import won't race with a refragmentation; it's appropriate to force here. + CDir *base = get_force_dirfrag(q.first, false); + if (!base || !base->contains(dir)) + continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. + + bool inside = true; + set<CDir*> bounds; + get_force_dirfrag_bound_set(q.second, bounds); + for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) { + CDir *bound = *p; + if (bound->contains(dir)) { + inside = false; // nope, bound is dir or parent of dir, not inside. + break; + } + } + if (inside) + claimed_by_sender = true; + } + + my_ambiguous_imports.erase(p); // no longer ambiguous. + if (claimed_by_sender) { + dout(7) << "ambiguous import failed on " << *dir << dendl; + migrator->import_reverse(dir); + } else { + dout(7) << "ambiguous import succeeded on " << *dir << dendl; + migrator->import_finish(dir, true); + } + } + p = next; + } + } + + // update my dir_auth values + // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous + // migrations between other nodes) + for (const auto& p : m->subtrees) { + dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl; + CDir *dir = get_force_dirfrag(p.first, !survivor); + if (!dir) + continue; + adjust_bounded_subtree_auth(dir, p.second, from); + try_subtree_merge(dir); + } + + show_subtrees(); + + // note ambiguous imports too + for (const auto& p : m->ambiguous_imports) { + dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl; + other_ambiguous_imports[from][p.first] = p.second; + } + + // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload + // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds + for (const auto& p : m->table_clients) { + dout(10) << " noting " << get_mdstable_name(p.type) + << " pending_commits " << p.pending_commits << dendl; + MDSTableClient *client = mds->get_table_client(p.type); + for (const auto& q : p.pending_commits) + client->notify_commit(q); + } + + // did i get them all? + resolve_gather.erase(from); + + maybe_resolve_finish(); +} + +void MDCache::process_delayed_resolve() +{ + dout(10) << "process_delayed_resolve" << dendl; + map<mds_rank_t, MMDSResolve::const_ref> tmp; + tmp.swap(delayed_resolve); + for (auto &p : tmp) { + handle_resolve(p.second); + } +} + +void MDCache::discard_delayed_resolve(mds_rank_t who) +{ + delayed_resolve.erase(who); +} + +void MDCache::maybe_resolve_finish() +{ + ceph_assert(resolve_ack_gather.empty()); + ceph_assert(resolve_need_rollback.empty()); + + if (!resolve_gather.empty()) { + dout(10) << "maybe_resolve_finish still waiting for resolves (" + << resolve_gather << ")" << dendl; + return; + } + + dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; + disambiguate_my_imports(); + finish_committed_masters(); + + if (resolve_done) { + ceph_assert(mds->is_resolve()); + trim_unlinked_inodes(); + recalc_auth_bits(false); + resolve_done.release()->complete(0); + } else { + // I am survivor. + maybe_send_pending_rejoins(); + } +} + +void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref &ack) +{ + dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + if (!resolve_ack_gather.count(from) || + mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) { + return; + } + + if (ambiguous_slave_updates.count(from)) { + ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from)); + ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + } + + for (const auto &p : ack->commit) { + dout(10) << " commit on slave " << p.first << dendl; + + if (ambiguous_slave_updates.count(from)) { + remove_ambiguous_slave_update(p.first, from); + continue; + } + + if (mds->is_resolve()) { + // replay + MDSlaveUpdate *su = get_uncommitted_slave(p.first, from); + ceph_assert(su); + + // log commit + mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from, + ESlaveUpdate::OP_COMMIT, su->origop), + new C_MDC_SlaveCommit(this, from, p.first)); + mds->mdlog->flush(); + + finish_uncommitted_slave(p.first); + } else { + MDRequestRef mdr = request_get(p.first); + // information about master imported caps + if (p.second.length() > 0) + mdr->more()->inode_import.share(p.second); + + ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything! + request_finish(mdr); + } + } + + for (const auto &metareq : ack->abort) { + dout(10) << " abort on slave " << metareq << dendl; + + if (mds->is_resolve()) { + MDSlaveUpdate *su = get_uncommitted_slave(metareq, from); + ceph_assert(su); + + // perform rollback (and journal a rollback entry) + // note: this will hold up the resolve a bit, until the rollback entries journal. + MDRequestRef null_ref; + switch (su->origop) { + case ESlaveUpdate::LINK: + mds->server->do_link_rollback(su->rollback, from, null_ref); + break; + case ESlaveUpdate::RENAME: + mds->server->do_rename_rollback(su->rollback, from, null_ref); + break; + case ESlaveUpdate::RMDIR: + mds->server->do_rmdir_rollback(su->rollback, from, null_ref); + break; + default: + ceph_abort(); + } + } else { + MDRequestRef mdr = request_get(metareq); + mdr->aborted = true; + if (mdr->slave_request) { + if (mdr->slave_did_prepare()) // journaling slave prepare ? + add_rollback(metareq, from); + } else { + request_finish(mdr); + } + } + } + + if (!ambiguous_slave_updates.count(from)) { + resolve_ack_gather.erase(from); + maybe_finish_slave_resolve(); + } +} + +void MDCache::add_uncommitted_slave(metareqid_t reqid, LogSegment *ls, mds_rank_t master, MDSlaveUpdate *su) +{ + auto const &ret = uncommitted_slaves.emplace(std::piecewise_construct, + std::forward_as_tuple(reqid), + std::forward_as_tuple()); + ceph_assert(ret.second); + ls->uncommitted_slaves.insert(reqid); + uslave &u = ret.first->second; + u.master = master; + u.ls = ls; + u.su = su; + if (su == nullptr) { + return; + } + for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) + uncommitted_slave_rename_olddir[*p]++; + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) + uncommitted_slave_unlink[*p]++; +} + +void MDCache::finish_uncommitted_slave(metareqid_t reqid, bool assert_exist) +{ + auto it = uncommitted_slaves.find(reqid); + if (it == uncommitted_slaves.end()) { + ceph_assert(!assert_exist); + return; + } + uslave &u = it->second; + MDSlaveUpdate* su = u.su; + + if (!u.waiters.empty()) { + mds->queue_waiters(u.waiters); + } + u.ls->uncommitted_slaves.erase(reqid); + uncommitted_slaves.erase(it); + + if (su == nullptr) { + return; + } + // discard the non-auth subtree we renamed out of + for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) { + CInode *diri = *p; + map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri); + ceph_assert(it != uncommitted_slave_rename_olddir.end()); + it->second--; + if (it->second == 0) { + uncommitted_slave_rename_olddir.erase(it); + list<CDir*> ls; + diri->get_dirfrags(ls); + for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) { + CDir *root = get_subtree_root(*q); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { + try_trim_non_auth_subtree(root); + if (*q != root) + break; + } + } + } else + ceph_assert(it->second > 0); + } + // removed the inodes that were unlinked by slave update + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) { + CInode *in = *p; + map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in); + ceph_assert(it != uncommitted_slave_unlink.end()); + it->second--; + if (it->second == 0) { + uncommitted_slave_unlink.erase(it); + if (!in->get_projected_parent_dn()) + mds->mdcache->remove_inode_recursive(in); + } else + ceph_assert(it->second > 0); + } + delete su; +} + +MDSlaveUpdate* MDCache::get_uncommitted_slave(metareqid_t reqid, mds_rank_t master) +{ + + MDSlaveUpdate* su = nullptr; + auto it = uncommitted_slaves.find(reqid); + if (it != uncommitted_slaves.end() && + it->second.master == master) { + su = it->second.su; + } + return su; +} + +void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) { + auto p = resolve_need_rollback.find(reqid); + ceph_assert(p != resolve_need_rollback.end()); + if (mds->is_resolve()) { + finish_uncommitted_slave(reqid, false); + } else if (mdr) { + finish_uncommitted_slave(mdr->reqid, mdr->more()->slave_update_journaled); + } + resolve_need_rollback.erase(p); + maybe_finish_slave_resolve(); +} + +void MDCache::disambiguate_other_imports() +{ + dout(10) << "disambiguate_other_imports" << dendl; + + bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + // other nodes' ambiguous imports + for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin(); + p != other_ambiguous_imports.end(); + ++p) { + mds_rank_t who = p->first; + dout(10) << "ambiguous imports for mds." << who << dendl; + + for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; + // an ambiguous import will not race with a refragmentation; it's appropriate to force here. + CDir *dir = get_force_dirfrag(q->first, recovering); + if (!dir) continue; + + if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander + dir->authority() == CDIR_AUTH_UNDEF) { // resolving + dout(10) << " mds." << who << " did import " << *dir << dendl; + adjust_bounded_subtree_auth(dir, q->second, who); + try_subtree_merge(dir); + } else { + dout(10) << " mds." << who << " did not import " << *dir << dendl; + } + } + } + other_ambiguous_imports.clear(); +} + +void MDCache::disambiguate_my_imports() +{ + dout(10) << "disambiguate_my_imports" << dendl; + + if (!mds->is_resolve()) { + ceph_assert(my_ambiguous_imports.empty()); + return; + } + + disambiguate_other_imports(); + + // my ambiguous imports + mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid()); + while (!my_ambiguous_imports.empty()) { + map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin(); + + CDir *dir = get_dirfrag(q->first); + ceph_assert(dir); + + if (dir->authority() != me_ambig) { + dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; + cancel_ambiguous_import(dir); + + mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); + + // subtree may have been swallowed by another node claiming dir + // as their own. + CDir *root = get_subtree_root(dir); + if (root != dir) + dout(10) << " subtree root is " << *root << dendl; + ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us! + try_trim_non_auth_subtree(root); + } else { + dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl; + finish_ambiguous_import(q->first); + mds->mdlog->start_submit_entry(new EImportFinish(dir, true)); + } + } + ceph_assert(my_ambiguous_imports.empty()); + mds->mdlog->flush(); + + // verify all my subtrees are unambiguous! + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (dir->is_ambiguous_dir_auth()) { + dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; + } + ceph_assert(!dir->is_ambiguous_dir_auth()); + } + + show_subtrees(); +} + + +void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds) +{ + ceph_assert(my_ambiguous_imports.count(base) == 0); + my_ambiguous_imports[base] = bounds; +} + + +void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds) +{ + // make a list + vector<dirfrag_t> binos; + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) + binos.push_back((*p)->dirfrag()); + + // note: this can get called twice if the exporter fails during recovery + if (my_ambiguous_imports.count(base->dirfrag())) + my_ambiguous_imports.erase(base->dirfrag()); + + add_ambiguous_import(base->dirfrag(), binos); +} + +void MDCache::cancel_ambiguous_import(CDir *dir) +{ + dirfrag_t df = dir->dirfrag(); + ceph_assert(my_ambiguous_imports.count(df)); + dout(10) << "cancel_ambiguous_import " << df + << " bounds " << my_ambiguous_imports[df] + << " " << *dir + << dendl; + my_ambiguous_imports.erase(df); +} + +void MDCache::finish_ambiguous_import(dirfrag_t df) +{ + ceph_assert(my_ambiguous_imports.count(df)); + vector<dirfrag_t> bounds; + bounds.swap(my_ambiguous_imports[df]); + my_ambiguous_imports.erase(df); + + dout(10) << "finish_ambiguous_import " << df + << " bounds " << bounds + << dendl; + CDir *dir = get_dirfrag(df); + ceph_assert(dir); + + // adjust dir_auth, import maps + adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); + try_subtree_merge(dir); +} + +void MDCache::remove_inode_recursive(CInode *in) +{ + dout(10) << "remove_inode_recursive " << *in << dendl; + list<CDir*> ls; + in->get_dirfrags(ls); + list<CDir*>::iterator p = ls.begin(); + while (p != ls.end()) { + CDir *subdir = *p++; + + dout(10) << " removing dirfrag " << subdir << dendl; + auto it = subdir->items.begin(); + while (it != subdir->items.end()) { + CDentry *dn = it->second; + ++it; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + CInode *tin = dnl->get_inode(); + subdir->unlink_inode(dn, false); + remove_inode_recursive(tin); + } + subdir->remove_dentry(dn); + } + + if (subdir->is_subtree_root()) + remove_subtree(subdir); + in->close_dirfrag(subdir->dirfrag().frag); + } + remove_inode(in); +} + +bool MDCache::expire_recursive(CInode *in, expiremap &expiremap) +{ + ceph_assert(!in->is_auth()); + + dout(10) << __func__ << ":" << *in << dendl; + + // Recurse into any dirfrags beneath this inode + list<CDir*> ls; + in->get_dirfrags(ls); + for (auto subdir : ls) { + if (!in->is_mdsdir() && subdir->is_subtree_root()) { + dout(10) << __func__ << ": stray still has subtree " << *in << dendl; + return true; + } + + for (auto &it : subdir->items) { + CDentry *dn = it.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + CInode *tin = dnl->get_inode(); + + /* Remote strays with linkage (i.e. hardlinks) should not be + * expired, because they may be the target of + * a rename() as the owning MDS shuts down */ + if (!tin->is_stray() && tin->inode.nlink) { + dout(10) << __func__ << ": stray still has linkage " << *tin << dendl; + return true; + } + + const bool abort = expire_recursive(tin, expiremap); + if (abort) { + return true; + } + } + if (dn->lru_is_expireable()) { + trim_dentry(dn, expiremap); + } else { + dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl; + return true; + } + } + } + + return false; +} + +void MDCache::trim_unlinked_inodes() +{ + dout(7) << "trim_unlinked_inodes" << dendl; + int count = 0; + vector<CInode*> q; + for (auto &p : inode_map) { + CInode *in = p.second; + if (in->get_parent_dn() == NULL && !in->is_base()) { + dout(7) << " will trim from " << *in << dendl; + q.push_back(in); + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + for (auto& in : q) { + remove_inode_recursive(in); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } +} + +/** recalc_auth_bits() + * once subtree auth is disambiguated, we need to adjust all the + * auth and dirty bits in our cache before moving on. + */ +void MDCache::recalc_auth_bits(bool replay) +{ + dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl; + + if (root) { + root->inode_auth.first = mds->mdsmap->get_root(); + bool auth = mds->get_nodeid() == root->inode_auth.first; + if (auth) { + root->state_set(CInode::STATE_AUTH); + } else { + root->state_clear(CInode::STATE_AUTH); + if (!replay) + root->state_set(CInode::STATE_REJOINING); + } + } + + set<CInode*> subtree_inodes; + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (p->first->dir_auth.first == mds->get_nodeid()) + subtree_inodes.insert(p->first->inode); + } + + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (p->first->inode->is_mdsdir()) { + CInode *in = p->first->inode; + bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()); + if (auth) { + in->state_set(CInode::STATE_AUTH); + } else { + in->state_clear(CInode::STATE_AUTH); + if (!replay) + in->state_set(CInode::STATE_REJOINING); + } + } + + list<CDir*> dfq; // dirfrag queue + dfq.push_back(p->first); + + bool auth = p->first->authority().first == mds->get_nodeid(); + dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; + + while (!dfq.empty()) { + CDir *dir = dfq.front(); + dfq.pop_front(); + + // dir + if (auth) { + dir->state_set(CDir::STATE_AUTH); + } else { + dir->state_clear(CDir::STATE_AUTH); + if (!replay) { + // close empty non-auth dirfrag + if (!dir->is_subtree_root() && dir->get_num_any() == 0) { + dir->inode->close_dirfrag(dir->get_frag()); + continue; + } + dir->state_set(CDir::STATE_REJOINING); + dir->state_clear(CDir::STATE_COMPLETE); + if (dir->is_dirty()) + dir->mark_clean(); + } + } + + // dentries in this dir + for (auto &p : dir->items) { + // dn + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (auth) { + dn->state_set(CDentry::STATE_AUTH); + } else { + dn->state_clear(CDentry::STATE_AUTH); + if (!replay) { + dn->state_set(CDentry::STATE_REJOINING); + if (dn->is_dirty()) + dn->mark_clean(); + } + } + + if (dnl->is_primary()) { + // inode + CInode *in = dnl->get_inode(); + if (auth) { + in->state_set(CInode::STATE_AUTH); + } else { + in->state_clear(CInode::STATE_AUTH); + if (!replay) { + in->state_set(CInode::STATE_REJOINING); + if (in->is_dirty()) + in->mark_clean(); + if (in->is_dirty_parent()) + in->clear_dirty_parent(); + // avoid touching scatterlocks for our subtree roots! + if (subtree_inodes.count(in) == 0) + in->clear_scatter_dirty(); + } + } + // recurse? + if (in->is_dir()) + in->get_nested_dirfrags(dfq); + } + } + } + } + + show_subtrees(); + show_cache(); +} + + + +// =========================================================================== +// REJOIN + +/* + * notes on scatterlock recovery: + * + * - recovering inode replica sends scatterlock data for any subtree + * roots (the only ones that are possibly dirty). + * + * - surviving auth incorporates any provided scatterlock data. any + * pending gathers are then finished, as with the other lock types. + * + * that takes care of surviving auth + (recovering replica)*. + * + * - surviving replica sends strong_inode, which includes current + * scatterlock state, AND any dirty scatterlock data. this + * provides the recovering auth with everything it might need. + * + * - recovering auth must pick initial scatterlock state based on + * (weak|strong) rejoins. + * - always assimilate scatterlock data (it can't hurt) + * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC. + * - include base inode in ack for all inodes that saw scatterlock content + * + * also, for scatter gather, + * + * - auth increments {frag,r}stat.version on completion of any gather. + * + * - auth incorporates changes in a gather _only_ if the version + * matches. + * + * - replica discards changes any time the scatterlock syncs, and + * after recovery. + */ + +void MDCache::dump_rejoin_status(Formatter *f) const +{ + f->open_object_section("rejoin_status"); + f->dump_stream("rejoin_gather") << rejoin_gather; + f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather; + f->dump_unsigned("num_opening_inodes", cap_imports_num_opening); + f->close_section(); +} + +void MDCache::rejoin_start(MDSContext *rejoin_done_) +{ + dout(10) << "rejoin_start" << dendl; + ceph_assert(!rejoin_done); + rejoin_done.reset(rejoin_done_); + + rejoin_gather = recovery_set; + // need finish opening cap inodes before sending cache rejoins + rejoin_gather.insert(mds->get_nodeid()); + process_imported_caps(); +} + +/* + * rejoin phase! + * + * this initiates rejoin. it should be called before we get any + * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). + * + * we start out by sending rejoins to everyone in the recovery set. + * + * if we are rejoin, send for all regions in our cache. + * if we are active|stopping, send only to nodes that are rejoining. + */ +void MDCache::rejoin_send_rejoins() +{ + dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; + + if (rejoin_gather.count(mds->get_nodeid())) { + dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl; + rejoins_pending = true; + return; + } + if (!resolve_gather.empty()) { + dout(7) << "rejoin_send_rejoins still waiting for resolves (" + << resolve_gather << ")" << dendl; + rejoins_pending = true; + return; + } + + ceph_assert(!migrator->is_importing()); + ceph_assert(!migrator->is_exporting()); + + if (!mds->is_rejoin()) { + disambiguate_other_imports(); + } + + map<mds_rank_t, MMDSCacheRejoin::ref> rejoins; + + + // if i am rejoining, send a rejoin to everyone. + // otherwise, just send to others who are rejoining. + for (set<mds_rank_t>::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (*p == mds->get_nodeid()) continue; // nothing to myself! + if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! + if (mds->is_rejoin()) + rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK); + else if (mds->mdsmap->is_rejoin(*p)) + rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG); + } + + if (mds->is_rejoin()) { + map<client_t, pair<Session*, set<mds_rank_t> > > client_exports; + for (auto& p : cap_exports) { + mds_rank_t target = p.second.first; + if (rejoins.count(target) == 0) + continue; + for (auto q = p.second.second.begin(); q != p.second.second.end(); ) { + Session *session = nullptr; + auto it = client_exports.find(q->first); + if (it != client_exports.end()) { + session = it->second.first; + if (session) + it->second.second.insert(target); + } else { + session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + auto& r = client_exports[q->first]; + r.first = session; + if (session) + r.second.insert(target); + } + if (session) { + ++q; + } else { + // remove reconnect with no session + p.second.second.erase(q++); + } + } + rejoins[target]->cap_exports[p.first] = p.second.second; + } + for (auto& p : client_exports) { + Session *session = p.second.first; + for (auto& q : p.second.second) { + auto rejoin = rejoins[q]; + rejoin->client_map[p.first] = session->info.inst; + rejoin->client_metadata_map[p.first] = session->info.client_metadata; + } + } + } + + + // check all subtrees + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + ceph_assert(dir->is_subtree_root()); + if (dir->is_ambiguous_dir_auth()) { + // exporter is recovering, importer is survivor. + ceph_assert(rejoins.count(dir->authority().first)); + ceph_assert(!rejoins.count(dir->authority().second)); + continue; + } + + // my subtree? + if (dir->is_auth()) + continue; // skip my own regions! + + mds_rank_t auth = dir->get_dir_auth().first; + ceph_assert(auth >= 0); + if (rejoins.count(auth) == 0) + continue; // don't care about this node's subtrees + + rejoin_walk(dir, rejoins[auth]); + } + + // rejoin root inodes, too + for (auto &p : rejoins) { + if (mds->is_rejoin()) { + // weak + if (p.first == 0 && root) { + p.second->add_weak_inode(root->vino()); + if (root->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on root " << *root << dendl; + p.second->add_scatterlock_state(root); + } + } + if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) { + if (in) + p.second->add_weak_inode(in->vino()); + } + } else { + // strong + if (p.first == 0 && root) { + p.second->add_strong_inode(root->vino(), + root->get_replica_nonce(), + root->get_caps_wanted(), + root->filelock.get_state(), + root->nestlock.get_state(), + root->dirfragtreelock.get_state()); + root->state_set(CInode::STATE_REJOINING); + if (root->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on root " << *root << dendl; + p.second->add_scatterlock_state(root); + } + } + + if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) { + p.second->add_strong_inode(in->vino(), + in->get_replica_nonce(), + in->get_caps_wanted(), + in->filelock.get_state(), + in->nestlock.get_state(), + in->dirfragtreelock.get_state()); + in->state_set(CInode::STATE_REJOINING); + } + } + } + + if (!mds->is_rejoin()) { + // i am survivor. send strong rejoin. + // note request remote_auth_pins, xlocks + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (mdr->is_slave()) + continue; + // auth pins + for (const auto& q : mdr->remote_auth_pins) { + if (!q.first->is_auth()) { + ceph_assert(q.second == q.first->authority().first); + if (rejoins.count(q.second) == 0) continue; + const MMDSCacheRejoin::ref &rejoin = rejoins[q.second]; + + dout(15) << " " << *mdr << " authpin on " << *q.first << dendl; + MDSCacheObjectInfo i; + q.first->set_object_info(i); + if (i.ino) + rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt); + else + rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt); + + if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin && + mdr->more()->rename_inode == q.first) + rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid), + mdr->reqid, mdr->attempt); + } + } + // xlocks + for (const auto& q : mdr->locks) { + auto lock = q.lock; + auto obj = lock->get_parent(); + if (q.is_xlock() && !obj->is_auth()) { + mds_rank_t who = obj->authority().first; + if (rejoins.count(who) == 0) continue; + const MMDSCacheRejoin::ref &rejoin = rejoins[who]; + + dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl; + MDSCacheObjectInfo i; + obj->set_object_info(i); + if (i.ino) + rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(), + mdr->reqid, mdr->attempt); + else + rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid, + mdr->reqid, mdr->attempt); + } else if (q.is_remote_wrlock()) { + mds_rank_t who = q.wrlock_target; + if (rejoins.count(who) == 0) continue; + const MMDSCacheRejoin::ref &rejoin = rejoins[who]; + + dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl; + MDSCacheObjectInfo i; + obj->set_object_info(i); + ceph_assert(i.ino); + rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(), + mdr->reqid, mdr->attempt); + } + } + } + } + + // send the messages + for (auto &p : rejoins) { + ceph_assert(rejoin_sent.count(p.first) == 0); + ceph_assert(rejoin_ack_gather.count(p.first) == 0); + rejoin_sent.insert(p.first); + rejoin_ack_gather.insert(p.first); + mds->send_message_mds(p.second, p.first); + } + rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too + rejoins_pending = false; + + // nothing? + if (mds->is_rejoin() && rejoin_gather.empty()) { + dout(10) << "nothing to rejoin" << dendl; + rejoin_gather_finish(); + } +} + + +/** + * rejoin_walk - build rejoin declarations for a subtree + * + * @param dir subtree root + * @param rejoin rejoin message + * + * from a rejoining node: + * weak dirfrag + * weak dentries (w/ connectivity) + * + * from a surviving node: + * strong dirfrag + * strong dentries (no connectivity!) + * strong inodes + */ +void MDCache::rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin) +{ + dout(10) << "rejoin_walk " << *dir << dendl; + + list<CDir*> nested; // finish this dir, then do nested items + + if (mds->is_rejoin()) { + // WEAK + rejoin->add_weak_dirfrag(dir->dirfrag()); + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->last == CEPH_NOSNAP); + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(15) << " add_weak_primary_dentry " << *dn << dendl; + ceph_assert(dnl->is_primary()); + CInode *in = dnl->get_inode(); + ceph_assert(dnl->get_inode()->is_dir()); + rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino()); + in->get_nested_dirfrags(nested); + if (in->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on " << *in << dendl; + rejoin->add_scatterlock_state(in); + } + } + } else { + // STRONG + dout(15) << " add_strong_dirfrag " << *dir << dendl; + rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); + dir->state_set(CDir::STATE_REJOINING); + + for (auto it = dir->items.begin(); it != dir->items.end(); ) { + CDentry *dn = it->second; + ++it; + dn->state_set(CDentry::STATE_REJOINING); + CDentry::linkage_t *dnl = dn->get_linkage(); + CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL; + + // trim snap dentries. because they may have been pruned by + // their auth mds (snap deleted) + if (dn->last != CEPH_NOSNAP) { + if (in && !in->remote_parents.empty()) { + // unlink any stale remote snap dentry. + for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) { + CDentry *remote_dn = *it2; + ++it2; + ceph_assert(remote_dn->last != CEPH_NOSNAP); + remote_dn->unlink_remote(remote_dn->get_linkage()); + } + } + if (dn->lru_is_expireable()) { + if (!dnl->is_null()) + dir->unlink_inode(dn, false); + if (in) + remove_inode(in); + dir->remove_dentry(dn); + continue; + } else { + // Inventing null/remote dentry shouldn't cause problem + ceph_assert(!dnl->is_primary()); + } + } + + dout(15) << " add_strong_dentry " << *dn << dendl; + rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last, + dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_d_type():0, + dn->get_replica_nonce(), + dn->lock.get_state()); + dn->state_set(CDentry::STATE_REJOINING); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dout(15) << " add_strong_inode " << *in << dendl; + rejoin->add_strong_inode(in->vino(), + in->get_replica_nonce(), + in->get_caps_wanted(), + in->filelock.get_state(), + in->nestlock.get_state(), + in->dirfragtreelock.get_state()); + in->state_set(CInode::STATE_REJOINING); + in->get_nested_dirfrags(nested); + if (in->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on " << *in << dendl; + rejoin->add_scatterlock_state(in); + } + } + } + } + + // recurse into nested dirs + for (list<CDir*>::iterator p = nested.begin(); + p != nested.end(); + ++p) + rejoin_walk(*p, rejoin); +} + + +/* + * i got a rejoin. + * - reply with the lockstate + * + * if i am active|stopping, + * - remove source from replica list for everything not referenced here. + */ +void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m) +{ + dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() + << " (" << m->get_payload().length() << " bytes)" + << dendl; + + switch (m->op) { + case MMDSCacheRejoin::OP_WEAK: + handle_cache_rejoin_weak(m); + break; + case MMDSCacheRejoin::OP_STRONG: + handle_cache_rejoin_strong(m); + break; + case MMDSCacheRejoin::OP_ACK: + handle_cache_rejoin_ack(m); + break; + + default: + ceph_abort(); + } +} + + +/* + * handle_cache_rejoin_weak + * + * the sender + * - is recovering from their journal. + * - may have incorrect (out of date) inode contents + * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient + * + * if the sender didn't trim_non_auth(), they + * - may have incorrect (out of date) dentry/inode linkage + * - may have deleted/purged inodes + * and i may have to go to disk to get accurate inode contents. yuck. + */ +void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &weak) +{ + mds_rank_t from = mds_rank_t(weak->get_source().num()); + + // possible response(s) + MMDSCacheRejoin::ref ack; // if survivor + set<vinodeno_t> acked_inodes; // if survivor + set<SimpleLock *> gather_locks; // if survivor + bool survivor = false; // am i a survivor? + + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + survivor = true; + dout(10) << "i am a surivivor, and will ack immediately" << dendl; + ack = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK); + + map<inodeno_t,map<client_t,Capability::Import> > imported_caps; + + // check cap exports + for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { + CInode *in = get_inode(p->first); + ceph_assert(!in || in->is_auth()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl; + Capability *cap = rejoin_import_cap(in, q->first, q->second, from); + Capability::Import& im = imported_caps[p->first][q->first]; + if (cap) { + im.cap_id = cap->get_cap_id(); + im.issue_seq = cap->get_last_seq(); + im.mseq = cap->get_mseq(); + } else { + // all are zero + } + } + mds->locker->eval(in, CEPH_CAP_LOCKS, true); + } + + encode(imported_caps, ack->imported_caps); + } else { + ceph_assert(mds->is_rejoin()); + + // we may have already received a strong rejoin from the sender. + rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks); + ceph_assert(gather_locks.empty()); + + // check cap exports. + rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end()); + rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(), + weak->client_metadata_map.end()); + + for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { + CInode *in = get_inode(p->first); + ceph_assert(!in || in->is_auth()); + // note + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl; + cap_imports[p->first][q->first][from] = q->second; + } + } + } + + // assimilate any potentially dirty scatterlock state + for (const auto &p : weak->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file); + in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest); + in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft); + if (!survivor) + rejoin_potential_updated_scatterlocks.insert(in); + } + + // recovering peer may send incorrect dirfrags here. we need to + // infer which dirfrag they meant. the ack will include a + // strong_dirfrag that will set them straight on the fragmentation. + + // walk weak map + set<CDir*> dirs_to_share; + for (const auto &p : weak->weak_dirfrags) { + CInode *diri = get_inode(p.ino); + if (!diri) + dout(0) << " missing dir ino " << p.ino << dendl; + ceph_assert(diri); + + frag_vec_t leaves; + if (diri->dirfragtree.is_leaf(p.frag)) { + leaves.push_back(p.frag); + } else { + diri->dirfragtree.get_leaves_under(p.frag, leaves); + if (leaves.empty()) + leaves.push_back(diri->dirfragtree[p.frag.value()]); + } + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) { + dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl; + continue; + } + ceph_assert(dir); + if (dirs_to_share.count(dir)) { + dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl; + } else { + dirs_to_share.insert(dir); + unsigned nonce = dir->add_replica(from); + dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl; + if (ack) { + ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep); + ack->add_dirfrag_base(dir); + } + } + } + } + + for (const auto &p : weak->weak) { + CInode *diri = get_inode(p.first); + if (!diri) + dout(0) << " missing dir ino " << p.first << dendl; + ceph_assert(diri); + + // weak dentries + CDir *dir = 0; + for (const auto &q : p.second) { + // locate proper dirfrag. + // optimize for common case (one dirfrag) to avoid dirs_to_share set check + frag_t fg = diri->pick_dirfrag(q.first.name); + if (!dir || dir->get_frag() != fg) { + dir = diri->get_dirfrag(fg); + if (!dir) + dout(0) << " missing dir frag " << fg << " on " << *diri << dendl; + ceph_assert(dir); + ceph_assert(dirs_to_share.count(dir)); + } + + // and dentry + CDentry *dn = dir->lookup(q.first.name, q.first.snapid); + ceph_assert(dn); + CDentry::linkage_t *dnl = dn->get_linkage(); + ceph_assert(dnl->is_primary()); + + if (survivor && dn->is_replica(from)) + dentry_remove_replica(dn, from, gather_locks); + unsigned dnonce = dn->add_replica(from); + dout(10) << " have " << *dn << dendl; + if (ack) + ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last, + dnl->get_inode()->ino(), inodeno_t(0), 0, + dnonce, dn->lock.get_replica_state()); + + // inode + CInode *in = dnl->get_inode(); + ceph_assert(in); + + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from, true, gather_locks); + unsigned inonce = in->add_replica(from); + dout(10) << " have " << *in << dendl; + + // scatter the dirlock, just in case? + if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag()) + in->filelock.set_state(LOCK_MIX); + + if (ack) { + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); + } + } + } + + // weak base inodes? (root, stray, etc.) + for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin(); + p != weak->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + ceph_assert(in); // hmm fixme wrt stray? + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from, true, gather_locks); + unsigned inonce = in->add_replica(from); + dout(10) << " have base " << *in << dendl; + + if (ack) { + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); + } + } + + ceph_assert(rejoin_gather.count(from)); + rejoin_gather.erase(from); + if (survivor) { + // survivor. do everything now. + for (const auto &p : weak->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl; + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + } + + rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks); + mds->send_message(ack, weak->get_connection()); + + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { + if (!(*p)->is_stable()) + mds->locker->eval_gather(*p); + } + } else { + // done? + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) { + rejoin_gather_finish(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; + } + } +} + +/* + * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects + * + * all validated replicas are acked with a strong nonce, etc. if that isn't in the + * ack, the replica dne, and we can remove it from our replica maps. + */ +void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack, + set<vinodeno_t>& acked_inodes, + set<SimpleLock *>& gather_locks) +{ + dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl; + + auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) { + // inode? + if (in->is_auth() && + in->is_replica(from) && + (ack == NULL || acked_inodes.count(in->vino()) == 0)) { + inode_remove_replica(in, from, false, gather_locks); + dout(10) << " rem " << *in << dendl; + } + + if (!in->is_dir()) + return; + + list<CDir*> dfs; + in->get_dirfrags(dfs); + for (list<CDir*>::iterator p = dfs.begin(); + p != dfs.end(); + ++p) { + CDir *dir = *p; + if (!dir->is_auth()) + continue; + + if (dir->is_replica(from) && + (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) { + dir->remove_replica(from); + dout(10) << " rem " << *dir << dendl; + } + + // dentries + for (auto &p : dir->items) { + CDentry *dn = p.second; + + if (dn->is_replica(from)) { + if (ack) { + const auto it = ack->strong_dentries.find(dir->dirfrag()); + if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) { + continue; + } + } + dentry_remove_replica(dn, from, gather_locks); + dout(10) << " rem " << *dn << dendl; + } + } + } + }; + + for (auto &p : inode_map) + scour_func(p.second); + for (auto &p : snap_inode_map) + scour_func(p.second); +} + + +CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last) +{ + CInode *in = new CInode(this, true, 1, last); + in->inode.ino = ino; + in->state_set(CInode::STATE_REJOINUNDEF); + add_inode(in); + rejoin_undef_inodes.insert(in); + dout(10) << " invented " << *in << dendl; + return in; +} + +CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df) +{ + CInode *in = get_inode(df.ino); + if (!in) + in = rejoin_invent_inode(df.ino, CEPH_NOSNAP); + if (!in->is_dir()) { + ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF)); + in->inode.mode = S_IFDIR; + in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + } + CDir *dir = in->get_or_open_dirfrag(this, df.frag); + dir->state_set(CDir::STATE_REJOINUNDEF); + rejoin_undef_dirfrags.insert(dir); + dout(10) << " invented " << *dir << dendl; + return dir; +} + +void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &strong) +{ + mds_rank_t from = mds_rank_t(strong->get_source().num()); + + // only a recovering node will get a strong rejoin. + if (!mds->is_rejoin()) { + if (mds->get_want_state() == MDSMap::STATE_REJOIN) { + mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong)); + return; + } + ceph_abort_msg("got unexpected rejoin message during recovery"); + } + + // assimilate any potentially dirty scatterlock state + for (const auto &p : strong->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file); + in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest); + in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft); + rejoin_potential_updated_scatterlocks.insert(in); + } + + rejoin_unlinked_inodes[from].clear(); + + // surviving peer may send incorrect dirfrag here (maybe they didn't + // get the fragment notify, or maybe we rolled back?). we need to + // infer the right frag and get them with the program. somehow. + // we don't normally send ACK.. so we'll need to bundle this with + // MISSING or something. + + // strong dirfrags/dentries. + // also process auth_pins, xlocks. + for (const auto &p : strong->strong_dirfrags) { + auto& dirfrag = p.first; + CInode *diri = get_inode(dirfrag.ino); + if (!diri) + diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP); + CDir *dir = diri->get_dirfrag(dirfrag.frag); + bool refragged = false; + if (dir) { + dout(10) << " have " << *dir << dendl; + } else { + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t())); + else if (diri->dirfragtree.is_leaf(dirfrag.frag)) + dir = rejoin_invent_dirfrag(dirfrag); + } + if (dir) { + dir->add_replica(from, p.second.nonce); + dir->dir_rep = p.second.dir_rep; + } else { + dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl; + frag_vec_t leaves; + diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves); + if (leaves.empty()) + leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]); + dout(10) << " maps to frag(s) " << leaves << dendl; + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) + dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf)); + else + dout(10) << " have(approx) " << *dir << dendl; + dir->add_replica(from, p.second.nonce); + dir->dir_rep = p.second.dir_rep; + } + refragged = true; + } + + const auto it = strong->strong_dentries.find(dirfrag); + if (it != strong->strong_dentries.end()) { + const map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = it->second; + for (const auto &q : dmap) { + const string_snap_t& ss = q.first; + const MMDSCacheRejoin::dn_strong& d = q.second; + CDentry *dn; + if (!refragged) + dn = dir->lookup(ss.name, ss.snapid); + else { + frag_t fg = diri->pick_dirfrag(ss.name); + dir = diri->get_dirfrag(fg); + ceph_assert(dir); + dn = dir->lookup(ss.name, ss.snapid); + } + if (!dn) { + if (d.is_remote()) { + dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid); + } else if (d.is_null()) { + dn = dir->add_null_dentry(ss.name, d.first, ss.snapid); + } else { + CInode *in = get_inode(d.ino, ss.snapid); + if (!in) in = rejoin_invent_inode(d.ino, ss.snapid); + dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid); + } + dout(10) << " invented " << *dn << dendl; + } + CDentry::linkage_t *dnl = dn->get_linkage(); + + // dn auth_pin? + const auto pinned_it = strong->authpinned_dentries.find(dirfrag); + if (pinned_it != strong->authpinned_dentries.end()) { + const auto slave_reqid_it = pinned_it->second.find(ss); + if (slave_reqid_it != pinned_it->second.end()) { + for (const auto &r : slave_reqid_it->second) { + dout(10) << " dn authpin by " << r << " on " << *dn << dendl; + + // get/create slave mdrequest + MDRequestRef mdr; + if (have_request(r.reqid)) + mdr = request_get(r.reqid); + else + mdr = request_start_slave(r.reqid, r.attempt, strong); + mdr->auth_pin(dn); + } + } + } + + // dn xlock? + const auto xlocked_it = strong->xlocked_dentries.find(dirfrag); + if (xlocked_it != strong->xlocked_dentries.end()) { + const auto ss_req_it = xlocked_it->second.find(ss); + if (ss_req_it != xlocked_it->second.end()) { + const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second; + dout(10) << " dn xlock by " << r << " on " << *dn << dendl; + MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above. + ceph_assert(mdr->is_auth_pinned(dn)); + if (!mdr->is_xlocked(&dn->versionlock)) { + ceph_assert(dn->versionlock.can_xlock_local()); + dn->versionlock.get_xlock(mdr, mdr->get_client()); + mdr->locks.emplace(&dn->versionlock, MutationImpl::LockOp::XLOCK); + } + if (dn->lock.is_stable()) + dn->auth_pin(&dn->lock); + dn->lock.set_state(LOCK_XLOCK); + dn->lock.get_xlock(mdr, mdr->get_client()); + mdr->locks.emplace(&dn->lock, MutationImpl::LockOp::XLOCK); + } + } + + dn->add_replica(from, d.nonce); + dout(10) << " have " << *dn << dendl; + + if (dnl->is_primary()) { + if (d.is_primary()) { + if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) { + // the survivor missed MDentryUnlink+MDentryLink messages ? + ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); + CInode *in = get_inode(d.ino, ss.snapid); + ceph_assert(in); + ceph_assert(in->get_parent_dn()); + rejoin_unlinked_inodes[from].insert(in); + dout(7) << " sender has primary dentry but wrong inode" << dendl; + } + } else { + // the survivor missed MDentryLink message ? + ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); + dout(7) << " sender doesn't have primay dentry" << dendl; + } + } else { + if (d.is_primary()) { + // the survivor missed MDentryUnlink message ? + CInode *in = get_inode(d.ino, ss.snapid); + ceph_assert(in); + ceph_assert(in->get_parent_dn()); + rejoin_unlinked_inodes[from].insert(in); + dout(7) << " sender has primary dentry but we don't" << dendl; + } + } + } + } + } + + for (const auto &p : strong->strong_inodes) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->add_replica(from, p.second.nonce); + dout(10) << " have " << *in << dendl; + + const MMDSCacheRejoin::inode_strong& is = p.second; + + // caps_wanted + if (is.caps_wanted) { + in->set_mds_caps_wanted(from, is.caps_wanted); + dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) + << " on " << *in << dendl; + } + + // scatterlocks? + // infer state from replica state: + // * go to MIX if they might have wrlocks + // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) + in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK + in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); + in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); + + // auth pin? + const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino()); + if (authpinned_inodes_it != strong->authpinned_inodes.end()) { + for (const auto& r : authpinned_inodes_it->second) { + dout(10) << " inode authpin by " << r << " on " << *in << dendl; + + // get/create slave mdrequest + MDRequestRef mdr; + if (have_request(r.reqid)) + mdr = request_get(r.reqid); + else + mdr = request_start_slave(r.reqid, r.attempt, strong); + if (strong->frozen_authpin_inodes.count(in->vino())) { + ceph_assert(!in->get_num_auth_pins()); + mdr->freeze_auth_pin(in); + } else { + ceph_assert(!in->is_frozen_auth_pin()); + } + mdr->auth_pin(in); + } + } + // xlock(s)? + const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino()); + if (xlocked_inodes_it != strong->xlocked_inodes.end()) { + for (const auto &q : xlocked_inodes_it->second) { + SimpleLock *lock = in->get_lock(q.first); + dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl; + MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above. + ceph_assert(mdr->is_auth_pinned(in)); + if (!mdr->is_xlocked(&in->versionlock)) { + ceph_assert(in->versionlock.can_xlock_local()); + in->versionlock.get_xlock(mdr, mdr->get_client()); + mdr->locks.emplace(&in->versionlock, MutationImpl::LockOp::XLOCK); + } + if (lock->is_stable()) + in->auth_pin(lock); + lock->set_state(LOCK_XLOCK); + if (lock == &in->filelock) + in->loner_cap = -1; + lock->get_xlock(mdr, mdr->get_client()); + mdr->locks.emplace(lock, MutationImpl::LockOp::XLOCK); + } + } + } + // wrlock(s)? + for (const auto &p : strong->wrlocked_inodes) { + CInode *in = get_inode(p.first); + for (const auto &q : p.second) { + SimpleLock *lock = in->get_lock(q.first); + for (const auto &r : q.second) { + dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl; + MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above. + if (in->is_auth()) + ceph_assert(mdr->is_auth_pinned(in)); + lock->set_state(LOCK_MIX); + if (lock == &in->filelock) + in->loner_cap = -1; + lock->get_wrlock(true); + mdr->locks.emplace(lock, MutationImpl::LockOp::WRLOCK); + } + } + } + + // done? + ceph_assert(rejoin_gather.count(from)); + rejoin_gather.erase(from); + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) { + rejoin_gather_finish(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; + } +} + +void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &ack) +{ + dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN); + bool survivor = !mds->is_rejoin(); + + // for sending cache expire message + set<CInode*> isolated_inodes; + set<CInode*> refragged_inodes; + list<pair<CInode*,int> > updated_realms; + + // dirs + for (const auto &p : ack->strong_dirfrags) { + // we may have had incorrect dir fragmentation; refragment based + // on what they auth tells us. + CDir *dir = get_dirfrag(p.first); + if (!dir) { + dir = get_force_dirfrag(p.first, false); + if (dir) + refragged_inodes.insert(dir->get_inode()); + } + if (!dir) { + CInode *diri = get_inode(p.first.ino); + if (!diri) { + // barebones inode; the full inode loop below will clean up. + diri = new CInode(this, false); + diri->inode.ino = p.first.ino; + diri->inode.mode = S_IFDIR; + diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + add_inode(diri); + if (MDS_INO_MDSDIR(from) == p.first.ino) { + diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN); + dout(10) << " add inode " << *diri << dendl; + } else { + diri->inode_auth = CDIR_AUTH_DEFAULT; + isolated_inodes.insert(diri); + dout(10) << " unconnected dirfrag " << p.first << dendl; + } + } + // barebones dirfrag; the full dirfrag loop below will clean up. + dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false)); + if (MDS_INO_MDSDIR(from) == p.first.ino || + (dir->authority() != CDIR_AUTH_UNDEF && + dir->authority().first != from)) + adjust_subtree_auth(dir, from); + dout(10) << " add dirfrag " << *dir << dendl; + } + + dir->set_replica_nonce(p.second.nonce); + dir->state_clear(CDir::STATE_REJOINING); + dout(10) << " got " << *dir << dendl; + + // dentries + auto it = ack->strong_dentries.find(p.first); + if (it != ack->strong_dentries.end()) { + for (const auto &q : it->second) { + CDentry *dn = dir->lookup(q.first.name, q.first.snapid); + if(!dn) + dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid); + + CDentry::linkage_t *dnl = dn->get_linkage(); + + ceph_assert(dn->last == q.first.snapid); + if (dn->first != q.second.first) { + dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl; + dn->first = q.second.first; + } + + // may have bad linkage if we missed dentry link/unlink messages + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (!q.second.is_primary() || + vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) { + dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl; + dir->unlink_inode(dn); + } + } else if (dnl->is_remote()) { + if (!q.second.is_remote() || + q.second.remote_ino != dnl->get_remote_ino() || + q.second.remote_d_type != dnl->get_remote_d_type()) { + dout(10) << " had bad linkage for " << *dn << dendl; + dir->unlink_inode(dn); + } + } else { + if (!q.second.is_null()) + dout(10) << " had bad linkage for " << *dn << dendl; + } + + // hmm, did we have the proper linkage here? + if (dnl->is_null() && !q.second.is_null()) { + if (q.second.is_remote()) { + dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type); + } else { + CInode *in = get_inode(q.second.ino, q.first.snapid); + if (!in) { + // barebones inode; assume it's dir, the full inode loop below will clean up. + in = new CInode(this, false, q.second.first, q.first.snapid); + in->inode.ino = q.second.ino; + in->inode.mode = S_IFDIR; + in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + add_inode(in); + dout(10) << " add inode " << *in << dendl; + } else if (in->get_parent_dn()) { + dout(10) << " had bad linkage for " << *(in->get_parent_dn()) + << ", unlinking " << *in << dendl; + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); + } + dn->dir->link_primary_inode(dn, in); + isolated_inodes.erase(in); + } + } + + dn->set_replica_nonce(q.second.nonce); + dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor); + dn->state_clear(CDentry::STATE_REJOINING); + dout(10) << " got " << *dn << dendl; + } + } + } + + for (set<CInode*>::iterator p = refragged_inodes.begin(); + p != refragged_inodes.end(); + ++p) { + list<CDir*> ls; + (*p)->get_nested_dirfrags(ls); + for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) { + if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag())) + continue; + ceph_assert((*q)->get_num_any() == 0); + (*p)->close_dirfrag((*q)->get_frag()); + } + } + + // full dirfrags + for (const auto &p : ack->dirfrag_bases) { + CDir *dir = get_dirfrag(p.first); + ceph_assert(dir); + auto q = p.second.cbegin(); + dir->_decode_base(q); + dout(10) << " got dir replica " << *dir << dendl; + } + + // full inodes + auto p = ack->inode_base.cbegin(); + while (!p.end()) { + inodeno_t ino; + snapid_t last; + bufferlist basebl; + decode(ino, p); + decode(last, p); + decode(basebl, p); + CInode *in = get_inode(ino, last); + ceph_assert(in); + auto q = basebl.cbegin(); + snapid_t sseq = 0; + if (in->snaprealm) + sseq = in->snaprealm->srnode.seq; + in->_decode_base(q); + if (in->snaprealm && in->snaprealm->srnode.seq != sseq) { + int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT; + updated_realms.push_back(pair<CInode*,int>(in, snap_op)); + } + dout(10) << " got inode base " << *in << dendl; + } + + // inodes + p = ack->inode_locks.cbegin(); + //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl; + while (!p.end()) { + inodeno_t ino; + snapid_t last; + __u32 nonce; + bufferlist lockbl; + decode(ino, p); + decode(last, p); + decode(nonce, p); + decode(lockbl, p); + + CInode *in = get_inode(ino, last); + ceph_assert(in); + in->set_replica_nonce(nonce); + auto q = lockbl.cbegin(); + in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor); + in->state_clear(CInode::STATE_REJOINING); + dout(10) << " got inode locks " << *in << dendl; + } + + // FIXME: This can happen if entire subtree, together with the inode subtree root + // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. + ceph_assert(isolated_inodes.empty()); + + map<inodeno_t,map<client_t,Capability::Import> > peer_imported; + auto bp = ack->imported_caps.cbegin(); + decode(peer_imported, bp); + + for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin(); + p != peer_imported.end(); + ++p) { + auto& ex = cap_exports.at(p->first); + ceph_assert(ex.first == from); + for (map<client_t,Capability::Import>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + auto r = ex.second.find(q->first); + ceph_assert(r != ex.second.end()); + + dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (!session) { + dout(10) << " no session for client." << p->first << dendl; + ex.second.erase(r); + continue; + } + + // mark client caps stale. + auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, + r->second.capinfo.cap_id, 0, + mds->get_osd_epoch_barrier()); + m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, + (q->second.cap_id > 0 ? from : -1), 0); + mds->send_message_client_counted(m, session); + + ex.second.erase(r); + } + ceph_assert(ex.second.empty()); + } + + for (auto p : updated_realms) { + CInode *in = p.first; + bool notify_clients; + if (mds->is_rejoin()) { + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + notify_clients = false; + } else { + // notify clients if I'm survivor + notify_clients = true; + } + do_realm_invalidate_and_update_notify(in, p.second, notify_clients); + } + + // done? + ceph_assert(rejoin_ack_gather.count(from)); + rejoin_ack_gather.erase(from); + if (!survivor) { + if (rejoin_gather.empty()) { + // eval unstable scatter locks after all wrlocks are rejoined. + while (!rejoin_eval_locks.empty()) { + SimpleLock *lock = rejoin_eval_locks.front(); + rejoin_eval_locks.pop_front(); + if (!lock->is_stable()) + mds->locker->eval_gather(lock); + } + } + + if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. + rejoin_ack_gather.empty()) { + // finally, kickstart past snap parent opens + open_snaprealms(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" + << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; + } + } else { + // survivor. + mds->queue_waiters(rejoin_waiters); + } +} + +/** + * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes + * + * FIXME: wait, can this actually happen? a survivor should generate cache trim + * messages that clean these guys up... + */ +void MDCache::rejoin_trim_undef_inodes() +{ + dout(10) << "rejoin_trim_undef_inodes" << dendl; + + while (!rejoin_undef_inodes.empty()) { + set<CInode*>::iterator p = rejoin_undef_inodes.begin(); + CInode *in = *p; + rejoin_undef_inodes.erase(p); + + in->clear_replica_map(); + + // close out dirfrags + if (in->is_dir()) { + list<CDir*> dfls; + in->get_dirfrags(dfls); + for (list<CDir*>::iterator p = dfls.begin(); + p != dfls.end(); + ++p) { + CDir *dir = *p; + dir->clear_replica_map(); + + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->clear_replica_map(); + + dout(10) << " trimming " << *dn << dendl; + dir->remove_dentry(dn); + } + + dout(10) << " trimming " << *dir << dendl; + in->close_dirfrag(dir->dirfrag().frag); + } + } + + CDentry *dn = in->get_parent_dn(); + if (dn) { + dn->clear_replica_map(); + dout(10) << " trimming " << *dn << dendl; + dn->dir->remove_dentry(dn); + } else { + dout(10) << " trimming " << *in << dendl; + remove_inode(in); + } + } + + ceph_assert(rejoin_undef_inodes.empty()); +} + +void MDCache::rejoin_gather_finish() +{ + dout(10) << "rejoin_gather_finish" << dendl; + ceph_assert(mds->is_rejoin()); + ceph_assert(rejoin_ack_gather.count(mds->get_nodeid())); + + if (open_undef_inodes_dirfrags()) + return; + + if (process_imported_caps()) + return; + + choose_lock_states_and_reconnect_caps(); + + identify_files_to_recover(); + rejoin_send_acks(); + + // signal completion of fetches, rejoin_gather_finish, etc. + rejoin_ack_gather.erase(mds->get_nodeid()); + + // did we already get our acks too? + if (rejoin_ack_gather.empty()) { + // finally, open snaprealms + open_snaprealms(); + } +} + +class C_MDC_RejoinOpenInoFinish: public MDCacheContext { + inodeno_t ino; +public: + C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} + void finish(int r) override { + mdcache->rejoin_open_ino_finish(ino, r); + } +}; + +void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret) +{ + dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl; + + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret == mds->get_nodeid()) { + ceph_assert(get_inode(ino)); + } else { + auto p = cap_imports.find(ino); + ceph_assert(p != cap_imports.end()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + ceph_assert(q->second.count(MDS_RANK_NONE)); + ceph_assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + + ceph_assert(cap_imports_num_opening > 0); + cap_imports_num_opening--; + + if (cap_imports_num_opening == 0) { + if (rejoin_gather.empty()) + rejoin_gather_finish(); + else if (rejoin_gather.count(mds->get_nodeid())) + process_imported_caps(); + } +} + +class C_MDC_RejoinSessionsOpened : public MDCacheLogContext { +public: + map<client_t,pair<Session*,uint64_t> > session_map; + C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {} + void finish(int r) override { + ceph_assert(r == 0); + mdcache->rejoin_open_sessions_finish(session_map); + } +}; + +void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map) +{ + dout(10) << "rejoin_open_sessions_finish" << dendl; + mds->server->finish_force_open_sessions(session_map); + rejoin_session_map.swap(session_map); + if (rejoin_gather.empty()) + rejoin_gather_finish(); +} + +void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret) +{ + auto p = cap_imports.find(ino); + if (p != cap_imports.end()) { + dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl; + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret != mds->get_nodeid()) { + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + ceph_assert(q->second.count(MDS_RANK_NONE)); + ceph_assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + } +} + +bool MDCache::process_imported_caps() +{ + dout(10) << "process_imported_caps" << dendl; + + if (!open_file_table.is_prefetched() && + open_file_table.prefetch_inodes()) { + open_file_table.wait_for_prefetch( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + ceph_assert(rejoin_gather.count(mds->get_nodeid())); + process_imported_caps(); + }) + ) + ); + return true; + } + + for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { + CInode *in = get_inode(p->first); + if (in) { + ceph_assert(in->is_auth()); + cap_imports_missing.erase(p->first); + continue; + } + if (cap_imports_missing.count(p->first) > 0) + continue; + + cap_imports_num_opening++; + dout(10) << " opening missing ino " << p->first << dendl; + open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false); + if (!(cap_imports_num_opening % 1000)) + mds->heartbeat_reset(); + } + + if (cap_imports_num_opening > 0) + return true; + + // called by rejoin_gather_finish() ? + if (rejoin_gather.count(mds->get_nodeid()) == 0) { + if (!rejoin_client_map.empty() && + rejoin_session_map.empty()) { + C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this); + version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, + rejoin_client_metadata_map, + finish->session_map); + ESessions *le = new ESessions(pv, std::move(rejoin_client_map), + std::move(rejoin_client_metadata_map)); + mds->mdlog->start_submit_entry(le, finish); + mds->mdlog->flush(); + rejoin_client_map.clear(); + rejoin_client_metadata_map.clear(); + return true; + } + + // process caps that were exported by slave rename + for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin(); + p != rejoin_slave_exports.end(); + ++p) { + CInode *in = get_inode(p->first); + ceph_assert(in); + for (map<client_t,Capability::Export>::iterator q = p->second.second.begin(); + q != p->second.second.end(); + ++q) { + auto r = rejoin_session_map.find(q->first); + if (r == rejoin_session_map.end()) + continue; + + Session *session = r->second.first; + Capability *cap = in->get_client_cap(q->first); + if (!cap) { + cap = in->add_client_cap(q->first, session); + // add empty item to reconnected_caps + (void)reconnected_caps[p->first][q->first]; + } + cap->merge(q->second, true); + + Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first]; + ceph_assert(cap->get_last_seq() == im.issue_seq); + ceph_assert(cap->get_mseq() == im.mseq); + cap->set_cap_id(im.cap_id); + // send cap import because we assigned a new cap ID + do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1, + p->second.first, CEPH_CAP_FLAG_AUTH); + } + } + rejoin_slave_exports.clear(); + rejoin_imported_caps.clear(); + + // process cap imports + // ino -> client -> frommds -> capex + for (auto p = cap_imports.begin(); p != cap_imports.end(); ) { + CInode *in = get_inode(p->first); + if (!in) { + dout(10) << " still missing ino " << p->first + << ", will try again after replayed client requests" << dendl; + ++p; + continue; + } + ceph_assert(in->is_auth()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + Session *session; + { + auto r = rejoin_session_map.find(q->first); + session = (r != rejoin_session_map.end() ? r->second.first : nullptr); + } + + for (auto r = q->second.begin(); r != q->second.end(); ++r) { + if (!session) { + if (r->first >= 0) + (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero + continue; + } + + Capability *cap = in->reconnect_cap(q->first, r->second, session); + add_reconnected_cap(q->first, in->ino(), r->second); + if (r->first >= 0) { + if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists + cap->inc_mseq(); + do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0); + + Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first]; + im.cap_id = cap->get_cap_id(); + im.issue_seq = cap->get_last_seq(); + im.mseq = cap->get_mseq(); + } + } + } + cap_imports.erase(p++); // remove and move on + } + } else { + trim_non_auth(); + + ceph_assert(rejoin_gather.count(mds->get_nodeid())); + rejoin_gather.erase(mds->get_nodeid()); + ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid())); + maybe_send_pending_rejoins(); + } + return false; +} + +void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, + client_t client, snapid_t snap_follows) +{ + dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl; + + if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1)) + return; + + const set<snapid_t>& snaps = realm->get_snaps(); + snapid_t follows = snap_follows; + + while (true) { + CInode *in = pick_inode_snap(head_in, follows); + if (in == head_in) + break; + + bool need_snapflush = false; + for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1))); + p != snaps.end() && *p <= in->last; + ++p) { + head_in->add_need_snapflush(in, *p, client); + need_snapflush = true; + } + follows = in->last; + if (!need_snapflush) + continue; + + dout(10) << " need snapflush from client." << client << " on " << *in << dendl; + + if (in->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + int lockid = cinode_lock_info[i].lock; + SimpleLock *lock = in->get_lock(lockid); + ceph_assert(lock); + in->auth_pin(lock); + lock->set_state(LOCK_SNAP_SYNC); + lock->get_wrlock(true); + } + } + in->client_snap_caps.insert(client); + mds->locker->mark_need_snapflush_inode(in); + } +} + +/* + * choose lock states based on reconnected caps + */ +void MDCache::choose_lock_states_and_reconnect_caps() +{ + dout(10) << "choose_lock_states_and_reconnect_caps" << dendl; + + int count = 0; + for (auto p : inode_map) { + CInode *in = p.second; + if (in->last != CEPH_NOSNAP) + continue; + + if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat()) + in->mark_dirty_rstat(); + + int dirty_caps = 0; + auto q = reconnected_caps.find(in->ino()); + if (q != reconnected_caps.end()) { + for (const auto &it : q->second) + dirty_caps |= it.second.dirty_caps; + } + in->choose_lock_states(dirty_caps); + dout(15) << " chose lock states on " << *in << dendl; + + if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } +} + +void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, + map<client_t,MClientSnap::ref>& splits) +{ + MClientSnap::ref snap; + auto it = splits.find(client); + if (it != splits.end()) { + snap = it->second; + snap->head.op = CEPH_SNAP_OP_SPLIT; + } else { + snap = MClientSnap::create(CEPH_SNAP_OP_SPLIT); + splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap)); + snap->head.split = realm->inode->ino(); + snap->bl = realm->get_snap_trace(); + + for (const auto& child : realm->open_children) + snap->split_realms.push_back(child->inode->ino()); + } + snap->split_inos.push_back(ino); +} + +void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, + map<client_t,MClientSnap::ref>& splits) +{ + ceph_assert(parent_realm); + + vector<inodeno_t> split_inos; + vector<inodeno_t> split_realms; + + for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps)); + !p.end(); + ++p) + split_inos.push_back((*p)->ino()); + for (set<SnapRealm*>::iterator p = realm->open_children.begin(); + p != realm->open_children.end(); + ++p) + split_realms.push_back((*p)->inode->ino()); + + for (const auto& p : realm->client_caps) { + ceph_assert(!p.second->empty()); + auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple()); + if (em.second) { + auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT); + update->head.split = parent_realm->inode->ino(); + update->split_inos = split_inos; + update->split_realms = split_realms; + update->bl = parent_realm->get_snap_trace(); + em.first->second = std::move(update); + } + } +} + +void MDCache::send_snaps(map<client_t,MClientSnap::ref>& splits) +{ + dout(10) << "send_snaps" << dendl; + + for (auto &p : splits) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v)); + if (session) { + dout(10) << " client." << p.first + << " split " << p.second->head.split + << " inos " << p.second->split_inos + << dendl; + mds->send_message_client_counted(p.second, session); + } else { + dout(10) << " no session for client." << p.first << dendl; + } + } + splits.clear(); +} + + +/* + * remove any items from logsegment open_file lists that don't have + * any caps + */ +void MDCache::clean_open_file_lists() +{ + dout(10) << "clean_open_file_lists" << dendl; + + for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + + elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file)); + while (!q.end()) { + CInode *in = *q; + ++q; + if (in->last == CEPH_NOSNAP) { + dout(10) << " unlisting unwanted/capless inode " << *in << dendl; + in->item_open_file.remove_myself(); + } else { + if (in->client_snap_caps.empty()) { + dout(10) << " unlisting flushed snap inode " << *in << dendl; + in->item_open_file.remove_myself(); + } + } + } + } +} + +void MDCache::dump_openfiles(Formatter *f) +{ + f->open_array_section("openfiles"); + for (auto p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + + auto q = ls->open_files.begin(member_offset(CInode, item_open_file)); + while (!q.end()) { + CInode *in = *q; + ++q; + if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted()) + || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty())) + continue; + f->open_object_section("file"); + in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS); + f->close_section(); + } + } + f->close_section(); +} + +Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds) +{ + dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds + << " on " << *in << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); + if (!session) { + dout(10) << " no session for client." << client << dendl; + return NULL; + } + + Capability *cap = in->reconnect_cap(client, icr, session); + + if (frommds >= 0) { + if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists + cap->inc_mseq(); + do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0); + } + + return cap; +} + +void MDCache::export_remaining_imported_caps() +{ + dout(10) << "export_remaining_imported_caps" << dendl; + + stringstream warn_str; + + int count = 0; + for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { + warn_str << " ino " << p->first << "\n"; + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) { + // mark client caps stale. + auto stale = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier()); + stale->set_cap_peer(0, 0, 0, -1, 0); + mds->send_message_client_counted(stale, q->first); + } + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin(); + p != cap_reconnect_waiters.end(); + ++p) + mds->queue_waiters(p->second); + + cap_imports.clear(); + cap_reconnect_waiters.clear(); + + if (warn_str.peek() != EOF) { + mds->clog->warn() << "failed to reconnect caps for missing inodes:"; + mds->clog->warn(warn_str); + } +} + +Capability* MDCache::try_reconnect_cap(CInode *in, Session *session) +{ + client_t client = session->info.get_client(); + Capability *cap = nullptr; + const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client); + if (rc) { + cap = in->reconnect_cap(client, *rc, session); + dout(10) << "try_reconnect_cap client." << client + << " reconnect wanted " << ccap_string(rc->capinfo.wanted) + << " issue " << ccap_string(rc->capinfo.issued) + << " on " << *in << dendl; + remove_replay_cap_reconnect(in->ino(), client); + + if (in->is_replicated()) { + mds->locker->try_eval(in, CEPH_CAP_LOCKS); + } else { + int dirty_caps = 0; + auto p = reconnected_caps.find(in->ino()); + if (p != reconnected_caps.end()) { + auto q = p->second.find(client); + if (q != p->second.end()) + dirty_caps = q->second.dirty_caps; + } + in->choose_lock_states(dirty_caps); + dout(15) << " chose lock states on " << *in << dendl; + } + + map<inodeno_t, MDSContext::vec >::iterator it = + cap_reconnect_waiters.find(in->ino()); + if (it != cap_reconnect_waiters.end()) { + mds->queue_waiters(it->second); + cap_reconnect_waiters.erase(it); + } + } + return cap; +} + + + +// ------- +// cap imports and delayed snap parent opens + +void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap, + uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, + int peer, int p_flags) +{ + SnapRealm *realm = in->find_snaprealm(); + if (realm->have_past_parents_open()) { + dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; + if (cap->get_last_seq() == 0) // reconnected cap + cap->inc_last_seq(); + cap->set_last_issue(); + cap->set_last_issue_stamp(ceph_clock_now()); + cap->clear_new(); + auto reap = MClientCaps::create(CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(), cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(), mds->get_osd_epoch_barrier()); + in->encode_cap_message(reap, cap); + reap->snapbl = realm->get_snap_trace(); + reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags); + mds->send_message_client_counted(reap, session); + } else { + ceph_abort(); + } +} + +void MDCache::do_delayed_cap_imports() +{ + dout(10) << "do_delayed_cap_imports" << dendl; + + ceph_assert(delayed_imported_caps.empty()); +} + +struct C_MDC_OpenSnapRealms : public MDCacheContext { + explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {} + void finish(int r) override { + mdcache->open_snaprealms(); + } +}; + +void MDCache::open_snaprealms() +{ + dout(10) << "open_snaprealms" << dendl; + + MDSGatherBuilder gather(g_ceph_context); + + auto it = rejoin_pending_snaprealms.begin(); + while (it != rejoin_pending_snaprealms.end()) { + CInode *in = *it; + SnapRealm *realm = in->snaprealm; + ceph_assert(realm); + if (realm->have_past_parents_open() || + realm->open_parents(gather.new_sub())) { + dout(10) << " past parents now open on " << *in << dendl; + + map<client_t,MClientSnap::ref> splits; + // finish off client snaprealm reconnects? + map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino()); + if (q != reconnected_snaprealms.end()) { + for (const auto& r : q->second) + finish_snaprealm_reconnect(r.first, realm, r.second, splits); + reconnected_snaprealms.erase(q); + } + + for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps)); + !p.end(); ++p) { + CInode *child = *p; + auto q = reconnected_caps.find(child->ino()); + ceph_assert(q != reconnected_caps.end()); + for (auto r = q->second.begin(); r != q->second.end(); ++r) { + Capability *cap = child->get_client_cap(r->first); + if (!cap) + continue; + if (r->second.snap_follows > 0) { + if (r->second.snap_follows < child->first - 1) { + rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows); + } else if (r->second.snapflush) { + // When processing a cap flush message that is re-sent, it's possble + // that the sender has already released all WR caps. So we should + // force MDCache::cow_inode() to setup CInode::client_need_snapflush. + cap->mark_needsnapflush(); + } + } + // make sure client's cap is in the correct snaprealm. + if (r->second.realm_ino != in->ino()) { + prepare_realm_split(realm, r->first, child->ino(), splits); + } + } + } + + rejoin_pending_snaprealms.erase(it++); + in->put(CInode::PIN_OPENINGSNAPPARENTS); + + send_snaps(splits); + } else { + dout(10) << " opening past parents on " << *in << dendl; + ++it; + } + } + + if (gather.has_subs()) { + if (gather.num_subs_remaining() == 0) { + // cleanup gather + gather.set_finisher(new C_MDSInternalNoop); + gather.activate(); + } else { + // for multimds, must succeed the first time + ceph_assert(recovery_set.empty()); + + dout(10) << "open_snaprealms - waiting for " + << gather.num_subs_remaining() << dendl; + gather.set_finisher(new C_MDC_OpenSnapRealms(this)); + gather.activate(); + return; + } + } + + notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE); + + if (!reconnected_snaprealms.empty()) { + dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl; + for (auto& p : reconnected_snaprealms) { + stringstream warn_str; + warn_str << " " << p.first << " {"; + bool first = true; + for (auto& q : p.second) { + if (!first) + warn_str << ", "; + warn_str << "client." << q.first << "/" << q.second; + } + warn_str << "}"; + dout(5) << warn_str.str() << dendl; + } + } + ceph_assert(rejoin_waiters.empty()); + ceph_assert(rejoin_pending_snaprealms.empty()); + dout(10) << "open_snaprealms - all open" << dendl; + do_delayed_cap_imports(); + + ceph_assert(rejoin_done); + rejoin_done.release()->complete(0); + reconnected_caps.clear(); +} + +bool MDCache::open_undef_inodes_dirfrags() +{ + dout(10) << "open_undef_inodes_dirfrags " + << rejoin_undef_inodes.size() << " inodes " + << rejoin_undef_dirfrags.size() << " dirfrags" << dendl; + + set<CDir*> fetch_queue = rejoin_undef_dirfrags; + + for (set<CInode*>::iterator p = rejoin_undef_inodes.begin(); + p != rejoin_undef_inodes.end(); + ++p) { + CInode *in = *p; + ceph_assert(!in->is_base()); + fetch_queue.insert(in->get_parent_dir()); + } + + if (fetch_queue.empty()) + return false; + + MDSGatherBuilder gather(g_ceph_context, + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + if (rejoin_gather.empty()) + rejoin_gather_finish(); + }) + ) + ); + + for (set<CDir*>::iterator p = fetch_queue.begin(); + p != fetch_queue.end(); + ++p) { + CDir *dir = *p; + CInode *diri = dir->get_inode(); + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag())); + dir->fetch(gather.new_sub()); + } + ceph_assert(gather.has_subs()); + gather.activate(); + return true; +} + +void MDCache::opened_undef_inode(CInode *in) { + dout(10) << "opened_undef_inode " << *in << dendl; + rejoin_undef_inodes.erase(in); + if (in->is_dir()) { + // FIXME: re-hash dentries if necessary + ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash); + if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) { + CDir *dir = in->get_dirfrag(frag_t()); + ceph_assert(dir); + rejoin_undef_dirfrags.erase(dir); + in->force_dirfrags(); + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) + rejoin_undef_dirfrags.insert(*p); + } + } +} + +void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, + map<client_t,MClientSnap::ref>& updates) +{ + if (seq < realm->get_newest_seq()) { + dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < " + << realm->get_newest_seq() << " on " << *realm << dendl; + auto snap = MClientSnap::create(CEPH_SNAP_OP_UPDATE); + snap->bl = realm->get_snap_trace(); + for (const auto& child : realm->open_children) + snap->split_realms.push_back(child->inode->ino()); + updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap)); + } else { + dout(10) << "finish_snaprealm_reconnect client." << client << " up to date" + << " on " << *realm << dendl; + } +} + + + +void MDCache::rejoin_send_acks() +{ + dout(7) << "rejoin_send_acks" << dendl; + + // replicate stray + for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin(); + p != rejoin_unlinked_inodes.end(); + ++p) { + for (set<CInode*>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CInode *in = *q; + dout(7) << " unlinked inode " << *in << dendl; + // inode expired + if (!in->is_replica(p->first)) + continue; + while (1) { + CDentry *dn = in->get_parent_dn(); + if (dn->is_replica(p->first)) + break; + dn->add_replica(p->first); + CDir *dir = dn->get_dir(); + if (dir->is_replica(p->first)) + break; + dir->add_replica(p->first); + in = dir->get_inode(); + if (in->is_replica(p->first)) + break; + in->add_replica(p->first); + if (in->is_base()) + break; + } + } + } + rejoin_unlinked_inodes.clear(); + + // send acks to everyone in the recovery set + map<mds_rank_t,MMDSCacheRejoin::ref> acks; + for (set<mds_rank_t>::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (rejoin_ack_sent.count(*p)) + continue; + acks[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK); + } + + rejoin_ack_sent = recovery_set; + + // walk subtrees + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (!dir->is_auth()) + continue; + dout(10) << "subtree " << *dir << dendl; + + // auth items in this subtree + list<CDir*> dq; + dq.push_back(dir); + + while (!dq.empty()) { + CDir *dir = dq.front(); + dq.pop_front(); + + // dir + for (auto &r : dir->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep); + it->second->add_dirfrag_base(dir); + } + + for (auto &p : dir->items) { + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + + // inode + CInode *in = NULL; + if (dnl->is_primary()) + in = dnl->get_inode(); + + // dentry + for (auto &r : dn->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last, + dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_d_type():0, + ++r.second, + dn->lock.get_replica_state()); + // peer missed MDentrylink message ? + if (in && !in->is_replica(r.first)) + in->add_replica(r.first); + } + + if (!in) + continue; + + for (auto &r : in->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(in, ++r.second, bl); + } + + // subdirs in this subtree? + in->get_nested_dirfrags(dq); + } + } + } + + // base inodes too + if (root && root->is_auth()) + for (auto &r : root->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(root, mds->mdsmap->get_up_features()); + bufferlist bl; + root->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(root, ++r.second, bl); + } + if (myin) + for (auto &r : myin->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(myin, mds->mdsmap->get_up_features()); + bufferlist bl; + myin->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(myin, ++r.second, bl); + } + + // include inode base for any inodes whose scatterlocks may have updated + for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin(); + p != rejoin_potential_updated_scatterlocks.end(); + ++p) { + CInode *in = *p; + for (const auto &r : in->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(in, mds->mdsmap->get_up_features()); + } + } + + // send acks + for (auto p = acks.begin(); p != acks.end(); ++p) { + encode(rejoin_imported_caps[p->first], p->second->imported_caps); + mds->send_message_mds(p->second, p->first); + } + + rejoin_imported_caps.clear(); +} + +class C_MDC_ReIssueCaps : public MDCacheContext { + CInode *in; +public: + C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) : + MDCacheContext(mdc), in(i) + { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS)) + mdcache->mds->locker->issue_caps(in); + in->put(CInode::PIN_PTRWAITER); + } +}; + +void MDCache::reissue_all_caps() +{ + dout(10) << "reissue_all_caps" << dendl; + + int count = 0; + for (auto &p : inode_map) { + int n = 1; + CInode *in = p.second; + if (in->is_head() && in->is_any_caps()) { + // called by MDSRank::active_start(). There shouldn't be any frozen subtree. + if (in->is_frozen_inode()) { + in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in)); + continue; + } + if (!mds->locker->eval(in, CEPH_CAP_LOCKS)) + n += mds->locker->issue_caps(in); + } + + if ((count % 1000) + n >= 1000) + mds->heartbeat_reset(); + count += n; + } +} + + +// =============================================================================== + +struct C_MDC_QueuedCow : public MDCacheContext { + CInode *in; + MutationRef mut; + C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) : + MDCacheContext(mdc), in(i), mut(m) {} + void finish(int r) override { + mdcache->_queued_file_recover_cow(in, mut); + } +}; + + +void MDCache::queue_file_recover(CInode *in) +{ + dout(10) << "queue_file_recover " << *in << dendl; + ceph_assert(in->is_auth()); + + // cow? + /* + SnapRealm *realm = in->find_snaprealm(); + set<snapid_t> s = realm->get_snaps(); + while (!s.empty() && *s.begin() < in->first) + s.erase(s.begin()); + while (!s.empty() && *s.rbegin() > in->last) + s.erase(*s.rbegin()); + dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl; + if (s.size() > 1) { + CInode::mempool_inode pi = in->project_inode(); + pi->version = in->pre_dirty(); + + auto mut(std::make_shared<MutationImpl>()); + mut->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow"); + mds->mdlog->start_entry(le); + predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + + s.erase(*s.begin()); + while (!s.empty()) { + snapid_t snapid = *s.begin(); + CInode *cow_inode = 0; + journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode); + ceph_assert(cow_inode); + recovery_queue.enqueue(cow_inode); + s.erase(*s.begin()); + } + + in->parent->first = in->first; + le->metablob.add_primary_dentry(in->parent, in, true); + mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut)); + mds->mdlog->flush(); + } + */ + + recovery_queue.enqueue(in); +} + +void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut) +{ + in->pop_and_dirty_projected_inode(mut->ls); + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); +} + + +/* + * called after recovery to recover file sizes for previously opened (for write) + * files. that is, those where max_size > size. + */ +void MDCache::identify_files_to_recover() +{ + dout(10) << "identify_files_to_recover" << dendl; + int count = 0; + for (auto &p : inode_map) { + CInode *in = p.second; + if (!in->is_auth()) + continue; + + if (in->last != CEPH_NOSNAP) + continue; + + // Only normal files need file size recovery + if (!in->is_file()) { + continue; + } + + bool recover = false; + for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin(); + p != in->inode.client_ranges.end(); + ++p) { + Capability *cap = in->get_client_cap(p->first); + if (cap) { + cap->mark_clientwriteable(); + } else { + dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl; + recover = true; + break; + } + } + + if (recover) { + if (in->filelock.is_stable()) { + in->auth_pin(&in->filelock); + } else { + ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP); + } + in->filelock.set_state(LOCK_PRE_SCAN); + rejoin_recover_q.push_back(in); + } else { + rejoin_check_q.push_back(in); + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } +} + +void MDCache::start_files_to_recover() +{ + for (CInode *in : rejoin_check_q) { + if (in->filelock.get_state() == LOCK_XLOCKSNAP) + mds->locker->issue_caps(in); + mds->locker->check_inode_max_size(in); + } + rejoin_check_q.clear(); + for (CInode *in : rejoin_recover_q) { + mds->locker->file_recover(&in->filelock); + } + if (!rejoin_recover_q.empty()) { + rejoin_recover_q.clear(); + do_file_recover(); + } +} + +void MDCache::do_file_recover() +{ + recovery_queue.advance(); +} + +// =============================================================================== + + +// ---------------------------- +// truncate + +class C_MDC_RetryTruncate : public MDCacheContext { + CInode *in; + LogSegment *ls; +public: + C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) : + MDCacheContext(c), in(i), ls(l) {} + void finish(int r) override { + mdcache->_truncate_inode(in, ls); + } +}; + +void MDCache::truncate_inode(CInode *in, LogSegment *ls) +{ + auto pi = in->get_projected_inode(); + dout(10) << "truncate_inode " + << pi->truncate_from << " -> " << pi->truncate_size + << " on " << *in + << dendl; + + ls->truncating_inodes.insert(in); + in->get(CInode::PIN_TRUNCATING); + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + ceph_assert(in->filelock.is_xlocked()); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + mds->locker->issue_caps(in); + return; + } + + _truncate_inode(in, ls); +} + +struct C_IO_MDC_TruncateFinish : public MDCacheIOContext { + CInode *in; + LogSegment *ls; + C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) : + MDCacheIOContext(c, false), in(i), ls(l) { + } + void finish(int r) override { + ceph_assert(r == 0 || r == -ENOENT); + mdcache->truncate_inode_finish(in, ls); + } + void print(ostream& out) const override { + out << "file_truncate(" << in->ino() << ")"; + } +}; + +void MDCache::_truncate_inode(CInode *in, LogSegment *ls) +{ + auto pi = &in->inode; + dout(10) << "_truncate_inode " + << pi->truncate_from << " -> " << pi->truncate_size + << " on " << *in << dendl; + + ceph_assert(pi->is_truncating()); + ceph_assert(pi->truncate_size < (1ULL << 63)); + ceph_assert(pi->truncate_from < (1ULL << 63)); + ceph_assert(pi->truncate_size < pi->truncate_from); + + + SnapRealm *realm = in->find_snaprealm(); + SnapContext nullsnap; + const SnapContext *snapc; + if (realm) { + dout(10) << " realm " << *realm << dendl; + snapc = &realm->get_snap_context(); + } else { + dout(10) << " NO realm, using null context" << dendl; + snapc = &nullsnap; + ceph_assert(in->last == CEPH_NOSNAP); + } + dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl; + filer.truncate(in->inode.ino, &in->inode.layout, *snapc, + pi->truncate_size, pi->truncate_from-pi->truncate_size, + pi->truncate_seq, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), + mds->finisher)); +} + +struct C_MDC_TruncateLogged : public MDCacheLogContext { + CInode *in; + MutationRef mut; + C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) : + MDCacheLogContext(m), in(i), mut(mu) {} + void finish(int r) override { + mdcache->truncate_inode_logged(in, mut); + } +}; + +void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) +{ + dout(10) << "truncate_inode_finish " << *in << dendl; + + set<CInode*>::iterator p = ls->truncating_inodes.find(in); + ceph_assert(p != ls->truncating_inodes.end()); + ls->truncating_inodes.erase(p); + + // update + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + pi.inode.truncate_from = 0; + pi.inode.truncate_pending--; + + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + mut->add_projected_inode(in); + + EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); + mds->mdlog->start_entry(le); + CDentry *dn = in->get_projected_parent_dn(); + le->metablob.add_dir_context(dn->get_dir()); + le->metablob.add_primary_dentry(dn, in, true); + le->metablob.add_truncate_finish(in->ino(), ls->seq); + + journal_dirty_inode(mut.get(), &le->metablob, in); + mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut)); + + // flush immediately if there are readers/writers waiting + if (in->is_waiter_for(CInode::WAIT_TRUNC) || + (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + mds->mdlog->flush(); +} + +void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut) +{ + dout(10) << "truncate_inode_logged " << *in << dendl; + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + + in->put(CInode::PIN_TRUNCATING); + in->auth_unpin(this); + + MDSContext::vec waiters; + in->take_waiting(CInode::WAIT_TRUNC, waiters); + mds->queue_waiters(waiters); +} + + +void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls) +{ + dout(20) << "add_recovered_truncate " << *in << " in log segment " + << ls->seq << "/" << ls->offset << dendl; + ls->truncating_inodes.insert(in); + in->get(CInode::PIN_TRUNCATING); +} + +void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls) +{ + dout(20) << "remove_recovered_truncate " << *in << " in log segment " + << ls->seq << "/" << ls->offset << dendl; + // if we have the logseg the truncate started in, it must be in our list. + set<CInode*>::iterator p = ls->truncating_inodes.find(in); + ceph_assert(p != ls->truncating_inodes.end()); + ls->truncating_inodes.erase(p); + in->put(CInode::PIN_TRUNCATING); +} + +void MDCache::start_recovered_truncates() +{ + dout(10) << "start_recovered_truncates" << dendl; + for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + for (set<CInode*>::iterator q = ls->truncating_inodes.begin(); + q != ls->truncating_inodes.end(); + ++q) { + CInode *in = *q; + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + ceph_assert(in->filelock.is_stable()); + in->filelock.set_state(LOCK_XLOCKDONE); + in->auth_pin(&in->filelock); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + // start_files_to_recover will revoke caps + continue; + } + _truncate_inode(in, ls); + } + } +} + + + + + + +// ================================================================================ +// cache trimming + +std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap) +{ + bool is_standby_replay = mds->is_standby_replay(); + std::vector<CDentry *> unexpirables; + uint64_t trimmed = 0; + + auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold"); + + dout(7) << "trim_lru trimming " << count + << " items from LRU" + << " size=" << lru.lru_get_size() + << " mid=" << lru.lru_get_top() + << " pintail=" << lru.lru_get_pintail() + << " pinned=" << lru.lru_get_num_pinned() + << dendl; + + const uint64_t trim_counter_start = trim_counter.get(); + bool throttled = false; + while (1) { + throttled |= trim_counter_start+trimmed >= trim_threshold; + if (throttled) break; + CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire()); + if (!dn) + break; + if (trim_dentry(dn, expiremap)) { + unexpirables.push_back(dn); + } else { + trimmed++; + } + } + + for (auto &dn : unexpirables) { + bottom_lru.lru_insert_mid(dn); + } + unexpirables.clear(); + + // trim dentries from the LRU until count is reached + // if mds is in standbyreplay and will trim all inodes which aren't in segments + while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) { + throttled |= trim_counter_start+trimmed >= trim_threshold; + if (throttled) break; + CDentry *dn = static_cast<CDentry*>(lru.lru_expire()); + if (!dn) { + break; + } + if ((is_standby_replay && dn->get_linkage()->inode && + dn->get_linkage()->inode->item_open_file.is_on_list())) { + // we move the inodes that need to be trimmed to the end of the lru queue. + // refer to MDCache::standby_trim_segment + lru.lru_insert_bot(dn); + break; + } else if (trim_dentry(dn, expiremap)) { + unexpirables.push_back(dn); + } else { + trimmed++; + if (count > 0) count--; + } + } + trim_counter.hit(trimmed); + + for (auto &dn : unexpirables) { + lru.lru_insert_mid(dn); + } + unexpirables.clear(); + + dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl; + return std::pair<bool, uint64_t>(throttled, trimmed); +} + +/* + * note: only called while MDS is active or stopping... NOT during recovery. + * however, we may expire a replica whose authority is recovering. + * + * @param count is number of dentries to try to expire + */ +std::pair<bool, uint64_t> MDCache::trim(uint64_t count) +{ + uint64_t used = cache_size(); + uint64_t limit = cache_memory_limit; + expiremap expiremap; + + dout(7) << "trim bytes_used=" << bytes2str(used) + << " limit=" << bytes2str(limit) + << " reservation=" << cache_reservation + << "% count=" << count << dendl; + + // process delayed eval_stray() + stray_manager.advance_delayed(); + + auto result = trim_lru(count, expiremap); + auto& trimmed = result.second; + + // trim non-auth, non-bound subtrees + for (auto p = subtrees.begin(); p != subtrees.end();) { + CDir *dir = p->first; + ++p; + CInode *diri = dir->get_inode(); + if (dir->is_auth()) { + if (!diri->is_auth() && !diri->is_base() && + dir->get_num_head_items() == 0) { + if (dir->state_test(CDir::STATE_EXPORTING) || + !(mds->is_active() || mds->is_stopping()) || + dir->is_freezing() || dir->is_frozen()) + continue; + + migrator->export_empty_import(dir); + ++trimmed; + } + } else { + if (!diri->is_auth()) { + if (dir->get_num_ref() > 1) // only subtree pin + continue; + if (diri->get_num_ref() > diri->get_num_subtree_roots()) + continue; + + // don't trim subtree root if its auth MDS is recovering. + // This simplify the cache rejoin code. + if (dir->is_subtree_root() && + rejoin_ack_gather.count(dir->get_dir_auth().first)) + continue; + trim_dirfrag(dir, 0, expiremap); + ++trimmed; + } + } + } + + // trim root? + if (mds->is_stopping() && root) { + list<CDir*> ls; + root->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->get_num_ref() == 1) { // subtree pin + trim_dirfrag(dir, 0, expiremap); + ++trimmed; + } + } + if (root->get_num_ref() == 0) { + trim_inode(0, root, 0, expiremap); + ++trimmed; + } + } + + std::set<mds_rank_t> stopping; + mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING); + stopping.erase(mds->get_nodeid()); + for (auto rank : stopping) { + CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank)); + if (!mdsdir_in) + continue; + + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple()); + if (em.second) { + em.first->second = MCacheExpire::create(mds->get_nodeid()); + } + + dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl; + + const bool aborted = expire_recursive(mdsdir_in, expiremap); + if (!aborted) { + dout(20) << __func__ << ": successfully expired mdsdir" << dendl; + list<CDir*> ls; + mdsdir_in->get_dirfrags(ls); + for (auto dir : ls) { + if (dir->get_num_ref() == 1) { // subtree pin + trim_dirfrag(dir, dir, expiremap); + ++trimmed; + } + } + if (mdsdir_in->get_num_ref() == 0) { + trim_inode(NULL, mdsdir_in, NULL, expiremap); + ++trimmed; + } + } else { + dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl; + } + } + + // Other rank's base inodes (when I'm stopping) + if (mds->is_stopping()) { + for (set<CInode*>::iterator p = base_inodes.begin(); + p != base_inodes.end();) { + CInode *base_in = *p; + ++p; + if (MDS_INO_IS_MDSDIR(base_in->ino()) && + MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) { + dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl; + if (base_in->get_num_ref() == 0) { + trim_inode(NULL, base_in, NULL, expiremap); + ++trimmed; + } + } + } + } + + // send any expire messages + send_expire_messages(expiremap); + + return result; +} + +void MDCache::send_expire_messages(expiremap& expiremap) +{ + // send expires + for (const auto &p : expiremap) { + if (mds->is_cluster_degraded() && + (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_sent.count(p.first) == 0))) { + continue; + } + dout(7) << "sending cache_expire to " << p.first << dendl; + mds->send_message_mds(p.second, p.first); + } + expiremap.clear(); +} + + +bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap) +{ + dout(12) << "trim_dentry " << *dn << dendl; + + CDentry::linkage_t *dnl = dn->get_linkage(); + + CDir *dir = dn->get_dir(); + ceph_assert(dir); + + CDir *con = get_subtree_root(dir); + if (con) + dout(12) << " in container " << *con << dendl; + else { + dout(12) << " no container; under a not-yet-linked dir" << dendl; + ceph_assert(dn->is_auth()); + } + + // If replica dentry is not readable, it's likely we will receive + // MDentryLink/MDentryUnlink message soon (It's possible we first + // receive a MDentryUnlink message, then MDentryLink message) + // MDentryLink message only replicates an inode, so we should + // avoid trimming the inode's parent dentry. This is because that + // unconnected replicas are problematic for subtree migration. + if (!dn->is_auth() && !dn->lock.can_read(-1) && + !dn->get_dir()->get_inode()->is_stray()) + return true; + + // adjust the dir state + // NOTE: we can safely remove a clean, null dentry without effecting + // directory completeness. + // (check this _before_ we unlink the inode, below!) + bool clear_complete = false; + if (!(dnl->is_null() && dn->is_clean())) + clear_complete = true; + + // unlink the dentry + if (dnl->is_remote()) { + // just unlink. + dir->unlink_inode(dn, false); + } else if (dnl->is_primary()) { + // expire the inode, too. + CInode *in = dnl->get_inode(); + ceph_assert(in); + if (trim_inode(dn, in, con, expiremap)) + return true; // purging stray instead of trimming + } else { + ceph_assert(dnl->is_null()); + } + + if (!dn->is_auth()) { + // notify dentry authority. + mds_authority_t auth = dn->authority(); + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *dn << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = MCacheExpire::create(mds->get_nodeid()); + em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce()); + } + } + + // remove dentry + if (dn->last == CEPH_NOSNAP && dir->is_auth()) + dir->add_to_bloom(dn); + dir->remove_dentry(dn); + + if (clear_complete) + dir->state_clear(CDir::STATE_COMPLETE); + + if (mds->logger) mds->logger->inc(l_mds_inodes_expired); + return false; +} + + +void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap) +{ + dout(15) << "trim_dirfrag " << *dir << dendl; + + if (dir->is_subtree_root()) { + ceph_assert(!dir->is_auth() || + (!dir->is_replicated() && dir->inode->is_base())); + remove_subtree(dir); // remove from subtree map + } + ceph_assert(dir->get_num_ref() == 0); + + CInode *in = dir->get_inode(); + + if (!dir->is_auth()) { + mds_authority_t auth = dir->authority(); + + // was this an auth delegation? (if so, slightly modified container) + dirfrag_t condf; + if (dir->is_subtree_root()) { + dout(12) << " subtree root, container is " << *dir << dendl; + con = dir; + condf = dir->dirfrag(); + } else { + condf = con->dirfrag(); + } + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *dir << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */ + em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce); + } + } + + in->close_dirfrag(dir->dirfrag().frag); +} + +/** + * Try trimming an inode from the cache + * + * @return true if the inode is still in cache, else false if it was trimmed + */ +bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap) +{ + dout(15) << "trim_inode " << *in << dendl; + ceph_assert(in->get_num_ref() == 0); + + if (in->is_dir()) { + // If replica inode's dirfragtreelock is not readable, it's likely + // some dirfrags of the inode are being fragmented and we will receive + // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new + // dirfrags, so we should avoid trimming these dirfrags' parent inode. + // This is because that unconnected replicas are problematic for + // subtree migration. + // + if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) { + return true; + } + + // DIR + list<CDir*> dfls; + in->get_dirfrags(dfls); + for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) { + CDir *dir = *p; + ceph_assert(!dir->is_subtree_root()); + trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p + } + } + + // INODE + if (in->is_auth()) { + // eval stray after closing dirfrags + if (dn && !dn->state_test(CDentry::STATE_PURGING)) { + maybe_eval_stray(in); + if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0) + return true; + } + } else { + mds_authority_t auth = in->authority(); + + dirfrag_t df; + if (con) + df = con->dirfrag(); + else + df = dirfrag_t(0,frag_t()); // must be a root or stray inode. + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (con && mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *in << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */ + em.first->second->add_inode(df, in->vino(), in->get_replica_nonce()); + } + } + + /* + if (in->is_auth()) { + if (in->hack_accessed) + mds->logger->inc("outt"); + else { + mds->logger->inc("outut"); + mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp); + } + } + */ + + // unlink + if (dn) + dn->get_dir()->unlink_inode(dn, false); + remove_inode(in); + return false; +} + + +/** + * trim_non_auth - remove any non-auth items from our cache + * + * this reduces the amount of non-auth metadata in our cache, reducing the + * load incurred by the rejoin phase. + * + * the only non-auth items that remain are those that are needed to + * attach our own subtrees to the root. + * + * when we are done, all dentries will be in the top bit of the lru. + * + * why we have to do this: + * we may not have accurate linkage for non-auth items. which means we will + * know which subtree it falls into, and can not be sure to declare it to the + * correct authority. + */ +void MDCache::trim_non_auth() +{ + dout(7) << "trim_non_auth" << dendl; + + // temporarily pin all subtree roots + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) + p->first->get(CDir::PIN_SUBTREETEMP); + + list<CDentry*> auth_list; + + // trim non-auth items from the lru + for (;;) { + CDentry *dn = NULL; + if (bottom_lru.lru_get_size() > 0) + dn = static_cast<CDentry*>(bottom_lru.lru_expire()); + if (!dn && lru.lru_get_size() > 0) + dn = static_cast<CDentry*>(lru.lru_expire()); + if (!dn) + break; + + CDentry::linkage_t *dnl = dn->get_linkage(); + + if (dn->is_auth()) { + // add back into lru (at the top) + auth_list.push_back(dn); + + if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth()) + dn->unlink_remote(dnl); + } else { + // non-auth. expire. + CDir *dir = dn->get_dir(); + ceph_assert(dir); + + // unlink the dentry + dout(10) << " removing " << *dn << dendl; + if (dnl->is_remote()) { + dir->unlink_inode(dn, false); + } + else if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dout(10) << " removing " << *in << dendl; + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *subdir = *p; + ceph_assert(!subdir->is_subtree_root()); + in->close_dirfrag(subdir->dirfrag().frag); + } + dir->unlink_inode(dn, false); + remove_inode(in); + } + else { + ceph_assert(dnl->is_null()); + } + + ceph_assert(!dir->has_bloom()); + dir->remove_dentry(dn); + // adjust the dir state + dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! + // close empty non-auth dirfrag + if (!dir->is_subtree_root() && dir->get_num_any() == 0) + dir->inode->close_dirfrag(dir->get_frag()); + } + } + + for (auto dn : auth_list) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) + bottom_lru.lru_insert_mid(dn); + else + lru.lru_insert_top(dn); + } + + // move everything in the pintail to the top bit of the lru. + lru.lru_touch_entire_pintail(); + + // unpin all subtrees + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) + p->first->put(CDir::PIN_SUBTREETEMP); + + if (lru.lru_get_size() == 0 && + bottom_lru.lru_get_size() == 0) { + // root, stray, etc.? + auto p = inode_map.begin(); + while (p != inode_map.end()) { + CInode *in = p->second; + ++p; + if (!in->is_auth()) { + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); + p != ls.end(); + ++p) { + dout(10) << " removing " << **p << dendl; + ceph_assert((*p)->get_num_ref() == 1); // SUBTREE + remove_subtree((*p)); + in->close_dirfrag((*p)->dirfrag().frag); + } + dout(10) << " removing " << *in << dendl; + ceph_assert(!in->get_parent_dn()); + ceph_assert(in->get_num_ref() == 0); + remove_inode(in); + } + } + } + + show_subtrees(); +} + +/** + * Recursively trim the subtree rooted at directory to remove all + * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors + * of those links. This is used to clear invalid data out of the cache. + * Note that it doesn't clear the passed-in directory, since that's not + * always safe. + */ +bool MDCache::trim_non_auth_subtree(CDir *dir) +{ + dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl; + + bool keep_dir = !can_trim_non_auth_dirfrag(dir); + + auto j = dir->begin(); + auto i = j; + while (j != dir->end()) { + i = j++; + CDentry *dn = i->second; + dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { // check for subdirectories, etc + CInode *in = dnl->get_inode(); + bool keep_inode = false; + if (in->is_dir()) { + list<CDir*> subdirs; + in->get_dirfrags(subdirs); + for (list<CDir*>::iterator subdir = subdirs.begin(); + subdir != subdirs.end(); + ++subdir) { + if ((*subdir)->is_subtree_root()) { + keep_inode = true; + dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl; + } else { + if (trim_non_auth_subtree(*subdir)) + keep_inode = true; + else { + in->close_dirfrag((*subdir)->get_frag()); + dir->state_clear(CDir::STATE_COMPLETE); // now incomplete! + } + } + } + + } + if (!keep_inode) { // remove it! + dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl; + dir->unlink_inode(dn, false); + remove_inode(in); + ceph_assert(!dir->has_bloom()); + dir->remove_dentry(dn); + } else { + dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl; + dn->state_clear(CDentry::STATE_AUTH); + in->state_clear(CInode::STATE_AUTH); + } + } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback + dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl; + } else { // just remove it + dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl; + if (dnl->is_remote()) + dir->unlink_inode(dn, false); + dir->remove_dentry(dn); + } + } + dir->state_clear(CDir::STATE_AUTH); + /** + * We've now checked all our children and deleted those that need it. + * Now return to caller, and tell them if *we're* a keeper. + */ + return keep_dir || dir->get_num_any(); +} + +/* + * during replay, when we determine a subtree is no longer ours, we + * try to trim it from our cache. because subtrees must be connected + * to the root, the fact that we can trim this tree may mean that our + * children or parents can also be trimmed. + */ +void MDCache::try_trim_non_auth_subtree(CDir *dir) +{ + dout(10) << "try_trim_nonauth_subtree " << *dir << dendl; + + // can we now trim child subtrees? + set<CDir*> bounds; + get_subtree_bounds(dir, bounds); + for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) { + CDir *bd = *p; + if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth + bd->get_num_any() == 0 && // and empty + can_trim_non_auth_dirfrag(bd)) { + CInode *bi = bd->get_inode(); + dout(10) << " closing empty non-auth child subtree " << *bd << dendl; + remove_subtree(bd); + bd->mark_clean(); + bi->close_dirfrag(bd->get_frag()); + } + } + + if (trim_non_auth_subtree(dir)) { + // keep + try_subtree_merge(dir); + } else { + // can we trim this subtree (and possibly our ancestors) too? + while (true) { + CInode *diri = dir->get_inode(); + if (diri->is_base()) { + if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) { + dout(10) << " closing empty non-auth subtree " << *dir << dendl; + remove_subtree(dir); + dir->mark_clean(); + diri->close_dirfrag(dir->get_frag()); + + dout(10) << " removing " << *diri << dendl; + ceph_assert(!diri->get_parent_dn()); + ceph_assert(diri->get_num_ref() == 0); + remove_inode(diri); + } + break; + } + + CDir *psub = get_subtree_root(diri->get_parent_dir()); + dout(10) << " parent subtree is " << *psub << dendl; + if (psub->get_dir_auth().first == mds->get_nodeid()) + break; // we are auth, keep. + + dout(10) << " closing empty non-auth subtree " << *dir << dendl; + remove_subtree(dir); + dir->mark_clean(); + diri->close_dirfrag(dir->get_frag()); + + dout(10) << " parent subtree also non-auth: " << *psub << dendl; + if (trim_non_auth_subtree(psub)) + break; + dir = psub; + } + } + + show_subtrees(); +} + +void MDCache::standby_trim_segment(LogSegment *ls) +{ + auto try_trim_inode = [this](CInode *in) { + if (in->get_num_ref() == 0 && + !in->item_open_file.is_on_list() && + in->parent != NULL && + in->parent->get_num_ref() == 0){ + touch_dentry_bottom(in->parent); + } + }; + + auto try_trim_dentry = [this](CDentry *dn) { + if (dn->get_num_ref() > 0) + return; + auto in = dn->get_linkage()->inode; + if(in && in->item_open_file.is_on_list()) + return; + touch_dentry_bottom(dn); + }; + + ls->new_dirfrags.clear_list(); + ls->open_files.clear_list(); + + while (!ls->dirty_dirfrags.empty()) { + CDir *dir = ls->dirty_dirfrags.front(); + dir->mark_clean(); + if (dir->inode) + try_trim_inode(dir->inode); + } + while (!ls->dirty_inodes.empty()) { + CInode *in = ls->dirty_inodes.front(); + in->mark_clean(); + try_trim_inode(in); + } + while (!ls->dirty_dentries.empty()) { + CDentry *dn = ls->dirty_dentries.front(); + dn->mark_clean(); + try_trim_dentry(dn); + } + while (!ls->dirty_parent_inodes.empty()) { + CInode *in = ls->dirty_parent_inodes.front(); + in->clear_dirty_parent(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_dir.empty()) { + CInode *in = ls->dirty_dirfrag_dir.front(); + in->filelock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_nest.empty()) { + CInode *in = ls->dirty_dirfrag_nest.front(); + in->nestlock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_dirfragtree.empty()) { + CInode *in = ls->dirty_dirfrag_dirfragtree.front(); + in->dirfragtreelock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->truncating_inodes.empty()) { + auto it = ls->truncating_inodes.begin(); + CInode *in = *it; + ls->truncating_inodes.erase(it); + in->put(CInode::PIN_TRUNCATING); + try_trim_inode(in); + } +} + +void MDCache::handle_cache_expire(const MCacheExpire::const_ref &m) +{ + mds_rank_t from = mds_rank_t(m->get_from()); + + dout(7) << "cache_expire from mds." << from << dendl; + + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + set<SimpleLock *> gather_locks; + // loop over realms + for (const auto &p : m->realms) { + // check container? + if (p.first.ino > 0) { + CInode *expired_inode = get_inode(p.first.ino); + ceph_assert(expired_inode); // we had better have this. + CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag); + ceph_assert(parent_dir); + + int export_state = -1; + if (parent_dir->is_auth() && parent_dir->is_exporting()) { + export_state = migrator->get_export_state(parent_dir); + ceph_assert(export_state >= 0); + } + + if (!parent_dir->is_auth() || + (export_state != -1 && + ((export_state == Migrator::EXPORT_WARNING && + migrator->export_has_warned(parent_dir,from)) || + export_state == Migrator::EXPORT_EXPORTING || + export_state == Migrator::EXPORT_LOGGINGFINISH || + (export_state == Migrator::EXPORT_NOTIFYING && + !migrator->export_has_notified(parent_dir,from))))) { + + // not auth. + dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl; + ceph_assert(parent_dir->is_frozen_tree_root()); + + // make a message container + + auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple()); + if (em.second) + em.first->second = MCacheExpire::create(from); /* new */ + + // merge these expires into it + em.first->second->add_realm(p.first, p.second); + continue; + } + ceph_assert(export_state <= Migrator::EXPORT_PREPPING || + (export_state == Migrator::EXPORT_WARNING && + !migrator->export_has_warned(parent_dir, from))); + + dout(7) << "expires for " << *parent_dir << dendl; + } else { + dout(7) << "containerless expires (root, stray inodes)" << dendl; + } + + // INODES + for (const auto &q : p.second.inodes) { + CInode *in = get_inode(q.first); + unsigned nonce = q.second; + + if (!in) { + dout(0) << " inode expire on " << q.first << " from " << from + << ", don't have it" << dendl; + ceph_assert(in); + } + ceph_assert(in->is_auth()); + dout(20) << __func__ << ": expiring inode " << *in << dendl; + + // check nonce + if (nonce == in->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " inode expire on " << *in << " from mds." << from + << " cached_by was " << in->get_replicas() << dendl; + inode_remove_replica(in, from, false, gather_locks); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " inode expire on " << *in << " from mds." << from + << " with old nonce " << nonce + << " (current " << in->get_replica_nonce(from) << "), dropping" + << dendl; + } + } + + // DIRS + for (const auto &q : p.second.dirs) { + CDir *dir = get_dirfrag(q.first); + unsigned nonce = q.second; + + if (!dir) { + CInode *diri = get_inode(q.first.ino); + if (diri) { + if (mds->is_rejoin() && + rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet + !diri->is_replica(from)) { + list<CDir*> ls; + diri->get_nested_dirfrags(ls); + dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from + << " while rejoining, inode isn't replicated" << dendl; + for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) { + dir = *q; + if (dir->is_replica(from)) { + dout(7) << " dir expire on " << *dir << " from mds." << from << dendl; + dir->remove_replica(from); + } + } + continue; + } + CDir *other = diri->get_approx_dirfrag(q.first.frag); + if (other) { + dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from + << " have " << *other << ", mismatched frags, dropping" << dendl; + continue; + } + } + dout(0) << " dir expire on " << q.first << " from " << from + << ", don't have it" << dendl; + ceph_assert(dir); + } + dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl; + + ceph_assert(dir->is_auth()); + + // check nonce + if (nonce == dir->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " dir expire on " << *dir << " from mds." << from + << " replicas was " << dir->get_replicas() << dendl; + dir->remove_replica(from); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " dir expire on " << *dir << " from mds." << from + << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) + << "), dropping" << dendl; + } + } + + // DENTRIES + for (const auto &pd : p.second.dentries) { + dout(10) << " dn expires in dir " << pd.first << dendl; + CInode *diri = get_inode(pd.first.ino); + ceph_assert(diri); + CDir *dir = diri->get_dirfrag(pd.first.frag); + + if (!dir) { + dout(0) << " dn expires on " << pd.first << " from " << from + << ", must have refragmented" << dendl; + } else { + ceph_assert(dir->is_auth()); + } + + for (const auto &p : pd.second) { + unsigned nonce = p.second; + CDentry *dn; + + if (dir) { + dn = dir->lookup(p.first.first, p.first.second); + } else { + // which dirfrag for this dentry? + CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first)); + ceph_assert(dir); + ceph_assert(dir->is_auth()); + dn = dir->lookup(p.first.first, p.first.second); + } + + if (!dn) { + if (dir) + dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl; + else + dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl; + } + ceph_assert(dn); + + if (nonce == dn->get_replica_nonce(from)) { + dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl; + dentry_remove_replica(dn, from, gather_locks); + } + else { + dout(7) << " dentry_expire on " << *dn << " from mds." << from + << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) + << "), dropping" << dendl; + } + } + } + } + + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { + if (!(*p)->is_stable()) + mds->locker->eval_gather(*p); + } +} + +void MDCache::process_delayed_expire(CDir *dir) +{ + dout(7) << "process_delayed_expire on " << *dir << dendl; + for (const auto &p : delayed_expire[dir]) { + handle_cache_expire(p.second); + } + delayed_expire.erase(dir); +} + +void MDCache::discard_delayed_expire(CDir *dir) +{ + dout(7) << "discard_delayed_expire on " << *dir << dendl; + delayed_expire.erase(dir); +} + +void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin, + set<SimpleLock *>& gather_locks) +{ + in->remove_replica(from); + in->set_mds_caps_wanted(from, 0); + + // note: this code calls _eval more often than it needs to! + // fix lock + if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock); + if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock); + if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock); + if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock); + if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock); + if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock); + + // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state. + // Don't remove the recovering mds from lock's gathering list because + // it may hold rejoined wrlocks. + if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock); + if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock); + if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock); +} + +void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks) +{ + dn->remove_replica(from); + + // fix lock + if (dn->lock.remove_replica(from)) + gather_locks.insert(&dn->lock); + + // Replicated strays might now be elegible for purge + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + maybe_eval_stray(dnl->get_inode()); + } +} + +void MDCache::trim_client_leases() +{ + utime_t now = ceph_clock_now(); + + dout(10) << "trim_client_leases" << dendl; + + std::size_t pool = 0; + for (const auto& list : client_leases) { + pool += 1; + if (list.empty()) + continue; + + auto before = list.size(); + while (!list.empty()) { + ClientLease *r = list.front(); + if (r->ttl > now) break; + CDentry *dn = static_cast<CDentry*>(r->parent); + dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl; + dn->remove_client_lease(r, mds->locker); + } + auto after = list.size(); + dout(10) << "trim_client_leases pool " << pool << " trimmed " + << (before-after) << " leases, " << after << " left" << dendl; + } +} + + +void MDCache::check_memory_usage() +{ + static MemoryModel mm(g_ceph_context); + static MemoryModel::snap last; + mm.sample(&last); + static MemoryModel::snap baseline = last; + + // check client caps + ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes); + double caps_per_inode = 0.0; + if (CInode::count()) + caps_per_inode = (double)Capability::count() / (double)CInode::count(); + + dout(2) << "Memory usage: " + << " total " << last.get_total() + << ", rss " << last.get_rss() + << ", heap " << last.get_heap() + << ", baseline " << baseline.get_heap() + << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps" + << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode" + << dendl; + + mds->update_mlogger(); + mds->mlogger->set(l_mdm_rss, last.get_rss()); + mds->mlogger->set(l_mdm_heap, last.get_heap()); + + if (cache_toofull()) { + mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM); + } + + // If the cache size had exceeded its limit, but we're back in bounds + // now, free any unused pool memory so that our memory usage isn't + // permanently bloated. + if (exceeded_size_limit && !cache_toofull()) { + // Only do this once we are back in bounds: otherwise the releases would + // slow down whatever process caused us to exceed bounds to begin with + if (ceph_using_tcmalloc()) { + dout(5) << "check_memory_usage: releasing unused space from tcmalloc" + << dendl; + ceph_heap_release_free_memory(); + } + exceeded_size_limit = false; + } +} + + + +// ========================================================================================= +// shutdown + +class C_MDC_ShutdownCheck : public MDCacheContext { +public: + explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {} + void finish(int) override { + mdcache->shutdown_check(); + } +}; + +void MDCache::shutdown_check() +{ + dout(0) << "shutdown_check at " << ceph_clock_now() << dendl; + + // cache + char old_val[32] = { 0 }; + char *o = old_val; + g_conf().get_val("debug_mds", &o, sizeof(old_val)); + g_conf().set_val("debug_mds", "10"); + g_conf().apply_changes(nullptr); + show_cache(); + g_conf().set_val("debug_mds", old_val); + g_conf().apply_changes(nullptr); + mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // this + dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; + + + if (mds->objecter->is_active()) { + dout(0) << "objecter still active" << dendl; + mds->objecter->dump_active(); + } +} + + +void MDCache::shutdown_start() +{ + dout(5) << "shutdown_start" << dendl; + + if (g_conf()->mds_shutdown_check) + mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // g_conf()->debug_mds = 10; +} + + + +bool MDCache::shutdown_pass() +{ + dout(7) << "shutdown_pass" << dendl; + + if (mds->is_stopped()) { + dout(7) << " already shut down" << dendl; + show_cache(); + show_subtrees(); + return true; + } + + // empty stray dir + bool strays_all_exported = shutdown_export_strays(); + + // trim cache + trim(UINT64_MAX); + dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + + // Export all subtrees to another active (usually rank 0) if not rank 0 + int num_auth_subtree = 0; + if (!subtrees.empty() && + mds->get_nodeid() != 0) { + dout(7) << "looking for subtrees to export to mds0" << dendl; + list<CDir*> ls; + for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin(); + it != subtrees.end(); + ++it) { + CDir *dir = it->first; + if (dir->get_inode()->is_mdsdir()) + continue; + if (dir->is_auth()) { + num_auth_subtree++; + if (dir->is_frozen() || + dir->is_freezing() || + dir->is_ambiguous_dir_auth() || + dir->state_test(CDir::STATE_EXPORTING)) + continue; + ls.push_back(dir); + } + } + + migrator->clear_export_queue(); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + mds_rank_t dest = dir->get_inode()->authority().first; + if (dest > 0 && !mds->mdsmap->is_active(dest)) + dest = 0; + dout(7) << "sending " << *dir << " back to mds." << dest << dendl; + migrator->export_dir_nicely(dir, dest); + } + } + + if (!strays_all_exported) { + dout(7) << "waiting for strays to migrate" << dendl; + return false; + } + + if (num_auth_subtree > 0) { + ceph_assert(mds->get_nodeid() > 0); + dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl; + show_subtrees(); + return false; + } + + // close out any sessions (and open files!) before we try to trim the log, etc. + if (mds->sessionmap.have_unclosed_sessions()) { + if (!mds->server->terminating_sessions) + mds->server->terminate_sessions(); + return false; + } + + // Fully trim the log so that all objects in cache are clean and may be + // trimmed by a future MDCache::trim. Note that MDSRank::tick does not + // trim the log such that the cache eventually becomes clean. + if (mds->mdlog->get_num_segments() > 0) { + auto ls = mds->mdlog->get_current_segment(); + if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) { + // Current segment contains events other than subtreemap or + // there are dirty dirfrags (see CDir::log_mark_dirty()) + mds->mdlog->start_new_segment(); + mds->mdlog->flush(); + } + } + mds->mdlog->trim_all(); + if (mds->mdlog->get_num_segments() > 1) { + dout(7) << "still >1 segments, waiting for log to trim" << dendl; + return false; + } + + // drop our reference to our stray dir inode + for (int i = 0; i < NUM_STRAY; ++i) { + if (strays[i] && + strays[i]->state_test(CInode::STATE_STRAYPINNED)) { + strays[i]->state_clear(CInode::STATE_STRAYPINNED); + strays[i]->put(CInode::PIN_STRAY); + strays[i]->put_stickydirs(); + } + } + + CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL; + if (mydir && !mydir->is_subtree_root()) + mydir = NULL; + + // subtrees map not empty yet? + if (subtrees.size() > (mydir ? 1 : 0)) { + dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; + show_subtrees(); + migrator->show_importing(); + migrator->show_exporting(); + if (!migrator->is_importing() && !migrator->is_exporting()) + show_cache(); + return false; + } + ceph_assert(!migrator->is_exporting()); + ceph_assert(!migrator->is_importing()); + + // replicas may dirty scatter locks + if (myin && myin->is_replicated()) { + dout(7) << "still have replicated objects" << dendl; + return false; + } + + if ((myin && myin->get_num_auth_pins()) || + (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) { + dout(7) << "still have auth pinned objects" << dendl; + return false; + } + + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the log" << dendl; + mds->mdlog->cap(); + } + + if (!mds->mdlog->empty()) + mds->mdlog->trim(0); + + if (!mds->mdlog->empty()) { + dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() + << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; + return false; + } + + if (!did_shutdown_log_cap) { + // flush journal header + dout(7) << "writing header for (now-empty) journal" << dendl; + ceph_assert(mds->mdlog->empty()); + mds->mdlog->write_head(0); + // NOTE: filer active checker below will block us until this completes. + did_shutdown_log_cap = true; + return false; + } + + // filer active? + if (mds->objecter->is_active()) { + dout(7) << "objecter still active" << dendl; + mds->objecter->dump_active(); + return false; + } + + // trim what we can from the cache + if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) { + dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + show_cache(); + //dump(); + return false; + } + + // make mydir subtree go away + if (mydir) { + if (mydir->get_num_ref() > 1) { // subtree pin + dout(7) << "there's still reference to mydir " << *mydir << dendl; + show_cache(); + return false; + } + + remove_subtree(mydir); + myin->close_dirfrag(mydir->get_frag()); + } + ceph_assert(subtrees.empty()); + + if (myin) { + remove_inode(myin); + ceph_assert(!myin); + } + + if (global_snaprealm) { + remove_inode(global_snaprealm->inode); + global_snaprealm = nullptr; + } + + // done! + dout(5) << "shutdown done." << dendl; + return true; +} + +bool MDCache::shutdown_export_strays() +{ + static const unsigned MAX_EXPORTING = 100; + + if (mds->get_nodeid() == 0) + return true; + + if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2) + return false; + + dout(10) << "shutdown_export_strays " << shutdown_export_next.first + << " '" << shutdown_export_next.second << "'" << dendl; + + bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0)); + bool all_exported = false; + +again: + auto next = shutdown_export_next; + + for (int i = 0; i < NUM_STRAY; ++i) { + CInode *strayi = strays[i]; + if (!strayi || + !strayi->state_test(CInode::STATE_STRAYPINNED)) + continue; + if (strayi->ino() < next.first.ino) + continue; + + deque<CDir*> dfls; + strayi->get_dirfrags(dfls); + + while (!dfls.empty()) { + CDir *dir = dfls.front(); + dfls.pop_front(); + + if (dir->dirfrag() < next.first) + continue; + if (next.first < dir->dirfrag()) { + next.first = dir->dirfrag(); + next.second.clear(); + } + + if (!dir->is_complete()) { + MDSContext *fin = nullptr; + if (shutdown_exporting_strays.empty()) { + fin = new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + shutdown_export_strays(); + }) + ); + } + dir->fetch(fin); + goto done; + } + + CDir::dentry_key_map::iterator it; + if (next.second.empty()) { + it = dir->begin(); + } else { + auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second)); + it = dir->lower_bound(dentry_key_t(0, next.second, hash)); + } + + for (; it != dir->end(); ++it) { + CDentry *dn = it->second; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_null()) + continue; + + if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) { + next.second = it->first.name; + goto done; + } + + auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino()); + if (!ret.second) { + dout(10) << "already exporting/purging " << *dn << dendl; + continue; + } + + // Don't try to migrate anything that is actually + // being purged right now + if (!dn->state_test(CDentry::STATE_PURGING)) + stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root! + + if (shutdown_exporting_strays.size() >= MAX_EXPORTING) { + ++it; + if (it != dir->end()) { + next.second = it->first.name; + } else { + if (dfls.empty()) + next.first.ino.val++; + else + next.first = dfls.front()->dirfrag(); + next.second.clear(); + } + goto done; + } + } + } + } + + if (shutdown_exporting_strays.empty()) { + dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0); + if (first_df < shutdown_export_next.first || + !shutdown_export_next.second.empty()) { + shutdown_export_next.first = first_df; + shutdown_export_next.second.clear(); + goto again; + } + all_exported = true; + } + +done: + shutdown_export_next = next; + return all_exported; +} + +// ========= messaging ============== + +void MDCache::dispatch(const Message::const_ref &m) +{ + switch (m->get_type()) { + + // RESOLVE + case MSG_MDS_RESOLVE: + handle_resolve(MMDSResolve::msgref_cast(m)); + break; + case MSG_MDS_RESOLVEACK: + handle_resolve_ack(MMDSResolveAck::msgref_cast(m)); + break; + + // REJOIN + case MSG_MDS_CACHEREJOIN: + handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m)); + break; + + case MSG_MDS_DISCOVER: + handle_discover(MDiscover::msgref_cast(m)); + break; + case MSG_MDS_DISCOVERREPLY: + handle_discover_reply(MDiscoverReply::msgref_cast(m)); + break; + + case MSG_MDS_DIRUPDATE: + handle_dir_update(MDirUpdate::msgref_cast(m)); + break; + + case MSG_MDS_CACHEEXPIRE: + handle_cache_expire(MCacheExpire::msgref_cast(m)); + break; + + case MSG_MDS_DENTRYLINK: + handle_dentry_link(MDentryLink::msgref_cast(m)); + break; + case MSG_MDS_DENTRYUNLINK: + handle_dentry_unlink(MDentryUnlink::msgref_cast(m)); + break; + + case MSG_MDS_FRAGMENTNOTIFY: + handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m)); + break; + case MSG_MDS_FRAGMENTNOTIFYACK: + handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m)); + break; + + case MSG_MDS_FINDINO: + handle_find_ino(MMDSFindIno::msgref_cast(m)); + break; + case MSG_MDS_FINDINOREPLY: + handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m)); + break; + + case MSG_MDS_OPENINO: + handle_open_ino(MMDSOpenIno::msgref_cast(m)); + break; + case MSG_MDS_OPENINOREPLY: + handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m)); + break; + + case MSG_MDS_SNAPUPDATE: + handle_snap_update(MMDSSnapUpdate::msgref_cast(m)); + break; + + default: + derr << "cache unknown message " << m->get_type() << dendl; + ceph_abort_msg("cache unknown message"); + } +} + +int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // who + const filepath& path, // what + vector<CDentry*> *pdnvec, // result + CInode **pin, + int onfail) +{ + bool discover = (onfail == MDS_TRAVERSE_DISCOVER); + bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK); + bool forward = (onfail == MDS_TRAVERSE_FORWARD); + + ceph_assert(!forward || mdr); // forward requires a request + + snapid_t snapid = CEPH_NOSNAP; + if (mdr) + mdr->snapid = snapid; + + client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1; + + if (mds->logger) mds->logger->inc(l_mds_traverse); + + dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl; + CInode *cur = get_inode(path.get_ino()); + if (cur == NULL) { + if (MDS_INO_IS_MDSDIR(path.get_ino())) + open_foreign_mdsdir(path.get_ino(), cf.build()); + else { + //ceph_abort(); // hrm.. broken + return -ESTALE; + } + return 1; + } + if (cur->state_test(CInode::STATE_PURGING)) + return -ESTALE; + + // make sure snaprealm are open... + if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() && + !cur->snaprealm->open_parents(cf.build())) { + return 1; + } + + // start trace + if (pdnvec) + pdnvec->clear(); + if (pin) + *pin = cur; + + unsigned depth = 0; + while (depth < path.depth()) { + dout(12) << "traverse: path seg depth " << depth << " '" << path[depth] + << "' snapid " << snapid << dendl; + + if (!cur->is_dir()) { + dout(7) << "traverse: " << *cur << " not a dir " << dendl; + return -ENOTDIR; + } + + // walk into snapdir? + if (path[depth].length() == 0) { + dout(10) << "traverse: snapdir" << dendl; + if (!mdr) + return -EINVAL; + snapid = CEPH_SNAPDIR; + mdr->snapid = snapid; + depth++; + continue; + } + // walk thru snapdir? + if (snapid == CEPH_SNAPDIR) { + if (!mdr) + return -EINVAL; + SnapRealm *realm = cur->find_snaprealm(); + snapid = realm->resolve_snapname(path[depth], cur->ino()); + dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl; + if (!snapid) { + CInode *t = cur; + while (t) { + // if snaplock isn't readable, it's possible that other mds is creating + // snapshot, but snap update message hasn't been received. + if (!t->snaplock.can_read(client)) { + dout(10) << " non-readable snaplock on " << *t << dendl; + t->snaplock.add_waiter(SimpleLock::WAIT_RD, cf.build()); + return 1; + } + CDentry *pdn = t->get_projected_parent_dn(); + t = pdn ? pdn->get_dir()->get_inode() : NULL; + } + return -ENOENT; + } + mdr->snapid = snapid; + depth++; + continue; + } + + // open dir + frag_t fg = cur->pick_dirfrag(path[depth]); + CDir *curdir = cur->get_dirfrag(fg); + if (!curdir) { + if (cur->is_auth()) { + // parent dir frozen_dir? + if (cur->is_frozen()) { + dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl; + cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build()); + return 1; + } + curdir = cur->get_or_open_dirfrag(this, fg); + } else { + // discover? + dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; + discover_path(cur, snapid, path.postfixpath(depth), cf.build(), + null_okay); + if (mds->logger) mds->logger->inc(l_mds_traverse_discover); + return 1; + } + } + ceph_assert(curdir); + +#ifdef MDS_VERIFY_FRAGSTAT + if (curdir->is_complete()) + curdir->verify_fragstat(); +#endif + + // frozen? + /* + if (curdir->is_frozen()) { + // doh! + // FIXME: traverse is allowed? + dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin)); + if (onfinish) delete onfinish; + return 1; + } + */ + + // Before doing dirfrag->dn lookup, compare with DamageTable's + // record of which dentries were unreadable + if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) { + dout(4) << "traverse: stopped lookup at damaged dentry " + << *curdir << "/" << path[depth] << " snap=" << snapid << dendl; + return -EIO; + } + + // dentry + CDentry *dn = curdir->lookup(path[depth], snapid); + CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0; + + // null and last_bit and xlocked by me? + if (dnl && dnl->is_null() && null_okay) { + dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl; + if (pdnvec) + pdnvec->push_back(dn); + if (pin) + *pin = 0; + break; // done! + } + + if (dnl && + dn->lock.is_xlocked() && + dn->lock.get_xlock_by() != mdr && + !dn->lock.can_read(client) && + (dnl->is_null() || forward)) { + dout(10) << "traverse: xlocked dentry at " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build()); + if (mds->logger) mds->logger->inc(l_mds_traverse_lock); + mds->mdlog->flush(); + return 1; + } + + // can we conclude ENOENT? + if (dnl && dnl->is_null()) { + if (dn->lock.can_read(client) || + (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) { + dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl; + if (pdnvec) { + if (depth == path.depth() - 1) + pdnvec->push_back(dn); + else + pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); + } + return -ENOENT; + } else { + dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build()); + return 1; + } + } + + if (dnl && !dnl->is_null()) { + CInode *in = dnl->get_inode(); + + // do we have inode? + if (!in) { + ceph_assert(dnl->is_remote()); + // do i have it? + in = get_inode(dnl->get_remote_ino()); + if (in) { + dout(7) << "linking in remote in " << *in << dendl; + dn->link_remote(dnl, in); + } else { + dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl; + ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! + if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { + dout(4) << "traverse: remote dentry points to damaged ino " + << *dn << dendl; + return -EIO; + } + open_remote_dentry(dn, true, cf.build(), + (null_okay && depth == path.depth() - 1)); + if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino); + return 1; + } + } + + cur = in; + // make sure snaprealm are open... + if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() && + !cur->snaprealm->open_parents(cf.build())) { + return 1; + } + + // add to trace, continue. + touch_inode(cur); + if (pdnvec) + pdnvec->push_back(dn); + if (pin) + *pin = cur; + depth++; + continue; + } + + + // MISS. dentry doesn't exist. + dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; + + if (curdir->is_auth()) { + // dentry is mine. + if (curdir->is_complete() || + (snapid == CEPH_NOSNAP && + curdir->has_bloom() && + !curdir->is_in_bloom(path[depth]))) { + // file not found + if (pdnvec) { + // instantiate a null dn? + if (depth < path.depth()-1){ + dout(20) << " didn't traverse full path; not returning pdnvec" << dendl; + dn = NULL; + } else if (dn) { + ceph_abort(); // should have fallen out in ->is_null() check above + } else if (curdir->is_frozen()) { + dout(20) << " not adding null to frozen dir " << dendl; + } else if (snapid < CEPH_MAXSNAP) { + dout(20) << " not adding null for snapid " << snapid << dendl; + } else { + // create a null dentry + dn = curdir->add_null_dentry(path[depth]); + dout(20) << " added null " << *dn << dendl; + } + if (dn) + pdnvec->push_back(dn); + else + pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); + } + return -ENOENT; + } else { + + // Check DamageTable for missing fragments before trying to fetch + // this + if (mds->damage_table.is_dirfrag_damaged(curdir)) { + dout(4) << "traverse: damaged dirfrag " << *curdir + << ", blocking fetch" << dendl; + return -EIO; + } + + // directory isn't complete; reload + dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; + touch_inode(cur); + curdir->fetch(cf.build(), path[depth]); + if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch); + return 1; + } + } else { + // dirfrag/dentry is not mine. + mds_authority_t dauth = curdir->authority(); + + if (!forward_all_requests_to_auth && + forward && + mdr && mdr->client_request && + (int)depth < mdr->client_request->get_num_fwd()){ + dout(7) << "traverse: snap " << snapid << " and depth " << depth + << " < fwd " << mdr->client_request->get_num_fwd() + << ", discovering instead of forwarding" << dendl; + discover = true; + } + + if ((discover || null_okay)) { + dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; + discover_path(curdir, snapid, path.postfixpath(depth), cf.build(), + null_okay); + if (mds->logger) mds->logger->inc(l_mds_traverse_discover); + return 1; + } + if (forward) { + // forward + dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; + + if (curdir->is_ambiguous_auth()) { + // wait + dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build()); + return 1; + } + + dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; + + request_forward(mdr, dauth.first); + + if (mds->logger) mds->logger->inc(l_mds_traverse_forward); + return 2; + } + } + + ceph_abort(); // i shouldn't get here + } + + // success. + if (mds->logger) mds->logger->inc(l_mds_traverse_hit); + dout(10) << "path_traverse finish on snapid " << snapid << dendl; + if (mdr) + ceph_assert(mdr->snapid == snapid); + return 0; +} + +CInode *MDCache::cache_traverse(const filepath& fp) +{ + dout(10) << "cache_traverse " << fp << dendl; + + CInode *in; + if (fp.get_ino()) + in = get_inode(fp.get_ino()); + else + in = root; + if (!in) + return NULL; + + for (unsigned i = 0; i < fp.depth(); i++) { + std::string_view dname = fp[i]; + frag_t fg = in->pick_dirfrag(dname); + dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl; + CDir *curdir = in->get_dirfrag(fg); + if (!curdir) + return NULL; + CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP); + if (!dn) + return NULL; + in = dn->get_linkage()->get_inode(); + if (!in) + return NULL; + } + dout(10) << " got " << *in << dendl; + return in; +} + + +/** + * open_remote_dir -- open up a remote dirfrag + * + * @param diri base inode + * @param approxfg approximate fragment. + * @param fin completion callback + */ +void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin) +{ + dout(10) << "open_remote_dir on " << *diri << dendl; + ceph_assert(diri->is_dir()); + ceph_assert(!diri->is_auth()); + ceph_assert(diri->get_dirfrag(approxfg) == 0); + + discover_dir_frag(diri, approxfg, fin); +} + + +/** + * get_dentry_inode - get or open inode + * + * @param dn the dentry + * @param mdr current request + * + * will return inode for primary, or link up/open up remote link's inode as necessary. + * If it's not available right now, puts mdr on wait list and returns null. + */ +CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected) +{ + CDentry::linkage_t *dnl; + if (projected) + dnl = dn->get_projected_linkage(); + else + dnl = dn->get_linkage(); + + ceph_assert(!dnl->is_null()); + + if (dnl->is_primary()) + return dnl->inode; + + ceph_assert(dnl->is_remote()); + CInode *in = get_inode(dnl->get_remote_ino()); + if (in) { + dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; + dn->link_remote(dnl, in); + return in; + } else { + dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; + open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr)); + return 0; + } +} + +struct C_MDC_OpenRemoteDentry : public MDCacheContext { + CDentry *dn; + inodeno_t ino; + MDSContext *onfinish; + bool want_xlocked; + C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) : + MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) { + dn->get(MDSCacheObject::PIN_PTRWAITER); + } + void finish(int r) override { + mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r); + dn->put(MDSCacheObject::PIN_PTRWAITER); + } +}; + +void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked) +{ + dout(10) << "open_remote_dentry " << *dn << dendl; + CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage(); + inodeno_t ino = dnl->get_remote_ino(); + int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1; + open_ino(ino, pool, + new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace +} + +void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin, + bool want_xlocked, int r) +{ + if (r < 0) { + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_remote() && dnl->get_remote_ino() == ino) { + dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; + dn->state_set(CDentry::STATE_BADREMOTEINO); + + std::string path; + CDir *dir = dn->get_dir(); + if (dir) { + dir->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + } + + bool fatal = mds->damage_table.notify_remote_damaged(ino, path); + if (fatal) { + mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } + } else { + r = 0; + } + } + fin->complete(r < 0 ? r : 0); +} + + +void MDCache::make_trace(vector<CDentry*>& trace, CInode *in) +{ + // empty trace if we're a base inode + if (in->is_base()) + return; + + CInode *parent = in->get_parent_inode(); + ceph_assert(parent); + make_trace(trace, parent); + + CDentry *dn = in->get_parent_dn(); + dout(15) << "make_trace adding " << *dn << dendl; + trace.push_back(dn); +} + + +// ------------------------------------------------------------------------------- +// Open inode by inode number + +class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext { + inodeno_t ino; + public: + bufferlist bl; + C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) : + MDCacheIOContext(c), ino(i) {} + void finish(int r) override { + mdcache->_open_ino_backtrace_fetched(ino, bl, r); + } + void print(ostream& out) const override { + out << "openino_backtrace_fetch" << ino << ")"; + } +}; + +struct C_MDC_OpenInoTraverseDir : public MDCacheContext { + inodeno_t ino; + MMDSOpenIno::const_ref msg; + bool parent; + public: + C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const MMDSOpenIno::const_ref &m, bool p) : + MDCacheContext(c), ino(i), msg(m), parent(p) {} + void finish(int r) override { + if (r < 0 && !parent) + r = -EAGAIN; + if (msg) { + mdcache->handle_open_ino(msg, r); + return; + } + auto& info = mdcache->opening_inodes.at(ino); + mdcache->_open_ino_traverse_dir(ino, info, r); + } +}; + +struct C_MDC_OpenInoParentOpened : public MDCacheContext { + inodeno_t ino; + public: + C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} + void finish(int r) override { + mdcache->_open_ino_parent_opened(ino, r); + } +}; + +void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err) +{ + dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; + + open_ino_info_t& info = opening_inodes.at(ino); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + inode_backtrace_t backtrace; + if (err == 0) { + try { + decode(backtrace, bl); + } catch (const buffer::error &decode_exc) { + derr << "corrupt backtrace on ino x0" << std::hex << ino + << std::dec << ": " << decode_exc << dendl; + open_ino_finish(ino, info, -EIO); + return; + } + if (backtrace.pool != info.pool && backtrace.pool != -1) { + dout(10) << " old object in pool " << info.pool + << ", retrying pool " << backtrace.pool << dendl; + info.pool = backtrace.pool; + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + return; + } + } else if (err == -ENOENT) { + int64_t meta_pool = mds->mdsmap->get_metadata_pool(); + if (info.pool != meta_pool) { + dout(10) << " no object in pool " << info.pool + << ", retrying pool " << meta_pool << dendl; + info.pool = meta_pool; + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + return; + } + err = 0; // backtrace.ancestors.empty() is checked below + } + + if (err == 0) { + if (backtrace.ancestors.empty()) { + dout(10) << " got empty backtrace " << dendl; + err = -ESTALE; + } else if (!info.ancestors.empty()) { + if (info.ancestors[0] == backtrace.ancestors[0]) { + dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl; + err = -EINVAL; + } else { + info.last_err = 0; + } + } + } + if (err) { + dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl; + if (info.last_err) + err = info.last_err; + open_ino_finish(ino, info, err); + return; + } + + dout(10) << " got backtrace " << backtrace << dendl; + info.ancestors = backtrace.ancestors; + + _open_ino_traverse_dir(ino, info, 0); +} + +void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) +{ + dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; + + open_ino_info_t& info = opening_inodes.at(ino); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret == mds->get_nodeid()) { + _open_ino_traverse_dir(ino, info, 0); + } else { + if (ret >= 0) { + mds_rank_t checked_rank = mds_rank_t(ret); + info.check_peers = true; + info.auth_hint = checked_rank; + info.checked.erase(checked_rank); + } + do_open_ino(ino, info, ret); + } +} + +void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret) { + do_open_ino(ino, info, ret); + return; + } + + mds_rank_t hint = info.auth_hint; + ret = open_ino_traverse_dir(ino, NULL, info.ancestors, + info.discover, info.want_xlocked, &hint); + if (ret > 0) + return; + if (hint != mds->get_nodeid()) + info.auth_hint = hint; + do_open_ino(ino, info, ret); +} + +void MDCache::_open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent) +{ + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); + dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent)); + if (mds->logger) + mds->logger->inc(l_mds_openino_dir_fetch); +} + +int MDCache::open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, + const vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, mds_rank_t *hint) +{ + dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl; + int err = 0; + for (unsigned i = 0; i < ancestors.size(); i++) { + const auto& ancestor = ancestors.at(i); + CInode *diri = get_inode(ancestor.dirino); + + if (!diri) { + if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) { + open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + continue; + } + + if (diri->state_test(CInode::STATE_REJOINUNDEF)) { + CDir *dir = diri->get_parent_dir(); + while (dir->state_test(CDir::STATE_REJOINUNDEF) && + dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) + dir = dir->get_inode()->get_parent_dir(); + _open_ino_fetch_dir(ino, m, dir, i == 0); + return 1; + } + + if (!diri->is_dir()) { + dout(10) << " " << *diri << " is not dir" << dendl; + if (i == 0) + err = -ENOTDIR; + break; + } + + const string& name = ancestor.dname; + frag_t fg = diri->pick_dirfrag(name); + CDir *dir = diri->get_dirfrag(fg); + if (!dir) { + if (diri->is_auth()) { + if (diri->is_frozen()) { + dout(10) << " " << *diri << " is frozen, waiting " << dendl; + diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + dir = diri->get_or_open_dirfrag(this, fg); + } else if (discover) { + open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + } + if (dir) { + inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino; + CDentry *dn = dir->lookup(name); + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + if (dir->is_auth()) { + if (dnl && dnl->is_primary() && + dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { + dout(10) << " fetching undef " << *dnl->get_inode() << dendl; + _open_ino_fetch_dir(ino, m, dir, i == 0); + return 1; + } + + if (!dnl && !dir->is_complete() && + (!dir->has_bloom() || dir->is_in_bloom(name))) { + dout(10) << " fetching incomplete " << *dir << dendl; + _open_ino_fetch_dir(ino, m, dir, i == 0); + return 1; + } + + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -ENOENT; + } else if (discover) { + if (!dnl) { + filepath path(name, 0); + discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0), + (i == 0 && want_xlocked)); + return 1; + } + if (dnl->is_null() && !dn->lock.can_read(-1)) { + dout(10) << " null " << *dn << " is not readable, waiting" << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -ENOENT; + } + } + if (hint && i == 0) + *hint = dir ? dir->authority().first : diri->authority().first; + break; + } + return err; +} + +void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl; + + MDSContext::vec waiters; + waiters.swap(info.waiters); + opening_inodes.erase(ino); + finish_contexts(g_ceph_context, waiters, ret); +} + +void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err) +{ + if (err < 0 && err != -EAGAIN) { + info.checked.clear(); + info.checking = MDS_RANK_NONE; + info.check_peers = true; + info.fetch_backtrace = true; + if (info.discover) { + info.discover = false; + info.ancestors.clear(); + } + if (err != -ENOENT && err != -ENOTDIR) + info.last_err = err; + } + + if (info.check_peers || info.discover) { + if (info.discover) { + // got backtrace from peer, but failed to find inode. re-check peers + info.discover = false; + info.ancestors.clear(); + info.checked.clear(); + } + info.check_peers = false; + info.checking = MDS_RANK_NONE; + do_open_ino_peer(ino, info); + } else if (info.fetch_backtrace) { + info.check_peers = true; + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + info.checked.clear(); + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + } else { + ceph_assert(!info.ancestors.empty()); + info.checking = mds->get_nodeid(); + open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(), + new C_MDC_OpenInoParentOpened(this, ino), info.want_replica); + } +} + +void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) +{ + set<mds_rank_t> all, active; + mds->mdsmap->get_mds_set(all); + if (mds->get_state() == MDSMap::STATE_REJOIN) + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN); + else + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); + + dout(10) << "do_open_ino_peer " << ino << " active " << active + << " all " << all << " checked " << info.checked << dendl; + + mds_rank_t whoami = mds->get_nodeid(); + mds_rank_t peer = MDS_RANK_NONE; + if (info.auth_hint >= 0 && info.auth_hint != whoami) { + if (active.count(info.auth_hint)) { + peer = info.auth_hint; + info.auth_hint = MDS_RANK_NONE; + } + } else { + for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) + if (*p != whoami && info.checked.count(*p) == 0) { + peer = *p; + break; + } + } + if (peer < 0) { + all.erase(whoami); + if (all != info.checked) { + dout(10) << " waiting for more peers to be active" << dendl; + } else { + dout(10) << " all MDS peers have been checked " << dendl; + do_open_ino(ino, info, 0); + } + } else { + info.checking = peer; + vector<inode_backpointer_t> *pa = NULL; + // got backtrace from peer or backtrace just fetched + if (info.discover || !info.fetch_backtrace) + pa = &info.ancestors; + mds->send_message_mds(MMDSOpenIno::create(info.tid, ino, pa), peer); + if (mds->logger) + mds->logger->inc(l_mds_openino_peer_discover); + } +} + +void MDCache::handle_open_ino(const MMDSOpenIno::const_ref &m, int err) +{ + if (mds->get_state() < MDSMap::STATE_REJOIN && + mds->get_want_state() != CEPH_MDS_STATE_REJOIN) { + return; + } + + dout(10) << "handle_open_ino " << *m << " err " << err << dendl; + + auto from = mds_rank_t(m->get_source().num()); + inodeno_t ino = m->ino; + MMDSOpenInoReply::ref reply; + CInode *in = get_inode(ino); + if (in) { + dout(10) << " have " << *in << dendl; + reply = MMDSOpenInoReply::create(m->get_tid(), ino, mds_rank_t(0)); + if (in->is_auth()) { + touch_inode(in); + while (1) { + CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + CInode *diri = pdn->get_dir()->get_inode(); + reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), + in->inode.version)); + in = diri; + } + } else { + reply->hint = in->authority().first; + } + } else if (err < 0) { + reply = MMDSOpenInoReply::create(m->get_tid(), ino, MDS_RANK_NONE, err); + } else { + mds_rank_t hint = MDS_RANK_NONE; + int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint); + if (ret > 0) + return; + reply = MMDSOpenInoReply::create(m->get_tid(), ino, hint, ret); + } + mds->send_message_mds(reply, from); +} + +void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m) +{ + dout(10) << "handle_open_ino_reply " << *m << dendl; + + inodeno_t ino = m->ino; + mds_rank_t from = mds_rank_t(m->get_source().num()); + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end() && it->second.checking == from) { + open_ino_info_t& info = it->second; + info.checking = MDS_RANK_NONE; + info.checked.insert(from); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + } else if (!m->ancestors.empty()) { + dout(10) << " found ino " << ino << " on mds." << from << dendl; + if (!info.want_replica) { + open_ino_finish(ino, info, from); + return; + } + + info.ancestors = m->ancestors; + info.auth_hint = from; + info.checking = mds->get_nodeid(); + info.discover = true; + _open_ino_traverse_dir(ino, info, 0); + } else if (m->error) { + dout(10) << " error " << m->error << " from mds." << from << dendl; + do_open_ino(ino, info, m->error); + } else { + if (m->hint >= 0 && m->hint != mds->get_nodeid()) { + info.auth_hint = m->hint; + info.checked.erase(m->hint); + } + do_open_ino_peer(ino, info); + } + } +} + +void MDCache::kick_open_ino_peers(mds_rank_t who) +{ + dout(10) << "kick_open_ino_peers mds." << who << dendl; + + for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin(); + p != opening_inodes.end(); + ++p) { + open_ino_info_t& info = p->second; + if (info.checking == who) { + dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl; + info.checking = MDS_RANK_NONE; + do_open_ino_peer(p->first, info); + } else if (info.checking == MDS_RANK_NONE) { + dout(10) << " kicking ino " << p->first << " who was waiting" << dendl; + do_open_ino_peer(p->first, info); + } + } +} + +void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin, + bool want_replica, bool want_xlocked) +{ + dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " + << want_replica << dendl; + + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end()) { + open_ino_info_t& info = it->second; + if (want_replica) { + info.want_replica = true; + if (want_xlocked && !info.want_xlocked) { + if (!info.ancestors.empty()) { + CInode *diri = get_inode(info.ancestors[0].dirino); + if (diri) { + frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname); + CDir *dir = diri->get_dirfrag(fg); + if (dir && !dir->is_auth()) { + filepath path(info.ancestors[0].dname, 0); + discover_path(dir, CEPH_NOSNAP, path, NULL, true); + } + } + } + info.want_xlocked = true; + } + } + info.waiters.push_back(fin); + } else { + open_ino_info_t& info = opening_inodes[ino]; + info.want_replica = want_replica; + info.want_xlocked = want_xlocked; + info.tid = ++open_ino_last_tid; + info.pool = pool >= 0 ? pool : default_file_layout.pool_id; + info.waiters.push_back(fin); + if (mds->is_rejoin() && + open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) { + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + _open_ino_traverse_dir(ino, info, 0); + } else { + do_open_ino(ino, info, 0); + } + } +} + +/* ---------------------------- */ + +/* + * search for a given inode on MDS peers. optionally start with the given node. + + + TODO + - recover from mds node failure, recovery + - traverse path + + */ +void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint) +{ + dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl; + CInode *in = get_inode(ino); + if (in && in->state_test(CInode::STATE_PURGING)) { + c->complete(-ESTALE); + return; + } + ceph_assert(!in); + + ceph_tid_t tid = ++find_ino_peer_last_tid; + find_ino_peer_info_t& fip = find_ino_peer[tid]; + fip.ino = ino; + fip.tid = tid; + fip.fin = c; + fip.hint = hint; + _do_find_ino_peer(fip); +} + +void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip) +{ + set<mds_rank_t> all, active; + mds->mdsmap->get_mds_set(all); + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); + + dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino + << " active " << active << " all " << all + << " checked " << fip.checked + << dendl; + + mds_rank_t m = MDS_RANK_NONE; + if (fip.hint >= 0) { + m = fip.hint; + fip.hint = MDS_RANK_NONE; + } else { + for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) + if (*p != mds->get_nodeid() && + fip.checked.count(*p) == 0) { + m = *p; + break; + } + } + if (m == MDS_RANK_NONE) { + all.erase(mds->get_nodeid()); + if (all != fip.checked) { + dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl; + } else { + dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl; + fip.fin->complete(-ESTALE); + find_ino_peer.erase(fip.tid); + } + } else { + fip.checking = m; + mds->send_message_mds(MMDSFindIno::create(fip.tid, fip.ino), m); + } +} + +void MDCache::handle_find_ino(const MMDSFindIno::const_ref &m) +{ + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + dout(10) << "handle_find_ino " << *m << dendl; + auto r = MMDSFindInoReply::create(m->tid); + CInode *in = get_inode(m->ino); + if (in) { + in->make_path(r->path); + dout(10) << " have " << r->path << " " << *in << dendl; + } + mds->send_message_mds(r, mds_rank_t(m->get_source().num())); +} + + +void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref &m) +{ + map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid); + if (p != find_ino_peer.end()) { + dout(10) << "handle_find_ino_reply " << *m << dendl; + find_ino_peer_info_t& fip = p->second; + + // success? + if (get_inode(fip.ino)) { + dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl; + mds->queue_waiter(fip.fin); + find_ino_peer.erase(p); + return; + } + + mds_rank_t from = mds_rank_t(m->get_source().num()); + if (fip.checking == from) + fip.checking = MDS_RANK_NONE; + fip.checked.insert(from); + + if (!m->path.empty()) { + // we got a path! + vector<CDentry*> trace; + CF_MDS_RetryMessageFactory cf(mds, m); + MDRequestRef null_ref; + int r = path_traverse(null_ref, cf, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER); + if (r > 0) + return; + dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path + << ", retrying" << dendl; + fip.checked.clear(); + _do_find_ino_peer(fip); + } else { + // nope, continue. + _do_find_ino_peer(fip); + } + } else { + dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl; + } +} + +void MDCache::kick_find_ino_peers(mds_rank_t who) +{ + // find_ino_peers requests we should move on from + for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin(); + p != find_ino_peer.end(); + ++p) { + find_ino_peer_info_t& fip = p->second; + if (fip.checking == who) { + dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl; + fip.checking = MDS_RANK_NONE; + _do_find_ino_peer(fip); + } else if (fip.checking == MDS_RANK_NONE) { + dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl; + _do_find_ino_peer(fip); + } + } +} + +/* ---------------------------- */ + +int MDCache::get_num_client_requests() +{ + int count = 0; + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (mdr->reqid.name.is_client() && !mdr->is_slave()) + count++; + } + return count; +} + +MDRequestRef MDCache::request_start(const MClientRequest::const_ref& req) +{ + // did we win a forward race against a slave? + if (active_requests.count(req->get_reqid())) { + MDRequestRef& mdr = active_requests[req->get_reqid()]; + ceph_assert(mdr); + if (mdr->is_slave()) { + dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl; + mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; + } + return MDRequestRef(); + } + + // register new client request + MDRequestImpl::Params params; + params.reqid = req->get_reqid(); + params.attempt = req->get_num_fwd(); + params.client_req = req; + params.initiated = req->get_recv_stamp(); + params.throttled = req->get_throttle_stamp(); + params.all_read = req->get_recv_complete_stamp(); + params.dispatched = req->get_dispatch_stamp(); + + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + active_requests[params.reqid] = mdr; + mdr->set_op_stamp(req->get_stamp()); + dout(7) << "request_start " << *mdr << dendl; + return mdr; +} + +MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const Message::const_ref &m) +{ + int by = m->get_source().num(); + MDRequestImpl::Params params; + params.reqid = ri; + params.attempt = attempt; + params.triggering_slave_req = m; + params.slave_to = by; + params.initiated = m->get_recv_stamp(); + params.throttled = m->get_throttle_stamp(); + params.all_read = m->get_recv_complete_stamp(); + params.dispatched = m->get_dispatch_stamp(); + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + ceph_assert(active_requests.count(mdr->reqid) == 0); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl; + return mdr; +} + +MDRequestRef MDCache::request_start_internal(int op) +{ + utime_t now = ceph_clock_now(); + MDRequestImpl::Params params; + params.reqid.name = entity_name_t::MDS(mds->get_nodeid()); + params.reqid.tid = mds->issue_tid(); + params.initiated = now; + params.throttled = now; + params.all_read = now; + params.dispatched = now; + params.internal_op = op; + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + + ceph_assert(active_requests.count(mdr->reqid) == 0); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start_internal " << *mdr << " op " << op << dendl; + return mdr; +} + +MDRequestRef MDCache::request_get(metareqid_t rid) +{ + ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid); + ceph_assert(p != active_requests.end()); + dout(7) << "request_get " << rid << " " << *p->second << dendl; + return p->second; +} + +void MDCache::request_finish(MDRequestRef& mdr) +{ + dout(7) << "request_finish " << *mdr << dendl; + mdr->mark_event("finishing request"); + + // slave finisher? + if (mdr->has_more() && mdr->more()->slave_commit) { + Context *fin = mdr->more()->slave_commit; + mdr->more()->slave_commit = 0; + int ret; + if (mdr->aborted) { + mdr->aborted = false; + ret = -1; + mdr->more()->slave_rolling_back = true; + } else { + ret = 0; + mdr->committing = true; + } + fin->complete(ret); // this must re-call request_finish. + return; + } + + switch(mdr->internal_op) { + case CEPH_MDS_OP_FRAGMENTDIR: + logger->inc(l_mdss_ireq_fragmentdir); + break; + case CEPH_MDS_OP_EXPORTDIR: + logger->inc(l_mdss_ireq_exportdir); + break; + case CEPH_MDS_OP_ENQUEUE_SCRUB: + logger->inc(l_mdss_ireq_enqueue_scrub); + break; + case CEPH_MDS_OP_FLUSH: + logger->inc(l_mdss_ireq_flush); + break; + case CEPH_MDS_OP_REPAIR_FRAGSTATS: + logger->inc(l_mdss_ireq_fragstats); + break; + case CEPH_MDS_OP_REPAIR_INODESTATS: + logger->inc(l_mdss_ireq_inodestats); + break; + } + + request_cleanup(mdr); +} + + +void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port) +{ + CachedStackStringStream css; + *css << "forwarding request to mds." << who; + mdr->mark_event(css->strv()); + if (mdr->client_request && mdr->client_request->get_source().is_client()) { + dout(7) << "request_forward " << *mdr << " to mds." << who << " req " + << *mdr->client_request << dendl; + mds->forward_message_mds(mdr->release_client_request(), who); + if (mds->logger) mds->logger->inc(l_mds_forward); + } else if (mdr->internal_op >= 0) { + dout(10) << "request_forward on internal op; cancelling" << dendl; + mdr->internal_op_finish->complete(-EXDEV); + } else { + dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request + << " was from mds" << dendl; + } + request_cleanup(mdr); +} + + +void MDCache::dispatch_request(MDRequestRef& mdr) +{ + if (mdr->client_request) { + mds->server->dispatch_client_request(mdr); + } else if (mdr->slave_request) { + mds->server->dispatch_slave_request(mdr); + } else { + switch (mdr->internal_op) { + case CEPH_MDS_OP_FRAGMENTDIR: + dispatch_fragment_dir(mdr); + break; + case CEPH_MDS_OP_EXPORTDIR: + migrator->dispatch_export_dir(mdr, 0); + break; + case CEPH_MDS_OP_ENQUEUE_SCRUB: + enqueue_scrub_work(mdr); + break; + case CEPH_MDS_OP_FLUSH: + flush_dentry_work(mdr); + break; + case CEPH_MDS_OP_REPAIR_FRAGSTATS: + repair_dirfrag_stats_work(mdr); + break; + case CEPH_MDS_OP_REPAIR_INODESTATS: + repair_inode_stats_work(mdr); + break; + case CEPH_MDS_OP_UPGRADE_SNAPREALM: + upgrade_inode_snaprealm_work(mdr); + break; + default: + ceph_abort(); + } + } +} + + +void MDCache::request_drop_foreign_locks(MDRequestRef& mdr) +{ + if (!mdr->has_more()) + return; + + // clean up slaves + // (will implicitly drop remote dn pins) + for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin(); + p != mdr->more()->slaves.end(); + ++p) { + auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, + MMDSSlaveRequest::OP_FINISH); + + if (mdr->killed && !mdr->committing) { + r->mark_abort(); + } else if (mdr->more()->srcdn_auth_mds == *p && + mdr->more()->inode_import.length() > 0) { + // information about rename imported caps + r->inode_export.claim(mdr->more()->inode_import); + } + + mds->send_message_mds(r, *p); + } + + /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them + * implicitly. Note that we don't call the finishers -- there shouldn't + * be any on a remote lock and the request finish wakes up all + * the waiters anyway! */ + + for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) { + SimpleLock *lock = it->lock; + if (it->is_xlock() && !lock->get_parent()->is_auth()) { + dout(10) << "request_drop_foreign_locks forgetting lock " << *lock + << " on " << lock->get_parent() << dendl; + lock->put_xlock(); + mdr->locks.erase(it++); + } else if (it->is_remote_wrlock()) { + dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock + << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl; + if (it->is_wrlock()) { + it->clear_remote_wrlock(); + ++it; + } else { + mdr->locks.erase(it++); + } + } else { + ++it; + } + } + + mdr->more()->slaves.clear(); /* we no longer have requests out to them, and + * leaving them in can cause double-notifies as + * this function can get called more than once */ +} + +void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr) +{ + request_drop_foreign_locks(mdr); + mds->locker->drop_non_rdlocks(mdr.get()); +} + +void MDCache::request_drop_locks(MDRequestRef& mdr) +{ + request_drop_foreign_locks(mdr); + mds->locker->drop_locks(mdr.get()); +} + +void MDCache::request_cleanup(MDRequestRef& mdr) +{ + dout(15) << "request_cleanup " << *mdr << dendl; + + if (mdr->has_more()) { + if (mdr->more()->is_ambiguous_auth) + mdr->clear_ambiguous_auth(); + if (!mdr->more()->waiting_for_finish.empty()) + mds->queue_waiters(mdr->more()->waiting_for_finish); + } + + request_drop_locks(mdr); + + // drop (local) auth pins + mdr->drop_local_auth_pins(); + + // drop stickydirs + mdr->put_stickydirs(); + + mds->locker->kick_cap_releases(mdr); + + // drop cache pins + mdr->drop_pins(); + + // remove from session + mdr->item_session_request.remove_myself(); + + // remove from map + active_requests.erase(mdr->reqid); + + if (mds->logger) + log_stat(); + + mdr->mark_event("cleaned up request"); +} + +void MDCache::request_kill(MDRequestRef& mdr) +{ + // rollback slave requests is tricky. just let the request proceed. + if (mdr->has_more() && + (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) { + if (!mdr->done_locking) { + ceph_assert(mdr->more()->witnessed.empty()); + mdr->aborted = true; + dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl; + } else { + dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl; + } + + ceph_assert(mdr->used_prealloc_ino == 0); + ceph_assert(mdr->prealloc_inos.empty()); + + mdr->session = NULL; + mdr->item_session_request.remove_myself(); + return; + } + + mdr->killed = true; + mdr->mark_event("killing request"); + + if (mdr->committing) { + dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl; + mdr->item_session_request.remove_myself(); + } else { + dout(10) << "request_kill " << *mdr << dendl; + request_cleanup(mdr); + } +} + +// ------------------------------------------------------------------------------- +// SNAPREALMS + +void MDCache::create_global_snaprealm() +{ + CInode *in = new CInode(this); // dummy inode + create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755); + add_inode(in); + global_snaprealm = in->snaprealm; +} + +void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients) +{ + dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl; + + vector<inodeno_t> split_inos; + vector<inodeno_t> split_realms; + + if (notify_clients) { + ceph_assert(in->snaprealm->have_past_parents_open()); + if (snapop == CEPH_SNAP_OP_SPLIT) { + // notify clients of update|split + for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps)); + !p.end(); ++p) + split_inos.push_back((*p)->ino()); + + for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin(); + p != in->snaprealm->open_children.end(); + ++p) + split_realms.push_back((*p)->inode->ino()); + } + } + + set<SnapRealm*> past_children; + map<client_t, MClientSnap::ref> updates; + list<SnapRealm*> q; + q.push_back(in->snaprealm); + while (!q.empty()) { + SnapRealm *realm = q.front(); + q.pop_front(); + + dout(10) << " realm " << *realm << " on " << *realm->inode << dendl; + realm->invalidate_cached_snaps(); + + if (notify_clients) { + for (const auto& p : realm->client_caps) { + const auto& client = p.first; + const auto& caps = p.second; + ceph_assert(!caps->empty()); + + auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple()); + if (em.second) { + auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT); + update->head.split = in->ino(); + update->split_inos = split_inos; + update->split_realms = split_realms; + update->bl = in->snaprealm->get_snap_trace(); + em.first->second = std::move(update); + } + } + } + + if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) { + for (set<SnapRealm*>::iterator p = realm->open_past_children.begin(); + p != realm->open_past_children.end(); + ++p) + past_children.insert(*p); + } + + // notify for active children, too. + dout(10) << " " << realm << " open_children are " << realm->open_children << dendl; + for (set<SnapRealm*>::iterator p = realm->open_children.begin(); + p != realm->open_children.end(); + ++p) + q.push_back(*p); + } + + if (notify_clients) + send_snaps(updates); + + // notify past children and their descendants if we update/delete old snapshots + for (set<SnapRealm*>::iterator p = past_children.begin(); + p != past_children.end(); + ++p) + q.push_back(*p); + + while (!q.empty()) { + SnapRealm *realm = q.front(); + q.pop_front(); + + realm->invalidate_cached_snaps(); + + for (set<SnapRealm*>::iterator p = realm->open_children.begin(); + p != realm->open_children.end(); + ++p) { + if (past_children.count(*p) == 0) + q.push_back(*p); + } + + for (set<SnapRealm*>::iterator p = realm->open_past_children.begin(); + p != realm->open_past_children.end(); + ++p) { + if (past_children.count(*p) == 0) { + q.push_back(*p); + past_children.insert(*p); + } + } + } + + if (snapop == CEPH_SNAP_OP_DESTROY) { + // eval stray inodes if we delete snapshot from their past ancestor snaprealm + for (set<SnapRealm*>::iterator p = past_children.begin(); + p != past_children.end(); + ++p) + maybe_eval_stray((*p)->inode, true); + } +} + +void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op) +{ + dout(10) << __func__ << " " << *in << " stid " << stid << dendl; + ceph_assert(in->is_auth()); + + set<mds_rank_t> mds_set; + if (stid > 0) { + mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE); + mds_set.erase(mds->get_nodeid()); + } else { + in->list_replicas(mds_set); + } + + if (!mds_set.empty()) { + bufferlist snap_blob; + in->encode_snap(snap_blob); + + for (auto p : mds_set) { + auto m = MMDSSnapUpdate::create(in->ino(), stid, snap_op); + m->snap_blob = snap_blob; + mds->send_message_mds(m, p); + } + } + + if (stid > 0) + notify_global_snaprealm_update(snap_op); +} + +void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref &m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + dout(10) << __func__ << " " << *m << " from mds." << from << dendl; + + if (mds->get_state() < MDSMap::STATE_RESOLVE && + mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) { + return; + } + + // null rejoin_done means open_snaprealms() has already been called + bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN || + (mds->is_rejoin() && !rejoin_done); + + if (m->get_tid() > 0) { + mds->snapclient->notify_commit(m->get_tid()); + if (notify_clients) + notify_global_snaprealm_update(m->get_snap_op()); + } + + CInode *in = get_inode(m->get_ino()); + if (in) { + ceph_assert(!in->is_auth()); + if (mds->get_state() > MDSMap::STATE_REJOIN || + (mds->is_rejoin() && !in->is_rejoining())) { + auto p = m->snap_blob.cbegin(); + in->decode_snap(p); + + if (!notify_clients) { + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + } + do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients); + } + } +} + +void MDCache::notify_global_snaprealm_update(int snap_op) +{ + if (snap_op != CEPH_SNAP_OP_DESTROY) + snap_op = CEPH_SNAP_OP_UPDATE; + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (auto &session : sessions) { + if (!session->is_open() && !session->is_stale()) + continue; + auto update = MClientSnap::create(snap_op); + update->head.split = global_snaprealm->inode->ino(); + update->bl = global_snaprealm->get_snap_trace(); + mds->send_message_client_counted(update, session); + } +} + +// ------------------------------------------------------------------------------- +// STRAYS + +struct C_MDC_RetryScanStray : public MDCacheContext { + dirfrag_t next; + C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { } + void finish(int r) override { + mdcache->scan_stray_dir(next); + } +}; + +void MDCache::scan_stray_dir(dirfrag_t next) +{ + dout(10) << "scan_stray_dir " << next << dendl; + + list<CDir*> ls; + for (int i = 0; i < NUM_STRAY; ++i) { + if (strays[i]->ino() < next.ino) + continue; + strays[i]->get_dirfrags(ls); + } + + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->dirfrag() < next) + continue; + if (!dir->is_complete()) { + dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag())); + return; + } + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->state_set(CDentry::STATE_STRAY); + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (in->inode.nlink == 0) + in->state_set(CInode::STATE_ORPHAN); + maybe_eval_stray(in); + } + } + } +} + +void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin) +{ + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); + if (mds->logger) + mds->logger->inc(l_mds_openino_backtrace_fetch); +} + + + + + +// ======================================================================================== +// DISCOVER +/* + + - for all discovers (except base_inos, e.g. root, stray), waiters are attached + to the parent metadata object in the cache (pinning it). + + - all discovers are tracked by tid, so that we can ignore potentially dup replies. + +*/ + +void MDCache::_send_discover(discover_info_t& d) +{ + auto dis = MDiscover::create(d.ino, d.frag, d.snap, d.want_path, d.want_base_dir, d.want_xlocked); + dis->set_tid(d.tid); + mds->send_message_mds(dis, d.mds); +} + +void MDCache::discover_base_ino(inodeno_t want_ino, + MDSContext *onfinish, + mds_rank_t from) +{ + dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl; + if (waiting_for_base_ino[from].count(want_ino) == 0) { + discover_info_t& d = _create_discover(from); + d.ino = want_ino; + _send_discover(d); + } + waiting_for_base_ino[from][want_ino].push_back(onfinish); +} + + +void MDCache::discover_dir_frag(CInode *base, + frag_t approx_fg, + MDSContext *onfinish, + mds_rank_t from) +{ + if (from < 0) + from = base->authority().first; + + dirfrag_t df(base->ino(), approx_fg); + dout(7) << "discover_dir_frag " << df + << " from mds." << from << dendl; + + if (!base->is_waiting_for_dir(approx_fg) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.pin_base(base); + d.ino = base->ino(); + d.frag = approx_fg; + d.want_base_dir = true; + _send_discover(d); + } + + if (onfinish) + base->add_dir_waiter(approx_fg, onfinish); +} + +struct C_MDC_RetryDiscoverPath : public MDCacheContext { + CInode *base; + snapid_t snapid; + filepath path; + mds_rank_t from; + C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) : + MDCacheContext(c), base(b), snapid(s), path(p), from(f) {} + void finish(int r) override { + mdcache->discover_path(base, snapid, path, 0, from); + } +}; + +void MDCache::discover_path(CInode *base, + snapid_t snap, + filepath want_path, + MDSContext *onfinish, + bool want_xlocked, + mds_rank_t from) +{ + if (from < 0) + from = base->authority().first; + + dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from + << (want_xlocked ? " want_xlocked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(10) << " waiting for single auth on " << *base << dendl; + if (!onfinish) + onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from); + base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); + return; + } else if (from == mds->get_nodeid()) { + MDSContext::vec finished; + base->take_waiting(CInode::WAIT_DIR, finished); + mds->queue_waiters(finished); + return; + } + + frag_t fg = base->pick_dirfrag(want_path[0]); + if ((want_xlocked && want_path.depth() == 1) || + !base->is_waiting_for_dir(fg) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.ino = base->ino(); + d.pin_base(base); + d.frag = fg; + d.snap = snap; + d.want_path = want_path; + d.want_base_dir = true; + d.want_xlocked = want_xlocked; + _send_discover(d); + } + + // register + wait + if (onfinish) + base->add_dir_waiter(fg, onfinish); +} + +struct C_MDC_RetryDiscoverPath2 : public MDCacheContext { + CDir *base; + snapid_t snapid; + filepath path; + C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) : + MDCacheContext(c), base(b), snapid(s), path(p) {} + void finish(int r) override { + mdcache->discover_path(base, snapid, path, 0); + } +}; + +void MDCache::discover_path(CDir *base, + snapid_t snap, + filepath want_path, + MDSContext *onfinish, + bool want_xlocked) +{ + mds_rank_t from = base->authority().first; + + dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from + << (want_xlocked ? " want_xlocked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(7) << " waiting for single auth on " << *base << dendl; + if (!onfinish) + onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path); + base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); + return; + } else if (from == mds->get_nodeid()) { + MDSContext::vec finished; + base->take_sub_waiting(finished); + mds->queue_waiters(finished); + return; + } + + if ((want_xlocked && want_path.depth() == 1) || + !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.ino = base->ino(); + d.pin_base(base->inode); + d.frag = base->get_frag(); + d.snap = snap; + d.want_path = want_path; + d.want_base_dir = false; + d.want_xlocked = want_xlocked; + _send_discover(d); + } + + // register + wait + if (onfinish) + base->add_dentry_waiter(want_path[0], snap, onfinish); +} + +void MDCache::kick_discovers(mds_rank_t who) +{ + for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin(); + p != discovers.end(); + ++p) { + if (p->second.mds != who) + continue; + _send_discover(p->second); + } +} + + +void MDCache::handle_discover(const MDiscover::const_ref &dis) +{ + mds_rank_t whoami = mds->get_nodeid(); + mds_rank_t from = mds_rank_t(dis->get_source().num()); + + ceph_assert(from != whoami); + + if (mds->get_state() <= MDSMap::STATE_REJOIN) { + if (mds->get_state() < MDSMap::STATE_REJOIN && + mds->get_want_state() < CEPH_MDS_STATE_REJOIN) { + return; + } + + // proceed if requester is in the REJOIN stage, the request is from parallel_fetch(). + // delay processing request from survivor because we may not yet choose lock states. + if (!mds->mdsmap->is_rejoin(from)) { + dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl; + mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis)); + return; + } + } + + + CInode *cur = 0; + auto reply = MDiscoverReply::create(*dis); + + snapid_t snapid = dis->get_snapid(); + + // get started. + if (MDS_INO_IS_BASE(dis->get_base_ino()) && + !dis->wants_base_dir() && dis->get_want().depth() == 0) { + // wants root + dout(7) << "handle_discover from mds." << from + << " wants base + " << dis->get_want().get_path() + << " snap " << snapid + << dendl; + + cur = get_inode(dis->get_base_ino()); + ceph_assert(cur); + + // add root + reply->starts_with = MDiscoverReply::INODE; + replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features()); + dout(10) << "added base " << *cur << dendl; + } + else { + // there's a base inode + cur = get_inode(dis->get_base_ino(), snapid); + if (!cur && snapid != CEPH_NOSNAP) { + cur = get_inode(dis->get_base_ino()); + if (cur && !cur->is_multiversion()) + cur = NULL; // nope! + } + + if (!cur) { + dout(7) << "handle_discover mds." << from + << " don't have base ino " << dis->get_base_ino() << "." << snapid + << dendl; + if (!dis->wants_base_dir() && dis->get_want().depth() > 0) + reply->set_error_dentry(dis->get_dentry(0)); + reply->set_flag_error_dir(); + } else if (dis->wants_base_dir()) { + dout(7) << "handle_discover mds." << from + << " wants basedir+" << dis->get_want().get_path() + << " has " << *cur + << dendl; + } else { + dout(7) << "handle_discover mds." << from + << " wants " << dis->get_want().get_path() + << " has " << *cur + << dendl; + } + } + + ceph_assert(reply); + + // add content + // do some fidgeting to include a dir if they asked for the base dir, or just root. + for (unsigned i = 0; + cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); + i++) { + + // -- figure out the dir + + // is *cur even a dir at all? + if (!cur->is_dir()) { + dout(7) << *cur << " not a dir" << dendl; + reply->set_flag_error_dir(); + break; + } + + // pick frag + frag_t fg; + if (dis->get_want().depth()) { + // dentry specifies + fg = cur->pick_dirfrag(dis->get_dentry(i)); + } else { + // requester explicity specified the frag + ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino())); + fg = dis->get_base_dir_frag(); + if (!cur->dirfragtree.is_leaf(fg)) + fg = cur->dirfragtree[fg.value()]; + } + CDir *curdir = cur->get_dirfrag(fg); + + if ((!curdir && !cur->is_auth()) || + (curdir && !curdir->is_auth())) { + + /* before: + * ONLY set flag if empty!! + * otherwise requester will wake up waiter(s) _and_ continue with discover, + * resulting in duplicate discovers in flight, + * which can wreak havoc when discovering rename srcdn (which may move) + */ + + if (reply->is_empty()) { + // only hint if empty. + // someday this could be better, but right now the waiter logic isn't smart enough. + + // hint + if (curdir) { + dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; + reply->set_dir_auth_hint(curdir->authority().first); + } else { + dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " + << *cur << dendl; + reply->set_dir_auth_hint(cur->authority().first); + } + + // note error dentry, if any + // NOTE: important, as it allows requester to issue an equivalent discover + // to whomever we hint at. + if (dis->get_want().depth() > i) + reply->set_error_dentry(dis->get_dentry(i)); + } + + break; + } + + if (!curdir) { // open dir? + if (cur->is_frozen()) { + if (!reply->is_empty()) { + dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl; + break; + } + dout(7) << *cur << " is frozen, empty reply, waiting" << dendl; + cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } + curdir = cur->get_or_open_dirfrag(this, fg); + } else if (curdir->is_frozen_tree() || + (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) { + if (!reply->is_empty()) { + dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; + break; + } + if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) { + dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl; + reply->set_flag_error_dir(); + break; + } + dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } + + // add dir + if (curdir->get_version() == 0) { + // fetch newly opened dir + } else if (reply->is_empty() && !dis->wants_base_dir()) { + dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; + // make sure the base frag is correct, though, in there was a refragment since the + // original request was sent. + reply->set_base_dir_frag(curdir->get_frag()); + } else { + ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen. + if (!reply->trace.length()) + reply->starts_with = MDiscoverReply::DIR; + replicate_dir(curdir, from, reply->trace); + dout(7) << "handle_discover added dir " << *curdir << dendl; + } + + // lookup + CDentry *dn = 0; + if (curdir->get_version() == 0) { + // fetch newly opened dir + ceph_assert(!curdir->has_bloom()); + } else if (dis->get_want().depth() > 0) { + // lookup dentry + dn = curdir->lookup(dis->get_dentry(i), snapid); + } else + break; // done! + + // incomplete dir? + if (!dn) { + if (!curdir->is_complete() && + !(snapid == CEPH_NOSNAP && + curdir->has_bloom() && + !curdir->is_in_bloom(dis->get_dentry(i)))) { + // readdir + dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; + if (reply->is_empty()) { + // fetch and wait + curdir->fetch(new C_MDS_RetryMessage(mds, dis), + dis->wants_base_dir() && curdir->get_version() == 0); + return; + } else { + // initiate fetch, but send what we have so far + curdir->fetch(0); + break; + } + } + + if (snapid != CEPH_NOSNAP && !reply->is_empty()) { + dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid + << " dne, non-empty reply, stopping" << dendl; + break; + } + + // send null dentry + dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " + << *curdir << dendl; + if (snapid == CEPH_NOSNAP) + dn = curdir->add_null_dentry(dis->get_dentry(i)); + else + dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid); + } + ceph_assert(dn); + + // don't add replica to purging dentry/inode + if (dn->state_test(CDentry::STATE_PURGING)) { + if (reply->is_empty()) + reply->set_flag_error_dn(dis->get_dentry(i)); + break; + } + + CDentry::linkage_t *dnl = dn->get_linkage(); + + // xlocked dentry? + // ...always block on non-tail items (they are unrelated) + // ...allow xlocked tail disocvery _only_ if explicitly requested + bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); + if (dn->lock.is_xlocked()) { + // is this the last (tail) item in the discover traversal? + if (tailitem && dis->wants_xlocked()) { + dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl; + } else if (reply->is_empty()) { + dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); + return; + } else { + dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl; + break; + } + } + + // frozen inode? + if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) { + if (tailitem && dis->wants_xlocked()) { + dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl; + } else if (reply->is_empty()) { + dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl; + dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } else { + dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl; + break; + } + } + + // add dentry + if (!reply->trace.length()) + reply->starts_with = MDiscoverReply::DENTRY; + replicate_dentry(dn, from, reply->trace); + dout(7) << "handle_discover added dentry " << *dn << dendl; + + if (!dnl->is_primary()) break; // stop on null or remote link. + + // add inode + CInode *next = dnl->get_inode(); + ceph_assert(next->is_auth()); + + replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features()); + dout(7) << "handle_discover added inode " << *next << dendl; + + // descend, keep going. + cur = next; + continue; + } + + // how did we do? + ceph_assert(!reply->is_empty()); + dout(7) << "handle_discover sending result back to asker mds." << from << dendl; + mds->send_message(reply, dis->get_connection()); +} + +void MDCache::handle_discover_reply(const MDiscoverReply::const_ref &m) +{ + /* + if (mds->get_state() < MDSMap::STATE_ACTIVE) { + dout(0) << "discover_reply NOT ACTIVE YET" << dendl; + return; + } + */ + dout(7) << "discover_reply " << *m << dendl; + if (m->is_flag_error_dir()) + dout(7) << " flag error, dir" << dendl; + if (m->is_flag_error_dn()) + dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; + + MDSContext::vec finished, error; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + // starting point + CInode *cur = get_inode(m->get_base_ino()); + auto p = m->trace.cbegin(); + + int next = m->starts_with; + + // decrement discover counters + if (m->get_tid()) { + map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid()); + if (p != discovers.end()) { + dout(10) << " found tid " << m->get_tid() << dendl; + discovers.erase(p); + } else { + dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl; + } + } + + // discover may start with an inode + if (!p.end() && next == MDiscoverReply::INODE) { + cur = add_replica_inode(p, NULL, finished); + dout(7) << "discover_reply got base inode " << *cur << dendl; + ceph_assert(cur->is_base()); + + next = MDiscoverReply::DIR; + + // take waiters? + if (cur->is_base() && + waiting_for_base_ino[from].count(cur->ino())) { + finished.swap(waiting_for_base_ino[from][cur->ino()]); + waiting_for_base_ino[from].erase(cur->ino()); + } + } + ceph_assert(cur); + + // loop over discover results. + // indexes follow each ([[dir] dentry] inode) + // can start, end with any type. + while (!p.end()) { + // dir + frag_t fg; + CDir *curdir = 0; + if (next == MDiscoverReply::DIR) { + curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished); + if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) { + ceph_assert(m->get_wanted_base_dir()); + cur->take_dir_waiting(m->get_base_dir_frag(), finished); + } + } else { + // note: this can only happen our first way around this loop. + if (p.end() && m->is_flag_error_dn()) { + fg = cur->pick_dirfrag(m->get_error_dentry()); + curdir = cur->get_dirfrag(fg); + } else + curdir = cur->get_dirfrag(m->get_base_dir_frag()); + } + + if (p.end()) + break; + + // dentry + CDentry *dn = add_replica_dentry(p, curdir, finished); + + if (p.end()) + break; + + // inode + cur = add_replica_inode(p, dn, finished); + + next = MDiscoverReply::DIR; + } + + // dir error? + // or dir_auth hint? + if (m->is_flag_error_dir() && !cur->is_dir()) { + // not a dir. + cur->take_waiting(CInode::WAIT_DIR, error); + } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) { + mds_rank_t who = m->get_dir_auth_hint(); + if (who == mds->get_nodeid()) who = -1; + if (who >= 0) + dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; + + + if (m->get_wanted_base_dir()) { + frag_t fg = m->get_base_dir_frag(); + CDir *dir = cur->get_dirfrag(fg); + + if (cur->is_waiting_for_dir(fg)) { + if (cur->is_auth()) + cur->take_waiting(CInode::WAIT_DIR, finished); + else if (dir || !cur->dirfragtree.is_leaf(fg)) + cur->take_dir_waiting(fg, finished); + else + discover_dir_frag(cur, fg, 0, who); + } else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + + // try again? + if (m->get_error_dentry().length()) { + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + // wanted a dentry + if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) { + if (dir->is_auth() || dir->lookup(m->get_error_dentry())) { + dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), + m->get_wanted_snapid(), finished); + } else { + filepath relpath(m->get_error_dentry(), 0); + discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked()); + } + } else + dout(7) << " doing nothing, have dir but nobody is waiting on dentry " + << m->get_error_dentry() << dendl; + } + } else if (m->is_flag_error_dn()) { + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + if (dir) { + if (dir->is_auth()) { + dir->take_sub_waiting(finished); + } else { + dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), + m->get_wanted_snapid(), error); + } + } + } + + // waiters + finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly + mds->queue_waiters(finished); +} + + + +// ---------------------------- +// REPLICAS + + +void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) +{ + dirfrag_t df = dir->dirfrag(); + encode(df, bl); + dir->encode_replica(to, bl); +} + +void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) +{ + encode(dn->get_name(), bl); + encode(dn->last, bl); + dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE); +} + +void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, + uint64_t features) +{ + encode(in->inode.ino, bl); // bleh, minor assymetry here + encode(in->last, bl); + in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE); +} + +CDir *MDCache::add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, + MDSContext::vec& finished) +{ + dirfrag_t df; + decode(df, p); + + ceph_assert(diri->ino() == df.ino); + + // add it (_replica_) + CDir *dir = diri->get_dirfrag(df.frag); + + if (dir) { + // had replica. update w/ new nonce. + dir->decode_replica(p); + dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl; + } else { + // force frag to leaf in the diri tree + if (!diri->dirfragtree.is_leaf(df.frag)) { + dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree " + << diri->dirfragtree << dendl; + diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag); + } + + // add replica. + dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) ); + dir->decode_replica(p); + + // is this a dir_auth delegation boundary? + if (from != diri->authority().first || + diri->is_ambiguous_auth() || + diri->is_base()) + adjust_subtree_auth(dir, from); + + dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; + + // get waiters + diri->take_dir_waiting(df.frag, finished); + } + + return dir; +} + +CDentry *MDCache::add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished) +{ + string name; + snapid_t last; + decode(name, p); + decode(last, p); + + CDentry *dn = dir->lookup(name, last); + + // have it? + if (dn) { + dn->decode_replica(p, false); + dout(7) << "add_replica_dentry had " << *dn << dendl; + } else { + dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last); + dn->decode_replica(p, true); + dout(7) << "add_replica_dentry added " << *dn << dendl; + } + + dir->take_dentry_waiting(name, dn->first, dn->last, finished); + + return dn; +} + +CInode *MDCache::add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished) +{ + inodeno_t ino; + snapid_t last; + decode(ino, p); + decode(last, p); + CInode *in = get_inode(ino, last); + if (!in) { + in = new CInode(this, false, 1, last); + in->decode_replica(p, true); + add_inode(in); + if (in->ino() == MDS_INO_ROOT) + in->inode_auth.first = 0; + else if (in->is_mdsdir()) + in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET; + dout(10) << "add_replica_inode added " << *in << dendl; + if (dn) { + ceph_assert(dn->get_linkage()->is_null()); + dn->dir->link_primary_inode(dn, in); + } + } else { + in->decode_replica(p, false); + dout(10) << "add_replica_inode had " << *in << dendl; + } + + if (dn) { + if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in) + dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl; + } + + return in; +} + + +void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl) +{ + uint64_t features = mds->mdsmap->get_up_features(); + replicate_inode(get_myin(), who, bl, features); + replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl); + replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl); + replicate_inode(straydn->get_dir()->inode, who, bl, features); + replicate_dir(straydn->get_dir(), who, bl); + replicate_dentry(straydn, who, bl); +} + +CDentry *MDCache::add_replica_stray(const bufferlist &bl, mds_rank_t from) +{ + MDSContext::vec finished; + auto p = bl.cbegin(); + + CInode *mdsin = add_replica_inode(p, NULL, finished); + CDir *mdsdir = add_replica_dir(p, mdsin, from, finished); + CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished); + CInode *strayin = add_replica_inode(p, straydirdn, finished); + CDir *straydir = add_replica_dir(p, strayin, from, finished); + CDentry *straydn = add_replica_dentry(p, straydir, finished); + if (!finished.empty()) + mds->queue_waiters(finished); + + return straydn; +} + + +int MDCache::send_dir_updates(CDir *dir, bool bcast) +{ + // this is an FYI, re: replication + + set<mds_rank_t> who; + if (bcast) { + mds->get_mds_map()->get_active_mds_set(who); + } else { + for (const auto &p : dir->get_replicas()) { + who.insert(p.first); + } + } + + dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; + + filepath path; + dir->inode->make_path(path); + + mds_rank_t whoami = mds->get_nodeid(); + for (set<mds_rank_t>::iterator it = who.begin(); + it != who.end(); + ++it) { + if (*it == whoami) continue; + //if (*it == except) continue; + dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; + + std::set<int32_t> s; + for (const auto &r : dir->dir_rep_by) { + s.insert(r); + } + mds->send_message_mds(MDirUpdate::create(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it); + } + + return 0; +} + +void MDCache::handle_dir_update(const MDirUpdate::const_ref &m) +{ + dirfrag_t df = m->get_dirfrag(); + CDir *dir = get_dirfrag(df); + if (!dir) { + dout(5) << "dir_update on " << df << ", don't have it" << dendl; + + // discover it? + if (m->should_discover()) { + // only try once! + // this is key to avoid a fragtree update race, among other things. + m->inc_tried_discover(); + vector<CDentry*> trace; + CInode *in; + filepath path = m->get_path(); + dout(5) << "trying discover on dir_update for " << path << dendl; + CF_MDS_RetryMessageFactory cf(mds, m); + MDRequestRef null_ref; + int r = path_traverse(null_ref, cf, path, &trace, &in, MDS_TRAVERSE_DISCOVER); + if (r > 0) + return; + if (r == 0 && + in->ino() == df.ino && + in->get_approx_dirfrag(df.frag) == NULL) { + open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m)); + return; + } + } + + return; + } + + if (!m->has_tried_discover()) { + // Update if it already exists. Othwerwise it got updated by discover reply. + dout(5) << "dir_update on " << *dir << dendl; + dir->dir_rep = m->get_dir_rep(); + dir->dir_rep_by.clear(); + for (const auto &e : m->get_dir_rep_by()) { + dir->dir_rep_by.insert(e); + } + } +} + + + + + +// LINK + +void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr) +{ + dout(7) << "send_dentry_link " << *dn << dendl; + + CDir *subtree = get_subtree_root(dn->get_dir()); + for (const auto &p : dn->get_replicas()) { + // don't tell (rename) witnesses; they already know + if (mdr.get() && mdr->more()->witnessed.count(p.first)) + continue; + if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_gather.count(p.first))) + continue; + CDentry::linkage_t *dnl = dn->get_linkage(); + auto m = MDentryLink::create(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary()); + if (dnl->is_primary()) { + dout(10) << " primary " << *dnl->get_inode() << dendl; + replicate_inode(dnl->get_inode(), p.first, m->bl, + mds->mdsmap->get_up_features()); + } else if (dnl->is_remote()) { + inodeno_t ino = dnl->get_remote_ino(); + __u8 d_type = dnl->get_remote_d_type(); + dout(10) << " remote " << ino << " " << d_type << dendl; + encode(ino, m->bl); + encode(d_type, m->bl); + } else + ceph_abort(); // aie, bad caller! + mds->send_message_mds(m, p.first); + } +} + +void MDCache::handle_dentry_link(const MDentryLink::const_ref &m) +{ + CDentry *dn = NULL; + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << "handle_dentry_link on " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + + ceph_assert(!dn->is_auth()); + ceph_assert(dnl->is_null()); + } + } + + auto p = m->bl.cbegin(); + MDSContext::vec finished; + if (dn) { + if (m->get_is_primary()) { + // primary link. + add_replica_inode(p, dn, finished); + } else { + // remote link, easy enough. + inodeno_t ino; + __u8 d_type; + decode(ino, p); + decode(d_type, p); + dir->link_remote_inode(dn, ino, d_type); + } + } else { + ceph_abort(); + } + + if (!finished.empty()) + mds->queue_waiters(finished); + + return; +} + + +// UNLINK + +void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr) +{ + dout(10) << "send_dentry_unlink " << *dn << dendl; + // share unlink news with replicas + set<mds_rank_t> replicas; + dn->list_replicas(replicas); + bufferlist snapbl; + if (straydn) { + straydn->list_replicas(replicas); + CInode *strayin = straydn->get_linkage()->get_inode(); + strayin->encode_snap_blob(snapbl); + } + for (set<mds_rank_t>::iterator it = replicas.begin(); + it != replicas.end(); + ++it) { + // don't tell (rmdir) witnesses; they already know + if (mdr.get() && mdr->more()->witnessed.count(*it)) + continue; + + if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN && + rejoin_gather.count(*it))) + continue; + + auto unlink = MDentryUnlink::create(dn->get_dir()->dirfrag(), dn->get_name()); + if (straydn) { + replicate_stray(straydn, *it, unlink->straybl); + unlink->snapbl = snapbl; + } + mds->send_message_mds(unlink, *it); + } +} + +void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref &m) +{ + // straydn + CDentry *straydn = NULL; + if (m->straybl.length()) + straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num())); + + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << "handle_dentry_unlink on " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + + // open inode? + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dn->dir->unlink_inode(dn); + ceph_assert(straydn); + straydn->dir->link_primary_inode(straydn, in); + + // in->first is lazily updated on replica; drag it forward so + // that we always keep it in sync with the dnq + ceph_assert(straydn->first >= in->first); + in->first = straydn->first; + + // update subtree map? + if (in->is_dir()) + adjust_subtree_after_rename(in, dir, false); + + if (m->snapbl.length()) { + bool hadrealm = (in->snaprealm ? true : false); + in->decode_snap_blob(m->snapbl); + ceph_assert(in->snaprealm); + ceph_assert(in->snaprealm->have_past_parents_open()); + if (!hadrealm) + do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false); + } + + // send caps to auth (if we're not already) + if (in->is_any_caps() && + !in->state_test(CInode::STATE_EXPORTINGCAPS)) + migrator->export_caps(in); + + straydn = NULL; + } else { + ceph_assert(!straydn); + ceph_assert(dnl->is_remote()); + dn->dir->unlink_inode(dn); + } + ceph_assert(dnl->is_null()); + } + } + + // race with trim_dentry() + if (straydn) { + ceph_assert(straydn->get_num_ref() == 0); + ceph_assert(straydn->get_linkage()->is_null()); + expiremap ex; + trim_dentry(straydn, ex); + send_expire_messages(ex); + } +} + + + + + + +// =================================================================== + + + +// =================================================================== +// FRAGMENT + + +/** + * adjust_dir_fragments -- adjust fragmentation for a directory + * + * @param diri directory inode + * @param basefrag base fragment + * @param bits bit adjustment. positive for split, negative for merge. + */ +void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + list<CDir*>& resultfrags, + MDSContext::vec& waiters, + bool replay) +{ + dout(10) << "adjust_dir_fragments " << basefrag << " " << bits + << " on " << *diri << dendl; + + list<CDir*> srcfrags; + diri->get_dirfrags_under(basefrag, srcfrags); + + adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay); +} + +CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay) +{ + CDir *dir = diri->get_dirfrag(fg); + if (dir) + return dir; + + dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl; + + list<CDir*> src, result; + MDSContext::vec waiters; + + // split a parent? + frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg); + while (1) { + CDir *pdir = diri->get_dirfrag(parent); + if (pdir) { + int split = fg.bits() - parent.bits(); + dout(10) << " splitting parent by " << split << " " << *pdir << dendl; + src.push_back(pdir); + adjust_dir_fragments(diri, src, parent, split, result, waiters, replay); + dir = diri->get_dirfrag(fg); + if (dir) { + dout(10) << "force_dir_fragment result " << *dir << dendl; + break; + } + } + if (parent == frag_t()) + break; + frag_t last = parent; + parent = parent.parent(); + dout(10) << " " << last << " parent is " << parent << dendl; + } + + if (!dir) { + // hoover up things under fg? + diri->get_dirfrags_under(fg, src); + if (src.empty()) { + dout(10) << "force_dir_fragment no frags under " << fg << dendl; + } else { + dout(10) << " will combine frags under " << fg << ": " << src << dendl; + adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay); + dir = result.front(); + dout(10) << "force_dir_fragment result " << *dir << dendl; + } + } + if (!replay) + mds->queue_waiters(waiters); + return dir; +} + +void MDCache::adjust_dir_fragments(CInode *diri, + list<CDir*>& srcfrags, + frag_t basefrag, int bits, + list<CDir*>& resultfrags, + MDSContext::vec& waiters, + bool replay) +{ + dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits + << " srcfrags " << srcfrags + << " on " << *diri << dendl; + + // adjust fragtree + // yuck. we may have discovered the inode while it was being fragmented. + if (!diri->dirfragtree.is_leaf(basefrag)) + diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag); + + if (bits > 0) + diri->dirfragtree.split(basefrag, bits); + dout(10) << " new fragtree is " << diri->dirfragtree << dendl; + + if (srcfrags.empty()) + return; + + // split + CDir *parent_dir = diri->get_parent_dir(); + CDir *parent_subtree = 0; + if (parent_dir) + parent_subtree = get_subtree_root(parent_dir); + + if (bits > 0) { + // SPLIT + ceph_assert(srcfrags.size() == 1); + CDir *dir = srcfrags.front(); + + dir->split(bits, resultfrags, waiters, replay); + + // did i change the subtree map? + if (dir->is_subtree_root()) { + // new frags are now separate subtrees + for (list<CDir*>::iterator p = resultfrags.begin(); + p != resultfrags.end(); + ++p) + subtrees[*p].clear(); // new frag is now its own subtree + + // was i a bound? + if (parent_subtree) { + ceph_assert(subtrees[parent_subtree].count(dir)); + subtrees[parent_subtree].erase(dir); + for (list<CDir*>::iterator p = resultfrags.begin(); + p != resultfrags.end(); + ++p) { + ceph_assert((*p)->is_subtree_root()); + subtrees[parent_subtree].insert(*p); + } + } + + // adjust my bounds. + set<CDir*> bounds; + bounds.swap(subtrees[dir]); + subtrees.erase(dir); + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *frag = get_subtree_root((*p)->get_parent_dir()); + subtrees[frag].insert(*p); + } + + show_subtrees(10); + } + + diri->close_dirfrag(dir->get_frag()); + + } else { + // MERGE + + // are my constituent bits subtrees? if so, i will be too. + // (it's all or none, actually.) + bool any_subtree = false, any_non_subtree = false; + for (CDir *dir : srcfrags) { + if (dir->is_subtree_root()) + any_subtree = true; + else + any_non_subtree = true; + } + ceph_assert(!any_subtree || !any_non_subtree); + + set<CDir*> new_bounds; + if (any_subtree) { + for (CDir *dir : srcfrags) { + // this simplifies the code that find subtrees underneath the dirfrag + if (!dir->is_subtree_root()) { + dir->state_set(CDir::STATE_AUXSUBTREE); + adjust_subtree_auth(dir, mds->get_nodeid()); + } + } + + for (CDir *dir : srcfrags) { + ceph_assert(dir->is_subtree_root()); + dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl; + map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir); + set<CDir*>::iterator r = q->second.begin(); + while (r != subtrees[dir].end()) { + new_bounds.insert(*r); + subtrees[dir].erase(r++); + } + subtrees.erase(q); + + // remove myself as my parent's bound + if (parent_subtree) + subtrees[parent_subtree].erase(dir); + } + } + + // merge + CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth()); + f->merge(srcfrags, waiters, replay); + + if (any_subtree) { + ceph_assert(f->is_subtree_root()); + subtrees[f].swap(new_bounds); + if (parent_subtree) + subtrees[parent_subtree].insert(f); + + show_subtrees(10); + } + + resultfrags.push_back(f); + } +} + + +class C_MDC_FragmentFrozen : public MDSInternalContext { + MDCache *mdcache; + MDRequestRef mdr; +public: + C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) : + MDSInternalContext(m->mds), mdcache(m), mdr(r) {} + void finish(int r) override { + mdcache->fragment_frozen(mdr, r); + } +}; + +bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs) +{ + if (is_readonly()) { + dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl; + return false; + } + if (mds->is_cluster_degraded()) { + dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl; + return false; + } + if (diri->get_parent_dir() && + diri->get_parent_dir()->get_inode()->is_stray()) { + dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl; + return false; + } + if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) { + dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl; + return false; + } + + if (diri->scrub_is_in_progress()) { + dout(7) << "can_fragment: scrub in progress" << dendl; + return false; + } + + for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) { + CDir *dir = *p; + if (dir->state_test(CDir::STATE_FRAGMENTING)) { + dout(7) << "can_fragment: already fragmenting " << *dir << dendl; + return false; + } + if (!dir->is_auth()) { + dout(7) << "can_fragment: not auth on " << *dir << dendl; + return false; + } + if (dir->is_bad()) { + dout(7) << "can_fragment: bad dirfrag " << *dir << dendl; + return false; + } + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl; + return false; + } + } + + return true; +} + +void MDCache::split_dir(CDir *dir, int bits) +{ + dout(7) << __func__ << " " << *dir << " bits " << bits << dendl; + ceph_assert(dir->is_auth()); + CInode *diri = dir->inode; + + list<CDir*> dirs; + dirs.push_back(dir); + + if (!can_fragment(diri, dirs)) { + dout(7) << __func__ << " cannot fragment right now, dropping" << dendl; + return; + } + + if (dir->frag.bits() + bits > 24) { + dout(7) << __func__ << " frag bits > 24, dropping" << dendl; + return; + } + + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + mdr->more()->fragment_base = dir->dirfrag(); + + ceph_assert(fragments.count(dir->dirfrag()) == 0); + fragment_info_t& info = fragments[dir->dirfrag()]; + info.mdr = mdr; + info.dirs.push_back(dir); + info.bits = bits; + info.last_cum_auth_pins_change = ceph_clock_now(); + + fragment_freeze_dirs(dirs); + // initial mark+complete pass + fragment_mark_and_complete(mdr); +} + +void MDCache::merge_dir(CInode *diri, frag_t frag) +{ + dout(7) << "merge_dir to " << frag << " on " << *diri << dendl; + + list<CDir*> dirs; + if (!diri->get_dirfrags_under(frag, dirs)) { + dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl; + return; + } + + if (diri->dirfragtree.is_leaf(frag)) { + dout(10) << " " << frag << " already a leaf for " << *diri << dendl; + return; + } + + if (!can_fragment(diri, dirs)) + return; + + CDir *first = dirs.front(); + int bits = first->get_frag().bits() - frag.bits(); + dout(10) << " we are merging by " << bits << " bits" << dendl; + + dirfrag_t basedirfrag(diri->ino(), frag); + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + mdr->more()->fragment_base = basedirfrag; + + ceph_assert(fragments.count(basedirfrag) == 0); + fragment_info_t& info = fragments[basedirfrag]; + info.mdr = mdr; + info.dirs = dirs; + info.bits = -bits; + info.last_cum_auth_pins_change = ceph_clock_now(); + + fragment_freeze_dirs(dirs); + // initial mark+complete pass + fragment_mark_and_complete(mdr); +} + +void MDCache::fragment_freeze_dirs(list<CDir*>& dirs) +{ + bool any_subtree = false, any_non_subtree = false; + for (CDir* dir : dirs) { + dir->auth_pin(dir); // until we mark and complete them + dir->state_set(CDir::STATE_FRAGMENTING); + dir->freeze_dir(); + ceph_assert(dir->is_freezing_dir()); + + if (dir->is_subtree_root()) + any_subtree = true; + else + any_non_subtree = true; + } + + if (any_subtree && any_non_subtree) { + // either all dirfrags are subtree roots or all are not. + for (CDir *dir : dirs) { + if (dir->is_subtree_root()) { + ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE)); + } else { + dir->state_set(CDir::STATE_AUXSUBTREE); + adjust_subtree_auth(dir, mds->get_nodeid()); + } + } + } +} + +class C_MDC_FragmentMarking : public MDCacheContext { + MDRequestRef mdr; +public: + C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} + void finish(int r) override { + mdcache->fragment_mark_and_complete(mdr); + } +}; + +void MDCache::fragment_mark_and_complete(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + fragment_info_t& info = it->second; + CInode *diri = info.dirs.front()->get_inode(); + dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl; + + MDSGatherBuilder gather(g_ceph_context); + + for (list<CDir*>::iterator p = info.dirs.begin(); + p != info.dirs.end(); + ++p) { + CDir *dir = *p; + + bool ready = true; + if (!dir->is_complete()) { + dout(15) << " fetching incomplete " << *dir << dendl; + dir->fetch(gather.new_sub(), true); // ignore authpinnability + ready = false; + } else if (dir->get_frag() == frag_t()) { + // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback + // the operation. To avoid CDir::fetch() complaining about missing object, + // we commit new dirfrag first. + if (dir->state_test(CDir::STATE_CREATING)) { + dout(15) << " waiting until new dir gets journaled " << *dir << dendl; + dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub()); + ready = false; + } else if (dir->is_new()) { + dout(15) << " committing new " << *dir << dendl; + ceph_assert(dir->is_dirty()); + dir->commit(0, gather.new_sub(), true); + ready = false; + } + } + if (!ready) + continue; + + if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + dout(15) << " marking " << *dir << dendl; + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->get(CDentry::PIN_FRAGMENTING); + ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_set(CDentry::STATE_FRAGMENTING); + } + dir->state_set(CDir::STATE_DNPINNEDFRAG); + dir->auth_unpin(dir); + } else { + dout(15) << " already marked " << *dir << dendl; + } + } + if (gather.has_subs()) { + gather.set_finisher(new C_MDC_FragmentMarking(this, mdr)); + gather.activate(); + return; + } + + for (list<CDir*>::iterator p = info.dirs.begin(); + p != info.dirs.end(); + ++p) { + CDir *dir = *p; + if (!dir->is_frozen_dir()) { + ceph_assert(dir->is_freezing_dir()); + dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub()); + } + } + if (gather.has_subs()) { + gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr)); + gather.activate(); + // flush log so that request auth_pins are retired + mds->mdlog->flush(); + return; + } + + fragment_frozen(mdr, 0); +} + +void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs) +{ + dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl; + for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) { + CDir *dir = *p; + dout(10) << " frag " << *dir << dendl; + + ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING)); + dir->state_clear(CDir::STATE_FRAGMENTING); + + if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + dir->state_clear(CDir::STATE_DNPINNEDFRAG); + + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_clear(CDentry::STATE_FRAGMENTING); + dn->put(CDentry::PIN_FRAGMENTING); + } + } else { + dir->auth_unpin(dir); + } + + dir->unfreeze_dir(); + } +} + +bool MDCache::fragment_are_all_frozen(CDir *dir) +{ + ceph_assert(dir->is_frozen_dir()); + map<dirfrag_t,fragment_info_t>::iterator p; + for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); + p != fragments.end() && p->first.ino == dir->ino(); + ++p) { + if (p->first.frag.contains(dir->get_frag())) + return p->second.all_frozen; + } + ceph_abort(); + return false; +} + +void MDCache::fragment_freeze_inc_num_waiters(CDir *dir) +{ + map<dirfrag_t,fragment_info_t>::iterator p; + for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); + p != fragments.end() && p->first.ino == dir->ino(); + ++p) { + if (p->first.frag.contains(dir->get_frag())) { + p->second.num_remote_waiters++; + return; + } + } + ceph_abort(); +} + +void MDCache::find_stale_fragment_freeze() +{ + dout(10) << "find_stale_fragment_freeze" << dendl; + // see comment in Migrator::find_stale_export_freeze() + utime_t now = ceph_clock_now(); + utime_t cutoff = now; + cutoff -= g_conf()->mds_freeze_tree_timeout; + + for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin(); + p != fragments.end(); ) { + dirfrag_t df = p->first; + fragment_info_t& info = p->second; + ++p; + if (info.all_frozen) + continue; + CDir *dir; + int total_auth_pins = 0; + for (list<CDir*>::iterator q = info.dirs.begin(); + q != info.dirs.end(); + ++q) { + dir = *q; + if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + total_auth_pins = -1; + break; + } + if (dir->is_frozen_dir()) + continue; + total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins(); + } + if (total_auth_pins < 0) + continue; + if (info.last_cum_auth_pins != total_auth_pins) { + info.last_cum_auth_pins = total_auth_pins; + info.last_cum_auth_pins_change = now; + continue; + } + if (info.last_cum_auth_pins_change >= cutoff) + continue; + dir = info.dirs.front(); + if (info.num_remote_waiters > 0 || + (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { + dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl; + list<CDir*> dirs; + info.dirs.swap(dirs); + fragments.erase(df); + fragment_unmark_unfreeze_dirs(dirs); + } + } +} + +class C_MDC_FragmentPrep : public MDCacheLogContext { + MDRequestRef mdr; +public: + C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_logged(mdr); + } +}; + +class C_MDC_FragmentStore : public MDCacheContext { + MDRequestRef mdr; +public: + C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_stored(mdr); + } +}; + +class C_MDC_FragmentCommit : public MDCacheLogContext { + dirfrag_t basedirfrag; + MDRequestRef mdr; +public: + C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) : + MDCacheLogContext(m), basedirfrag(df), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_committed(basedirfrag, mdr); + } +}; + +class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext { + dirfrag_t basedirfrag; + int bits; + MDRequestRef mdr; +public: + C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b, + const MDRequestRef& r) : + MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {} + void finish(int r) override { + ceph_assert(r == 0 || r == -ENOENT); + mdcache->_fragment_old_purged(basedirfrag, bits, mdr); + } + void print(ostream& out) const override { + out << "fragment_purge_old(" << basedirfrag << ")"; + } +}; + +void MDCache::fragment_frozen(MDRequestRef& mdr, int r) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + ceph_assert(r == 0); + fragment_info_t& info = it->second; + dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits + << " on " << info.dirs.front()->get_inode() << dendl; + + info.all_frozen = true; + dispatch_fragment_dir(mdr); +} + +void MDCache::dispatch_fragment_dir(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + fragment_info_t& info = it->second; + CInode *diri = info.dirs.front()->get_inode(); + + dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + if (!mdr->aborted) { + MutationImpl::LockOpVec lov; + lov.add_wrlock(&diri->dirfragtreelock); + // prevent a racing gather on any other scatterlocks too + lov.add_wrlock(&diri->nestlock); + lov.add_wrlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) + if (!mdr->aborted) + return; + } + + if (mdr->aborted) { + dout(10) << " can't auth_pin " << *diri << ", requeuing dir " + << info.dirs.front()->dirfrag() << dendl; + if (info.bits > 0) + mds->balancer->queue_split(info.dirs.front(), false); + else + mds->balancer->queue_merge(info.dirs.front()); + fragment_unmark_unfreeze_dirs(info.dirs); + fragments.erase(it); + request_finish(mdr); + return; + } + + mdr->ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits); + mds->mdlog->start_entry(le); + + for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) { + CDir *dir = *p; + dirfrag_rollback rollback; + rollback.fnode = dir->fnode; + le->add_orig_frag(dir->get_frag(), &rollback); + } + + // refragment + MDSContext::vec waiters; + adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits, + info.resultfrags, waiters, false); + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + mds->queue_waiters(waiters); + + for (const auto& fg : le->orig_frags) + ceph_assert(!diri->dirfragtree.is_leaf(fg)); + + le->metablob.add_dir_context(*info.resultfrags.begin()); + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); + ++p) { + if (diri->is_auth()) { + le->metablob.add_fragmented_dir(*p, false, false); + } else { + (*p)->state_set(CDir::STATE_DIRTYDFT); + le->metablob.add_fragmented_dir(*p, false, true); + } + } + + // dft lock + if (diri->is_auth()) { + // journal dirfragtree + auto &pi = diri->project_inode(); + pi.inode.version = diri->pre_dirty(); + journal_dirty_inode(mdr.get(), &le->metablob, diri); + } else { + mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); + mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); + mdr->add_updated_lock(&diri->dirfragtreelock); + } + + /* + // filelock + mds->locker->mark_updated_scatterlock(&diri->filelock); + mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mut->add_updated_lock(&diri->filelock); + + // dirlock + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + mut->add_updated_lock(&diri->nestlock); + */ + + add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls); + mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr), + mdr, __func__); + mds->mdlog->flush(); +} + +void MDCache::_fragment_logged(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + auto& info = fragments.at(basedirfrag); + CInode *diri = info.resultfrags.front()->get_inode(); + + dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + mdr->mark_event("prepare logged"); + + if (diri->is_auth()) + diri->pop_and_dirty_projected_inode(mdr->ls); + + mdr->apply(); // mark scatterlock + + // store resulting frags + MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); + + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); + ++p) { + CDir *dir = *p; + dout(10) << " storing result frag " << *dir << dendl; + + dir->mark_dirty(dir->pre_dirty(), mdr->ls); + dir->mark_new(mdr->ls); + + // freeze and store them too + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + dir->commit(0, gather.new_sub(), true); // ignore authpinnability + } + + gather.activate(); +} + +void MDCache::_fragment_stored(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + fragment_info_t &info = fragments.at(basedirfrag); + CDir *first = info.resultfrags.front(); + CInode *diri = first->get_inode(); + + dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + mdr->mark_event("new frags stored"); + + // tell peers + mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ? + diri->authority().first : CDIR_AUTH_UNKNOWN; + for (const auto &p : first->get_replicas()) { + if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_gather.count(p.first))) + continue; + + auto notify = MMDSFragmentNotify::create(basedirfrag, info.bits, mdr->reqid.tid); + if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root + diri_auth != p.first) { // not auth mds of diri + /* + * In the nornal case, mds does not trim dir inode whose child dirfrags + * are likely being fragmented (see trim_inode()). But when fragmenting + * subtree roots, following race can happen: + * + * - mds.a (auth mds of dirfrag) sends fragment_notify message to + * mds.c and drops wrlock on dirfragtreelock. + * - mds.b (auth mds of dir inode) changes dirfragtreelock state to + * SYNC and send lock message mds.c + * - mds.c receives the lock message and changes dirfragtreelock state + * to SYNC + * - mds.c trim dirfrag and dir inode from its cache + * - mds.c receives the fragment_notify message + * + * So we need to ensure replicas have received the notify, then unlock + * the dirfragtreelock. + */ + notify->mark_ack_wanted(); + info.notify_ack_waiting.insert(p.first); + } + + // freshly replicate new dirs to peers + for (list<CDir*>::iterator q = info.resultfrags.begin(); + q != info.resultfrags.end(); + ++q) + replicate_dir(*q, p.first, notify->basebl); + + mds->send_message_mds(notify, p.first); + } + + // journal commit + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits); + mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr)); + + + // unfreeze resulting frags + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); + ++p) { + CDir *dir = *p; + dout(10) << " result frag " << *dir << dendl; + + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_clear(CDentry::STATE_FRAGMENTING); + dn->put(CDentry::PIN_FRAGMENTING); + } + + // unfreeze + dir->unfreeze_dir(); + } + + if (info.notify_ack_waiting.empty()) { + fragment_drop_locks(info); + } else { + mds->locker->drop_locks_for_fragment_unfreeze(mdr.get()); + } +} + +void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr) +{ + dout(10) << "fragment_committed " << basedirfrag << dendl; + if (mdr) + mdr->mark_event("commit logged"); + + ufragment &uf = uncommitted_fragments.at(basedirfrag); + + // remove old frags + C_GatherBuilder gather( + g_ceph_context, + new C_OnFinisher( + new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr), + mds->finisher)); + + SnapContext nullsnapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + for (const auto& fg : uf.old_frags) { + object_t oid = CInode::get_object_name(basedirfrag.ino, fg, ""); + ObjectOperation op; + if (fg == frag_t()) { + // backtrace object + dout(10) << " truncate orphan dirfrag " << oid << dendl; + op.truncate(0); + op.omap_clear(); + } else { + dout(10) << " removing orphan dirfrag " << oid << dendl; + op.remove(); + } + mds->objecter->mutate(oid, oloc, op, nullsnapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + } + + ceph_assert(gather.has_subs()); + gather.activate(); +} + +void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr) +{ + dout(10) << "fragment_old_purged " << basedirfrag << dendl; + if (mdr) + mdr->mark_event("old frags purged"); + + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits); + mds->mdlog->start_submit_entry(le); + + finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH); + + if (mds->logger) { + if (bits > 0) { + mds->logger->inc(l_mds_dir_split); + } else { + mds->logger->inc(l_mds_dir_merge); + } + } + + if (mdr) { + auto it = fragments.find(basedirfrag); + ceph_assert(it != fragments.end()); + it->second.finishing = true; + if (it->second.notify_ack_waiting.empty()) + fragment_maybe_finish(it); + else + mdr->mark_event("wating for notify acks"); + } +} + +void MDCache::fragment_drop_locks(fragment_info_t& info) +{ + mds->locker->drop_locks(info.mdr.get()); + request_finish(info.mdr); + //info.mdr.reset(); +} + +void MDCache::fragment_maybe_finish(const fragment_info_iterator& it) +{ + if (!it->second.finishing) + return; + + // unmark & auth_unpin + for (const auto &dir : it->second.resultfrags) { + dir->state_clear(CDir::STATE_FRAGMENTING); + dir->auth_unpin(this); + + // In case the resulting fragments are beyond the split size, + // we might need to split them again right away (they could + // have been taking inserts between unfreezing and getting + // here) + mds->balancer->maybe_fragment(dir, false); + } + + fragments.erase(it); +} + + +void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &ack) +{ + dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_ACTIVE) { + return; + } + + auto it = fragments.find(ack->get_base_dirfrag()); + if (it == fragments.end() || + it->second.get_tid() != ack->get_tid()) { + dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl; + return; + } + + if (it->second.notify_ack_waiting.erase(from) && + it->second.notify_ack_waiting.empty()) { + fragment_drop_locks(it->second); + fragment_maybe_finish(it); + } +} + +void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref ¬ify) +{ + dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; + mds_rank_t from = mds_rank_t(notify->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + CInode *diri = get_inode(notify->get_ino()); + if (diri) { + frag_t base = notify->get_basefrag(); + int bits = notify->get_bits(); + +/* + if ((bits < 0 && diri->dirfragtree.is_leaf(base)) || + (bits > 0 && !diri->dirfragtree.is_leaf(base))) { + dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits + << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl; + return; + } +*/ + + // refragment + MDSContext::vec waiters; + list<CDir*> resultfrags; + adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false); + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + + for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) + diri->take_dir_waiting((*p)->get_frag(), waiters); + + // add new replica dirs values + auto p = notify->basebl.cbegin(); + while (!p.end()) + add_replica_dir(p, diri, from, waiters); + + mds->queue_waiters(waiters); + } else { + ceph_abort(); + } + + if (notify->is_ack_wanted()) { + auto ack = MMDSFragmentNotifyAck::create(notify->get_base_dirfrag(), + notify->get_bits(), notify->get_tid()); + mds->send_message_mds(ack, from); + } +} + +void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags, + LogSegment *ls, bufferlist *rollback) +{ + dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl; + ceph_assert(!uncommitted_fragments.count(basedirfrag)); + ufragment& uf = uncommitted_fragments[basedirfrag]; + uf.old_frags = old_frags; + uf.bits = bits; + uf.ls = ls; + ls->uncommitted_fragments.insert(basedirfrag); + if (rollback) + uf.rollback.swap(*rollback); +} + +void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op) +{ + dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag + << " op " << EFragment::op_name(op) << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) { + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + mds->queue_waiters(uf.waiters); + uncommitted_fragments.erase(it); + } + } +} + +void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags) +{ + dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag + << " old_frags (" << old_frags << ")" << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (!uf.old_frags.empty()) { + uf.old_frags = std::move(old_frags); + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + uncommitted_fragments.erase(it); + } + } +} + +void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher) +{ + MDSGatherBuilder gather(g_ceph_context, finisher); + for (auto& p : uncommitted_fragments) { + p.second.waiters.push_back(gather.new_sub()); + } + gather.activate(); +} + +void MDCache::rollback_uncommitted_fragments() +{ + dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl; + for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin(); + p != uncommitted_fragments.end(); + ++p) { + ufragment &uf = p->second; + CInode *diri = get_inode(p->first.ino); + ceph_assert(diri); + + if (uf.committed) { + _fragment_committed(p->first, MDRequestRef()); + continue; + } + + dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; + + LogSegment *ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits); + mds->mdlog->start_entry(le); + bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF); + + frag_vec_t old_frags; + diri->dirfragtree.get_leaves_under(p->first.frag, old_frags); + + list<CDir*> resultfrags; + if (uf.old_frags.empty()) { + // created by old format EFragment + MDSContext::vec waiters; + adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); + } else { + auto bp = uf.rollback.cbegin(); + for (const auto& fg : uf.old_frags) { + CDir *dir = force_dir_fragment(diri, fg); + resultfrags.push_back(dir); + + dirfrag_rollback rollback; + decode(rollback, bp); + + dir->set_version(rollback.fnode.version); + dir->fnode = rollback.fnode; + + dir->_mark_dirty(ls); + + if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) { + dout(10) << " dirty nestinfo on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->nestlock); + ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest); + } + if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) { + dout(10) << " dirty fragstat on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->filelock); + ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir); + } + + le->add_orig_frag(dir->get_frag()); + le->metablob.add_dir_context(dir); + if (diri_auth) { + le->metablob.add_fragmented_dir(dir, true, false); + } else { + dout(10) << " dirty dirfragtree on " << *dir << dendl; + dir->state_set(CDir::STATE_DIRTYDFT); + le->metablob.add_fragmented_dir(dir, true, true); + } + } + } + + if (diri_auth) { + auto &pi = diri->project_inode(); + pi.inode.version = diri->pre_dirty(); + diri->pop_and_dirty_projected_inode(ls); // hacky + le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true); + } else { + mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); + ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); + } + + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + + for (const auto& leaf : old_frags) { + ceph_assert(!diri->dirfragtree.is_leaf(leaf)); + } + + mds->mdlog->submit_entry(le); + + uf.old_frags.swap(old_frags); + _fragment_committed(p->first, MDRequestRef()); + } +} + +void MDCache::force_readonly() +{ + if (is_readonly()) + return; + + dout(1) << "force file system read-only" << dendl; + mds->clog->warn() << "force file system read-only"; + + set_readonly(); + + mds->server->force_clients_readonly(); + + // revoke write caps + int count = 0; + for (auto &p : inode_map) { + CInode *in = p.second; + if (in->is_head()) + mds->locker->eval(in, CEPH_CAP_LOCKS); + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + mds->mdlog->flush(); +} + + +// ============================================================== +// debug crap + +void MDCache::show_subtrees(int dbl, bool force_print) +{ + if (g_conf()->mds_thrash_exports) + dbl += 15; + + //dout(10) << "show_subtrees" << dendl; + + if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl)) + return; // i won't print anything. + + if (subtrees.empty()) { + dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees" + << dendl; + return; + } + + if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD && + !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) { + dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not " + "printing subtrees" << dendl; + return; + } + + // root frags + list<CDir*> basefrags; + for (set<CInode*>::iterator p = base_inodes.begin(); + p != base_inodes.end(); + ++p) + (*p)->get_dirfrags(basefrags); + //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; + dout(15) << "show_subtrees" << dendl; + + // queue stuff + list<pair<CDir*,int> > q; + string indent; + set<CDir*> seen; + + // calc max depth + for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p) + q.push_back(pair<CDir*,int>(*p, 0)); + + set<CDir*> subtrees_seen; + + unsigned int depth = 0; + while (!q.empty()) { + CDir *dir = q.front().first; + unsigned int d = q.front().second; + q.pop_front(); + + if (subtrees.count(dir) == 0) continue; + + subtrees_seen.insert(dir); + + if (d > depth) depth = d; + + // sanity check + //dout(25) << "saw depth " << d << " " << *dir << dendl; + if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; + ceph_assert(seen.count(dir) == 0); + seen.insert(dir); + + // nested items? + if (!subtrees[dir].empty()) { + for (set<CDir*>::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) { + //dout(25) << " saw sub " << **p << dendl; + q.push_front(pair<CDir*,int>(*p, d+1)); + } + } + } + + if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD && + !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) { + dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing " + "subtrees" << dendl; + return; + } + + // print tree + for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p) + q.push_back(pair<CDir*,int>(*p, 0)); + + while (!q.empty()) { + CDir *dir = q.front().first; + int d = q.front().second; + q.pop_front(); + + if (subtrees.count(dir) == 0) continue; + + // adjust indenter + while ((unsigned)d < indent.size()) + indent.resize(d); + + // pad + string pad = "______________________________________"; + pad.resize(depth*2+1-indent.size()); + if (!subtrees[dir].empty()) + pad[0] = '.'; // parent + + + string auth; + if (dir->is_auth()) + auth = "auth "; + else + auth = " rep "; + + char s[10]; + if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) + snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first)); + else + snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second)); + + // print + dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s + << " " << auth << *dir << dendl; + + if (dir->ino() == MDS_INO_ROOT) + ceph_assert(dir->inode == root); + if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) + ceph_assert(dir->inode == myin); + if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid())) + ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode); + + // nested items? + if (!subtrees[dir].empty()) { + // more at my level? + if (!q.empty() && q.front().second == d) + indent += "| "; + else + indent += " "; + + for (set<CDir*>::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + q.push_front(pair<CDir*,int>(*p, d+2)); + } + } + + // verify there isn't stray crap in subtree map + int lost = 0; + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (subtrees_seen.count(p->first)) continue; + dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; + lost++; + } + ceph_assert(lost == 0); +} + +void MDCache::show_cache() +{ + dout(7) << "show_cache" << dendl; + + auto show_func = [this](CInode *in) { + // unlinked? + if (!in->parent) + dout(7) << " unlinked " << *in << dendl; + + // dirfrags? + list<CDir*> dfs; + in->get_dirfrags(dfs); + for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *dir = *p; + dout(7) << " dirfrag " << *dir << dendl; + + for (auto &p : dir->items) { + CDentry *dn = p.second; + dout(7) << " dentry " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary() && dnl->get_inode()) + dout(7) << " inode " << *dnl->get_inode() << dendl; + } + } + }; + + for (auto &p : inode_map) + show_func(p.second); + for (auto &p : snap_inode_map) + show_func(p.second); +} + +void MDCache::cache_status(Formatter *f) +{ + f->open_object_section("cache"); + + f->open_object_section("pool"); + mempool::get_pool(mempool::mds_co::id).dump(f); + f->close_section(); + + f->close_section(); +} + +void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f) +{ + ceph_assert(in); + if ((max_depth >= 0) && (cur_depth > max_depth)) { + return; + } + list<CDir*> ls; + in->get_dirfrags(ls); + for (const auto &subdir : ls) { + for (const auto &p : subdir->items) { + CDentry *dn = p.second; + CInode *in = dn->get_linkage()->get_inode(); + if (in) { + dump_tree(in, cur_depth + 1, max_depth, f); + } + } + } + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS); + f->close_section(); +} + +int MDCache::dump_cache(std::string_view file_name) +{ + return dump_cache(file_name, NULL); +} + +int MDCache::dump_cache(Formatter *f) +{ + return dump_cache(std::string_view(""), f); +} + +/** + * Dump the metadata cache, either to a Formatter, if + * provided, else to a plain text file. + */ +int MDCache::dump_cache(std::string_view fn, Formatter *f) +{ + int r = 0; + + // dumping large caches may cause mds to hang or worse get killed. + // so, disallow the dump if the cache size exceeds the configured + // threshold, which is 1G for formatter and unlimited for file (note + // that this can be jacked up by the admin... and is nothing but foot + // shooting, but the option itself is for devs and hence dangerous to + // tune). TODO: remove this when fixed. + uint64_t threshold = f ? + g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") : + g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file"); + + if (threshold && cache_size() > threshold) { + if (f) { + std::stringstream ss; + ss << "cache usage exceeds dump threshold"; + f->open_object_section("result"); + f->dump_string("error", ss.str()); + f->close_section(); + } else { + derr << "cache usage exceeds dump threshold" << dendl; + r = -EINVAL; + } + return r; + } + + r = 0; + int fd = -1; + + if (f) { + f->open_array_section("inodes"); + } else { + char path[PATH_MAX] = ""; + if (fn.length()) { + snprintf(path, sizeof path, "%s", fn.data()); + } else { + snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid())); + } + + dout(1) << "dump_cache to " << path << dendl; + + fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600); + if (fd < 0) { + derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl; + return errno; + } + } + + auto dump_func = [fd, f](CInode *in) { + int r; + if (f) { + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS); + f->close_section(); + return 1; + } + ostringstream ss; + ss << *in << std::endl; + std::string s = ss.str(); + r = safe_write(fd, s.c_str(), s.length()); + if (r < 0) + return r; + list<CDir*> dfs; + in->get_dirfrags(dfs); + for (auto &dir : dfs) { + ostringstream tt; + tt << " " << *dir << std::endl; + std::string t = tt.str(); + r = safe_write(fd, t.c_str(), t.length()); + if (r < 0) + return r; + for (auto &p : dir->items) { + CDentry *dn = p.second; + ostringstream uu; + uu << " " << *dn << std::endl; + std::string u = uu.str(); + r = safe_write(fd, u.c_str(), u.length()); + if (r < 0) + return r; + } + dir->check_rstats(); + } + return 1; + }; + + for (auto &p : inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; + } + for (auto &p : snap_inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; + } + r = 0; + + out: + if (f) { + f->close_section(); // inodes + } else { + ::close(fd); + } + return r; +} + + + +C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) + : MDSInternalContext(c->mds), cache(c), mdr(r) +{} + +void C_MDS_RetryRequest::finish(int r) +{ + mdr->retry++; + cache->dispatch_request(mdr); +} + + +class C_MDS_EnqueueScrub : public Context +{ + std::string tag; + Formatter *formatter; + Context *on_finish; +public: + ScrubHeaderRef header; + C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) : + tag(tag), formatter(f), on_finish(fin), header(nullptr) {} + + Context *take_finisher() { + Context *fin = on_finish; + on_finish = NULL; + return fin; + } + + void finish(int r) override { + if (r == 0) { + // since recursive scrub is asynchronous, dump minimal output + // to not upset cli tools. + if (header && header->get_recursive()) { + formatter->open_object_section("results"); + formatter->dump_int("return_code", 0); + formatter->dump_string("scrub_tag", tag); + formatter->dump_string("mode", "asynchronous"); + formatter->close_section(); // results + } + } else { // we failed the lookup or something; dump ourselves + formatter->open_object_section("results"); + formatter->dump_int("return_code", r); + formatter->close_section(); // results + r = 0; // already dumped in formatter + } + if (on_finish) + on_finish->complete(r); + } +}; + +void MDCache::enqueue_scrub( + std::string_view path, + std::string_view tag, + bool force, bool recursive, bool repair, + Formatter *f, Context *fin) +{ + dout(10) << __func__ << " " << path << dendl; + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB); + if (path == "~mdsdir") { + filepath fp(MDS_INO_MDSDIR(mds->get_nodeid())); + mdr->set_filepath(fp); + } else { + filepath fp(path); + mdr->set_filepath(path); + } + + bool is_internal = false; + std::string tag_str(tag); + if (tag_str.empty()) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + tag_str = uuid_gen.to_string(); + is_internal = true; + } + + C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin); + cs->header = std::make_shared<ScrubHeader>( + tag_str, is_internal, force, recursive, repair, f); + + mdr->internal_op_finish = cs; + enqueue_scrub_work(mdr); +} + +void MDCache::enqueue_scrub_work(MDRequestRef& mdr) +{ + MutationImpl::LockOpVec lov; + CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true); + if (NULL == in) + return; + + // TODO: Remove this restriction + ceph_assert(in->is_auth()); + + bool locked = mds->locker->acquire_locks(mdr, lov); + if (!locked) + return; + + C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish); + ScrubHeaderRef header = cs->header; + + // Cannot scrub same dentry twice at same time + if (in->scrub_is_in_progress()) { + mds->server->respond_to_request(mdr, -EBUSY); + return; + } else { + in->scrub_info(); + } + + header->set_origin(in); + + Context *fin; + if (header->get_recursive()) { + header->get_origin()->get(CInode::PIN_SCRUBQUEUE); + fin = new MDSInternalContextWrapper(mds, + new FunctionContext([this, header](int r) { + recursive_scrub_finish(header); + header->get_origin()->put(CInode::PIN_SCRUBQUEUE); + }) + ); + } else { + fin = cs->take_finisher(); + } + + // If the scrub did some repair, then flush the journal at the end of + // the scrub. Otherwise in the case of e.g. rewriting a backtrace + // the on disk state will still look damaged. + auto scrub_finish = new FunctionContext([this, header, fin](int r){ + if (!header->get_repaired()) { + if (fin) + fin->complete(r); + return; + } + + auto flush_finish = new FunctionContext([this, fin](int r){ + dout(4) << "Expiring log segments because scrub did some repairs" << dendl; + mds->mdlog->trim_all(); + + if (fin) { + MDSGatherBuilder gather(g_ceph_context); + auto& expiring_segments = mds->mdlog->get_expiring_segments(); + for (auto logseg : expiring_segments) + logseg->wait_for_expiry(gather.new_sub()); + ceph_assert(gather.has_subs()); + gather.set_finisher(new MDSInternalContextWrapper(mds, fin)); + gather.activate(); + } + }); + + dout(4) << "Flushing journal because scrub did some repairs" << dendl; + mds->mdlog->start_new_segment(); + mds->mdlog->flush(); + mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish)); + }); + + if (!header->get_recursive()) { + mds->scrubstack->enqueue_inode_top(in, header, + new MDSInternalContextWrapper(mds, scrub_finish)); + } else { + mds->scrubstack->enqueue_inode_bottom(in, header, + new MDSInternalContextWrapper(mds, scrub_finish)); + } + + mds->server->respond_to_request(mdr, 0); + return; +} + +void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header) +{ + if (header->get_origin()->is_base() && + header->get_force() && header->get_repair()) { + // notify snapserver that base directory is recursively scrubbed. + // After both root and mdsdir are recursively scrubbed, snapserver + // knows that all old format snaprealms are converted to the new + // format. + if (mds->mdsmap->get_num_in_mds() == 1 && + mds->mdsmap->get_num_failed_mds() == 0 && + mds->mdsmap->get_tableserver() == mds->get_nodeid()) { + mds->mark_base_recursively_scrubbed(header->get_origin()->ino()); + } + } +} + +struct C_MDC_RespondInternalRequest : public MDCacheLogContext { + MDRequestRef mdr; + C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) : + MDCacheLogContext(c), mdr(m) {} + void finish(int r) override { + mdr->apply(); + get_mds()->server->respond_to_request(mdr, r); + } +}; + +void MDCache::repair_dirfrag_stats(CDir *dir) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS); + mdr->pin(dir); + mdr->internal_op_private = dir; + mdr->internal_op_finish = new C_MDSInternalNoop; + repair_dirfrag_stats_work(mdr); +} + +void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr) +{ + CDir *dir = static_cast<CDir*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *dir << dendl; + + if (!dir->is_auth()) { + mds->server->respond_to_request(mdr, -ESTALE); + return; + } + + if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) { + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr)); + + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + if (!mdr->remote_auth_pins.empty()) + mds->locker->notify_freeze_waiter(dir); + return; + } + + mdr->auth_pin(dir); + + MutationImpl::LockOpVec lov; + CInode *diri = dir->inode; + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_wrlock(&diri->nestlock); + lov.add_wrlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!dir->is_complete()) { + dir->fetch(new C_MDS_RetryRequest(this, mdr)); + return; + } + + frag_info_t frag_info; + nest_info_t nest_info; + for (auto it = dir->begin(); it != dir->end(); ++it) { + CDentry *dn = it->second; + if (dn->last != CEPH_NOSNAP) + continue; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + nest_info.add(in->get_projected_inode()->accounted_rstat); + if (in->is_dir()) + frag_info.nsubdirs++; + else + frag_info.nfiles++; + } else if (dnl->is_remote()) + frag_info.nfiles++; + } + + fnode_t *pf = dir->get_projected_fnode(); + bool good_fragstat = frag_info.same_sums(pf->fragstat); + bool good_rstat = nest_info.same_sums(pf->rstat); + if (good_fragstat && good_rstat) { + dout(10) << __func__ << " no corruption found" << dendl; + mds->server->respond_to_request(mdr, 0); + return; + } + + pf = dir->project_fnode(); + pf->version = dir->pre_dirty(); + mdr->add_projected_fnode(dir); + + mdr->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag"); + mds->mdlog->start_entry(le); + + if (!good_fragstat) { + if (pf->fragstat.mtime > frag_info.mtime) + frag_info.mtime = pf->fragstat.mtime; + if (pf->fragstat.change_attr > frag_info.change_attr) + frag_info.change_attr = pf->fragstat.change_attr; + pf->fragstat = frag_info; + mds->locker->mark_updated_scatterlock(&diri->filelock); + mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mdr->add_updated_lock(&diri->filelock); + } + + if (!good_rstat) { + if (pf->rstat.rctime > nest_info.rctime) + nest_info.rctime = pf->rstat.rctime; + pf->rstat = nest_info; + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + mdr->add_updated_lock(&diri->nestlock); + } + + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + + mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr)); +} + +void MDCache::repair_inode_stats(CInode *diri) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS); + mdr->pin(diri); + mdr->internal_op_private = diri; + mdr->internal_op_finish = new C_MDSInternalNoop; + repair_inode_stats_work(mdr); +} + +void MDCache::repair_inode_stats_work(MDRequestRef& mdr) +{ + CInode *diri = static_cast<CInode*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *diri << dendl; + + if (!diri->is_auth()) { + mds->server->respond_to_request(mdr, -ESTALE); + return; + } + if (!diri->is_dir()) { + mds->server->respond_to_request(mdr, -ENOTDIR); + return; + } + + MutationImpl::LockOpVec lov; + + if (mdr->ls) // already marked filelock/nestlock dirty ? + goto do_rdlocks; + + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_wrlock(&diri->nestlock); + lov.add_wrlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger + // the scatter-gather process, which will fix any fragstat/rstat errors. + { + frag_vec_t leaves; + diri->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) { + ceph_assert(mdr->is_auth_pinned(diri)); + dir = diri->get_or_open_dirfrag(this, leaf); + } + if (dir->get_version() == 0) { + ceph_assert(dir->is_auth()); + dir->fetch(new C_MDS_RetryRequest(this, mdr)); + return; + } + } + } + + diri->state_set(CInode::STATE_REPAIRSTATS); + mdr->ls = mds->mdlog->get_current_segment(); + mds->locker->mark_updated_scatterlock(&diri->filelock); + mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + + mds->locker->drop_locks(mdr.get()); + +do_rdlocks: + // force the scatter-gather process + lov.clear(); + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_rdlock(&diri->nestlock); + lov.add_rdlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + diri->state_clear(CInode::STATE_REPAIRSTATS); + + frag_info_t dir_info; + nest_info_t nest_info; + nest_info.rsubdirs = 1; // it gets one to account for self + if (const sr_t *srnode = diri->get_projected_srnode(); srnode) + nest_info.rsnaps = srnode->snaps.size(); + + { + frag_vec_t leaves; + diri->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + ceph_assert(dir); + ceph_assert(dir->get_version() > 0); + dir_info.add(dir->fnode.accounted_fragstat); + nest_info.add(dir->fnode.accounted_rstat); + } + } + + if (!dir_info.same_sums(diri->inode.dirstat) || + !nest_info.same_sums(diri->inode.rstat)) { + dout(10) << __func__ << " failed to fix fragstat/rstat on " + << *diri << dendl; + } + + mds->server->respond_to_request(mdr, 0); +} + +void MDCache::upgrade_inode_snaprealm(CInode *in) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM); + mdr->pin(in); + mdr->internal_op_private = in; + mdr->internal_op_finish = new C_MDSInternalNoop; + upgrade_inode_snaprealm_work(mdr); +} + +void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr) +{ + CInode *in = static_cast<CInode*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *in << dendl; + + if (!in->is_auth()) { + mds->server->respond_to_request(mdr, -ESTALE); + return; + } + + MutationImpl::LockOpVec lov; + mds->locker->include_snap_rdlocks(in, lov); + lov.erase_rdlock(&in->snaplock); + lov.add_xlock(&in->snaplock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // project_snaprealm() upgrades snaprealm format + auto &pi = in->project_inode(false, true); + mdr->add_projected_inode(in); + pi.inode.version = in->pre_dirty(); + + mdr->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm"); + mds->mdlog->start_entry(le); + + if (in->is_base()) { + le->metablob.add_root(true, in); + } else { + CDentry *pdn = in->get_projected_parent_dn(); + le->metablob.add_dir_context(pdn->get_dir()); + le->metablob.add_primary_dentry(pdn, in, true); + } + + mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr)); +} + +void MDCache::flush_dentry(std::string_view path, Context *fin) +{ + if (is_readonly()) { + dout(10) << __func__ << ": read-only FS" << dendl; + fin->complete(-EROFS); + return; + } + dout(10) << "flush_dentry " << path << dendl; + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH); + filepath fp(path); + mdr->set_filepath(fp); + mdr->internal_op_finish = fin; + flush_dentry_work(mdr); +} + +class C_FinishIOMDR : public MDSContext { +protected: + MDSRank *mds; + MDRequestRef mdr; + MDSRank *get_mds() override { return mds; } +public: + C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {} + void finish(int r) override { mds->server->respond_to_request(mdr, r); } +}; + +void MDCache::flush_dentry_work(MDRequestRef& mdr) +{ + MutationImpl::LockOpVec lov; + CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true); + if (NULL == in) + return; + + // TODO: Is this necessary? Fix it if so + ceph_assert(in->is_auth()); + bool locked = mds->locker->acquire_locks(mdr, lov); + if (!locked) + return; + in->flush(new C_FinishIOMDR(mds, mdr)); +} + + +/** + * Initialize performance counters with global perfcounter + * collection. + */ +void MDCache::register_perfcounters() +{ + PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last); + + // Stray/purge statistics + pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry", + PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64(l_mdc_num_recovering_enqueued, + "num_recovering_enqueued", "Files waiting for recovery", "recy", + PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mdc_recovery_completed, + "recovery_completed", "File recoveries completed", "recd", + PerfCountersBuilder::PRIO_INTERESTING); + + // useful recovery queue statistics + pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", + "Files currently being recovered"); + pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", + "Files waiting for recovery with elevated priority"); + pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", + "File recoveries started"); + + // along with other stray dentries stats + pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", + "Stray dentries delayed"); + pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", + "Stray dentries enqueuing for purge"); + pcb.add_u64_counter(l_mdc_strays_created, "strays_created", + "Stray dentries created"); + pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued", + "Stray dentries enqueued for purge"); + pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", + "Stray dentries reintegrated"); + pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", + "Stray dentries migrated"); + + // low prio internal request stats + pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub", + "Internal Request type enqueue scrub"); + pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir", + "Internal Request type export dir"); + pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush", + "Internal Request type flush"); + pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir", + "Internal Request type fragmentdir"); + pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats", + "Internal Request type frag stats"); + pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats", + "Internal Request type inode stats"); + + logger.reset(pcb.create_perf_counters()); + g_ceph_context->get_perfcounters_collection()->add(logger.get()); + recovery_queue.set_logger(logger.get()); + stray_manager.set_logger(logger.get()); +} + +/** + * Call this when putting references to an inode/dentry or + * when attempting to trim it. + * + * If this inode is no longer linked by anyone, and this MDS + * rank holds the primary dentry, and that dentry is in a stray + * directory, then give up the dentry to the StrayManager, never + * to be seen again by MDCache. + * + * @param delay if true, then purgeable inodes are stashed til + * the next trim(), rather than being purged right + * away. + */ +void MDCache::maybe_eval_stray(CInode *in, bool delay) { + if (in->inode.nlink > 0 || in->is_base() || is_readonly() || + mds->get_state() <= MDSMap::STATE_REJOIN) + return; + + CDentry *dn = in->get_projected_parent_dn(); + + if (dn->state_test(CDentry::STATE_PURGING)) { + /* We have already entered the purging process, no need + * to re-evaluate me ! */ + return; + } + + if (dn->get_dir()->get_inode()->is_stray()) { + if (delay) + stray_manager.queue_delayed(dn); + else + stray_manager.eval_stray(dn); + } +} + +void MDCache::clear_dirty_bits_for_stray(CInode* diri) { + dout(10) << __func__ << " " << *diri << dendl; + ceph_assert(diri->get_projected_parent_dir()->inode->is_stray()); + list<CDir*> ls; + diri->get_dirfrags(ls); + for (auto &p : ls) { + if (p->is_auth() && !(p->is_frozen() || p->is_freezing())) + p->try_remove_dentries_for_stray(); + } + if (!diri->snaprealm) { + if (diri->is_auth()) + diri->clear_dirty_rstat(); + diri->clear_scatter_dirty(); + } +} + +bool MDCache::dump_inode(Formatter *f, uint64_t number) { + CInode *in = get_inode(number); + if (!in) { + return false; + } + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH); + f->close_section(); + return true; +} + +void MDCache::handle_mdsmap(const MDSMap &mdsmap) { + // process export_pin_delayed_queue whenever a new MDSMap received + auto &q = export_pin_delayed_queue; + for (auto it = q.begin(); it != q.end(); ) { + auto *in = *it; + mds_rank_t export_pin = in->get_export_pin(false); + dout(10) << " delayed export_pin=" << export_pin << " on " << *in + << " max_mds=" << mdsmap.get_max_mds() << dendl; + if (export_pin >= mdsmap.get_max_mds()) { + it++; + continue; + } + + in->state_clear(CInode::STATE_DELAYEDEXPORTPIN); + it = q.erase(it); + in->maybe_export_pin(); + } +} + diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h new file mode 100644 index 00000000..ab5adb68 --- /dev/null +++ b/src/mds/MDCache.h @@ -0,0 +1,1363 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_MDCACHE_H +#define CEPH_MDCACHE_H + +#include <atomic> +#include <string_view> +#include <thread> + +#include "common/DecayCounter.h" +#include "include/types.h" +#include "include/filepath.h" +#include "include/elist.h" + +#include "messages/MCacheExpire.h" +#include "messages/MClientQuota.h" +#include "messages/MClientRequest.h" +#include "messages/MClientSnap.h" +#include "messages/MDentryLink.h" +#include "messages/MDentryUnlink.h" +#include "messages/MDirUpdate.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" +#include "messages/MGatherCaps.h" +#include "messages/MGenericMessage.h" +#include "messages/MInodeFileCaps.h" +#include "messages/MLock.h" +#include "messages/MMDSCacheRejoin.h" +#include "messages/MMDSFindIno.h" +#include "messages/MMDSFindInoReply.h" +#include "messages/MMDSFragmentNotify.h" +#include "messages/MMDSFragmentNotifyAck.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" +#include "messages/MMDSResolve.h" +#include "messages/MMDSResolveAck.h" +#include "messages/MMDSSlaveRequest.h" +#include "messages/MMDSSnapUpdate.h" + + +#include "osdc/Filer.h" +#include "CInode.h" +#include "CDentry.h" +#include "CDir.h" +#include "include/Context.h" +#include "events/EMetaBlob.h" +#include "RecoveryQueue.h" +#include "StrayManager.h" +#include "OpenFileTable.h" +#include "MDSContext.h" +#include "MDSMap.h" +#include "Mutation.h" + + +class PerfCounters; + +class MDSRank; +class Session; +class Migrator; + +class Session; + +class ESubtreeMap; + +enum { + l_mdc_first = 3000, + // How many inodes currently in stray dentries + l_mdc_num_strays, + // How many stray dentries are currently delayed for purge due to refs + l_mdc_num_strays_delayed, + // How many stray dentries are currently being enqueued for purge + l_mdc_num_strays_enqueuing, + + // How many dentries have ever been added to stray dir + l_mdc_strays_created, + // How many dentries have been passed on to PurgeQueue + l_mdc_strays_enqueued, + // How many strays have been reintegrated? + l_mdc_strays_reintegrated, + // How many strays have been migrated? + l_mdc_strays_migrated, + + // How many inode sizes currently being recovered + l_mdc_num_recovering_processing, + // How many inodes currently waiting to have size recovered + l_mdc_num_recovering_enqueued, + // How many inodes waiting with elevated priority for recovery + l_mdc_num_recovering_prioritized, + // How many inodes ever started size recovery + l_mdc_recovery_started, + // How many inodes ever completed size recovery + l_mdc_recovery_completed, + + l_mdss_ireq_enqueue_scrub, + l_mdss_ireq_exportdir, + l_mdss_ireq_flush, + l_mdss_ireq_fragmentdir, + l_mdss_ireq_fragstats, + l_mdss_ireq_inodestats, + + l_mdc_last, +}; + + +// flags for predirty_journal_parents() +static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting +static const int PREDIRTY_DIR = 2; // update parent dir mtime/size +static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) + +class MDCache { + public: + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + typedef std::map<mds_rank_t, MCacheExpire::ref> expiremap; + + // my master + MDSRank *mds; + + // -- my cache -- + LRU lru; // dentry lru for expiring items from cache + LRU bottom_lru; // dentries that should be trimmed ASAP + protected: + ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino + map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino + CInode *root = nullptr; // root inode + CInode *myin = nullptr; // .ceph/mds%d dir + + bool readonly = false; + void set_readonly() { readonly = true; } + + std::array<CInode *, NUM_STRAY> strays{}; // my stray dir + int stray_index = 0; + + CInode *get_stray() { + return strays[stray_index]; + } + + set<CInode*> base_inodes; + + std::unique_ptr<PerfCounters> logger; + + Filer filer; + + bool exceeded_size_limit = false; + +private: + uint64_t cache_inode_limit; + uint64_t cache_memory_limit; + double cache_reservation; + double cache_health_threshold; + bool forward_all_requests_to_auth; + +public: + uint64_t cache_limit_inodes(void) { + return cache_inode_limit; + } + bool forward_all_reqs_to_auth() const { + return forward_all_requests_to_auth; + } + uint64_t cache_limit_memory(void) { + return cache_memory_limit; + } + double cache_toofull_ratio(void) const { + double inode_reserve = cache_inode_limit*(1.0-cache_reservation); + double memory_reserve = cache_memory_limit*(1.0-cache_reservation); + return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve)); + } + bool cache_toofull(void) const { + return cache_toofull_ratio() > 0.0; + } + uint64_t cache_size(void) const { + return mempool::get_pool(mempool::mds_co::id).allocated_bytes(); + } + bool cache_overfull(void) const { + return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold); + } + + void advance_stray() { + stray_index = (stray_index+1)%NUM_STRAY; + } + + /** + * Call this when you know that a CDentry is ready to be passed + * on to StrayManager (i.e. this is a stray you've just created) + */ + void notify_stray(CDentry *dn) { + ceph_assert(dn->get_dir()->get_inode()->is_stray()); + if (dn->state_test(CDentry::STATE_PURGING)) + return; + + stray_manager.eval_stray(dn); + } + + void maybe_eval_stray(CInode *in, bool delay=false); + void clear_dirty_bits_for_stray(CInode* diri); + + bool is_readonly() { return readonly; } + void force_readonly(); + + DecayRate decayrate; + + int num_shadow_inodes = 0; + + int num_inodes_with_caps = 0; + + unsigned max_dir_commit_size; + + static file_layout_t gen_default_file_layout(const MDSMap &mdsmap); + static file_layout_t gen_default_log_layout(const MDSMap &mdsmap); + + file_layout_t default_file_layout; + file_layout_t default_log_layout; + + void register_perfcounters(); + + // -- client leases -- +public: + static constexpr std::size_t client_lease_pools = 3; + std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0}; + +protected: + std::array<xlist<ClientLease*>, client_lease_pools> client_leases{}; +public: + void touch_client_lease(ClientLease *r, int pool, utime_t ttl) { + client_leases[pool].push_back(&r->item_lease); + r->ttl = ttl; + } + + void notify_stray_removed() + { + stray_manager.notify_stray_removed(); + } + + void notify_stray_created() + { + stray_manager.notify_stray_created(); + } + + void eval_remote(CDentry *dn) + { + stray_manager.eval_remote(dn); + } + + // -- client caps -- + uint64_t last_cap_id = 0; + + // -- discover -- + struct discover_info_t { + ceph_tid_t tid; + mds_rank_t mds; + inodeno_t ino; + frag_t frag; + snapid_t snap; + filepath want_path; + CInode *basei; + bool want_base_dir; + bool want_xlocked; + + discover_info_t() : + tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL), + want_base_dir(false), want_xlocked(false) {} + ~discover_info_t() { + if (basei) + basei->put(MDSCacheObject::PIN_DISCOVERBASE); + } + void pin_base(CInode *b) { + basei = b; + basei->get(MDSCacheObject::PIN_DISCOVERBASE); + } + }; + + map<ceph_tid_t, discover_info_t> discovers; + ceph_tid_t discover_last_tid = 0; + + void _send_discover(discover_info_t& dis); + discover_info_t& _create_discover(mds_rank_t mds) { + ceph_tid_t t = ++discover_last_tid; + discover_info_t& d = discovers[t]; + d.tid = t; + d.mds = mds; + return d; + } + + // waiters + map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino; + + void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE); + void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish, + mds_rank_t from=MDS_RANK_NONE); + void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish, + bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE); + void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish, + bool want_xlocked=false); + void kick_discovers(mds_rank_t who); // after a failure. + + + // -- subtrees -- +private: + static const unsigned int SUBTREES_COUNT_THRESHOLD = 5; + static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5; +protected: + /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */ + map<CDir*,set<CDir*> > subtrees; + map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir + + // adjust subtree auth specification + // dir->dir_auth + // imports/exports/nested_exports + // join/split subtrees as appropriate +public: + bool is_subtrees() { return !subtrees.empty(); } + template<typename T> + void get_subtrees(T& c) { + if constexpr (std::is_same_v<T, std::vector<CDir*>>) + c.reserve(c.size() + subtrees.size()); + for (const auto& p : subtrees) { + c.push_back(p.first); + } + } + void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true); + void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) { + adjust_subtree_auth(root, mds_authority_t(a,b)); + } + void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth); + void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) { + adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); + } + void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth); + void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) { + adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); + } + void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result); + void try_subtree_merge(CDir *root); + void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true); + void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut); + void eval_subtree_root(CInode *diri); + CDir *get_subtree_root(CDir *dir); + CDir *get_projected_subtree_root(CDir *dir); + bool is_leaf_subtree(CDir *dir) { + ceph_assert(subtrees.count(dir)); + return subtrees[dir].empty(); + } + void remove_subtree(CDir *dir); + bool is_subtree(CDir *root) { + return subtrees.count(root); + } + void get_subtree_bounds(CDir *root, set<CDir*>& bounds); + void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds); + void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds); + void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds); + + void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir); + void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop); + + auto get_auth_subtrees() { + std::vector<CDir*> c; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_auth()) { + c.push_back(root); + } + } + return c; + } + + auto get_fullauth_subtrees() { + std::vector<CDir*> c; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_auth()) { + c.push_back(root); + } + } + return c; + } + auto num_subtrees_fullauth() const { + std::size_t n = 0; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_auth()) { + ++n; + } + } + return n; + } + + auto num_subtrees_fullnonauth() const { + std::size_t n = 0; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_nonauth()) { + ++n; + } + } + return n; + } + + auto num_subtrees() const { + return subtrees.size(); + } + + +protected: + // -- requests -- + ceph::unordered_map<metareqid_t, MDRequestRef> active_requests; + +public: + int get_num_client_requests(); + + MDRequestRef request_start(const MClientRequest::const_ref& req); + MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, const Message::const_ref &m); + MDRequestRef request_start_internal(int op); + bool have_request(metareqid_t rid) { + return active_requests.count(rid); + } + MDRequestRef request_get(metareqid_t rid); + void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace); + void request_finish(MDRequestRef& mdr); + void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0); + void dispatch_request(MDRequestRef& mdr); + void request_drop_foreign_locks(MDRequestRef& mdr); + void request_drop_non_rdlocks(MDRequestRef& r); + void request_drop_locks(MDRequestRef& r); + void request_cleanup(MDRequestRef& r); + + void request_kill(MDRequestRef& r); // called when session closes + + // journal/snap helpers + CInode *pick_inode_snap(CInode *in, snapid_t follows); + CInode *cow_inode(CInode *in, snapid_t last); + void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn, + snapid_t follows=CEPH_NOSNAP, + CInode **pcow_inode=0, CDentry::linkage_t *dnl=0); + void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP, + CInode **pcow_inode=0); + void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); + + void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, + int linkunlink, SnapRealm *prealm); + void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last, + CDir *parent, int linkunlink, bool update_inode); + void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat, + snapid_t ofirst, snapid_t last, + CInode *pin, bool cow_head); + void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false); + void predirty_journal_parents(MutationRef mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink=0, + snapid_t follows=CEPH_NOSNAP); + + // slaves + void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) { + uncommitted_masters[reqid].ls = ls; + uncommitted_masters[reqid].slaves = slaves; + uncommitted_masters[reqid].safe = safe; + } + void wait_for_uncommitted_master(metareqid_t reqid, MDSContext *c) { + uncommitted_masters[reqid].waiters.push_back(c); + } + bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) { + auto p = uncommitted_masters.find(reqid); + return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0; + } + void log_master_commit(metareqid_t reqid); + void logged_master_update(metareqid_t reqid); + void _logged_master_commit(metareqid_t reqid); + void committed_master_slave(metareqid_t r, mds_rank_t from); + void finish_committed_masters(); + + void add_uncommitted_slave(metareqid_t reqid, LogSegment*, mds_rank_t, MDSlaveUpdate *su=nullptr); + void wait_for_uncommitted_slave(metareqid_t reqid, MDSContext *c) { + uncommitted_slaves.at(reqid).waiters.push_back(c); + } + void finish_uncommitted_slave(metareqid_t reqid, bool assert_exist=true); + MDSlaveUpdate* get_uncommitted_slave(metareqid_t reqid, mds_rank_t master); + void _logged_slave_commit(mds_rank_t from, metareqid_t reqid); + + // -- recovery -- +protected: + set<mds_rank_t> recovery_set; + +public: + void set_recovery_set(set<mds_rank_t>& s); + void handle_mds_failure(mds_rank_t who); + void handle_mds_recovery(mds_rank_t who); + +protected: + // [resolve] + // from EImportStart w/o EImportFinish during journal replay + map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports; + // from MMDSResolves + map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports; + + map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit. + map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit. + + // track master requests whose slaves haven't acknowledged commit + struct umaster { + set<mds_rank_t> slaves; + LogSegment *ls; + MDSContext::vec waiters; + bool safe; + bool committing; + bool recovering; + umaster() : ls(NULL), safe(false), committing(false), recovering(false) {} + }; + map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set + + struct uslave { + uslave() {} + mds_rank_t master; + LogSegment *ls = nullptr; + MDSlaveUpdate *su = nullptr; + MDSContext::vec waiters; + }; + map<metareqid_t, uslave> uncommitted_slaves; // slave: preserve the slave req until seeing commit. + + set<metareqid_t> pending_masters; + map<int, set<metareqid_t> > ambiguous_slave_updates; + + friend class ESlaveUpdate; + friend class ECommitted; + + bool resolves_pending = false; + set<mds_rank_t> resolve_gather; // nodes i need resolves from + set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from + set<version_t> resolve_snapclient_commits; + map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal + map<mds_rank_t, MMDSResolve::const_ref> delayed_resolve; + + void handle_resolve(const MMDSResolve::const_ref &m); + void handle_resolve_ack(const MMDSResolveAck::const_ref &m); + void process_delayed_resolve(); + void discard_delayed_resolve(mds_rank_t who); + void maybe_resolve_finish(); + void disambiguate_my_imports(); + void disambiguate_other_imports(); + void trim_unlinked_inodes(); + + void send_slave_resolves(); + void send_subtree_resolves(); + void maybe_finish_slave_resolve(); + +public: + void recalc_auth_bits(bool replay); + void remove_inode_recursive(CInode *in); + + bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { + auto p = ambiguous_slave_updates.find(master); + return p != ambiguous_slave_updates.end() && p->second.count(reqid); + } + void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { + ambiguous_slave_updates[master].insert(reqid); + } + void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { + auto p = ambiguous_slave_updates.find(master); + auto q = p->second.find(reqid); + ceph_assert(q != p->second.end()); + p->second.erase(q); + if (p->second.empty()) + ambiguous_slave_updates.erase(p); + } + + void add_rollback(metareqid_t reqid, mds_rank_t master) { + resolve_need_rollback[reqid] = master; + } + void finish_rollback(metareqid_t reqid, MDRequestRef& mdr); + + // ambiguous imports + void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds); + void add_ambiguous_import(CDir *base, const set<CDir*>& bounds); + bool have_ambiguous_import(dirfrag_t base) { + return my_ambiguous_imports.count(base); + } + void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) { + ceph_assert(my_ambiguous_imports.count(base)); + bounds = my_ambiguous_imports[base]; + } + void cancel_ambiguous_import(CDir *); + void finish_ambiguous_import(dirfrag_t dirino); + void resolve_start(MDSContext *resolve_done_); + void send_resolves(); + void maybe_send_pending_resolves() { + if (resolves_pending) + send_subtree_resolves(); + } + + void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, + map<dirfrag_t,vector<dirfrag_t> >& subtrees); + ESubtreeMap *create_subtree_map(); + + + void clean_open_file_lists(); + void dump_openfiles(Formatter *f); + bool dump_inode(Formatter *f, uint64_t number); +protected: + // [rejoin] + bool rejoins_pending = false; + set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin + set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to + set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to + set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack + map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps; + map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports; + + map<client_t,entity_inst_t> rejoin_client_map; + map<client_t,client_metadata_t> rejoin_client_metadata_map; + map<client_t,pair<Session*,uint64_t> > rejoin_session_map; + + map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex + + map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex + set<inodeno_t> cap_imports_missing; + map<inodeno_t, MDSContext::vec > cap_reconnect_waiters; + int cap_imports_num_opening = 0; + + set<CInode*> rejoin_undef_inodes; + set<CInode*> rejoin_potential_updated_scatterlocks; + set<CDir*> rejoin_undef_dirfrags; + map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes; + + vector<CInode*> rejoin_recover_q, rejoin_check_q; + list<SimpleLock*> rejoin_eval_locks; + MDSContext::vec rejoin_waiters; + + void rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin); + void handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m); + void handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &m); + CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); + CDir* rejoin_invent_dirfrag(dirfrag_t df); + void handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &m); + void rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack, + set<vinodeno_t>& acked_inodes, + set<SimpleLock *>& gather_locks); + void handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &m); + void rejoin_send_acks(); + void rejoin_trim_undef_inodes(); + void maybe_send_pending_rejoins() { + if (rejoins_pending) + rejoin_send_rejoins(); + } + std::unique_ptr<MDSContext> rejoin_done; + std::unique_ptr<MDSContext> resolve_done; +public: + void rejoin_start(MDSContext *rejoin_done_); + void rejoin_gather_finish(); + void rejoin_send_rejoins(); + void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, + int target=-1, bool drop_path=false) { + auto& ex = cap_exports[ino]; + ex.first = target; + auto &_icr = ex.second[client] = icr; + if (drop_path) + _icr.path.clear(); + } + void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, + mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) { + auto &_icr = cap_imports[ino][client][frommds] = icr; + if (drop_path) + _icr.path.clear(); + } + void rejoin_recovered_client(client_t client, const entity_inst_t& inst) { + rejoin_client_map.emplace(client, inst); + } + bool rejoin_has_cap_reconnect(inodeno_t ino) const { + return cap_imports.count(ino); + } + void add_replay_ino_alloc(inodeno_t ino) { + cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin + } + const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { + if (cap_imports.count(ino) && + cap_imports[ino].count(client) && + cap_imports[ino][client].count(MDS_RANK_NONE)) { + return &cap_imports[ino][client][MDS_RANK_NONE]; + } + return NULL; + } + void remove_replay_cap_reconnect(inodeno_t ino, client_t client) { + ceph_assert(cap_imports[ino].size() == 1); + ceph_assert(cap_imports[ino][client].size() == 1); + cap_imports.erase(ino); + } + void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) { + cap_reconnect_waiters[ino].push_back(c); + } + + // [reconnect/rejoin caps] + struct reconnected_cap_info_t { + inodeno_t realm_ino; + snapid_t snap_follows; + int dirty_caps; + bool snapflush; + reconnected_cap_info_t() : + realm_ino(0), snap_follows(0), dirty_caps(0), snapflush(false) {} + }; + map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino + map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq + + void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) { + reconnected_cap_info_t &info = reconnected_caps[ino][client]; + info.realm_ino = inodeno_t(icr.capinfo.snaprealm); + info.snap_follows = icr.snap_follows; + } + void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) { + reconnected_cap_info_t &info = reconnected_caps[ino][client]; + info.dirty_caps |= dirty; + if (snapflush) + info.snapflush = snapflush; + } + void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { + reconnected_snaprealms[ino][client] = seq; + } + + friend class C_MDC_RejoinOpenInoFinish; + friend class C_MDC_RejoinSessionsOpened; + void rejoin_open_ino_finish(inodeno_t ino, int ret); + void rejoin_prefetch_ino_finish(inodeno_t ino, int ret); + void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map); + bool process_imported_caps(); + void choose_lock_states_and_reconnect_caps(); + void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, + map<client_t,MClientSnap::ref>& splits); + void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,MClientSnap::ref>& splits); + void send_snaps(map<client_t,MClientSnap::ref>& splits); + Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); + void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, + map<client_t,MClientSnap::ref>& updates); + Capability* try_reconnect_cap(CInode *in, Session *session); + void export_remaining_imported_caps(); + + // realm inodes + set<CInode*> rejoin_pending_snaprealms; + // cap imports. delayed snap parent opens. + map<client_t,set<CInode*> > delayed_imported_caps; + + void do_cap_import(Session *session, CInode *in, Capability *cap, + uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, + int peer, int p_flags); + void do_delayed_cap_imports(); + void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client, + snapid_t snap_follows); + void open_snaprealms(); + + bool open_undef_inodes_dirfrags(); + void opened_undef_inode(CInode *in); + void opened_undef_dirfrag(CDir *dir) { + rejoin_undef_dirfrags.erase(dir); + } + + void reissue_all_caps(); + + + friend class Locker; + friend class Migrator; + friend class MDBalancer; + + // StrayManager needs to be able to remove_inode() from us + // when it is done purging + friend class StrayManager; + + // File size recovery +private: + RecoveryQueue recovery_queue; + void identify_files_to_recover(); +public: + void start_files_to_recover(); + void do_file_recover(); + void queue_file_recover(CInode *in); + void _queued_file_recover_cow(CInode *in, MutationRef& mut); + + // subsystems + std::unique_ptr<Migrator> migrator; + + public: + explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_); + ~MDCache(); + void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); + + // debug + void log_stat(); + + // root inode + CInode *get_root() { return root; } + CInode *get_myin() { return myin; } + + size_t get_cache_size() { return lru.lru_get_size(); } + + // trimming + std::pair<bool, uint64_t> trim(uint64_t count=0); +private: + std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap); + bool trim_dentry(CDentry *dn, expiremap& expiremap); + void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap); + bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&); + void send_expire_messages(expiremap& expiremap); + void trim_non_auth(); // trim out trimmable non-auth items +public: + bool trim_non_auth_subtree(CDir *directory); + void standby_trim_segment(LogSegment *ls); + void try_trim_non_auth_subtree(CDir *dir); + bool can_trim_non_auth_dirfrag(CDir *dir) { + return my_ambiguous_imports.count((dir)->dirfrag()) == 0 && + uncommitted_slave_rename_olddir.count(dir->inode) == 0; + } + + /** + * For all unreferenced inodes, dirs, dentries below an inode, compose + * expiry messages. This is used when giving up all replicas of entities + * for an MDS peer in the 'stopping' state, such that the peer can + * empty its cache and finish shutting down. + * + * We have to make sure we're only expiring un-referenced items to + * avoid interfering with ongoing stray-movement (we can't distinguish + * between the "moving my strays" and "waiting for my cache to empty" + * phases within 'stopping') + * + * @return false if we completed cleanly, true if caller should stop + * expiring because we hit something with refs. + */ + bool expire_recursive(CInode *in, expiremap& expiremap); + + void trim_client_leases(); + void check_memory_usage(); + + // shutdown +private: + set<inodeno_t> shutdown_exporting_strays; + pair<dirfrag_t, string> shutdown_export_next; +public: + void shutdown_start(); + void shutdown_check(); + bool shutdown_pass(); + bool shutdown(); // clear cache (ie at shutodwn) + bool shutdown_export_strays(); + void shutdown_export_stray_finish(inodeno_t ino) { + if (shutdown_exporting_strays.erase(ino)) + shutdown_export_strays(); + } + + bool did_shutdown_log_cap = false; + + // inode_map + bool have_inode(vinodeno_t vino) { + if (vino.snapid == CEPH_NOSNAP) + return inode_map.count(vino.ino) ? true : false; + else + return snap_inode_map.count(vino) ? true : false; + } + bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { + return have_inode(vinodeno_t(ino, snap)); + } + CInode* get_inode(vinodeno_t vino) { + if (vino.snapid == CEPH_NOSNAP) { + auto p = inode_map.find(vino.ino); + if (p != inode_map.end()) + return p->second; + } else { + auto p = snap_inode_map.find(vino); + if (p != snap_inode_map.end()) + return p->second; + } + return NULL; + } + CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { + return get_inode(vinodeno_t(ino, s)); + } + CInode* lookup_snap_inode(vinodeno_t vino) { + auto p = snap_inode_map.lower_bound(vino); + if (p != snap_inode_map.end() && + p->second->ino() == vino.ino && p->second->first <= vino.snapid) + return p->second; + return NULL; + } + + CDir* get_dirfrag(dirfrag_t df) { + CInode *in = get_inode(df.ino); + if (!in) + return NULL; + return in->get_dirfrag(df.frag); + } + CDir* get_dirfrag(inodeno_t ino, std::string_view dn) { + CInode *in = get_inode(ino); + if (!in) + return NULL; + frag_t fg = in->pick_dirfrag(dn); + return in->get_dirfrag(fg); + } + CDir* get_force_dirfrag(dirfrag_t df, bool replay) { + CInode *diri = get_inode(df.ino); + if (!diri) + return NULL; + CDir *dir = force_dir_fragment(diri, df.frag, replay); + if (!dir) + dir = diri->get_dirfrag(df.frag); + return dir; + } + + MDSCacheObject *get_object(const MDSCacheObjectInfo &info); + + + + public: + void add_inode(CInode *in); + + void remove_inode(CInode *in); + protected: + void touch_inode(CInode *in) { + if (in->get_parent_dn()) + touch_dentry(in->get_projected_parent_dn()); + } +public: + void touch_dentry(CDentry *dn) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { + bottom_lru.lru_midtouch(dn); + } else { + if (dn->is_auth()) + lru.lru_touch(dn); + else + lru.lru_midtouch(dn); + } + } + void touch_dentry_bottom(CDentry *dn) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) + return; + lru.lru_bottouch(dn); + } +protected: + + void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin, + set<SimpleLock *>& gather_locks); + void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks); + + void rename_file(CDentry *srcdn, CDentry *destdn); + + public: + // truncate + void truncate_inode(CInode *in, LogSegment *ls); + void _truncate_inode(CInode *in, LogSegment *ls); + void truncate_inode_finish(CInode *in, LogSegment *ls); + void truncate_inode_logged(CInode *in, MutationRef& mut); + + void add_recovered_truncate(CInode *in, LogSegment *ls); + void remove_recovered_truncate(CInode *in, LogSegment *ls); + void start_recovered_truncates(); + + + public: + CDir *get_auth_container(CDir *in); + CDir *get_export_container(CDir *dir); + void find_nested_exports(CDir *dir, set<CDir*>& s); + void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s); + + +private: + bool opening_root = false, open = false; + MDSContext::vec waiting_for_open; + +public: + void init_layouts(); + void create_unlinked_system_inode(CInode *in, inodeno_t ino, + int mode) const; + CInode *create_system_inode(inodeno_t ino, int mode); + CInode *create_root_inode(); + + void create_empty_hierarchy(MDSGather *gather); + void create_mydir_hierarchy(MDSGather *gather); + + bool is_open() { return open; } + void wait_for_open(MDSContext *c) { + waiting_for_open.push_back(c); + } + + void open_root_inode(MDSContext *c); + void open_root(); + void open_mydir_inode(MDSContext *c); + void open_mydir_frag(MDSContext *c); + void populate_mydir(); + + void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin); + void _create_system_file_finish(MutationRef& mut, CDentry *dn, + version_t dpv, MDSContext *fin); + + void open_foreign_mdsdir(inodeno_t ino, MDSContext *c); + CDir *get_stray_dir(CInode *in); + CDentry *get_or_create_stray_dentry(CInode *in); + + /** + * Find the given dentry (and whether it exists or not), its ancestors, + * and get them all into memory and usable on this MDS. This function + * makes a best-effort attempt to load everything; if it needs to + * go away and do something then it will put the request on a waitlist. + * It prefers the mdr, then the req, then the fin. (At least one of these + * must be non-null.) + * + * At least one of the params mdr, req, and fin must be non-null. + * + * @param mdr The MDRequest associated with the path. Can be null. + * @param cf A MDSContextFactory for waiter building. + * @param path The path to traverse to. + * @param pdnvec Data return parameter -- on success, contains a + * vector of dentries. On failure, is either empty or contains the + * full trace of traversable dentries. + * @param pin Data return parameter -- if successful, points to the inode + * associated with filepath. If unsuccessful, is null. + * @param onfail Specifies different lookup failure behaviors. If set to + * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null + * dentries (instead of returning -ENOENT). If set to + * MDS_TRAVERSE_FORWARD, it will forward the request to the auth + * MDS if that becomes appropriate (ie, if it doesn't know the contents + * of a directory). If set to MDS_TRAVERSE_DISCOVER, it + * will attempt to look up the path from a different MDS (and bring them + * into its cache as replicas). + * + * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise. + * If it returns 1, the requester associated with this call has been placed + * on the appropriate waitlist, and it should unwind itself and back out. + * If it returns 2 the request has been forwarded, and again the requester + * should unwind itself and back out. + */ + int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path, + vector<CDentry*> *pdnvec, CInode **pin, int onfail); + + CInode *cache_traverse(const filepath& path); + + void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin); + CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false); + + bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing); + bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, + set<CDir*>& fetch_queue, set<inodeno_t>& missing, + C_GatherBuilder &gather_bld); + + void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, + bool want_xlocked=false); + void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin, + bool want_xlocked, int r); + + void make_trace(vector<CDentry*>& trace, CInode *in); + +protected: + struct open_ino_info_t { + vector<inode_backpointer_t> ancestors; + set<mds_rank_t> checked; + mds_rank_t checking; + mds_rank_t auth_hint; + bool check_peers; + bool fetch_backtrace; + bool discover; + bool want_replica; + bool want_xlocked; + version_t tid; + int64_t pool; + int last_err; + MDSContext::vec waiters; + open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE), + check_peers(true), fetch_backtrace(true), discover(false), + want_replica(false), want_xlocked(false), tid(0), pool(-1), + last_err(0) {} + }; + ceph_tid_t open_ino_last_tid = 0; + map<inodeno_t,open_ino_info_t> opening_inodes; + + void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); + void _open_ino_parent_opened(inodeno_t ino, int ret); + void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); + void _open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent); + int open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, + const vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, mds_rank_t *hint); + void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); + void handle_open_ino(const MMDSOpenIno::const_ref &m, int err=0); + void handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m); + friend class C_IO_MDC_OpenInoBacktraceFetched; + friend struct C_MDC_OpenInoTraverseDir; + friend struct C_MDC_OpenInoParentOpened; + +public: + void kick_open_ino_peers(mds_rank_t who); + void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin, + bool want_replica=true, bool want_xlocked=false); + + // -- find_ino_peer -- + struct find_ino_peer_info_t { + inodeno_t ino; + ceph_tid_t tid; + MDSContext *fin; + mds_rank_t hint; + mds_rank_t checking; + set<mds_rank_t> checked; + + find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {} + }; + + map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer; + ceph_tid_t find_ino_peer_last_tid = 0; + + void find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint=MDS_RANK_NONE); + void _do_find_ino_peer(find_ino_peer_info_t& fip); + void handle_find_ino(const MMDSFindIno::const_ref &m); + void handle_find_ino_reply(const MMDSFindInoReply::const_ref &m); + void kick_find_ino_peers(mds_rank_t who); + + // -- snaprealms -- +private: + SnapRealm *global_snaprealm = nullptr; +public: + SnapRealm *get_global_snaprealm() const { return global_snaprealm; } + void create_global_snaprealm(); + void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true); + void send_snap_update(CInode *in, version_t stid, int snap_op); + void handle_snap_update(const MMDSSnapUpdate::const_ref &m); + void notify_global_snaprealm_update(int snap_op); + + // -- stray -- +public: + void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); + uint64_t get_num_strays() const { return stray_manager.get_num_strays(); } + +protected: + void scan_stray_dir(dirfrag_t next=dirfrag_t()); + StrayManager stray_manager; + friend struct C_MDC_RetryScanStray; + + // == messages == + public: + void dispatch(const Message::const_ref &m); + + protected: + // -- replicas -- + void handle_discover(const MDiscover::const_ref &dis); + void handle_discover_reply(const MDiscoverReply::const_ref &m); + friend class C_MDC_Join; + +public: + void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl); + void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl); + void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, + uint64_t features); + + CDir* add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished); + CDentry *add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished); + CInode *add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished); + + void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); + CDentry *add_replica_stray(const bufferlist &bl, mds_rank_t from); + + // -- namespace -- +public: + void send_dentry_link(CDentry *dn, MDRequestRef& mdr); + void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); +protected: + void handle_dentry_link(const MDentryLink::const_ref &m); + void handle_dentry_unlink(const MDentryUnlink::const_ref &m); + + + // -- fragmenting -- +private: + struct ufragment { + int bits; + bool committed; + LogSegment *ls; + MDSContext::vec waiters; + frag_vec_t old_frags; + bufferlist rollback; + ufragment() : bits(0), committed(false), ls(NULL) {} + }; + map<dirfrag_t, ufragment> uncommitted_fragments; + + struct fragment_info_t { + int bits; + list<CDir*> dirs; + list<CDir*> resultfrags; + MDRequestRef mdr; + set<mds_rank_t> notify_ack_waiting; + bool finishing = false; + + // for deadlock detection + bool all_frozen = false; + utime_t last_cum_auth_pins_change; + int last_cum_auth_pins = 0; + int num_remote_waiters = 0; // number of remote authpin waiters + fragment_info_t() {} + bool is_fragmenting() { return !resultfrags.empty(); } + uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; } + }; + map<dirfrag_t,fragment_info_t> fragments; + typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator; + + void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + list<CDir*>& frags, MDSContext::vec& waiters, bool replay); + void adjust_dir_fragments(CInode *diri, + list<CDir*>& srcfrags, + frag_t basefrag, int bits, + list<CDir*>& resultfrags, + MDSContext::vec& waiters, + bool replay); + CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true); + void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds); + + bool can_fragment(CInode *diri, list<CDir*>& dirs); + void fragment_freeze_dirs(list<CDir*>& dirs); + void fragment_mark_and_complete(MDRequestRef& mdr); + void fragment_frozen(MDRequestRef& mdr, int r); + void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs); + void fragment_drop_locks(fragment_info_t &info); + void fragment_maybe_finish(const fragment_info_iterator& it); + void dispatch_fragment_dir(MDRequestRef& mdr); + void _fragment_logged(MDRequestRef& mdr); + void _fragment_stored(MDRequestRef& mdr); + void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr); + void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr); + + friend class EFragment; + friend class C_MDC_FragmentFrozen; + friend class C_MDC_FragmentMarking; + friend class C_MDC_FragmentPrep; + friend class C_MDC_FragmentStore; + friend class C_MDC_FragmentCommit; + friend class C_IO_MDC_FragmentPurgeOld; + + void handle_fragment_notify(const MMDSFragmentNotify::const_ref &m); + void handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &m); + + void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag, + LogSegment *ls, bufferlist *rollback=NULL); + void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); + void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags); + + + DecayCounter trim_counter; + +public: + void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) { + uncommitted_fragments.at(dirfrag).waiters.push_back(c); + } + bool is_any_uncommitted_fragment() const { + return !uncommitted_fragments.empty(); + } + void wait_for_uncommitted_fragments(MDSContext* finisher); + void rollback_uncommitted_fragments(); + + void split_dir(CDir *dir, int byn); + void merge_dir(CInode *diri, frag_t fg); + + void find_stale_fragment_freeze(); + void fragment_freeze_inc_num_waiters(CDir *dir); + bool fragment_are_all_frozen(CDir *dir); + int get_num_fragmenting_dirs() { return fragments.size(); } + + // -- updates -- + //int send_inode_updates(CInode *in); + //void handle_inode_update(MInodeUpdate *m); + + int send_dir_updates(CDir *in, bool bcast=false); + void handle_dir_update(const MDirUpdate::const_ref &m); + + // -- cache expiration -- + void handle_cache_expire(const MCacheExpire::const_ref &m); + // delayed cache expire + map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg + void process_delayed_expire(CDir *dir); + void discard_delayed_expire(CDir *dir); + + // -- mdsmap -- + void handle_mdsmap(const MDSMap &mdsmap); + +protected: + int dump_cache(std::string_view fn, Formatter *f); +public: + int dump_cache() { return dump_cache(NULL, NULL); } + int dump_cache(std::string_view filename); + int dump_cache(Formatter *f); + void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f); + + void cache_status(Formatter *f); + + void dump_resolve_status(Formatter *f) const; + void dump_rejoin_status(Formatter *f) const; + + // == crap fns == + public: + void show_cache(); + void show_subtrees(int dbl=10, bool force_print=false); + + CInode *hack_pick_random_inode() { + ceph_assert(!inode_map.empty()); + int n = rand() % inode_map.size(); + auto p = inode_map.begin(); + while (n--) ++p; + return p->second; + } + +protected: + void flush_dentry_work(MDRequestRef& mdr); + /** + * Resolve path to a dentry and pass it onto the ScrubStack. + * + * TODO: return enough information to the original mdr formatter + * and completion that they can subsequeuntly check the progress of + * this scrub (we won't block them on a whole scrub as it can take a very + * long time) + */ + void enqueue_scrub_work(MDRequestRef& mdr); + void recursive_scrub_finish(const ScrubHeaderRef& header); + void repair_inode_stats_work(MDRequestRef& mdr); + void repair_dirfrag_stats_work(MDRequestRef& mdr); + void upgrade_inode_snaprealm_work(MDRequestRef& mdr); + friend class C_MDC_RespondInternalRequest; +public: + void flush_dentry(std::string_view path, Context *fin); + /** + * Create and start an OP_ENQUEUE_SCRUB + */ + void enqueue_scrub(std::string_view path, std::string_view tag, + bool force, bool recursive, bool repair, + Formatter *f, Context *fin); + void repair_inode_stats(CInode *diri); + void repair_dirfrag_stats(CDir *dir); + void upgrade_inode_snaprealm(CInode *in); + +public: + /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ + std::set<CInode *> export_pin_queue; + std::set<CInode *> export_pin_delayed_queue; + + OpenFileTable open_file_table; + +private: + std::thread upkeeper; + ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex"); + ceph::condition_variable upkeep_cvar; + time upkeep_last_trim = time::min(); + time upkeep_last_release = time::min(); + std::atomic<bool> upkeep_trim_shutdown{false}; +}; + +class C_MDS_RetryRequest : public MDSInternalContext { + MDCache *cache; + MDRequestRef mdr; + public: + C_MDS_RetryRequest(MDCache *c, MDRequestRef& r); + void finish(int r) override; +}; + +#endif diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc new file mode 100644 index 00000000..5277f9af --- /dev/null +++ b/src/mds/MDLog.cc @@ -0,0 +1,1530 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSRank.h" +#include "MDLog.h" +#include "MDCache.h" +#include "LogEvent.h" +#include "MDSContext.h" + +#include "osdc/Journaler.h" +#include "mds/JournalPointer.h" + +#include "common/entity_name.h" +#include "common/perf_counters.h" +#include "common/Cond.h" + +#include "events/ESubtreeMap.h" + +#include "common/config.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".log " + +// cons/des +MDLog::~MDLog() +{ + if (journaler) { delete journaler; journaler = 0; } + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger); + delete logger; + logger = 0; + } +} + + +void MDLog::create_logger() +{ + PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last); + + plb.add_u64_counter(l_mdl_evadd, "evadd", "Events submitted", "subm", + PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64(l_mdl_ev, "ev", "Events", "evts", + PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64(l_mdl_seg, "seg", "Segments", "segs", + PerfCountersBuilder::PRIO_INTERESTING); + + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + plb.add_u64(l_mdl_evexg, "evexg", "Expiring events"); + plb.add_u64(l_mdl_evexd, "evexd", "Current expired events"); + plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments"); + plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments"); + plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed", + "repl", PerfCountersBuilder::PRIO_INTERESTING); + plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency"); + plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events"); + plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events"); + plb.add_u64_counter(l_mdl_segadd, "segadd", "Segments added"); + plb.add_u64_counter(l_mdl_segex, "segex", "Total expired segments"); + plb.add_u64_counter(l_mdl_segtrm, "segtrm", "Trimmed segments"); + + plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + plb.add_u64(l_mdl_expos, "expos", "Journaler xpire position"); + plb.add_u64(l_mdl_wrpos, "wrpos", "Journaler write position"); + plb.add_u64(l_mdl_rdpos, "rdpos", "Journaler read position"); + + // logger + logger = plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); +} + +void MDLog::set_write_iohint(unsigned iohint_flags) +{ + journaler->set_write_iohint(iohint_flags); +} + +class C_MDL_WriteError : public MDSIOContextBase { + protected: + MDLog *mdlog; + MDSRank *get_mds() override {return mdlog->mds;} + + void finish(int r) override { + MDSRank *mds = get_mds(); + // assume journal is reliable, so don't choose action based on + // g_conf()->mds_action_on_write_error. + if (r == -EBLACKLISTED) { + derr << "we have been blacklisted (fenced), respawning..." << dendl; + mds->respawn(); + } else { + derr << "unhandled error " << cpp_strerror(r) << ", shutting down..." << dendl; + // Although it's possible that this could be something transient, + // it's severe and scary, so disable this rank until an administrator + // intervenes. + mds->clog->error() << "Unhandled journal write error on MDS rank " << + mds->get_nodeid() << ": " << cpp_strerror(r) << ", shutting down."; + mds->damaged(); + ceph_abort(); // damaged should never return + } + } + + public: + explicit C_MDL_WriteError(MDLog *m) : + MDSIOContextBase(false), mdlog(m) {} + void print(ostream& out) const override { + out << "mdlog_write_error"; + } +}; + + +void MDLog::write_head(MDSContext *c) +{ + Context *fin = NULL; + if (c != NULL) { + fin = new C_IO_Wrapper(mds, c); + } + journaler->write_head(fin); +} + +uint64_t MDLog::get_read_pos() const +{ + return journaler->get_read_pos(); +} + +uint64_t MDLog::get_write_pos() const +{ + return journaler->get_write_pos(); +} + +uint64_t MDLog::get_safe_pos() const +{ + return journaler->get_write_safe_pos(); +} + + + +void MDLog::create(MDSContext *c) +{ + dout(5) << "create empty log" << dendl; + + C_GatherBuilder gather(g_ceph_context); + // This requires an OnFinisher wrapper because Journaler will call back the completion for write_head inside its own lock + // XXX but should maybe that be handled inside Journaler? + gather.set_finisher(new C_IO_Wrapper(mds, c)); + + // The inode of the default Journaler we will create + ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); + + // Instantiate Journaler and start async write to RADOS + ceph_assert(journaler == NULL); + journaler = new Journaler("mdlog", ino, mds->mdsmap->get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, + l_mdl_jlat, mds->finisher); + ceph_assert(journaler->is_readonly()); + journaler->set_write_error_handler(new C_MDL_WriteError(this)); + journaler->set_writeable(); + journaler->create(&mds->mdcache->default_log_layout, g_conf()->mds_journal_format); + journaler->write_head(gather.new_sub()); + + // Async write JournalPointer to RADOS + JournalPointer jp(mds->get_nodeid(), mds->mdsmap->get_metadata_pool()); + jp.front = ino; + jp.back = 0; + jp.save(mds->objecter, gather.new_sub()); + + gather.activate(); + + logger->set(l_mdl_expos, journaler->get_expire_pos()); + logger->set(l_mdl_wrpos, journaler->get_write_pos()); + + submit_thread.create("md_submit"); +} + +void MDLog::open(MDSContext *c) +{ + dout(5) << "open discovering log bounds" << dendl; + + ceph_assert(!recovery_thread.is_started()); + recovery_thread.set_completion(c); + recovery_thread.create("md_recov_open"); + + submit_thread.create("md_submit"); + // either append() or replay() will follow. +} + +/** + * Final part of reopen() procedure, after recovery_thread + * has done its thing we call append() + */ +class C_ReopenComplete : public MDSInternalContext { + MDLog *mdlog; + MDSContext *on_complete; +public: + C_ReopenComplete(MDLog *mdlog_, MDSContext *on_complete_) : MDSInternalContext(mdlog_->mds), mdlog(mdlog_), on_complete(on_complete_) {} + void finish(int r) override { + mdlog->append(); + on_complete->complete(r); + } +}; + +/** + * Given that open() has been called in the past, go through the journal + * recovery procedure again, potentially reformatting the journal if it + * was in an old format. + */ +void MDLog::reopen(MDSContext *c) +{ + dout(5) << "reopen" << dendl; + + // Because we will call append() at the completion of this, check that we have already + // read the whole journal. + ceph_assert(journaler != NULL); + ceph_assert(journaler->get_read_pos() == journaler->get_write_pos()); + + delete journaler; + journaler = NULL; + + // recovery_thread was started at some point in the past. Although + // it has called it's completion if we made it back here, it might + // still not have been cleaned up: join it. + recovery_thread.join(); + + recovery_thread.set_completion(new C_ReopenComplete(this, c)); + recovery_thread.create("md_recov_reopen"); +} + +void MDLog::append() +{ + dout(5) << "append positioning at end and marking writeable" << dendl; + journaler->set_read_pos(journaler->get_write_pos()); + journaler->set_expire_pos(journaler->get_write_pos()); + + journaler->set_writeable(); + + logger->set(l_mdl_expos, journaler->get_write_pos()); +} + + + +// ------------------------------------------------- + +void MDLog::_start_entry(LogEvent *e) +{ + ceph_assert(submit_mutex.is_locked_by_me()); + + ceph_assert(cur_event == NULL); + cur_event = e; + + event_seq++; + + EMetaBlob *metablob = e->get_metablob(); + if (metablob) { + metablob->event_seq = event_seq; + metablob->last_subtree_map = get_last_segment_seq(); + } +} + +void MDLog::cancel_entry(LogEvent *le) +{ + ceph_assert(le == cur_event); + cur_event = NULL; + delete le; +} + +void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase *c) +{ + ceph_assert(submit_mutex.is_locked_by_me()); + ceph_assert(!mds->is_any_replay()); + ceph_assert(!capped); + + ceph_assert(le == cur_event); + cur_event = NULL; + + // let the event register itself in the segment + ceph_assert(!segments.empty()); + LogSegment *ls = segments.rbegin()->second; + ls->num_events++; + + le->_segment = ls; + le->update_segment(); + le->set_stamp(ceph_clock_now()); + + mdsmap_up_features = mds->mdsmap->get_up_features(); + pending_events[ls->seq].push_back(PendingEvent(le, c)); + num_events++; + + if (logger) { + logger->inc(l_mdl_evadd); + logger->set(l_mdl_ev, num_events); + } + + unflushed++; + + uint64_t period = journaler->get_layout_period(); + // start a new segment? + if (le->get_type() == EVENT_SUBTREEMAP || + (le->get_type() == EVENT_IMPORTFINISH && mds->is_resolve())) { + // avoid infinite loop when ESubtreeMap is very large. + // do not insert ESubtreeMap among EImportFinish events that finish + // disambiguate imports. Because the ESubtreeMap reflects the subtree + // state when all EImportFinish events are replayed. + } else if (ls->end/period != ls->offset/period || + ls->num_events >= g_conf()->mds_log_events_per_segment) { + dout(10) << "submit_entry also starting new segment: last = " + << ls->seq << "/" << ls->offset << ", event seq = " << event_seq << dendl; + _start_new_segment(); + } else if (g_conf()->mds_debug_subtrees && + le->get_type() != EVENT_SUBTREEMAP_TEST) { + // debug: journal this every time to catch subtree replay bugs. + // use a different event id so it doesn't get interpreted as a + // LogSegment boundary on replay. + LogEvent *sle = mds->mdcache->create_subtree_map(); + sle->set_type(EVENT_SUBTREEMAP_TEST); + _submit_entry(sle, NULL); + } +} + +/** + * Invoked on the flush after each entry submitted + */ +class C_MDL_Flushed : public MDSLogContextBase { +protected: + MDLog *mdlog; + MDSRank *get_mds() override {return mdlog->mds;} + MDSContext *wrapped; + + void finish(int r) override { + if (wrapped) + wrapped->complete(r); + } + +public: + C_MDL_Flushed(MDLog *m, MDSContext *w) + : mdlog(m), wrapped(w) {} + C_MDL_Flushed(MDLog *m, uint64_t wp) : mdlog(m), wrapped(NULL) { + set_write_pos(wp); + } +}; + +void MDLog::_submit_thread() +{ + dout(10) << "_submit_thread start" << dendl; + + submit_mutex.Lock(); + + while (!mds->is_daemon_stopping()) { + if (g_conf()->mds_log_pause) { + submit_cond.Wait(submit_mutex); + continue; + } + + map<uint64_t,list<PendingEvent> >::iterator it = pending_events.begin(); + if (it == pending_events.end()) { + submit_cond.Wait(submit_mutex); + continue; + } + + if (it->second.empty()) { + pending_events.erase(it); + continue; + } + + int64_t features = mdsmap_up_features; + PendingEvent data = it->second.front(); + it->second.pop_front(); + + submit_mutex.Unlock(); + + if (data.le) { + LogEvent *le = data.le; + LogSegment *ls = le->_segment; + // encode it, with event type + bufferlist bl; + le->encode_with_header(bl, features); + + uint64_t write_pos = journaler->get_write_pos(); + + le->set_start_off(write_pos); + if (le->get_type() == EVENT_SUBTREEMAP) + ls->offset = write_pos; + + dout(5) << "_submit_thread " << write_pos << "~" << bl.length() + << " : " << *le << dendl; + + // journal it. + const uint64_t new_write_pos = journaler->append_entry(bl); // bl is destroyed. + ls->end = new_write_pos; + + MDSLogContextBase *fin; + if (data.fin) { + fin = dynamic_cast<MDSLogContextBase*>(data.fin); + ceph_assert(fin); + fin->set_write_pos(new_write_pos); + } else { + fin = new C_MDL_Flushed(this, new_write_pos); + } + + journaler->wait_for_flush(fin); + + if (data.flush) + journaler->flush(); + + if (logger) + logger->set(l_mdl_wrpos, ls->end); + + delete le; + } else { + if (data.fin) { + MDSContext* fin = + dynamic_cast<MDSContext*>(data.fin); + ceph_assert(fin); + C_MDL_Flushed *fin2 = new C_MDL_Flushed(this, fin); + fin2->set_write_pos(journaler->get_write_pos()); + journaler->wait_for_flush(fin2); + } + if (data.flush) + journaler->flush(); + } + + submit_mutex.Lock(); + if (data.flush) + unflushed = 0; + else if (data.le) + unflushed++; + } + + submit_mutex.Unlock(); +} + +void MDLog::wait_for_safe(MDSContext *c) +{ + submit_mutex.Lock(); + + bool no_pending = true; + if (!pending_events.empty()) { + pending_events.rbegin()->second.push_back(PendingEvent(NULL, c)); + no_pending = false; + submit_cond.Signal(); + } + + submit_mutex.Unlock(); + + if (no_pending && c) + journaler->wait_for_flush(new C_IO_Wrapper(mds, c)); +} + +void MDLog::flush() +{ + submit_mutex.Lock(); + + bool do_flush = unflushed > 0; + unflushed = 0; + if (!pending_events.empty()) { + pending_events.rbegin()->second.push_back(PendingEvent(NULL, NULL, true)); + do_flush = false; + submit_cond.Signal(); + } + + submit_mutex.Unlock(); + + if (do_flush) + journaler->flush(); +} + +void MDLog::kick_submitter() +{ + std::lock_guard l(submit_mutex); + submit_cond.Signal(); +} + +void MDLog::cap() +{ + dout(5) << "cap" << dendl; + capped = true; +} + +void MDLog::shutdown() +{ + ceph_assert(mds->mds_lock.is_locked_by_me()); + + dout(5) << "shutdown" << dendl; + if (submit_thread.is_started()) { + ceph_assert(mds->is_daemon_stopping()); + + if (submit_thread.am_self()) { + // Called suicide from the thread: trust it to do no work after + // returning from suicide, and subsequently respect mds->is_daemon_stopping() + // and fall out of its loop. + } else { + mds->mds_lock.Unlock(); + // Because MDS::stopping is true, it's safe to drop mds_lock: nobody else + // picking it up will do anything with it. + + submit_mutex.Lock(); + submit_cond.Signal(); + submit_mutex.Unlock(); + + mds->mds_lock.Lock(); + + submit_thread.join(); + } + } + + // Replay thread can be stuck inside e.g. Journaler::wait_for_readable, + // so we need to shutdown the journaler first. + if (journaler) { + journaler->shutdown(); + } + + if (replay_thread.is_started() && !replay_thread.am_self()) { + mds->mds_lock.Unlock(); + replay_thread.join(); + mds->mds_lock.Lock(); + } + + if (recovery_thread.is_started() && !recovery_thread.am_self()) { + mds->mds_lock.Unlock(); + recovery_thread.join(); + mds->mds_lock.Lock(); + } +} + + +// ----------------------------- +// segments + +void MDLog::_start_new_segment() +{ + _prepare_new_segment(); + _journal_segment_subtree_map(NULL); +} + +void MDLog::_prepare_new_segment() +{ + ceph_assert(submit_mutex.is_locked_by_me()); + + uint64_t seq = event_seq + 1; + dout(7) << __func__ << " seq " << seq << dendl; + + segments[seq] = new LogSegment(seq); + + logger->inc(l_mdl_segadd); + logger->set(l_mdl_seg, segments.size()); + + // Adjust to next stray dir + dout(10) << "Advancing to next stray directory on mds " << mds->get_nodeid() + << dendl; + mds->mdcache->advance_stray(); +} + +void MDLog::_journal_segment_subtree_map(MDSContext *onsync) +{ + ceph_assert(submit_mutex.is_locked_by_me()); + + dout(7) << __func__ << dendl; + ESubtreeMap *sle = mds->mdcache->create_subtree_map(); + sle->event_seq = get_last_segment_seq(); + + _submit_entry(sle, new C_MDL_Flushed(this, onsync)); +} + +class C_OFT_Committed : public MDSInternalContext { + MDLog *mdlog; + uint64_t seq; +public: + C_OFT_Committed(MDLog *l, uint64_t s) : + MDSInternalContext(l->mds), mdlog(l), seq(s) {} + void finish(int ret) override { + mdlog->trim_expired_segments(); + } +}; + +void MDLog::trim(int m) +{ + unsigned max_segments = g_conf()->mds_log_max_segments; + int max_events = g_conf()->mds_log_max_events; + if (m >= 0) + max_events = m; + + if (mds->mdcache->is_readonly()) { + dout(10) << "trim, ignoring read-only FS" << dendl; + return; + } + + // Clamp max_events to not be smaller than events per segment + if (max_events > 0 && max_events <= g_conf()->mds_log_events_per_segment) { + max_events = g_conf()->mds_log_events_per_segment + 1; + } + + submit_mutex.Lock(); + + // trim! + dout(10) << "trim " + << segments.size() << " / " << max_segments << " segments, " + << num_events << " / " << max_events << " events" + << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" + << ", " << expired_segments.size() << " (" << expired_events << ") expired" + << dendl; + + if (segments.empty()) { + submit_mutex.Unlock(); + return; + } + + // hack: only trim for a few seconds at a time + utime_t stop = ceph_clock_now(); + stop += 2.0; + + int op_prio = CEPH_MSG_PRIO_LOW + + (CEPH_MSG_PRIO_HIGH - CEPH_MSG_PRIO_LOW) * + expiring_segments.size() / max_segments; + if (op_prio > CEPH_MSG_PRIO_HIGH) + op_prio = CEPH_MSG_PRIO_HIGH; + + unsigned new_expiring_segments = 0; + + unsigned max_expiring_segments = 0; + if (pre_segments_size > 0){ + max_expiring_segments = max_segments/2; + assert(segments.size() >= pre_segments_size); + max_expiring_segments = std::max<unsigned>(max_expiring_segments,segments.size() - pre_segments_size); + } + + map<uint64_t,LogSegment*>::iterator p = segments.begin(); + while (p != segments.end()) { + if (stop < ceph_clock_now()) + break; + + unsigned num_remaining_segments = (segments.size() - expired_segments.size() - expiring_segments.size()); + if ((num_remaining_segments <= max_segments) && + (max_events < 0 || num_events - expiring_events - expired_events <= max_events)) + break; + + // Do not trim too many segments at once for peak workload. If mds keeps creating N segments each tick, + // the upper bound of 'num_remaining_segments - max_segments' is '2 * N' + if (new_expiring_segments * 2 > num_remaining_segments) + break; + + if (max_expiring_segments > 0 && + expiring_segments.size() >= max_expiring_segments) + break; + + // look at first segment + LogSegment *ls = p->second; + ceph_assert(ls); + ++p; + + if (pending_events.count(ls->seq) || + ls->end > safe_pos) { + dout(5) << "trim segment " << ls->seq << "/" << ls->offset << ", not fully flushed yet, safe " + << journaler->get_write_safe_pos() << " < end " << ls->end << dendl; + break; + } + + if (expiring_segments.count(ls)) { + dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + } else if (expired_segments.count(ls)) { + dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + } else { + ceph_assert(expiring_segments.count(ls) == 0); + new_expiring_segments++; + expiring_segments.insert(ls); + expiring_events += ls->num_events; + submit_mutex.Unlock(); + + uint64_t last_seq = ls->seq; + try_expire(ls, op_prio); + + submit_mutex.Lock(); + p = segments.lower_bound(last_seq + 1); + } + } + + if (!capped && + !mds->mdcache->open_file_table.is_any_committing()) { + uint64_t last_seq = get_last_segment_seq(); + if (mds->mdcache->open_file_table.is_any_dirty() || + last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_HIGH); + submit_mutex.Lock(); + } + } + + // discard expired segments and unlock submit_mutex + _trim_expired_segments(); +} + +class C_MaybeExpiredSegment : public MDSInternalContext { + MDLog *mdlog; + LogSegment *ls; + int op_prio; + public: + C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s, int p) : + MDSInternalContext(mdl->mds), mdlog(mdl), ls(s), op_prio(p) {} + void finish(int res) override { + if (res < 0) + mdlog->mds->handle_write_error(res); + mdlog->_maybe_expired(ls, op_prio); + } +}; + +/** + * Like MDLog::trim, but instead of trimming to max_segments, trim all but the latest + * segment. + */ +int MDLog::trim_all() +{ + submit_mutex.Lock(); + + dout(10) << __func__ << ": " + << segments.size() + << "/" << expiring_segments.size() + << "/" << expired_segments.size() << dendl; + + uint64_t last_seq = 0; + if (!segments.empty()) { + last_seq = get_last_segment_seq(); + if (!capped && + !mds->mdcache->open_file_table.is_any_committing() && + last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_DEFAULT); + submit_mutex.Lock(); + } + } + + map<uint64_t,LogSegment*>::iterator p = segments.begin(); + while (p != segments.end() && + p->first < last_seq && + p->second->end < safe_pos) { // next segment should have been started + LogSegment *ls = p->second; + ++p; + + // Caller should have flushed journaler before calling this + if (pending_events.count(ls->seq)) { + dout(5) << __func__ << ": segment " << ls->seq << " has pending events" << dendl; + submit_mutex.Unlock(); + return -EAGAIN; + } + + if (expiring_segments.count(ls)) { + dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + } else if (expired_segments.count(ls)) { + dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + } else { + ceph_assert(expiring_segments.count(ls) == 0); + expiring_segments.insert(ls); + expiring_events += ls->num_events; + submit_mutex.Unlock(); + + uint64_t next_seq = ls->seq + 1; + try_expire(ls, CEPH_MSG_PRIO_DEFAULT); + + submit_mutex.Lock(); + p = segments.lower_bound(next_seq); + } + } + + _trim_expired_segments(); + + return 0; +} + + +void MDLog::try_expire(LogSegment *ls, int op_prio) +{ + MDSGatherBuilder gather_bld(g_ceph_context); + ls->try_to_expire(mds, gather_bld, op_prio); + + if (gather_bld.has_subs()) { + dout(5) << "try_expire expiring segment " << ls->seq << "/" << ls->offset << dendl; + gather_bld.set_finisher(new C_MaybeExpiredSegment(this, ls, op_prio)); + gather_bld.activate(); + } else { + dout(10) << "try_expire expired segment " << ls->seq << "/" << ls->offset << dendl; + submit_mutex.Lock(); + ceph_assert(expiring_segments.count(ls)); + expiring_segments.erase(ls); + expiring_events -= ls->num_events; + _expired(ls); + submit_mutex.Unlock(); + } + + logger->set(l_mdl_segexg, expiring_segments.size()); + logger->set(l_mdl_evexg, expiring_events); +} + +void MDLog::_maybe_expired(LogSegment *ls, int op_prio) +{ + if (mds->mdcache->is_readonly()) { + dout(10) << "_maybe_expired, ignoring read-only FS" << dendl; + return; + } + + dout(10) << "_maybe_expired segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + try_expire(ls, op_prio); +} + +void MDLog::_trim_expired_segments() +{ + ceph_assert(submit_mutex.is_locked_by_me()); + + uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq(); + + // trim expired segments? + bool trimmed = false; + while (!segments.empty()) { + LogSegment *ls = segments.begin()->second; + if (!expired_segments.count(ls)) { + dout(10) << "_trim_expired_segments waiting for " << ls->seq << "/" << ls->offset + << " to expire" << dendl; + break; + } + + if (!capped && ls->seq >= oft_committed_seq) { + dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq + << " <= " << ls->seq << "/" << ls->offset << dendl; + break; + } + + dout(10) << "_trim_expired_segments trimming expired " + << ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl; + expired_events -= ls->num_events; + expired_segments.erase(ls); + if (pre_segments_size > 0) + pre_segments_size--; + num_events -= ls->num_events; + + // this was the oldest segment, adjust expire pos + if (journaler->get_expire_pos() < ls->end) { + journaler->set_expire_pos(ls->end); + logger->set(l_mdl_expos, ls->end); + } else { + logger->set(l_mdl_expos, ls->offset); + } + + logger->inc(l_mdl_segtrm); + logger->inc(l_mdl_evtrm, ls->num_events); + + segments.erase(ls->seq); + delete ls; + trimmed = true; + } + + submit_mutex.Unlock(); + + if (trimmed) + journaler->write_head(0); +} + +void MDLog::trim_expired_segments() +{ + submit_mutex.Lock(); + _trim_expired_segments(); +} + +void MDLog::_expired(LogSegment *ls) +{ + ceph_assert(submit_mutex.is_locked_by_me()); + + dout(5) << "_expired segment " << ls->seq << "/" << ls->offset + << ", " << ls->num_events << " events" << dendl; + + if (!capped && ls == peek_current_segment()) { + dout(5) << "_expired not expiring " << ls->seq << "/" << ls->offset + << ", last one and !capped" << dendl; + } else { + // expired. + expired_segments.insert(ls); + expired_events += ls->num_events; + + // Trigger all waiters + finish_contexts(g_ceph_context, ls->expiry_waiters); + + logger->inc(l_mdl_evex, ls->num_events); + logger->inc(l_mdl_segex); + } + + logger->set(l_mdl_ev, num_events); + logger->set(l_mdl_evexd, expired_events); + logger->set(l_mdl_seg, segments.size()); + logger->set(l_mdl_segexd, expired_segments.size()); +} + + + +void MDLog::replay(MDSContext *c) +{ + ceph_assert(journaler->is_active()); + ceph_assert(journaler->is_readonly()); + + // empty? + if (journaler->get_read_pos() == journaler->get_write_pos()) { + dout(10) << "replay - journal empty, done." << dendl; + mds->mdcache->trim(); + if (mds->is_standby_replay()) + mds->update_mlogger(); + if (c) { + c->complete(0); + } + return; + } + + // add waiter + if (c) + waitfor_replay.push_back(c); + + // go! + dout(10) << "replay start, from " << journaler->get_read_pos() + << " to " << journaler->get_write_pos() << dendl; + + ceph_assert(num_events == 0 || already_replayed); + if (already_replayed) { + // Ensure previous instance of ReplayThread is joined before + // we create another one + replay_thread.join(); + } + already_replayed = true; + + replay_thread.create("md_log_replay"); +} + + +/** + * Resolve the JournalPointer object to a journal file, and + * instantiate a Journaler object. This may re-write the journal + * if the journal in RADOS appears to be in an old format. + * + * This is a separate thread because of the way it is initialized from inside + * the mds lock, which is also the global objecter lock -- rather than split + * it up into hard-to-read async operations linked up by contexts, + * + * When this function completes, the `journaler` attribute will be set to + * a Journaler instance using the latest available serialization format. + */ +void MDLog::_recovery_thread(MDSContext *completion) +{ + ceph_assert(journaler == NULL); + if (g_conf()->mds_journal_format > JOURNAL_FORMAT_MAX) { + dout(0) << "Configuration value for mds_journal_format is out of bounds, max is " + << JOURNAL_FORMAT_MAX << dendl; + + // Oh dear, something unreadable in the store for this rank: require + // operator intervention. + mds->damaged(); + ceph_abort(); // damaged should not return + } + + // First, read the pointer object. + // If the pointer object is not present, then create it with + // front = default ino and back = null + JournalPointer jp(mds->get_nodeid(), mds->mdsmap->get_metadata_pool()); + const int read_result = jp.load(mds->objecter); + if (read_result == -ENOENT) { + inodeno_t const default_log_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); + jp.front = default_log_ino; + int write_result = jp.save(mds->objecter); + // Nothing graceful we can do for this + ceph_assert(write_result >= 0); + } else if (read_result == -EBLACKLISTED) { + derr << "Blacklisted during JournalPointer read! Respawning..." << dendl; + mds->respawn(); + ceph_abort(); // Should be unreachable because respawn calls execv + } else if (read_result != 0) { + mds->clog->error() << "failed to read JournalPointer: " << read_result + << " (" << cpp_strerror(read_result) << ")"; + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + // If the back pointer is non-null, that means that a journal + // rewrite failed part way through. Erase the back journal + // to clean up. + if (jp.back) { + if (mds->is_standby_replay()) { + dout(1) << "Journal " << jp.front << " is being rewritten, " + << "cannot replay in standby until an active MDS completes rewrite" << dendl; + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + return; + } + completion->complete(-EAGAIN); + return; + } + dout(1) << "Erasing journal " << jp.back << dendl; + C_SaferCond erase_waiter; + Journaler back("mdlog", jp.back, mds->mdsmap->get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, + mds->finisher); + + // Read all about this journal (header + extents) + C_SaferCond recover_wait; + back.recover(&recover_wait); + int recovery_result = recover_wait.wait(); + if (recovery_result == -EBLACKLISTED) { + derr << "Blacklisted during journal recovery! Respawning..." << dendl; + mds->respawn(); + ceph_abort(); // Should be unreachable because respawn calls execv + } else if (recovery_result != 0) { + // Journaler.recover succeeds if no journal objects are present: an error + // means something worse like a corrupt header, which we can't handle here. + mds->clog->error() << "Error recovering journal " << jp.front << ": " + << cpp_strerror(recovery_result); + mds->damaged_unlocked(); + ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn() + } + + // We could read journal, so we can erase it. + back.erase(&erase_waiter); + int erase_result = erase_waiter.wait(); + + // If we are successful, or find no data, we can update the JournalPointer to + // reflect that the back journal is gone. + if (erase_result != 0 && erase_result != -ENOENT) { + derr << "Failed to erase journal " << jp.back << ": " << cpp_strerror(erase_result) << dendl; + } else { + dout(1) << "Successfully erased journal, updating journal pointer" << dendl; + jp.back = 0; + int write_result = jp.save(mds->objecter); + // Nothing graceful we can do for this + ceph_assert(write_result >= 0); + } + } + + /* Read the header from the front journal */ + Journaler *front_journal = new Journaler("mdlog", jp.front, + mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter, + logger, l_mdl_jlat, mds->finisher); + + // Assign to ::journaler so that we can be aborted by ::shutdown while + // waiting for journaler recovery + { + std::lock_guard l(mds->mds_lock); + journaler = front_journal; + } + + C_SaferCond recover_wait; + front_journal->recover(&recover_wait); + dout(4) << "Waiting for journal " << jp.front << " to recover..." << dendl; + int recovery_result = recover_wait.wait(); + dout(4) << "Journal " << jp.front << " recovered." << dendl; + + if (recovery_result == -EBLACKLISTED) { + derr << "Blacklisted during journal recovery! Respawning..." << dendl; + mds->respawn(); + ceph_abort(); // Should be unreachable because respawn calls execv + } else if (recovery_result != 0) { + mds->clog->error() << "Error recovering journal " << jp.front << ": " + << cpp_strerror(recovery_result); + mds->damaged_unlocked(); + ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn() + } + + /* Check whether the front journal format is acceptable or needs re-write */ + if (front_journal->get_stream_format() > JOURNAL_FORMAT_MAX) { + dout(0) << "Journal " << jp.front << " is in unknown format " << front_journal->get_stream_format() + << ", does this MDS daemon require upgrade?" << dendl; + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + journaler = NULL; + delete front_journal; + return; + } + completion->complete(-EINVAL); + } + } else if (mds->is_standby_replay() || front_journal->get_stream_format() >= g_conf()->mds_journal_format) { + /* The journal is of configured format, or we are in standbyreplay and will + * tolerate replaying old journals until we have to go active. Use front_journal as + * our journaler attribute and complete */ + dout(4) << "Recovered journal " << jp.front << " in format " << front_journal->get_stream_format() << dendl; + journaler->set_write_error_handler(new C_MDL_WriteError(this)); + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + return; + } + completion->complete(0); + } + } else { + /* Hand off to reformat routine, which will ultimately set the + * completion when it has done its thing */ + dout(1) << "Journal " << jp.front << " has old format " + << front_journal->get_stream_format() << ", it will now be updated" << dendl; + _reformat_journal(jp, front_journal, completion); + } +} + +/** + * Blocking rewrite of the journal to a new file, followed by + * swap of journal pointer to point to the new one. + * + * We write the new journal to the 'back' journal from the JournalPointer, + * swapping pointers to make that one the front journal only when we have + * safely completed. + */ +void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journal, MDSContext *completion) +{ + ceph_assert(!jp_in.is_null()); + ceph_assert(completion != NULL); + ceph_assert(old_journal != NULL); + + JournalPointer jp = jp_in; + + /* Set JournalPointer.back to the location we will write the new journal */ + inodeno_t primary_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); + inodeno_t secondary_ino = MDS_INO_LOG_BACKUP_OFFSET + mds->get_nodeid(); + jp.back = (jp.front == primary_ino ? secondary_ino : primary_ino); + int write_result = jp.save(mds->objecter); + ceph_assert(write_result == 0); + + /* Create the new Journaler file */ + Journaler *new_journal = new Journaler("mdlog", jp.back, + mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, mds->finisher); + dout(4) << "Writing new journal header " << jp.back << dendl; + file_layout_t new_layout = old_journal->get_layout(); + new_journal->set_writeable(); + new_journal->create(&new_layout, g_conf()->mds_journal_format); + + /* Write the new journal header to RADOS */ + C_SaferCond write_head_wait; + new_journal->write_head(&write_head_wait); + write_head_wait.wait(); + + // Read in the old journal, and whenever we have readable events, + // write them to the new journal. + int r = 0; + + // In old format journals before event_seq was introduced, the serialized + // offset of a SubtreeMap message in the log is used as the unique ID for + // a log segment. Because we change serialization, this will end up changing + // for us, so we have to explicitly update the fields that point back to that + // log segment. + std::map<LogSegment::seq_t, LogSegment::seq_t> segment_pos_rewrite; + + // The logic in here borrowed from replay_thread expects mds_lock to be held, + // e.g. between checking readable and doing wait_for_readable so that journaler + // state doesn't change in between. + uint32_t events_transcribed = 0; + while (1) { + while (!old_journal->is_readable() && + old_journal->get_read_pos() < old_journal->get_write_pos() && + !old_journal->get_error()) { + + // Issue a journal prefetch + C_SaferCond readable_waiter; + old_journal->wait_for_readable(&readable_waiter); + + // Wait for a journal prefetch to complete + readable_waiter.wait(); + } + if (old_journal->get_error()) { + r = old_journal->get_error(); + dout(0) << "_replay journaler got error " << r << ", aborting" << dendl; + break; + } + + if (!old_journal->is_readable() && + old_journal->get_read_pos() == old_journal->get_write_pos()) + break; + + // Read one serialized LogEvent + ceph_assert(old_journal->is_readable()); + bufferlist bl; + uint64_t le_pos = old_journal->get_read_pos(); + bool r = old_journal->try_read_entry(bl); + if (!r && old_journal->get_error()) + continue; + ceph_assert(r); + + // Update segment_pos_rewrite + auto le = LogEvent::decode_event(bl.cbegin()); + if (le) { + bool modified = false; + + if (le->get_type() == EVENT_SUBTREEMAP || + le->get_type() == EVENT_RESETJOURNAL) { + auto sle = dynamic_cast<ESubtreeMap*>(le.get()); + if (sle == NULL || sle->event_seq == 0) { + // A non-explicit event seq: the effective sequence number + // of this segment is it's position in the old journal and + // the new effective sequence number will be its position + // in the new journal. + segment_pos_rewrite[le_pos] = new_journal->get_write_pos(); + dout(20) << __func__ << " discovered segment seq mapping " + << le_pos << " -> " << new_journal->get_write_pos() << dendl; + } + } else { + event_seq++; + } + + // Rewrite segment references if necessary + EMetaBlob *blob = le->get_metablob(); + if (blob) { + modified = blob->rewrite_truncate_finish(mds, segment_pos_rewrite); + } + + // Zero-out expire_pos in subtreemap because offsets have changed + // (expire_pos is just an optimization so it's safe to eliminate it) + if (le->get_type() == EVENT_SUBTREEMAP + || le->get_type() == EVENT_SUBTREEMAP_TEST) { + auto& sle = dynamic_cast<ESubtreeMap&>(*le); + dout(20) << __func__ << " zeroing expire_pos in subtreemap event at " + << le_pos << " seq=" << sle.event_seq << dendl; + sle.expire_pos = 0; + modified = true; + } + + if (modified) { + bl.clear(); + le->encode_with_header(bl, mds->mdsmap->get_up_features()); + } + } else { + // Failure from LogEvent::decode, our job is to change the journal wrapper, + // not validate the contents, so pass it through. + dout(1) << __func__ << " transcribing un-decodable LogEvent at old position " + << old_journal->get_read_pos() << ", new position " << new_journal->get_write_pos() + << dendl; + } + + // Write (buffered, synchronous) one serialized LogEvent + events_transcribed += 1; + new_journal->append_entry(bl); + } + + dout(1) << "Transcribed " << events_transcribed << " events, flushing new journal" << dendl; + C_SaferCond flush_waiter; + new_journal->flush(&flush_waiter); + flush_waiter.wait(); + + // If failed to rewrite journal, leave the part written journal + // as garbage to be cleaned up next startup. + ceph_assert(r == 0); + + /* Now that the new journal is safe, we can flip the pointers */ + inodeno_t const tmp = jp.front; + jp.front = jp.back; + jp.back = tmp; + write_result = jp.save(mds->objecter); + ceph_assert(write_result == 0); + + /* Delete the old journal to free space */ + dout(1) << "New journal flushed, erasing old journal" << dendl; + C_SaferCond erase_waiter; + old_journal->erase(&erase_waiter); + int erase_result = erase_waiter.wait(); + ceph_assert(erase_result == 0); + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + delete new_journal; + return; + } + ceph_assert(journaler == old_journal); + journaler = NULL; + delete old_journal; + } + + /* Update the pointer to reflect we're back in clean single journal state. */ + jp.back = 0; + write_result = jp.save(mds->objecter); + ceph_assert(write_result == 0); + + /* Reset the Journaler object to its default state */ + dout(1) << "Journal rewrite complete, continuing with normal startup" << dendl; + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + delete new_journal; + return; + } + journaler = new_journal; + journaler->set_readonly(); + journaler->set_write_error_handler(new C_MDL_WriteError(this)); + } + + /* Trigger completion */ + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + return; + } + completion->complete(0); + } +} + + +// i am a separate thread +void MDLog::_replay_thread() +{ + dout(10) << "_replay_thread start" << dendl; + + // loop + int r = 0; + while (1) { + // wait for read? + while (!journaler->is_readable() && + journaler->get_read_pos() < journaler->get_write_pos() && + !journaler->get_error()) { + C_SaferCond readable_waiter; + journaler->wait_for_readable(&readable_waiter); + r = readable_waiter.wait(); + } + if (journaler->get_error()) { + r = journaler->get_error(); + dout(0) << "_replay journaler got error " << r << ", aborting" << dendl; + if (r == -ENOENT) { + if (mds->is_standby_replay()) { + // journal has been trimmed by somebody else + r = -EAGAIN; + } else { + mds->clog->error() << "missing journal object"; + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + } else if (r == -EINVAL) { + if (journaler->get_read_pos() < journaler->get_expire_pos()) { + // this should only happen if you're following somebody else + if(journaler->is_readonly()) { + dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl; + r = -EAGAIN; + } else { + mds->clog->error() << "invalid journaler offsets"; + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + } else { + /* re-read head and check it + * Given that replay happens in a separate thread and + * the MDS is going to either shut down or restart when + * we return this error, doing it synchronously is fine + * -- as long as we drop the main mds lock--. */ + C_SaferCond reread_fin; + journaler->reread_head(&reread_fin); + int err = reread_fin.wait(); + if (err) { + if (err == -ENOENT && mds->is_standby_replay()) { + r = -EAGAIN; + dout(1) << "Journal header went away while in standby replay, journal rewritten?" + << dendl; + break; + } else { + dout(0) << "got error while reading head: " << cpp_strerror(err) + << dendl; + + mds->clog->error() << "error reading journal header"; + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls + // respawn() + } + } + standby_trim_segments(); + if (journaler->get_read_pos() < journaler->get_expire_pos()) { + dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl; + r = -EAGAIN; + } + } + } + break; + } + + if (!journaler->is_readable() && + journaler->get_read_pos() == journaler->get_write_pos()) + break; + + ceph_assert(journaler->is_readable() || mds->is_daemon_stopping()); + + // read it + uint64_t pos = journaler->get_read_pos(); + bufferlist bl; + bool r = journaler->try_read_entry(bl); + if (!r && journaler->get_error()) + continue; + ceph_assert(r); + + // unpack event + auto le = LogEvent::decode_event(bl.cbegin()); + if (!le) { + dout(0) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() + << " -- unable to decode event" << dendl; + dout(0) << "dump of unknown or corrupt event:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + mds->clog->error() << "corrupt journal event at " << pos << "~" + << bl.length() << " / " + << journaler->get_write_pos(); + if (g_conf()->mds_log_skip_corrupt_events) { + continue; + } else { + mds->damaged_unlocked(); + ceph_abort(); // Should be unreachable because damaged() calls + // respawn() + } + + } + le->set_start_off(pos); + + // new segment? + if (le->get_type() == EVENT_SUBTREEMAP || + le->get_type() == EVENT_RESETJOURNAL) { + auto sle = dynamic_cast<ESubtreeMap*>(le.get()); + if (sle && sle->event_seq > 0) + event_seq = sle->event_seq; + else + event_seq = pos; + segments[event_seq] = new LogSegment(event_seq, pos); + logger->set(l_mdl_seg, segments.size()); + } else { + event_seq++; + } + + // have we seen an import map yet? + if (segments.empty()) { + dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() + << " " << le->get_stamp() << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl; + } else { + dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() + << " " << le->get_stamp() << ": " << *le << dendl; + le->_segment = get_current_segment(); // replay may need this + le->_segment->num_events++; + le->_segment->end = journaler->get_read_pos(); + num_events++; + + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + return; + } + logger->inc(l_mdl_replayed); + le->replay(mds); + } + } + + logger->set(l_mdl_rdpos, pos); + } + + // done! + if (r == 0) { + ceph_assert(journaler->get_read_pos() == journaler->get_write_pos()); + dout(10) << "_replay - complete, " << num_events + << " events" << dendl; + + logger->set(l_mdl_expos, journaler->get_expire_pos()); + } + + safe_pos = journaler->get_write_safe_pos(); + + dout(10) << "_replay_thread kicking waiters" << dendl; + { + std::lock_guard l(mds->mds_lock); + if (mds->is_daemon_stopping()) { + return; + } + pre_segments_size = segments.size(); // get num of logs when replay is finished + finish_contexts(g_ceph_context, waitfor_replay, r); + } + + dout(10) << "_replay_thread finish" << dendl; +} + +void MDLog::standby_trim_segments() +{ + dout(10) << "standby_trim_segments" << dendl; + uint64_t expire_pos = journaler->get_expire_pos(); + dout(10) << " expire_pos=" << expire_pos << dendl; + + mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos); + + bool removed_segment = false; + while (have_any_segments()) { + LogSegment *seg = get_oldest_segment(); + dout(10) << " segment seq=" << seg->seq << " " << seg->offset << + "~" << seg->end - seg->offset << dendl; + + if (seg->end > expire_pos) { + dout(10) << " won't remove, not expired!" << dendl; + break; + } + + if (segments.size() == 1) { + dout(10) << " won't remove, last segment!" << dendl; + break; + } + + dout(10) << " removing segment" << dendl; + mds->mdcache->standby_trim_segment(seg); + remove_oldest_segment(); + removed_segment = true; + } + + if (removed_segment) { + dout(20) << " calling mdcache->trim!" << dendl; + mds->mdcache->trim(); + } else { + dout(20) << " removed no segments!" << dendl; + } +} + +void MDLog::dump_replay_status(Formatter *f) const +{ + f->open_object_section("replay_status"); + f->dump_unsigned("journal_read_pos", journaler ? journaler->get_read_pos() : 0); + f->dump_unsigned("journal_write_pos", journaler ? journaler->get_write_pos() : 0); + f->dump_unsigned("journal_expire_pos", journaler ? journaler->get_expire_pos() : 0); + f->dump_unsigned("num_events", get_num_events()); + f->dump_unsigned("num_segments", get_num_segments()); + f->close_section(); +} diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h new file mode 100644 index 00000000..ea74180b --- /dev/null +++ b/src/mds/MDLog.h @@ -0,0 +1,337 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MDLOG_H +#define CEPH_MDLOG_H + +enum { + l_mdl_first = 5000, + l_mdl_evadd, + l_mdl_evex, + l_mdl_evtrm, + l_mdl_ev, + l_mdl_evexg, + l_mdl_evexd, + l_mdl_segadd, + l_mdl_segex, + l_mdl_segtrm, + l_mdl_seg, + l_mdl_segexg, + l_mdl_segexd, + l_mdl_expos, + l_mdl_wrpos, + l_mdl_rdpos, + l_mdl_jlat, + l_mdl_replayed, + l_mdl_last, +}; + +#include "include/types.h" +#include "include/Context.h" + +#include "MDSContext.h" +#include "common/Thread.h" +#include "common/Cond.h" + +#include "LogSegment.h" + +#include <list> + +class Journaler; +class JournalPointer; +class LogEvent; +class MDSRank; +class LogSegment; +class ESubtreeMap; + +class PerfCounters; + +#include <map> +using std::map; + +#include "common/Finisher.h" + + +class MDLog { +public: + MDSRank *mds; +protected: + int num_events; // in events + + int unflushed; + + bool capped; + + // Log position which is persistent *and* for which + // submit_entry wait_for_safe callbacks have already + // been called. + uint64_t safe_pos; + + inodeno_t ino; + Journaler *journaler; + + PerfCounters *logger; + + + // -- replay -- + class ReplayThread : public Thread { + MDLog *log; + public: + explicit ReplayThread(MDLog *l) : log(l) {} + void* entry() override { + log->_replay_thread(); + return 0; + } + } replay_thread; + bool already_replayed; + + friend class ReplayThread; + friend class C_MDL_Replay; + + MDSContext::vec waitfor_replay; + + void _replay(); // old way + void _replay_thread(); // new way + + // Journal recovery/rewrite logic + class RecoveryThread : public Thread { + MDLog *log; + MDSContext *completion; + public: + void set_completion(MDSContext *c) {completion = c;} + explicit RecoveryThread(MDLog *l) : log(l), completion(NULL) {} + void* entry() override { + log->_recovery_thread(completion); + return 0; + } + } recovery_thread; + void _recovery_thread(MDSContext *completion); + void _reformat_journal(JournalPointer const &jp, Journaler *old_journal, MDSContext *completion); + + // -- segments -- + map<uint64_t,LogSegment*> segments; + set<LogSegment*> expiring_segments; + set<LogSegment*> expired_segments; + std::size_t pre_segments_size = 0; // the num of segments when the mds finished replay-journal, to calc the num of segments growing + uint64_t event_seq; + int expiring_events; + int expired_events; + + struct PendingEvent { + LogEvent *le; + MDSContext *fin; + bool flush; + PendingEvent(LogEvent *e, MDSContext *c, bool f=false) : le(e), fin(c), flush(f) {} + }; + + int64_t mdsmap_up_features; + map<uint64_t,list<PendingEvent> > pending_events; // log segment -> event list + Mutex submit_mutex; + Cond submit_cond; + + void set_safe_pos(uint64_t pos) + { + std::lock_guard l(submit_mutex); + ceph_assert(pos >= safe_pos); + safe_pos = pos; + } + friend class MDSLogContextBase; + + void _submit_thread(); + class SubmitThread : public Thread { + MDLog *log; + public: + explicit SubmitThread(MDLog *l) : log(l) {} + void* entry() override { + log->_submit_thread(); + return 0; + } + } submit_thread; + friend class SubmitThread; + +public: + const std::set<LogSegment*> &get_expiring_segments() const + { + return expiring_segments; + } +protected: + + // -- subtreemaps -- + friend class ESubtreeMap; + friend class MDCache; + + uint64_t get_last_segment_seq() const { + ceph_assert(!segments.empty()); + return segments.rbegin()->first; + } + LogSegment *get_oldest_segment() { + return segments.begin()->second; + } + void remove_oldest_segment() { + map<uint64_t, LogSegment*>::iterator p = segments.begin(); + delete p->second; + segments.erase(p); + } + +public: + void create_logger(); + + // replay state + map<inodeno_t, set<inodeno_t> > pending_exports; + + void set_write_iohint(unsigned iohint_flags); + +public: + explicit MDLog(MDSRank *m) : mds(m), + num_events(0), + unflushed(0), + capped(false), + safe_pos(0), + journaler(0), + logger(0), + replay_thread(this), + already_replayed(false), + recovery_thread(this), + event_seq(0), expiring_events(0), expired_events(0), + mdsmap_up_features(0), + submit_mutex("MDLog::submit_mutex"), + submit_thread(this), + cur_event(NULL) { } + ~MDLog(); + + +private: + // -- segments -- + void _start_new_segment(); + void _prepare_new_segment(); + void _journal_segment_subtree_map(MDSContext *onsync); +public: + void start_new_segment() { + std::lock_guard l(submit_mutex); + _start_new_segment(); + } + void prepare_new_segment() { + std::lock_guard l(submit_mutex); + _prepare_new_segment(); + } + void journal_segment_subtree_map(MDSContext *onsync=NULL) { + submit_mutex.Lock(); + _journal_segment_subtree_map(onsync); + submit_mutex.Unlock(); + if (onsync) + flush(); + } + + LogSegment *peek_current_segment() { + return segments.empty() ? NULL : segments.rbegin()->second; + } + + LogSegment *get_current_segment() { + ceph_assert(!segments.empty()); + return segments.rbegin()->second; + } + + LogSegment *get_segment(LogSegment::seq_t seq) { + if (segments.count(seq)) + return segments[seq]; + return NULL; + } + + bool have_any_segments() const { + return !segments.empty(); + } + + void flush_logger(); + + size_t get_num_events() const { return num_events; } + size_t get_num_segments() const { return segments.size(); } + + uint64_t get_read_pos() const; + uint64_t get_write_pos() const; + uint64_t get_safe_pos() const; + Journaler *get_journaler() { return journaler; } + bool empty() const { return segments.empty(); } + + bool is_capped() const { return capped; } + void cap(); + + void kick_submitter(); + void shutdown(); + + // -- events -- +private: + LogEvent *cur_event; +public: + void _start_entry(LogEvent *e); + void start_entry(LogEvent *e) { + std::lock_guard l(submit_mutex); + _start_entry(e); + } + void cancel_entry(LogEvent *e); + void _submit_entry(LogEvent *e, MDSLogContextBase *c); + void submit_entry(LogEvent *e, MDSLogContextBase *c = 0) { + std::lock_guard l(submit_mutex); + _submit_entry(e, c); + submit_cond.Signal(); + } + void start_submit_entry(LogEvent *e, MDSLogContextBase *c = 0) { + std::lock_guard l(submit_mutex); + _start_entry(e); + _submit_entry(e, c); + submit_cond.Signal(); + } + bool entry_is_open() const { return cur_event != NULL; } + + void wait_for_safe( MDSContext *c ); + void flush(); + bool is_flushed() const { + return unflushed == 0; + } + +private: + void try_expire(LogSegment *ls, int op_prio); + void _maybe_expired(LogSegment *ls, int op_prio); + void _expired(LogSegment *ls); + void _trim_expired_segments(); + + friend class C_MaybeExpiredSegment; + friend class C_MDL_Flushed; + friend class C_OFT_Committed; + +public: + void trim_expired_segments(); + void trim(int max=-1); + int trim_all(); + bool expiry_done() const + { + return expiring_segments.empty() && expired_segments.empty(); + }; + +private: + void write_head(MDSContext *onfinish); + +public: + void create(MDSContext *onfinish); // fresh, empty log! + void open(MDSContext *onopen); // append() or replay() to follow! + void reopen(MDSContext *onopen); + void append(); + void replay(MDSContext *onfinish); + + void standby_trim_segments(); + + void dump_replay_status(Formatter *f) const; +}; + +#endif diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc new file mode 100644 index 00000000..949ac62c --- /dev/null +++ b/src/mds/MDSAuthCaps.cc @@ -0,0 +1,434 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string_view> + +#include <errno.h> +#include <fcntl.h> + +#include <boost/spirit/include/qi.hpp> +#include <boost/spirit/include/phoenix_operator.hpp> +#include <boost/spirit/include/phoenix.hpp> + +#include "common/debug.h" +#include "MDSAuthCaps.h" +#include "include/ipaddr.h" + +#define dout_subsys ceph_subsys_mds + +#undef dout_prefix +#define dout_prefix *_dout << "MDSAuthCap " + +using std::ostream; +using std::string; +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + +template <typename Iterator> +struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()> +{ + MDSCapParser() : MDSCapParser::base_type(mdscaps) + { + using qi::char_; + using qi::int_; + using qi::uint_; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + spaces = +(lit(' ') | lit('\n') | lit('\t')); + + quoted_path %= + lexeme[lit("\"") >> *(char_ - '"') >> '"'] | + lexeme[lit("'") >> *(char_ - '\'') >> '\'']; + unquoted_path %= +char_("a-zA-Z0-9_./-"); + network_str %= +char_("/.:a-fA-F0-9]["); + + // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]] + path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path)); + uid %= (spaces >> lit("uid") >> lit('=') >> uint_); + uintlist %= (uint_ % lit(',')); + gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist); + match = -( + (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] | + (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] | + (path)[_val = phoenix::construct<MDSCapMatch>(_1)]); + + // capspec = * | r[w][p][s] + capspec = spaces >> ( + lit("*")[_val = MDSCapSpec(MDSCapSpec::ALL)] + | + lit("all")[_val = MDSCapSpec(MDSCapSpec::ALL)] + | + (lit("rwps"))[_val = MDSCapSpec(MDSCapSpec::RWPS)] + | + (lit("rwp"))[_val = MDSCapSpec(MDSCapSpec::RWP)] + | + (lit("rws"))[_val = MDSCapSpec(MDSCapSpec::RWS)] + | + (lit("rw"))[_val = MDSCapSpec(MDSCapSpec::RW)] + | + (lit("r"))[_val = MDSCapSpec(MDSCapSpec::READ)] + ); + + grant = lit("allow") >> (capspec >> match >> + -(spaces >> lit("network") >> spaces >> network_str)) + [_val = phoenix::construct<MDSCapGrant>(_1, _2, _3)]; + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + mdscaps = grants [_val = phoenix::construct<MDSAuthCaps>(_1)]; + } + qi::rule<Iterator> spaces; + qi::rule<Iterator, string()> quoted_path, unquoted_path, network_str; + qi::rule<Iterator, MDSCapSpec()> capspec; + qi::rule<Iterator, string()> path; + qi::rule<Iterator, uint32_t()> uid; + qi::rule<Iterator, std::vector<uint32_t>() > uintlist; + qi::rule<Iterator, std::vector<uint32_t>() > gidlist; + qi::rule<Iterator, MDSCapMatch()> match; + qi::rule<Iterator, MDSCapGrant()> grant; + qi::rule<Iterator, std::vector<MDSCapGrant>()> grants; + qi::rule<Iterator, MDSAuthCaps()> mdscaps; +}; + +void MDSCapMatch::normalize_path() +{ + // drop any leading / + while (path.length() && path[0] == '/') { + path = path.substr(1); + } + + // drop dup // + // drop . + // drop .. +} + +bool MDSCapMatch::match(std::string_view target_path, + const int caller_uid, + const int caller_gid, + const vector<uint64_t> *caller_gid_list) const +{ + if (uid != MDS_AUTH_UID_ANY) { + if (uid != caller_uid) + return false; + if (!gids.empty()) { + bool gid_matched = false; + if (std::find(gids.begin(), gids.end(), caller_gid) != gids.end()) + gid_matched = true; + if (caller_gid_list) { + for (auto i = caller_gid_list->begin(); i != caller_gid_list->end(); ++i) { + if (std::find(gids.begin(), gids.end(), *i) != gids.end()) { + gid_matched = true; + break; + } + } + } + if (!gid_matched) + return false; + } + } + + if (!match_path(target_path)) { + return false; + } + + return true; +} + +bool MDSCapMatch::match_path(std::string_view target_path) const +{ + if (path.length()) { + if (target_path.find(path) != 0) + return false; + // if path doesn't already have a trailing /, make sure the target + // does so that path=/foo doesn't match target_path=/food + if (target_path.length() > path.length() && + path[path.length()-1] != '/' && + target_path[path.length()] != '/') + return false; + } + + return true; +} + +void MDSCapGrant::parse_network() +{ + network_valid = ::parse_network(network.c_str(), &network_parsed, + &network_prefix); +} + +/** + * Is the client *potentially* able to access this path? Actual + * permission will depend on uids/modes in the full is_capable. + */ +bool MDSAuthCaps::path_capable(std::string_view inode_path) const +{ + for (const auto &i : grants) { + if (i.match.match_path(inode_path)) { + return true; + } + } + + return false; +} + +/** + * For a given filesystem path, query whether this capability carries` + * authorization to read or write. + * + * This is true if any of the 'grant' clauses in the capability match the + * requested path + op. + */ +bool MDSAuthCaps::is_capable(std::string_view inode_path, + uid_t inode_uid, gid_t inode_gid, + unsigned inode_mode, + uid_t caller_uid, gid_t caller_gid, + const vector<uint64_t> *caller_gid_list, + unsigned mask, + uid_t new_uid, gid_t new_gid, + const entity_addr_t& addr) const +{ + if (cct) + ldout(cct, 10) << __func__ << " inode(path /" << inode_path + << " owner " << inode_uid << ":" << inode_gid + << " mode 0" << std::oct << inode_mode << std::dec + << ") by caller " << caller_uid << ":" << caller_gid +// << "[" << caller_gid_list << "]"; + << " mask " << mask + << " new " << new_uid << ":" << new_gid + << " cap: " << *this << dendl; + + for (std::vector<MDSCapGrant>::const_iterator i = grants.begin(); + i != grants.end(); + ++i) { + if (i->network.size() && + (!i->network_valid || + !network_contains(i->network_parsed, + i->network_prefix, + addr))) { + continue; + } + + if (i->match.match(inode_path, caller_uid, caller_gid, caller_gid_list) && + i->spec.allows(mask & (MAY_READ|MAY_EXECUTE), mask & MAY_WRITE)) { + // we have a match; narrow down GIDs to those specifically allowed here + vector<uint64_t> gids; + if (std::find(i->match.gids.begin(), i->match.gids.end(), caller_gid) != + i->match.gids.end()) { + gids.push_back(caller_gid); + } + if (caller_gid_list) { + std::set_intersection(i->match.gids.begin(), i->match.gids.end(), + caller_gid_list->begin(), caller_gid_list->end(), + std::back_inserter(gids)); + std::sort(gids.begin(), gids.end()); + } + + + // Spec is non-allowing if caller asked for set pool but spec forbids it + if (mask & MAY_SET_VXATTR) { + if (!i->spec.allow_set_vxattr()) { + continue; + } + } + + if (mask & MAY_SNAPSHOT) { + if (!i->spec.allow_snapshot()) { + continue; + } + } + + // check unix permissions? + if (i->match.uid == MDSCapMatch::MDS_AUTH_UID_ANY) { + return true; + } + + // chown/chgrp + if (mask & MAY_CHOWN) { + if (new_uid != caller_uid || // you can't chown to someone else + inode_uid != caller_uid) { // you can't chown from someone else + continue; + } + } + if (mask & MAY_CHGRP) { + // you can only chgrp *to* one of your groups... if you own the file. + if (inode_uid != caller_uid || + std::find(gids.begin(), gids.end(), new_gid) == + gids.end()) { + continue; + } + } + + if (inode_uid == caller_uid) { + if ((!(mask & MAY_READ) || (inode_mode & S_IRUSR)) && + (!(mask & MAY_WRITE) || (inode_mode & S_IWUSR)) && + (!(mask & MAY_EXECUTE) || (inode_mode & S_IXUSR))) { + return true; + } + } else if (std::find(gids.begin(), gids.end(), + inode_gid) != gids.end()) { + if ((!(mask & MAY_READ) || (inode_mode & S_IRGRP)) && + (!(mask & MAY_WRITE) || (inode_mode & S_IWGRP)) && + (!(mask & MAY_EXECUTE) || (inode_mode & S_IXGRP))) { + return true; + } + } else { + if ((!(mask & MAY_READ) || (inode_mode & S_IROTH)) && + (!(mask & MAY_WRITE) || (inode_mode & S_IWOTH)) && + (!(mask & MAY_EXECUTE) || (inode_mode & S_IXOTH))) { + return true; + } + } + } + } + + return false; +} + +void MDSAuthCaps::set_allow_all() +{ + grants.clear(); + grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::ALL), MDSCapMatch(), + {})); +} + +bool MDSAuthCaps::parse(CephContext *c, std::string_view str, ostream *err) +{ + // Special case for legacy caps + if (str == "allow") { + grants.clear(); + grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::RWPS), MDSCapMatch(), + {})); + return true; + } + + auto iter = str.begin(); + auto end = str.end(); + MDSCapParser<decltype(iter)> g; + + bool r = qi::phrase_parse(iter, end, g, ascii::space, *this); + cct = c; // set after parser self-assignment + if (r && iter == end) { + for (auto& grant : grants) { + std::sort(grant.match.gids.begin(), grant.match.gids.end()); + grant.parse_network(); + } + return true; + } else { + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) + *err << "mds capability parse failed, stopped at '" + << std::string(iter, end) + << "' of '" << str << "'"; + return false; + } +} + + +bool MDSAuthCaps::allow_all() const +{ + for (std::vector<MDSCapGrant>::const_iterator i = grants.begin(); i != grants.end(); ++i) { + if (i->match.is_match_all() && i->spec.allow_all()) { + return true; + } + } + + return false; +} + + +ostream &operator<<(ostream &out, const MDSCapMatch &match) +{ + if (match.path.length()) { + out << "path=\"/" << match.path << "\""; + if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) { + out << " "; + } + } + if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) { + out << "uid=" << match.uid; + if (!match.gids.empty()) { + out << " gids="; + for (std::vector<gid_t>::const_iterator p = match.gids.begin(); + p != match.gids.end(); + ++p) { + if (p != match.gids.begin()) + out << ','; + out << *p; + } + } + } + + return out; +} + + +ostream &operator<<(ostream &out, const MDSCapSpec &spec) +{ + if (spec.allow_all()) { + out << "*"; + } else { + if (spec.allow_read()) { + out << "r"; + } + if (spec.allow_write()) { + out << "w"; + } + if (spec.allow_set_vxattr()) { + out << "p"; + } + if (spec.allow_snapshot()) { + out << "s"; + } + } + + return out; +} + + +ostream &operator<<(ostream &out, const MDSCapGrant &grant) +{ + out << "allow "; + out << grant.spec; + if (!grant.match.is_match_all()) { + out << " " << grant.match; + } + if (grant.network.size()) { + out << " network " << grant.network; + } + return out; +} + + +ostream &operator<<(ostream &out, const MDSAuthCaps &cap) +{ + out << "MDSAuthCaps["; + for (size_t i = 0; i < cap.grants.size(); ++i) { + out << cap.grants[i]; + if (i < cap.grants.size() - 1) { + out << ", "; + } + } + out << "]"; + + return out; +} + diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h new file mode 100644 index 00000000..cc1006cd --- /dev/null +++ b/src/mds/MDSAuthCaps.h @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef MDS_AUTH_CAPS_H +#define MDS_AUTH_CAPS_H + +#include <sstream> +#include <string> +#include <string_view> +#include <vector> + +#include "include/types.h" +#include "common/debug.h" + +// unix-style capabilities +enum { + MAY_READ = (1 << 0), + MAY_WRITE = (1 << 1), + MAY_EXECUTE = (1 << 2), + MAY_CHOWN = (1 << 4), + MAY_CHGRP = (1 << 5), + MAY_SET_VXATTR = (1 << 6), + MAY_SNAPSHOT = (1 << 7), +}; + +class CephContext; + +// what we can do +struct MDSCapSpec { + static const unsigned ALL = (1 << 0); + static const unsigned READ = (1 << 1); + static const unsigned WRITE = (1 << 2); + // if the capability permits setting vxattrs (layout, quota, etc) + static const unsigned SET_VXATTR = (1 << 3); + // if the capability permits mksnap/rmsnap + static const unsigned SNAPSHOT = (1 << 4); + + static const unsigned RW = (READ|WRITE); + static const unsigned RWP = (READ|WRITE|SET_VXATTR); + static const unsigned RWS = (READ|WRITE|SNAPSHOT); + static const unsigned RWPS = (READ|WRITE|SET_VXATTR|SNAPSHOT); + + MDSCapSpec() = default; + MDSCapSpec(unsigned _caps) : caps(_caps) { + if (caps & ALL) + caps |= RWPS; + } + + bool allow_all() const { + return (caps & ALL); + } + bool allow_read() const { + return (caps & READ); + } + bool allow_write() const { + return (caps & WRITE); + } + + bool allows(bool r, bool w) const { + if (allow_all()) + return true; + if (r && !allow_read()) + return false; + if (w && !allow_write()) + return false; + return true; + } + + bool allow_snapshot() const { + return (caps & SNAPSHOT); + } + bool allow_set_vxattr() const { + return (caps & SET_VXATTR); + } +private: + unsigned caps = 0; +}; + +// conditions before we are allowed to do it +struct MDSCapMatch { + static const int64_t MDS_AUTH_UID_ANY = -1; + + int64_t uid; // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY + std::vector<gid_t> gids; // Use these GIDs + std::string path; // Require path to be child of this (may be "" or "/" for any) + + MDSCapMatch() : uid(MDS_AUTH_UID_ANY) {} + MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) : uid(uid_), gids(gids_) {} + explicit MDSCapMatch(const std::string &path_) + : uid(MDS_AUTH_UID_ANY), path(path_) { + normalize_path(); + } + MDSCapMatch(const std::string& path_, int64_t uid_, std::vector<gid_t>& gids_) + : uid(uid_), gids(gids_), path(path_) { + normalize_path(); + } + + void normalize_path(); + + bool is_match_all() const + { + return uid == MDS_AUTH_UID_ANY && path == ""; + } + + // check whether this grant matches against a given file and caller uid:gid + bool match(std::string_view target_path, + const int caller_uid, + const int caller_gid, + const vector<uint64_t> *caller_gid_list) const; + + /** + * Check whether this path *might* be accessible (actual permission + * depends on the stronger check in match()). + * + * @param target_path filesystem path without leading '/' + */ + bool match_path(std::string_view target_path) const; +}; + +struct MDSCapGrant { + MDSCapSpec spec; + MDSCapMatch match; + + std::string network; + + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_, + boost::optional<std::string> n) + : spec(spec_), match(match_) { + if (n) { + network = *n; + parse_network(); + } + } + MDSCapGrant() {} + + void parse_network(); +}; + +class MDSAuthCaps +{ + CephContext *cct = nullptr; + std::vector<MDSCapGrant> grants; + +public: + MDSAuthCaps() = default; + explicit MDSAuthCaps(CephContext *cct_) : cct(cct_) {} + + // this ctor is used by spirit/phoenix; doesn't need cct. + explicit MDSAuthCaps(const std::vector<MDSCapGrant>& grants_) : grants(grants_) {} + + void clear() { + grants.clear(); + } + + void set_allow_all(); + bool parse(CephContext *cct, std::string_view str, std::ostream *err); + + bool allow_all() const; + bool is_capable(std::string_view inode_path, + uid_t inode_uid, gid_t inode_gid, unsigned inode_mode, + uid_t uid, gid_t gid, const vector<uint64_t> *caller_gid_list, + unsigned mask, uid_t new_uid, gid_t new_gid, + const entity_addr_t& addr) const; + bool path_capable(std::string_view inode_path) const; + + friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap); +}; + + +std::ostream &operator<<(std::ostream &out, const MDSCapMatch &match); +std::ostream &operator<<(std::ostream &out, const MDSCapSpec &spec); +std::ostream &operator<<(std::ostream &out, const MDSCapGrant &grant); +std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap); + +#endif // MDS_AUTH_CAPS_H diff --git a/src/mds/MDSCacheObject.cc b/src/mds/MDSCacheObject.cc new file mode 100644 index 00000000..3ad8190b --- /dev/null +++ b/src/mds/MDSCacheObject.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MDSCacheObject.h" +#include "MDSContext.h" +#include "common/Formatter.h" + +uint64_t MDSCacheObject::last_wait_seq = 0; + +void MDSCacheObject::finish_waiting(uint64_t mask, int result) { + MDSContext::vec finished; + take_waiting(mask, finished); + finish_contexts(g_ceph_context, finished, result); +} + +void MDSCacheObject::dump(Formatter *f) const +{ + f->dump_bool("is_auth", is_auth()); + + // Fields only meaningful for auth + f->open_object_section("auth_state"); + { + f->open_object_section("replicas"); + for (const auto &it : get_replicas()) { + std::ostringstream rank_str; + rank_str << it.first; + f->dump_int(rank_str.str().c_str(), it.second); + } + f->close_section(); + } + f->close_section(); // auth_state + + // Fields only meaningful for replica + f->open_object_section("replica_state"); + { + f->open_array_section("authority"); + f->dump_int("first", authority().first); + f->dump_int("second", authority().second); + f->close_section(); + f->dump_unsigned("replica_nonce", get_replica_nonce()); + } + f->close_section(); // replica_state + + f->dump_int("auth_pins", auth_pins); + f->dump_bool("is_frozen", is_frozen()); + f->dump_bool("is_freezing", is_freezing()); + +#ifdef MDS_REF_SET + f->open_object_section("pins"); + for(const auto& p : ref_map) { + f->dump_int(pin_name(p.first).data(), p.second); + } + f->close_section(); +#endif + f->dump_int("nref", ref); +} + +/* + * Use this in subclasses when printing their specialized + * states too. + */ +void MDSCacheObject::dump_states(Formatter *f) const +{ + if (state_test(STATE_AUTH)) f->dump_string("state", "auth"); + if (state_test(STATE_DIRTY)) f->dump_string("state", "dirty"); + if (state_test(STATE_NOTIFYREF)) f->dump_string("state", "notifyref"); + if (state_test(STATE_REJOINING)) f->dump_string("state", "rejoining"); + if (state_test(STATE_REJOINUNDEF)) + f->dump_string("state", "rejoinundef"); +} + diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h new file mode 100644 index 00000000..e17089bb --- /dev/null +++ b/src/mds/MDSCacheObject.h @@ -0,0 +1,415 @@ +#ifndef CEPH_MDSCACHEOBJECT_H +#define CEPH_MDSCACHEOBJECT_H + +#include <ostream> +#include <string_view> + +#include "common/config.h" + +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "include/mempool.h" +#include "include/types.h" +#include "include/xlist.h" + +#include "mdstypes.h" +#include "MDSContext.h" + +#define MDS_REF_SET // define me for improved debug output, sanity checking +//#define MDS_AUTHPIN_SET // define me for debugging auth pin leaks +//#define MDS_VERIFY_FRAGSTAT // do (slow) sanity checking on frags + + +class MLock; +class SimpleLock; +class MDSCacheObject; +class MDSContext; + +/* + * for metadata leases to clients + */ +struct ClientLease { + client_t client; + MDSCacheObject *parent; + + ceph_seq_t seq = 0; + utime_t ttl; + xlist<ClientLease*>::item item_session_lease; // per-session list + xlist<ClientLease*>::item item_lease; // global list + + ClientLease(client_t c, MDSCacheObject *p) : + client(c), parent(p), + item_session_lease(this), + item_lease(this) { } + ClientLease() = delete; +}; + + +// print hack +struct mdsco_db_line_prefix { + MDSCacheObject *object; + explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} +}; +std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o); + +// printer +std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o); + +class MDSCacheObject { + public: + // -- pins -- + const static int PIN_REPLICATED = 1000; + const static int PIN_DIRTY = 1001; + const static int PIN_LOCK = -1002; + const static int PIN_REQUEST = -1003; + const static int PIN_WAITER = 1004; + const static int PIN_DIRTYSCATTERED = -1005; + static const int PIN_AUTHPIN = 1006; + static const int PIN_PTRWAITER = -1007; + const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export + static const int PIN_CLIENTLEASE = 1009; + static const int PIN_DISCOVERBASE = 1010; + + std::string_view generic_pin_name(int p) const { + switch (p) { + case PIN_REPLICATED: return "replicated"; + case PIN_DIRTY: return "dirty"; + case PIN_LOCK: return "lock"; + case PIN_REQUEST: return "request"; + case PIN_WAITER: return "waiter"; + case PIN_DIRTYSCATTERED: return "dirtyscattered"; + case PIN_AUTHPIN: return "authpin"; + case PIN_PTRWAITER: return "ptrwaiter"; + case PIN_TEMPEXPORTING: return "tempexporting"; + case PIN_CLIENTLEASE: return "clientlease"; + case PIN_DISCOVERBASE: return "discoverbase"; + default: ceph_abort(); return std::string_view(); + } + } + + // -- state -- + const static int STATE_AUTH = (1<<30); + const static int STATE_DIRTY = (1<<29); + const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put() + const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy + const static int STATE_REJOINUNDEF = (1<<26); // contents undefined. + + + // -- wait -- + const static uint64_t WAIT_ORDERED = (1ull<<61); + const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); + const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE + + + // ============================================ + // cons + public: + MDSCacheObject() {} + virtual ~MDSCacheObject() {} + + // printing + virtual void print(std::ostream& out) = 0; + virtual std::ostream& print_db_line_prefix(std::ostream& out) { + return out << "mdscacheobject(" << this << ") "; + } + + // -------------------------------------------- + // state + protected: + __u32 state = 0; // state bits + + public: + unsigned get_state() const { return state; } + unsigned state_test(unsigned mask) const { return (state & mask); } + void state_clear(unsigned mask) { state &= ~mask; } + void state_set(unsigned mask) { state |= mask; } + void state_reset(unsigned s) { state = s; } + + bool is_auth() const { return state_test(STATE_AUTH); } + bool is_dirty() const { return state_test(STATE_DIRTY); } + bool is_clean() const { return !is_dirty(); } + bool is_rejoining() const { return state_test(STATE_REJOINING); } + + // -------------------------------------------- + // authority + virtual mds_authority_t authority() const = 0; + bool is_ambiguous_auth() const { + return authority().second != CDIR_AUTH_UNKNOWN; + } + + // -------------------------------------------- + // pins +protected: + __s32 ref = 0; // reference count +#ifdef MDS_REF_SET + mempool::mds_co::flat_map<int,int> ref_map; +#endif + + public: + int get_num_ref(int by = -1) const { +#ifdef MDS_REF_SET + if (by >= 0) { + if (ref_map.find(by) == ref_map.end()) { + return 0; + } else { + return ref_map.find(by)->second; + } + } +#endif + return ref; + } + virtual std::string_view pin_name(int by) const = 0; + //bool is_pinned_by(int by) { return ref_set.count(by); } + //multiset<int>& get_ref_set() { return ref_set; } + + virtual void last_put() {} + virtual void bad_put(int by) { +#ifdef MDS_REF_SET + ceph_assert(ref_map[by] > 0); +#endif + ceph_assert(ref > 0); + } + virtual void _put() {} + void put(int by) { +#ifdef MDS_REF_SET + if (ref == 0 || ref_map[by] == 0) { +#else + if (ref == 0) { +#endif + bad_put(by); + } else { + ref--; +#ifdef MDS_REF_SET + ref_map[by]--; +#endif + if (ref == 0) + last_put(); + if (state_test(STATE_NOTIFYREF)) + _put(); + } + } + + virtual void first_get() {} + virtual void bad_get(int by) { +#ifdef MDS_REF_SET + ceph_assert(by < 0 || ref_map[by] == 0); +#endif + ceph_abort(); + } + void get(int by) { + if (ref == 0) + first_get(); + ref++; +#ifdef MDS_REF_SET + if (ref_map.find(by) == ref_map.end()) + ref_map[by] = 0; + ref_map[by]++; +#endif + } + + void print_pin_set(std::ostream& out) const { +#ifdef MDS_REF_SET + for(auto const &p : ref_map) { + out << " " << pin_name(p.first) << "=" << p.second; + } +#else + out << " nref=" << ref; +#endif + } + +protected: + int auth_pins = 0; +#ifdef MDS_AUTHPIN_SET + mempool::mds_co::multiset<void*> auth_pin_set; +#endif + +public: + int get_num_auth_pins() const { return auth_pins; } +#ifdef MDS_AUTHPIN_SET + void print_authpin_set(std::ostream& out) const { + out << " (" << auth_pin_set << ")"; + } +#endif + + void dump_states(Formatter *f) const; + void dump(Formatter *f) const; + + // -------------------------------------------- + // auth pins + enum { + // can_auth_pin() error codes + ERR_NOT_AUTH = 1, + ERR_EXPORTING_TREE, + ERR_FRAGMENTING_DIR, + ERR_EXPORTING_INODE, + }; + virtual bool can_auth_pin(int *err_code=nullptr) const = 0; + virtual void auth_pin(void *who) = 0; + virtual void auth_unpin(void *who) = 0; + virtual bool is_frozen() const = 0; + virtual bool is_freezing() const = 0; + virtual bool is_freezing_or_frozen() const { + return is_frozen() || is_freezing(); + } + + + // -------------------------------------------- + // replication (across mds cluster) + protected: + unsigned replica_nonce = 0; // [replica] defined on replica + typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type; + replica_map_type replica_map; // [auth] mds -> nonce + + public: + bool is_replicated() const { return !get_replicas().empty(); } + bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); } + int num_replicas() const { return get_replicas().size(); } + unsigned add_replica(mds_rank_t mds) { + if (get_replicas().count(mds)) + return ++get_replicas()[mds]; // inc nonce + if (get_replicas().empty()) + get(PIN_REPLICATED); + return get_replicas()[mds] = 1; + } + void add_replica(mds_rank_t mds, unsigned nonce) { + if (get_replicas().empty()) + get(PIN_REPLICATED); + get_replicas()[mds] = nonce; + } + unsigned get_replica_nonce(mds_rank_t mds) { + ceph_assert(get_replicas().count(mds)); + return get_replicas()[mds]; + } + void remove_replica(mds_rank_t mds) { + ceph_assert(get_replicas().count(mds)); + get_replicas().erase(mds); + if (get_replicas().empty()) { + put(PIN_REPLICATED); + } + } + void clear_replica_map() { + if (!get_replicas().empty()) + put(PIN_REPLICATED); + replica_map.clear(); + } + replica_map_type& get_replicas() { return replica_map; } + const replica_map_type& get_replicas() const { return replica_map; } + void list_replicas(std::set<mds_rank_t>& ls) const { + for (const auto &p : get_replicas()) { + ls.insert(p.first); + } + } + + unsigned get_replica_nonce() const { return replica_nonce; } + void set_replica_nonce(unsigned n) { replica_nonce = n; } + + + // --------------------------------------------- + // waiting + private: + mempool::mds_co::compact_multimap<uint64_t, std::pair<uint64_t, MDSContext*>> waiting; + static uint64_t last_wait_seq; + + public: + bool is_waiter_for(uint64_t mask, uint64_t min=0) { + if (!min) { + min = mask; + while (min & (min-1)) // if more than one bit is set + min &= min-1; // clear LSB + } + for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) { + if (p->first & mask) return true; + if (p->first > mask) return false; + } + return false; + } + virtual void add_waiter(uint64_t mask, MDSContext *c) { + if (waiting.empty()) + get(PIN_WAITER); + + uint64_t seq = 0; + if (mask & WAIT_ORDERED) { + seq = ++last_wait_seq; + mask &= ~WAIT_ORDERED; + } + waiting.insert(pair<uint64_t, pair<uint64_t, MDSContext*> >( + mask, + pair<uint64_t, MDSContext*>(seq, c))); +// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) +// << "add_waiter " << hex << mask << dec << " " << c +// << " on " << *this +// << dendl; + + } + virtual void take_waiting(uint64_t mask, MDSContext::vec& ls) { + if (waiting.empty()) return; + + // process ordered waiters in the same order that they were added. + std::map<uint64_t, MDSContext*> ordered_waiters; + + for (auto it = waiting.begin(); it != waiting.end(); ) { + if (it->first & mask) { + if (it->second.first > 0) { + ordered_waiters.insert(it->second); + } else { + ls.push_back(it->second.second); + } +// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) +// << "take_waiting mask " << hex << mask << dec << " took " << it->second +// << " tag " << hex << it->first << dec +// << " on " << *this +// << dendl; + waiting.erase(it++); + } else { +// pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second +// << " tag " << hex << it->first << dec +// << " on " << *this +// << dendl; + ++it; + } + } + for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) { + ls.push_back(it->second); + } + if (waiting.empty()) { + put(PIN_WAITER); + waiting.clear(); + } + } + void finish_waiting(uint64_t mask, int result = 0); + + // --------------------------------------------- + // locking + // noop unless overloaded. + virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; } + virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); } + virtual void encode_lock_state(int type, bufferlist& bl) { ceph_abort(); } + virtual void decode_lock_state(int type, const bufferlist& bl) { ceph_abort(); } + virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); } + virtual void add_lock_waiter(int type, uint64_t mask, MDSContext *c) { ceph_abort(); } + virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; } + + virtual void clear_dirty_scattered(int type) { ceph_abort(); } + + // --------------------------------------------- + // ordering + virtual bool is_lt(const MDSCacheObject *r) const = 0; + struct ptr_lt { + bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { + return l->is_lt(r); + } + }; + +}; + +inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) { + o.print(out); + return out; +} + +inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) { + o.object->print_db_line_prefix(out); + return out; +} + +#endif diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc new file mode 100644 index 00000000..b5b76847 --- /dev/null +++ b/src/mds/MDSContext.cc @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDSRank.h" + +#include "MDSContext.h" + +#include "common/dout.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +void MDSContext::complete(int r) { + MDSRank *mds = get_mds(); + ceph_assert(mds != nullptr); + ceph_assert(mds->mds_lock.is_locked_by_me()); + dout(10) << "MDSContext::complete: " << typeid(*this).name() << dendl; + return Context::complete(r); +} + +void MDSInternalContextWrapper::finish(int r) +{ + fin->complete(r); +} + +struct MDSIOContextList { + elist<MDSIOContextBase*> list; + ceph::spinlock lock; + MDSIOContextList() : list(member_offset(MDSIOContextBase, list_item)) {} + ~MDSIOContextList() { + list.clear(); // avoid assertion in elist's destructor + } +} ioctx_list; + +MDSIOContextBase::MDSIOContextBase(bool track) +{ + created_at = ceph::coarse_mono_clock::now(); + if (track) { + ioctx_list.lock.lock(); + ioctx_list.list.push_back(&list_item); + ioctx_list.lock.unlock(); + } +} + +MDSIOContextBase::~MDSIOContextBase() +{ + ioctx_list.lock.lock(); + list_item.remove_myself(); + ioctx_list.lock.unlock(); +} + +bool MDSIOContextBase::check_ios_in_flight(ceph::coarse_mono_time cutoff, + std::string& slow_count, + ceph::coarse_mono_time& oldest) +{ + static const unsigned MAX_COUNT = 100; + unsigned slow = 0; + + ioctx_list.lock.lock(); + for (elist<MDSIOContextBase*>::iterator p = ioctx_list.list.begin(); !p.end(); ++p) { + MDSIOContextBase *c = *p; + if (c->created_at >= cutoff) + break; + ++slow; + if (slow > MAX_COUNT) + break; + if (slow == 1) + oldest = c->created_at; + } + ioctx_list.lock.unlock(); + + if (slow > 0) { + if (slow > MAX_COUNT) + slow_count = std::to_string(MAX_COUNT) + "+"; + else + slow_count = std::to_string(slow); + return true; + } else { + return false; + } +} + +void MDSIOContextBase::complete(int r) { + MDSRank *mds = get_mds(); + + dout(10) << "MDSIOContextBase::complete: " << typeid(*this).name() << dendl; + ceph_assert(mds != NULL); + // Note, MDSIOContext is passed outside the MDS and, strangely, we grab the + // lock here when MDSContext::complete would otherwise assume the lock is + // already acquired. + std::lock_guard l(mds->mds_lock); + + if (mds->is_daemon_stopping()) { + dout(4) << "MDSIOContextBase::complete: dropping for stopping " + << typeid(*this).name() << dendl; + return; + } + + if (r == -EBLACKLISTED) { + derr << "MDSIOContextBase: blacklisted! Restarting..." << dendl; + mds->respawn(); + } else { + MDSContext::complete(r); + } +} + +void MDSLogContextBase::complete(int r) { + MDLog *mdlog = get_mds()->mdlog; + uint64_t safe_pos = write_pos; + pre_finish(r); + // MDSContextBase::complete() free this + MDSIOContextBase::complete(r); + mdlog->set_safe_pos(safe_pos); +} + +void MDSIOContextWrapper::finish(int r) +{ + fin->complete(r); +} + +void C_IO_Wrapper::complete(int r) +{ + if (async) { + async = false; + get_mds()->finisher->queue(this, r); + } else { + MDSIOContext::complete(r); + } +} diff --git a/src/mds/MDSContext.h b/src/mds/MDSContext.h new file mode 100644 index 00000000..24269008 --- /dev/null +++ b/src/mds/MDSContext.h @@ -0,0 +1,212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef MDS_CONTEXT_H +#define MDS_CONTEXT_H + +#include <vector> +#include <deque> + +#include "include/Context.h" +#include "include/elist.h" +#include "include/spinlock.h" +#include "common/ceph_time.h" + +class MDSRank; + +/** + * Completion which has access to a reference to the global MDS instance. + * + * This class exists so that Context subclasses can provide the MDS pointer + * from a pointer they already had, e.g. MDCache or Locker, rather than + * necessarily having to carry around an extra MDS* pointer. + */ +class MDSContext : public Context +{ +public: +template<template<typename> class A> + using vec_alloc = std::vector<MDSContext*, A<MDSContext*>>; + using vec = vec_alloc<std::allocator>; + +template<template<typename> class A> + using que_alloc = std::deque<MDSContext*, A<MDSContext*>>; + using que = que_alloc<std::allocator>; + + void complete(int r) override; + virtual MDSRank *get_mds() = 0; +}; + +/* Children of this could have used multiple inheritance with MDSHolder and + * MDSContext but then get_mds() would be ambiguous. + */ +template<class T> +class MDSHolder : public T +{ +public: + MDSRank* get_mds() override { + return mds; + } + +protected: + MDSHolder() = delete; + MDSHolder(MDSRank* mds) : mds(mds) { + ceph_assert(mds != nullptr); + } + + MDSRank* mds; +}; + +/** + * General purpose, lets you pass in an MDS pointer. + */ +class MDSInternalContext : public MDSHolder<MDSContext> +{ +public: + MDSInternalContext() = delete; + +protected: + explicit MDSInternalContext(MDSRank *mds_) : MDSHolder(mds_) {} +}; + +/** + * Wrap a regular Context up as an Internal context. Useful + * if you're trying to work with one of our more generic frameworks. + */ +class MDSInternalContextWrapper : public MDSInternalContext +{ +protected: + Context *fin = nullptr; + void finish(int r) override; +public: + MDSInternalContextWrapper(MDSRank *m, Context *c) : MDSInternalContext(m), fin(c) {} +}; + +class MDSIOContextBase : public MDSContext +{ +public: + MDSIOContextBase(bool track=true); + virtual ~MDSIOContextBase(); + MDSIOContextBase(const MDSIOContextBase&) = delete; + MDSIOContextBase& operator=(const MDSIOContextBase&) = delete; + + void complete(int r) override; + + virtual void print(ostream& out) const = 0; + + static bool check_ios_in_flight(ceph::coarse_mono_time cutoff, + std::string& slow_count, + ceph::coarse_mono_time& oldest); +private: + ceph::coarse_mono_time created_at; + elist<MDSIOContextBase*>::item list_item; + + friend struct MDSIOContextList; +}; + +/** + * Completion for an log operation, takes big MDSRank lock + * before executing finish function. Update log's safe pos + * after finish functuon return. + */ +class MDSLogContextBase : public MDSIOContextBase +{ +protected: + uint64_t write_pos = 0; +public: + MDSLogContextBase() = default; + void complete(int r) final; + void set_write_pos(uint64_t wp) { write_pos = wp; } + virtual void pre_finish(int r) {} + void print(ostream& out) const override { + out << "log_event(" << write_pos << ")"; + } +}; + +/** + * Completion for an I/O operation, takes big MDSRank lock + * before executing finish function. + */ +class MDSIOContext : public MDSHolder<MDSIOContextBase> +{ +public: + explicit MDSIOContext(MDSRank *mds_) : MDSHolder(mds_) {} +}; + +/** + * Wrap a regular Context up as an IO Context. Useful + * if you're trying to work with one of our more generic frameworks. + */ +class MDSIOContextWrapper : public MDSHolder<MDSIOContextBase> +{ +protected: + Context *fin; +public: + MDSIOContextWrapper(MDSRank *m, Context *c) : MDSHolder(m), fin(c) {} + void finish(int r) override; + void print(ostream& out) const override { + out << "io_context_wrapper(" << fin << ")"; + } +}; + +/** + * No-op for callers expecting MDSInternalContext + */ +class C_MDSInternalNoop : public MDSContext +{ +public: + void finish(int r) override {} + void complete(int r) override { delete this; } +protected: + MDSRank* get_mds() override final {ceph_abort();} +}; + + +/** + * This class is used where you have an MDSInternalContext but + * you sometimes want to call it back from an I/O completion. + */ +class C_IO_Wrapper : public MDSIOContext +{ +protected: + bool async; + MDSContext *wrapped; + void finish(int r) override { + wrapped->complete(r); + wrapped = nullptr; + } +public: + C_IO_Wrapper(MDSRank *mds_, MDSContext *wrapped_) : + MDSIOContext(mds_), async(true), wrapped(wrapped_) { + ceph_assert(wrapped != NULL); + } + + ~C_IO_Wrapper() override { + if (wrapped != nullptr) { + delete wrapped; + wrapped = nullptr; + } + } + void complete(int r) final; + void print(ostream& out) const override { + out << "io_wrapper(" << wrapped << ")"; + } +}; + +using MDSGather = C_GatherBase<MDSContext, C_MDSInternalNoop>; +using MDSGatherBuilder = C_GatherBuilderBase<MDSContext, MDSGather>; + +using MDSContextFactory = ContextFactory<MDSContext>; + +#endif // MDS_CONTEXT_H diff --git a/src/mds/MDSContinuation.h b/src/mds/MDSContinuation.h new file mode 100644 index 00000000..97bae912 --- /dev/null +++ b/src/mds/MDSContinuation.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/Continuation.h" +#include "mds/Mutation.h" +#include "mds/Server.h" + +#include "MDSContext.h" + +class MDSContinuation : public Continuation { +protected: + Server *server; + MDSContext *get_internal_callback(int stage) { + return new MDSInternalContextWrapper(server->mds, get_callback(stage)); + } + MDSIOContextBase *get_io_callback(int stage) { + return new MDSIOContextWrapper(server->mds, get_callback(stage)); + } +public: + MDSContinuation(Server *s) : + Continuation(NULL), server(s) {} +}; diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc new file mode 100644 index 00000000..4ef06740 --- /dev/null +++ b/src/mds/MDSDaemon.cc @@ -0,0 +1,1268 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> + +#include "include/compat.h" +#include "include/types.h" +#include "include/str_list.h" + +#include "common/Clock.h" +#include "common/HeartbeatMap.h" +#include "common/Timer.h" +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/entity_name.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/signal.h" +#include "common/version.h" + +#include "global/signal_handler.h" + +#include "msg/Messenger.h" +#include "mon/MonClient.h" + +#include "osdc/Objecter.h" + +#include "MDSMap.h" + +#include "MDSDaemon.h" +#include "Server.h" +#include "Locker.h" + +#include "SnapServer.h" +#include "SnapClient.h" + +#include "events/ESession.h" +#include "events/ESubtreeMap.h" + +#include "auth/AuthAuthorizeHandler.h" +#include "auth/RotatingKeyRing.h" +#include "auth/KeyRing.h" + +#include "perfglue/cpu_profiler.h" +#include "perfglue/heap_profiler.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << name << ' ' + +// cons/des +MDSDaemon::MDSDaemon(std::string_view n, Messenger *m, MonClient *mc) : + Dispatcher(m->cct), + mds_lock("MDSDaemon::mds_lock"), + stopping(false), + timer(m->cct, mds_lock), + gss_ktfile_client(m->cct->_conf.get_val<std::string>("gss_ktab_client_file")), + beacon(m->cct, mc, n), + name(n), + messenger(m), + monc(mc), + mgrc(m->cct, m), + log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS), + mds_rank(NULL), + asok_hook(NULL), + starttime(mono_clock::now()) +{ + orig_argc = 0; + orig_argv = NULL; + + clog = log_client.create_channel(); + if (!gss_ktfile_client.empty()) { + // Assert we can export environment variable + /* + The default client keytab is used, if it is present and readable, + to automatically obtain initial credentials for GSSAPI client + applications. The principal name of the first entry in the client + keytab is used by default when obtaining initial credentials. + 1. The KRB5_CLIENT_KTNAME environment variable. + 2. The default_client_keytab_name profile variable in [libdefaults]. + 3. The hardcoded default, DEFCKTNAME. + */ + const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", + gss_ktfile_client.c_str(), 1)); + ceph_assert(set_result == 0); + } + + monc->set_messenger(messenger); + + mdsmap.reset(new MDSMap); +} + +MDSDaemon::~MDSDaemon() { + std::lock_guard lock(mds_lock); + + delete mds_rank; + mds_rank = NULL; +} + +class MDSSocketHook : public AdminSocketHook { + MDSDaemon *mds; +public: + explicit MDSSocketHook(MDSDaemon *m) : mds(m) {} + bool call(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + bool r = mds->asok_command(command, cmdmap, format, ss); + out.append(ss); + return r; + } +}; + +bool MDSDaemon::asok_command(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, std::ostream& ss) +{ + dout(1) << "asok_command: " << command << " (starting...)" << dendl; + + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + bool handled = false; + if (command == "status") { + dump_status(f); + handled = true; + } else { + if (mds_rank == NULL) { + dout(1) << "Can't run that command on an inactive MDS!" << dendl; + f->dump_string("error", "mds_not_active"); + } else { + try { + handled = mds_rank->handle_asok_command(command, cmdmap, f, ss); + } catch (const bad_cmd_get& e) { + ss << e.what(); + } + } + } + f->flush(ss); + delete f; + + dout(1) << "asok_command: " << command << " (complete)" << dendl; + + return handled; +} + +void MDSDaemon::dump_status(Formatter *f) +{ + f->open_object_section("status"); + f->dump_stream("cluster_fsid") << monc->get_fsid(); + if (mds_rank) { + f->dump_int("whoami", mds_rank->get_nodeid()); + } else { + f->dump_int("whoami", MDS_RANK_NONE); + } + + f->dump_int("id", monc->get_global_id()); + f->dump_string("want_state", ceph_mds_state_name(beacon.get_want_state())); + f->dump_string("state", ceph_mds_state_name(mdsmap->get_state_gid(mds_gid_t( + monc->get_global_id())))); + if (mds_rank) { + std::lock_guard l(mds_lock); + mds_rank->dump_status(f); + } + + f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch()); + if (mds_rank) { + f->dump_unsigned("osdmap_epoch", mds_rank->get_osd_epoch()); + f->dump_unsigned("osdmap_epoch_barrier", mds_rank->get_osd_epoch_barrier()); + } else { + f->dump_unsigned("osdmap_epoch", 0); + f->dump_unsigned("osdmap_epoch_barrier", 0); + } + + f->dump_float("uptime", get_uptime().count()); + + f->close_section(); // status +} + +void MDSDaemon::set_up_admin_socket() +{ + int r; + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(asok_hook == nullptr); + asok_hook = new MDSSocketHook(this); + r = admin_socket->register_command("status", "status", asok_hook, + "high-level status of MDS"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_ops_in_flight", + "dump_ops_in_flight", asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("ops", + "ops", asok_hook, + "show the ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_blocked_ops", "dump_blocked_ops", + asok_hook, + "show the blocked ops currently in flight"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops", + asok_hook, + "show recent ops"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration", + asok_hook, + "show recent ops, sorted by op duration"); + ceph_assert(r == 0); + r = admin_socket->register_command("scrub_path", + "scrub_path name=path,type=CephString " + "name=scrubops,type=CephChoices," + "strings=force|recursive|repair,n=N,req=false", + asok_hook, + "scrub an inode and output results"); + ceph_assert(r == 0); + r = admin_socket->register_command("tag path", + "tag path name=path,type=CephString" + " name=tag,type=CephString", + asok_hook, + "Apply scrub tag recursively"); + ceph_assert(r == 0); + r = admin_socket->register_command("flush_path", + "flush_path name=path,type=CephString", + asok_hook, + "flush an inode (and its dirfrags)"); + ceph_assert(r == 0); + r = admin_socket->register_command("export dir", + "export dir " + "name=path,type=CephString " + "name=rank,type=CephInt", + asok_hook, + "migrate a subtree to named MDS"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump cache", + "dump cache name=path,type=CephString,req=false", + asok_hook, + "dump metadata cache (optionally to a file)"); + ceph_assert(r == 0); + r = admin_socket->register_command("cache status", + "cache status", + asok_hook, + "show cache status"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump tree", + "dump tree " + "name=root,type=CephString,req=true " + "name=depth,type=CephInt,req=false ", + asok_hook, + "dump metadata cache for subtree"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump loads", + "dump loads", + asok_hook, + "dump metadata loads"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump snaps", + "dump snaps name=server,type=CephChoices,strings=--server,req=false", + asok_hook, + "dump snapshots"); + ceph_assert(r == 0); + r = admin_socket->register_command("session evict", + "session evict name=client_id,type=CephString", + asok_hook, + "Evict a CephFS client"); + ceph_assert(r == 0); + r = admin_socket->register_command("session ls", + "session ls", + asok_hook, + "Enumerate connected CephFS clients"); + ceph_assert(r == 0); + r = admin_socket->register_command("session config", + "session config name=client_id,type=CephInt,req=true " + "name=option,type=CephString,req=true " + "name=value,type=CephString,req=false ", + asok_hook, + "Config a CephFS client session"); + assert(r == 0); + r = admin_socket->register_command("osdmap barrier", + "osdmap barrier name=target_epoch,type=CephInt", + asok_hook, + "Wait until the MDS has this OSD map epoch"); + ceph_assert(r == 0); + r = admin_socket->register_command("flush journal", + "flush journal", + asok_hook, + "Flush the journal to the backing store"); + ceph_assert(r == 0); + r = admin_socket->register_command("force_readonly", + "force_readonly", + asok_hook, + "Force MDS to read-only mode"); + ceph_assert(r == 0); + r = admin_socket->register_command("get subtrees", + "get subtrees", + asok_hook, + "Return the subtree map"); + ceph_assert(r == 0); + r = admin_socket->register_command("dirfrag split", + "dirfrag split " + "name=path,type=CephString,req=true " + "name=frag,type=CephString,req=true " + "name=bits,type=CephInt,req=true ", + asok_hook, + "Fragment directory by path"); + ceph_assert(r == 0); + r = admin_socket->register_command("dirfrag merge", + "dirfrag merge " + "name=path,type=CephString,req=true " + "name=frag,type=CephString,req=true", + asok_hook, + "De-fragment directory by path"); + ceph_assert(r == 0); + r = admin_socket->register_command("dirfrag ls", + "dirfrag ls " + "name=path,type=CephString,req=true", + asok_hook, + "List fragments in directory"); + ceph_assert(r == 0); + r = admin_socket->register_command("openfiles ls", + "openfiles ls", + asok_hook, + "List the opening files and their caps"); + ceph_assert(r == 0); + r = admin_socket->register_command("dump inode", + "dump inode " + "name=number,type=CephInt,req=true", + asok_hook, + "dump inode by inode number"); + ceph_assert(r == 0); +} + +void MDSDaemon::clean_up_admin_socket() +{ + g_ceph_context->get_admin_socket()->unregister_commands(asok_hook); + delete asok_hook; + asok_hook = NULL; +} + +int MDSDaemon::init() +{ + dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl; + dout(10) << sizeof(CInode) << "\tCInode" << dendl; + dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *7=" << 7*sizeof(elist<void*>::item) << dendl; + dout(10) << sizeof(CInode::mempool_inode) << "\t inode " << dendl; + dout(10) << sizeof(CInode::mempool_old_inode) << "\t old_inode " << dendl; + dout(10) << sizeof(nest_info_t) << "\t nest_info_t " << dendl; + dout(10) << sizeof(frag_info_t) << "\t frag_info_t " << dendl; + dout(10) << sizeof(SimpleLock) << "\t SimpleLock *5=" << 5*sizeof(SimpleLock) << dendl; + dout(10) << sizeof(ScatterLock) << "\t ScatterLock *3=" << 3*sizeof(ScatterLock) << dendl; + dout(10) << sizeof(CDentry) << "\tCDentry" << dendl; + dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item" << dendl; + dout(10) << sizeof(SimpleLock) << "\t SimpleLock" << dendl; + dout(10) << sizeof(CDir) << "\tCDir " << dendl; + dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *2=" << 2*sizeof(elist<void*>::item) << dendl; + dout(10) << sizeof(fnode_t) << "\t fnode_t " << dendl; + dout(10) << sizeof(nest_info_t) << "\t nest_info_t *2" << dendl; + dout(10) << sizeof(frag_info_t) << "\t frag_info_t *2" << dendl; + dout(10) << sizeof(Capability) << "\tCapability " << dendl; + dout(10) << sizeof(xlist<void*>::item) << "\t xlist<>::item *2=" << 2*sizeof(xlist<void*>::item) << dendl; + + messenger->add_dispatcher_tail(&beacon); + messenger->add_dispatcher_tail(this); + + // init monc + monc->set_messenger(messenger); + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | + CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_MGR); + int r = 0; + r = monc->init(); + if (r < 0) { + derr << "ERROR: failed to init monc: " << cpp_strerror(-r) << dendl; + mds_lock.Lock(); + suicide(); + mds_lock.Unlock(); + return r; + } + + messenger->set_auth_client(monc); + messenger->set_auth_server(monc); + monc->set_handle_authentication_dispatcher(this); + + // tell monc about log_client so it will know about mon session resets + monc->set_log_client(&log_client); + + r = monc->authenticate(); + if (r < 0) { + derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl; + mds_lock.Lock(); + suicide(); + mds_lock.Unlock(); + return r; + } + + int rotating_auth_attempts = 0; + auto rotating_auth_timeout = + g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout"); + while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) { + if (++rotating_auth_attempts <= g_conf()->max_rotating_auth_attempts) { + derr << "unable to obtain rotating service keys; retrying" << dendl; + continue; + } + derr << "ERROR: failed to refresh rotating keys, " + << "maximum retry time reached." << dendl; + mds_lock.Lock(); + suicide(); + mds_lock.Unlock(); + return -ETIMEDOUT; + } + + mds_lock.Lock(); + if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) { + dout(4) << __func__ << ": terminated already, dropping out" << dendl; + mds_lock.Unlock(); + return 0; + } + + monc->sub_want("mdsmap", 0, 0); + monc->renew_subs(); + + mds_lock.Unlock(); + + // Set up admin socket before taking mds_lock, so that ordering + // is consistent (later we take mds_lock within asok callbacks) + set_up_admin_socket(); + mds_lock.Lock(); + if (beacon.get_want_state() == MDSMap::STATE_DNE) { + suicide(); // we could do something more graceful here + dout(4) << __func__ << ": terminated already, dropping out" << dendl; + mds_lock.Unlock(); + return 0; + } + + timer.init(); + + beacon.init(*mdsmap); + messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE)); + + // schedule tick + reset_tick(); + mds_lock.Unlock(); + + return 0; +} + +void MDSDaemon::reset_tick() +{ + // cancel old + if (tick_event) timer.cancel_event(tick_event); + + // schedule + tick_event = timer.add_event_after( + g_conf()->mds_tick_interval, + new FunctionContext([this](int) { + ceph_assert(mds_lock.is_locked_by_me()); + tick(); + })); +} + +void MDSDaemon::tick() +{ + // reschedule + reset_tick(); + + // Call through to subsystems' tick functions + if (mds_rank) { + mds_rank->tick(); + } +} + +void MDSDaemon::send_command_reply(const MCommand::const_ref &m, MDSRank *mds_rank, + int r, bufferlist outbl, + std::string_view outs) +{ + auto priv = m->get_connection()->get_priv(); + auto session = static_cast<Session *>(priv.get()); + ceph_assert(session != NULL); + // If someone is using a closed session for sending commands (e.g. + // the ceph CLI) then we should feel free to clean up this connection + // as soon as we've sent them a response. + const bool live_session = + session->get_state_seq() > 0 && + mds_rank && + mds_rank->sessionmap.get_session(session->info.inst.name); + + if (!live_session) { + // This session only existed to issue commands, so terminate it + // as soon as we can. + ceph_assert(session->is_closed()); + session->get_connection()->mark_disposable(); + } + priv.reset(); + + auto reply = MCommandReply::create(r, outs); + reply->set_tid(m->get_tid()); + reply->set_data(outbl); + m->get_connection()->send_message2(reply); +} + +void MDSDaemon::handle_command(const MCommand::const_ref &m) +{ + auto priv = m->get_connection()->get_priv(); + auto session = static_cast<Session *>(priv.get()); + ceph_assert(session != NULL); + + int r = 0; + cmdmap_t cmdmap; + std::stringstream ss; + std::string outs; + bufferlist outbl; + Context *run_after = NULL; + bool need_reply = true; + + if (!session->auth_caps.allow_all()) { + dout(1) << __func__ + << ": received command from client without `tell` capability: " + << *m->get_connection()->peer_addrs << dendl; + + ss << "permission denied"; + r = -EPERM; + } else if (m->cmd.empty()) { + r = -EINVAL; + ss << "no command given"; + outs = ss.str(); + } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + r = -EINVAL; + outs = ss.str(); + } else { + try { + r = _handle_command(cmdmap, m, &outbl, &outs, &run_after, &need_reply); + } catch (const bad_cmd_get& e) { + outs = e.what(); + r = -EINVAL; + } + } + priv.reset(); + + if (need_reply) { + send_command_reply(m, mds_rank, r, outbl, outs); + } + + if (run_after) { + run_after->complete(0); + } +} + +const std::vector<MDSDaemon::MDSCommand>& MDSDaemon::get_commands() +{ + static const std::vector<MDSCommand> commands = { + MDSCommand("injectargs name=injected_args,type=CephString,n=N", "inject configuration arguments into running MDS"), + MDSCommand("config set name=key,type=CephString name=value,type=CephString", "Set a configuration option at runtime (not persistent)"), + MDSCommand("config unset name=key,type=CephString", "Unset a configuration option at runtime (not persistent)"), + MDSCommand("exit", "Terminate this MDS"), + MDSCommand("respawn", "Restart this MDS"), + MDSCommand("session kill name=session_id,type=CephInt", "End a client session"), + MDSCommand("cpu_profiler name=arg,type=CephChoices,strings=status|flush", "run cpu profiling on daemon"), + MDSCommand("session ls name=filters,type=CephString,n=N,req=false", "List client sessions"), + MDSCommand("client ls name=filters,type=CephString,n=N,req=false", "List client sessions"), + MDSCommand("session evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"), + MDSCommand("client evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"), + MDSCommand("session config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false", + "Config a client session"), + MDSCommand("client config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false", + "Config a client session"), + MDSCommand("damage ls", "List detected metadata damage"), + MDSCommand("damage rm name=damage_id,type=CephInt", "Remove a damage table entry"), + MDSCommand("version", "report version of MDS"), + MDSCommand("heap " + "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", + "show heap usage info (available only if compiled with tcmalloc)"), + MDSCommand("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally request client to release all caps and flush the journal"), + MDSCommand("scrub start name=path,type=CephString name=scrubops,type=CephChoices,strings=force|recursive|repair,n=N,req=false name=tag,type=CephString,req=false", + "scrub an inode and output results"), + MDSCommand("scrub abort", "Abort in progress scrub operation(s)"), + MDSCommand("scrub pause", "Pause in progress scrub operation(s)"), + MDSCommand("scrub resume", "Resume paused scrub operation(s)"), + MDSCommand("scrub status", "Status of scrub operation"), + }; + return commands; +}; + +int MDSDaemon::_handle_command( + const cmdmap_t &cmdmap, + const MCommand::const_ref &m, + bufferlist *outbl, + std::string *outs, + Context **run_later, + bool *need_reply) +{ + ceph_assert(outbl != NULL); + ceph_assert(outs != NULL); + + class SuicideLater : public Context + { + MDSDaemon *mds; + + public: + explicit SuicideLater(MDSDaemon *mds_) : mds(mds_) {} + void finish(int r) override { + // Wait a little to improve chances of caller getting + // our response before seeing us disappear from mdsmap + sleep(1); + + mds->suicide(); + } + }; + + + class RespawnLater : public Context + { + MDSDaemon *mds; + + public: + + explicit RespawnLater(MDSDaemon *mds_) : mds(mds_) {} + void finish(int r) override { + // Wait a little to improve chances of caller getting + // our response before seeing us disappear from mdsmap + sleep(1); + + mds->respawn(); + } + }; + + std::stringstream ds; + std::stringstream ss; + std::string prefix; + std::string format; + std::unique_ptr<Formatter> f(Formatter::create(format)); + cmd_getval(cct, cmdmap, "prefix", prefix); + + int r = 0; + + if (prefix == "get_command_descriptions") { + int cmdnum = 0; + std::unique_ptr<JSONFormatter> f(std::make_unique<JSONFormatter>()); + f->open_object_section("command_descriptions"); + for (auto& c : get_commands()) { + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(f.get(), m->get_connection()->get_features(), + secname.str(), c.cmdstring, c.helpstring, + c.module, "*", 0); + cmdnum++; + } + f->close_section(); // command_descriptions + + f->flush(ds); + goto out; + } + + cmd_getval(cct, cmdmap, "format", format); + if (prefix == "version") { + if (f) { + f->open_object_section("version"); + f->dump_string("version", pretty_version_to_str()); + f->close_section(); + f->flush(ds); + } else { + ds << pretty_version_to_str(); + } + } else if (prefix == "injectargs") { + vector<string> argsvec; + cmd_getval(cct, cmdmap, "injected_args", argsvec); + + if (argsvec.empty()) { + r = -EINVAL; + ss << "ignoring empty injectargs"; + goto out; + } + string args = argsvec.front(); + for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a) + args += " " + *a; + r = cct->_conf.injectargs(args, &ss); + } else if (prefix == "config set") { + std::string key; + cmd_getval(cct, cmdmap, "key", key); + std::string val; + cmd_getval(cct, cmdmap, "value", val); + r = cct->_conf.set_val(key, val, &ss); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + } else if (prefix == "config unset") { + std::string key; + cmd_getval(cct, cmdmap, "key", key); + r = cct->_conf.rm_val(key); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + if (r == -ENOENT) { + r = 0; // idempotent + } + } else if (prefix == "exit") { + // We will send response before executing + ss << "Exiting..."; + *run_later = new SuicideLater(this); + } else if (prefix == "respawn") { + // We will send response before executing + ss << "Respawning..."; + *run_later = new RespawnLater(this); + } else if (prefix == "session kill") { + if (mds_rank == NULL) { + r = -EINVAL; + ss << "MDS not active"; + goto out; + } + // FIXME harmonize `session kill` with admin socket session evict + int64_t session_id = 0; + bool got = cmd_getval(cct, cmdmap, "session_id", session_id); + ceph_assert(got); + bool killed = mds_rank->evict_client(session_id, false, + g_conf()->mds_session_blacklist_on_evict, + ss); + if (!killed) + r = -ENOENT; + } else if (prefix == "heap") { + if (!ceph_using_tcmalloc()) { + r = -EOPNOTSUPP; + ss << "could not issue heap profiler command -- not using tcmalloc!"; + } else { + string heapcmd; + cmd_getval(cct, cmdmap, "heapcmd", heapcmd); + vector<string> heapcmd_vec; + get_str_vec(heapcmd, heapcmd_vec); + string value; + if (cmd_getval(cct, cmdmap, "value", value)) + heapcmd_vec.push_back(value); + ceph_heap_profiler_handle_command(heapcmd_vec, ds); + } + } else if (prefix == "cpu_profiler") { + string arg; + cmd_getval(cct, cmdmap, "arg", arg); + vector<string> argvec; + get_str_vec(arg, argvec); + cpu_profiler_handle_command(argvec, ds); + } else { + // Give MDSRank a shot at the command + if (!mds_rank) { + ss << "MDS not active"; + r = -EINVAL; + } + else { + bool handled; + try { + handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss, + run_later, need_reply); + if (!handled) { + // MDSDaemon doesn't know this command + ss << "unrecognized command! " << prefix; + r = -EINVAL; + } + } catch (const bad_cmd_get& e) { + ss << e.what(); + r = -EINVAL; + } + } + } + +out: + *outs = ss.str(); + outbl->append(ds); + return r; +} + +void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m) +{ + version_t epoch = m->get_epoch(); + + // is it new? + if (epoch <= mdsmap->get_epoch()) { + dout(5) << "handle_mds_map old map epoch " << epoch << " <= " + << mdsmap->get_epoch() << ", discarding" << dendl; + return; + } + + dout(1) << "Updating MDS map to version " << epoch << " from " << m->get_source() << dendl; + + // keep old map, for a moment + std::unique_ptr<MDSMap> oldmap; + oldmap.swap(mdsmap); + + // decode and process + mdsmap.reset(new MDSMap); + mdsmap->decode(m->get_encoded()); + + monc->sub_got("mdsmap", mdsmap->get_epoch()); + + // verify compatset + CompatSet mdsmap_compat(MDSMap::get_compat_set_all()); + dout(10) << " my compat " << mdsmap_compat << dendl; + dout(10) << " mdsmap compat " << mdsmap->compat << dendl; + if (!mdsmap_compat.writeable(mdsmap->compat)) { + dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat + << " not writeable with daemon features " << mdsmap_compat + << ", killing myself" << dendl; + suicide(); + return; + } + + // Calculate my effective rank (either my owned rank or the rank I'm following if STATE_STANDBY_REPLAY + const auto addrs = messenger->get_myaddrs(); + const auto myid = monc->get_global_id(); + const auto mygid = mds_gid_t(myid); + const auto whoami = mdsmap->get_rank_gid(mygid); + const auto old_state = oldmap->get_state_gid(mygid); + const auto new_state = mdsmap->get_state_gid(mygid); + const auto incarnation = mdsmap->get_inc_gid(mygid); + dout(10) << "my gid is " << myid << dendl; + dout(10) << "map says I am mds." << whoami << "." << incarnation + << " state " << ceph_mds_state_name(new_state) << dendl; + dout(10) << "msgr says I am " << addrs << dendl; + + // If we're removed from the MDSMap, stop all processing. + using DS = MDSMap::DaemonState; + if (old_state != DS::STATE_NULL && new_state == DS::STATE_NULL) { + const auto& oldinfo = oldmap->get_info_gid(mygid); + dout(1) << "Map removed me " << oldinfo + << " from cluster; respawning! See cluster/monitor logs for details." << dendl; + respawn(); + } + + if (old_state == DS::STATE_NULL && new_state != DS::STATE_NULL) { + /* The MDS has been added to the FSMap, now we can init the MgrClient */ + mgrc.init(); + messenger->add_dispatcher_tail(&mgrc); + monc->sub_want("mgrmap", 0, 0); + monc->renew_subs(); /* MgrMap receipt drives connection to ceph-mgr */ + } + + // mark down any failed peers + for (const auto& [gid, info] : oldmap->get_mds_info()) { + if (mdsmap->get_mds_info().count(gid) == 0) { + dout(10) << " peer mds gid " << gid << " removed from map" << dendl; + messenger->mark_down_addrs(info.addrs); + } + } + + if (whoami == MDS_RANK_NONE) { + // We do not hold a rank: + dout(10) << __func__ << ": handling map in rankless mode" << dendl; + + if (new_state == DS::STATE_STANDBY) { + /* Note: STATE_BOOT is never an actual state in the FSMap. The Monitors + * generally mark a new MDS as STANDBY (although it's possible to + * immediately be assigned a rank). + */ + if (old_state == DS::STATE_NULL) { + dout(1) << "Monitors have assigned me to become a standby." << dendl; + beacon.set_want_state(*mdsmap, new_state); + } else if (old_state == DS::STATE_STANDBY) { + dout(5) << "I am still standby" << dendl; + } + } else if (new_state == DS::STATE_NULL) { + /* We are not in the MDSMap yet! Keep waiting: */ + ceph_assert(beacon.get_want_state() == DS::STATE_BOOT); + dout(10) << "not in map yet" << dendl; + } else { + /* We moved to standby somehow from another state */ + ceph_abort("invalid transition to standby"); + } + } else { + // Did we already hold a different rank? MDSMonitor shouldn't try + // to change that out from under me! + if (mds_rank && whoami != mds_rank->get_nodeid()) { + derr << "Invalid rank transition " << mds_rank->get_nodeid() << "->" + << whoami << dendl; + respawn(); + } + + // Did I previously not hold a rank? Initialize! + if (mds_rank == NULL) { + mds_rank = new MDSRankDispatcher(whoami, mds_lock, clog, + timer, beacon, mdsmap, messenger, monc, &mgrc, + new FunctionContext([this](int r){respawn();}), + new FunctionContext([this](int r){suicide();})); + dout(10) << __func__ << ": initializing MDS rank " + << mds_rank->get_nodeid() << dendl; + mds_rank->init(); + } + + // MDSRank is active: let him process the map, we have no say. + dout(10) << __func__ << ": handling map as rank " + << mds_rank->get_nodeid() << dendl; + mds_rank->handle_mds_map(m, *oldmap); + } + + beacon.notify_mdsmap(*mdsmap); +} + +void MDSDaemon::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** got signal " << sig_str(signum) << " ***" << dendl; + { + std::lock_guard l(mds_lock); + if (stopping) { + return; + } + suicide(); + } +} + +void MDSDaemon::suicide() +{ + ceph_assert(mds_lock.is_locked()); + + // make sure we don't suicide twice + ceph_assert(stopping == false); + stopping = true; + + dout(1) << "suicide! Wanted state " + << ceph_mds_state_name(beacon.get_want_state()) << dendl; + + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = 0; + } + + clean_up_admin_socket(); + + // Inform MDS we are going away, then shut down beacon + beacon.set_want_state(*mdsmap, MDSMap::STATE_DNE); + if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) { + // Notify the MDSMonitor that we're dying, so that it doesn't have to + // wait for us to go laggy. Only do this if we're actually in the + // MDSMap, because otherwise the MDSMonitor will drop our message. + beacon.send_and_wait(1); + } + beacon.shutdown(); + + if (mgrc.is_initialized()) + mgrc.shutdown(); + + if (mds_rank) { + mds_rank->shutdown(); + } else { + timer.shutdown(); + + monc->shutdown(); + messenger->shutdown(); + } +} + +void MDSDaemon::respawn() +{ + // --- WARNING TO FUTURE COPY/PASTERS --- + // You must also add a call like + // + // ceph_pthread_setname(pthread_self(), "ceph-mds"); + // + // to main() so that /proc/$pid/stat field 2 contains "(ceph-mds)" + // instead of "(exe)", so that killall (and log rotation) will work. + + dout(1) << "respawn!" << dendl; + + /* Dump recent in case the MDS was stuck doing something which caused it to + * be removed from the MDSMap leading to respawn. */ + g_ceph_context->_log->dump_recent(); + + char *new_argv[orig_argc+1]; + dout(1) << " e: '" << orig_argv[0] << "'" << dendl; + for (int i=0; i<orig_argc; i++) { + new_argv[i] = (char *)orig_argv[i]; + dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl; + } + new_argv[orig_argc] = NULL; + + /* Determine the path to our executable, test if Linux /proc/self/exe exists. + * This allows us to exec the same executable even if it has since been + * unlinked. + */ + char exe_path[PATH_MAX] = ""; +#ifdef PROCPREFIX + if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) { + dout(1) << "respawning with exe " << exe_path << dendl; + strcpy(exe_path, PROCPREFIX "/proc/self/exe"); + } else { +#else + { +#endif + /* Print CWD for the user's interest */ + char buf[PATH_MAX]; + char *cwd = getcwd(buf, sizeof(buf)); + ceph_assert(cwd); + dout(1) << " cwd " << cwd << dendl; + + /* Fall back to a best-effort: just running in our CWD */ + strncpy(exe_path, orig_argv[0], PATH_MAX-1); + } + + dout(1) << " exe_path " << exe_path << dendl; + + unblock_all_signals(NULL); + execv(exe_path, new_argv); + + dout(0) << "respawn execv " << orig_argv[0] + << " failed with " << cpp_strerror(errno) << dendl; + + // We have to assert out here, because suicide() returns, and callers + // to respawn expect it never to return. + ceph_abort(); +} + + + +bool MDSDaemon::ms_dispatch2(const Message::ref &m) +{ + std::lock_guard l(mds_lock); + if (stopping) { + return false; + } + + // Drop out early if shutting down + if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) { + dout(10) << " stopping, discarding " << *m << dendl; + return true; + } + + // First see if it's a daemon message + const bool handled_core = handle_core_message(m); + if (handled_core) { + return true; + } + + // Not core, try it as a rank message + if (mds_rank) { + return mds_rank->ms_dispatch(m); + } else { + return false; + } +} + +bool MDSDaemon::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) +{ + dout(10) << "MDSDaemon::ms_get_authorizer type=" + << ceph_entity_type_name(dest_type) << dendl; + + /* monitor authorization is being handled on different layer */ + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + + *authorizer = monc->build_authorizer(dest_type); + return *authorizer != NULL; +} + + +/* + * high priority messages we always process + */ + +#define ALLOW_MESSAGES_FROM(peers) \ + do { \ + if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \ + dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" \ + << m->get_connection()->get_peer_type() << " allowing=" \ + << #peers << " message=" << *m << dendl; \ + return true; \ + } \ + } while (0) + +bool MDSDaemon::handle_core_message(const Message::const_ref &m) +{ + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON); + break; + + // MDS + case CEPH_MSG_MDS_MAP: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS); + handle_mds_map(MMDSMap::msgref_cast(m)); + break; + + // OSD + case MSG_COMMAND: + handle_command(MCommand::msgref_cast(m)); + break; + case CEPH_MSG_OSD_MAP: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD); + + if (mds_rank) { + mds_rank->handle_osd_map(); + } + break; + + case MSG_MON_COMMAND: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON); + clog->warn() << "dropping `mds tell` command from legacy monitor"; + break; + + default: + return false; + } + return true; +} + +void MDSDaemon::ms_handle_connect(Connection *con) +{ +} + +bool MDSDaemon::ms_handle_reset(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT) + return false; + + std::lock_guard l(mds_lock); + if (stopping) { + return false; + } + dout(5) << "ms_handle_reset on " << con->get_peer_socket_addr() << dendl; + if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) + return false; + + auto priv = con->get_priv(); + if (auto session = static_cast<Session *>(priv.get()); session) { + if (session->is_closed()) { + dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl; + con->mark_down(); + con->set_priv(nullptr); + } + } else { + con->mark_down(); + } + return false; +} + + +void MDSDaemon::ms_handle_remote_reset(Connection *con) +{ + if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT) + return; + + std::lock_guard l(mds_lock); + if (stopping) { + return; + } + + dout(5) << "ms_handle_remote_reset on " << con->get_peer_socket_addr() << dendl; + if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) + return; + + auto priv = con->get_priv(); + if (auto session = static_cast<Session *>(priv.get()); session) { + if (session->is_closed()) { + dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl; + con->mark_down(); + con->set_priv(nullptr); + } + } +} + +bool MDSDaemon::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +KeyStore *MDSDaemon::ms_get_auth1_authorizer_keystore() +{ + return monc->rotating_secrets.get(); +} + +bool MDSDaemon::parse_caps(const AuthCapsInfo& info, MDSAuthCaps& caps) +{ + caps.clear(); + if (info.allow_all) { + caps.set_allow_all(); + return true; + } else { + auto it = info.caps.begin(); + string auth_cap_str; + try { + decode(auth_cap_str, it); + } catch (const buffer::error& e) { + dout(1) << __func__ << ": cannot decode auth caps buffer of length " << info.caps.length() << dendl; + return false; + } + + dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl; + CachedStackStringStream cs; + if (caps.parse(g_ceph_context, auth_cap_str, cs.get())) { + return true; + } else { + dout(1) << __func__ << ": auth cap parse error: " << cs->strv() << " parsing '" << auth_cap_str << "'" << dendl; + return false; + } + } +} + +int MDSDaemon::ms_handle_authentication(Connection *con) +{ + /* N.B. without mds_lock! */ + MDSAuthCaps caps; + return parse_caps(con->get_peer_caps_info(), caps) ? 0 : -1; +} + +void MDSDaemon::ms_handle_accept(Connection *con) +{ + entity_name_t n(con->get_peer_type(), con->get_peer_global_id()); + std::lock_guard l(mds_lock); + if (stopping) { + return; + } + + // We allow connections and assign Session instances to connections + // even if we have not been assigned a rank, because clients with + // "allow *" are allowed to connect and do 'tell' operations before + // we have a rank. + Session *s = NULL; + if (mds_rank) { + // If we do hold a rank, see if this is an existing client establishing + // a new connection, rather than a new client + s = mds_rank->sessionmap.get_session(n); + } + + // Wire up a Session* to this connection + // It doesn't go into a SessionMap instance until it sends an explicit + // request to open a session (initial state of Session is `closed`) + if (!s) { + s = new Session(con); + dout(10) << " new session " << s << " for " << s->info.inst + << " con " << con << dendl; + con->set_priv(RefCountedPtr{s, false}); + if (mds_rank) { + mds_rank->kick_waiters_for_any_client_connection(); + } + } else { + dout(10) << " existing session " << s << " for " << s->info.inst + << " existing con " << s->get_connection() + << ", new/authorizing con " << con << dendl; + con->set_priv(RefCountedPtr{s}); + } + + parse_caps(con->get_peer_caps_info(), s->auth_caps); + + dout(10) << "ms_handle_accept " << con->get_peer_socket_addr() << " con " << con << " session " << s << dendl; + if (s) { + if (s->get_connection() != con) { + dout(10) << " session connection " << s->get_connection() + << " -> " << con << dendl; + s->set_connection(con); + + // send out any queued messages + while (!s->preopen_out_queue.empty()) { + con->send_message2(s->preopen_out_queue.front()); + s->preopen_out_queue.pop_front(); + } + } + } +} + +bool MDSDaemon::is_clean_shutdown() +{ + if (mds_rank) { + return mds_rank->is_stopped(); + } else { + return true; + } +} diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h new file mode 100644 index 00000000..8add46d6 --- /dev/null +++ b/src/mds/MDSDaemon.h @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_H +#define CEPH_MDS_H + +#include <string_view> + +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MGenericMessage.h" +#include "messages/MMDSMap.h" +#include "messages/MMonCommand.h" + +#include "common/LogClient.h" +#include "common/Mutex.h" +#include "common/Timer.h" +#include "include/Context.h" +#include "include/types.h" +#include "mgr/MgrClient.h" +#include "msg/Dispatcher.h" + +#include "Beacon.h" +#include "MDSMap.h" +#include "MDSRank.h" + +#define CEPH_MDS_PROTOCOL 34 /* cluster internal */ + +class Messenger; +class MonClient; + +class MDSDaemon : public Dispatcher { + public: + /* Global MDS lock: every time someone takes this, they must + * also check the `stopping` flag. If stopping is true, you + * must either do nothing and immediately drop the lock, or + * never drop the lock again (i.e. call respawn()) */ + Mutex mds_lock; + bool stopping; + + SafeTimer timer; + std::string gss_ktfile_client{}; + + mono_time get_starttime() const { + return starttime; + } + chrono::duration<double> get_uptime() const { + mono_time now = mono_clock::now(); + return chrono::duration<double>(now-starttime); + } + + protected: + Beacon beacon; + + std::string name; + + Messenger *messenger; + MonClient *monc; + MgrClient mgrc; + std::unique_ptr<MDSMap> mdsmap; + LogClient log_client; + LogChannelRef clog; + + MDSRankDispatcher *mds_rank; + + public: + MDSDaemon(std::string_view n, Messenger *m, MonClient *mc); + ~MDSDaemon() override; + int orig_argc; + const char **orig_argv; + + // handle a signal (e.g., SIGTERM) + void handle_signal(int signum); + + int init(); + + /** + * Hint at whether we were shutdown gracefully (i.e. we were only + * in standby, or our rank was stopped). Should be removed once + * we handle shutdown properly (e.g. clear out all message queues) + * such that deleting xlists doesn't assert. + */ + bool is_clean_shutdown(); + protected: + // tick and other timer fun + Context *tick_event = nullptr; + void reset_tick(); + + void wait_for_omap_osds(); + + private: + bool ms_dispatch2(const Message::ref &m) override; + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override; + int ms_handle_authentication(Connection *con) override; + KeyStore *ms_get_auth1_authorizer_keystore() override; + void ms_handle_accept(Connection *con) override; + void ms_handle_connect(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override; + bool ms_handle_refused(Connection *con) override; + + protected: + // admin socket handling + friend class MDSSocketHook; + class MDSSocketHook *asok_hook; + void set_up_admin_socket(); + void clean_up_admin_socket(); + void check_ops_in_flight(); // send off any slow ops to monitor + bool asok_command(std::string_view command, const cmdmap_t& cmdmap, + std::string_view format, ostream& ss); + + void dump_status(Formatter *f); + + /** + * Terminate this daemon process. + * + * This function will return, but once it does so the calling thread + * must do no more work as all subsystems will have been shut down. + */ + void suicide(); + + /** + * Start a new daemon process with the same command line parameters that + * this process was run with, then terminate this process + */ + void respawn(); + + void tick(); + +protected: + bool handle_core_message(const Message::const_ref &m); + + // special message types + friend class C_MDS_Send_Command_Reply; + static void send_command_reply(const MCommand::const_ref &m, MDSRank* mds_rank, int r, + bufferlist outbl, std::string_view outs); + int _handle_command( + const cmdmap_t &cmdmap, + const MCommand::const_ref &m, + bufferlist *outbl, + std::string *outs, + Context **run_later, + bool *need_reply); + void handle_command(const MCommand::const_ref &m); + void handle_mds_map(const MMDSMap::const_ref &m); + +private: + struct MDSCommand { + MDSCommand(std::string_view signature, std::string_view help) + : cmdstring(signature), helpstring(help) + {} + + std::string cmdstring; + std::string helpstring; + std::string module = "mds"; + }; + + static const std::vector<MDSCommand>& get_commands(); + + bool parse_caps(const AuthCapsInfo&, MDSAuthCaps&); + + mono_time starttime = mono_clock::zero(); +}; + +#endif diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc new file mode 100644 index 00000000..27753c6e --- /dev/null +++ b/src/mds/MDSMap.cc @@ -0,0 +1,930 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/debug.h" +#include "mon/health_check.h" + +#include "MDSMap.h" + +#include <sstream> +using std::stringstream; + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_ + +// features +CompatSet MDSMap::get_compat_set_all() { + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2); + + return CompatSet(feature_compat, feature_ro_compat, feature_incompat); +} + +CompatSet MDSMap::get_compat_set_default() { + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2); + + return CompatSet(feature_compat, feature_ro_compat, feature_incompat); +} + +// base (pre v0.20) +CompatSet MDSMap::get_compat_set_base() { + CompatSet::FeatureSet feature_compat_base; + CompatSet::FeatureSet feature_incompat_base; + feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE); + CompatSet::FeatureSet feature_ro_compat_base; + + return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base); +} + +void MDSMap::mds_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("gid", global_id); + f->dump_string("name", name); + f->dump_int("rank", rank); + f->dump_int("incarnation", inc); + f->dump_stream("state") << ceph_mds_state_name(state); + f->dump_int("state_seq", state_seq); + f->dump_stream("addr") << addrs.get_legacy_str(); + f->dump_object("addrs", addrs); + if (laggy_since != utime_t()) + f->dump_stream("laggy_since") << laggy_since; + + f->open_array_section("export_targets"); + for (set<mds_rank_t>::iterator p = export_targets.begin(); + p != export_targets.end(); ++p) { + f->dump_int("mds", *p); + } + f->close_section(); + f->dump_unsigned("features", mds_features); + f->dump_unsigned("flags", flags); +} + +void MDSMap::mds_info_t::dump(std::ostream& o) const +{ + o << "[mds." << name << "{" << rank << ":" << global_id << "}" + << " state " << ceph_mds_state_name(state) + << " seq " << state_seq; + if (laggy()) { + o << " laggy since " << laggy_since; + } + if (!export_targets.empty()) { + o << " export targets " << export_targets; + } + if (is_frozen()) { + o << " frozen"; + } + o << " addr " << addrs << "]"; +} + +void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls) +{ + mds_info_t *sample = new mds_info_t(); + ls.push_back(sample); + sample = new mds_info_t(); + sample->global_id = 1; + sample->name = "test_instance"; + sample->rank = 0; + ls.push_back(sample); +} + +void MDSMap::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + f->dump_unsigned("flags", flags); + f->dump_unsigned("ever_allowed_features", ever_allowed_features); + f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features); + f->dump_stream("created") << created; + f->dump_stream("modified") << modified; + f->dump_int("tableserver", tableserver); + f->dump_int("root", root); + f->dump_int("session_timeout", session_timeout); + f->dump_int("session_autoclose", session_autoclose); + f->dump_stream("min_compat_client") << (int)min_compat_client << " (" + << ceph_release_name(min_compat_client) << ")"; + f->dump_int("max_file_size", max_file_size); + f->dump_int("last_failure", last_failure); + f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + f->dump_int("max_mds", max_mds); + f->open_array_section("in"); + for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_object_section("up"); + for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) { + char s[14]; + sprintf(s, "mds_%d", int(p->first)); + f->dump_int(s, p->second); + } + f->close_section(); + f->open_array_section("failed"); + for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_array_section("damaged"); + for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_array_section("stopped"); + for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_object_section("info"); + for (const auto& [gid, info] : mds_info) { + char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0' + sprintf(s, "gid_%llu", (long long unsigned)gid); + f->open_object_section(s); + info.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("data_pools"); + for (const auto& p: data_pools) + f->dump_int("pool", p); + f->close_section(); + f->dump_int("metadata_pool", metadata_pool); + f->dump_bool("enabled", enabled); + f->dump_string("fs_name", fs_name); + f->dump_string("balancer", balancer); + f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); +} + +void MDSMap::generate_test_instances(list<MDSMap*>& ls) +{ + MDSMap *m = new MDSMap(); + m->max_mds = 1; + m->data_pools.push_back(0); + m->metadata_pool = 1; + m->cas_pool = 2; + m->compat = get_compat_set_all(); + + // these aren't the defaults, just in case anybody gets confused + m->session_timeout = 61; + m->session_autoclose = 301; + m->max_file_size = 1<<24; + ls.push_back(m); +} + +void MDSMap::print(ostream& out) const +{ + out << "fs_name\t" << fs_name << "\n"; + out << "epoch\t" << epoch << "\n"; + out << "flags\t" << hex << flags << dec << "\n"; + out << "created\t" << created << "\n"; + out << "modified\t" << modified << "\n"; + out << "tableserver\t" << tableserver << "\n"; + out << "root\t" << root << "\n"; + out << "session_timeout\t" << session_timeout << "\n" + << "session_autoclose\t" << session_autoclose << "\n"; + out << "max_file_size\t" << max_file_size << "\n"; + out << "min_compat_client\t" << (int)min_compat_client << " (" + << ceph_release_name(min_compat_client) << ")\n"; + out << "last_failure\t" << last_failure << "\n" + << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n"; + out << "compat\t" << compat << "\n"; + out << "max_mds\t" << max_mds << "\n"; + out << "in\t" << in << "\n" + << "up\t" << up << "\n" + << "failed\t" << failed << "\n" + << "damaged\t" << damaged << "\n" + << "stopped\t" << stopped << "\n"; + out << "data_pools\t" << data_pools << "\n"; + out << "metadata_pool\t" << metadata_pool << "\n"; + out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; + out << "balancer\t" << balancer << "\n"; + out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; + + multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo; + for (const auto &p : mds_info) { + foo.insert(std::make_pair( + std::make_pair(p.second.rank, p.second.inc-1), p.first)); + } + + for (const auto &p : foo) { + out << mds_info.at(p.second) << "\n"; + } +} + +void MDSMap::print_summary(Formatter *f, ostream *out) const +{ + map<mds_rank_t,string> by_rank; + map<string,int> by_state; + + if (f) { + f->dump_unsigned("epoch", get_epoch()); + f->dump_unsigned("up", up.size()); + f->dump_unsigned("in", in.size()); + f->dump_unsigned("max", max_mds); + } else { + *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up"; + } + + if (f) + f->open_array_section("by_rank"); + for (const auto &p : mds_info) { + string s = ceph_mds_state_name(p.second.state); + if (p.second.laggy()) + s += "(laggy or crashed)"; + + if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) { + if (f) { + f->open_object_section("mds"); + f->dump_unsigned("rank", p.second.rank); + f->dump_string("name", p.second.name); + f->dump_string("status", s); + f->close_section(); + } else { + by_rank[p.second.rank] = p.second.name + "=" + s; + } + } else { + by_state[s]++; + } + } + if (f) { + f->close_section(); + } else { + if (!by_rank.empty()) + *out << " " << by_rank; + } + + for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) { + if (f) { + f->dump_unsigned(p->first.c_str(), p->second); + } else { + *out << ", " << p->second << " " << p->first; + } + } + + if (!failed.empty()) { + if (f) { + f->dump_unsigned("failed", failed.size()); + } else { + *out << ", " << failed.size() << " failed"; + } + } + + if (!damaged.empty()) { + if (f) { + f->dump_unsigned("damaged", damaged.size()); + } else { + *out << ", " << damaged.size() << " damaged"; + } + } + //if (stopped.size()) + //out << ", " << stopped.size() << " stopped"; +} + +void MDSMap::get_health(list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const +{ + if (!failed.empty()) { + std::ostringstream oss; + oss << "mds rank" + << ((failed.size() > 1) ? "s ":" ") + << failed + << ((failed.size() > 1) ? " have":" has") + << " failed"; + summary.push_back(make_pair(HEALTH_ERR, oss.str())); + if (detail) { + for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) { + std::ostringstream oss; + oss << "mds." << *p << " has failed"; + detail->push_back(make_pair(HEALTH_ERR, oss.str())); + } + } + } + + if (!damaged.empty()) { + std::ostringstream oss; + oss << "mds rank" + << ((damaged.size() > 1) ? "s ":" ") + << damaged + << ((damaged.size() > 1) ? " are":" is") + << " damaged"; + summary.push_back(make_pair(HEALTH_ERR, oss.str())); + if (detail) { + for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) { + std::ostringstream oss; + oss << "mds." << *p << " is damaged"; + detail->push_back(make_pair(HEALTH_ERR, oss.str())); + } + } + } + + if (is_degraded()) { + summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); + if (detail) { + detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); + for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { + if (!is_up(i)) + continue; + mds_gid_t gid = up.find(i)->second; + const auto& info = mds_info.at(gid); + stringstream ss; + if (is_resolve(i)) + ss << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is resolving"; + if (is_replay(i)) + ss << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is replaying journal"; + if (is_rejoin(i)) + ss << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is rejoining"; + if (is_reconnect(i)) + ss << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is reconnecting to clients"; + if (ss.str().length()) + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + } + } + } + + { + stringstream ss; + ss << fs_name << " max_mds " << max_mds; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } + + if ((mds_rank_t)up.size() < max_mds) { + stringstream ss; + ss << fs_name << " has " << up.size() + << " active MDS(s), but has max_mds of " << max_mds; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + } + + set<string> laggy; + for (const auto &u : up) { + const auto& info = mds_info.at(u.second); + if (info.laggy()) { + laggy.insert(info.name); + if (detail) { + std::ostringstream oss; + oss << "mds." << info.name << " at " << info.addrs + << " is laggy/unresponsive"; + detail->push_back(make_pair(HEALTH_WARN, oss.str())); + } + } + } + + if (!laggy.empty()) { + std::ostringstream oss; + oss << "mds " << laggy + << ((laggy.size() > 1) ? " are":" is") + << " laggy"; + summary.push_back(make_pair(HEALTH_WARN, oss.str())); + } + + if (get_max_mds() > 1 && + was_snaps_ever_allowed() && !allows_multimds_snaps()) { + std::ostringstream oss; + oss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS"; + summary.push_back(make_pair(HEALTH_WARN, oss.str())); + } +} + +void MDSMap::get_health_checks(health_check_map_t *checks) const +{ + // MDS_DAMAGE + if (!damaged.empty()) { + health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR, + "%num% mds daemon%plurals% damaged"); + for (auto p : damaged) { + std::ostringstream oss; + oss << "fs " << fs_name << " mds." << p << " is damaged"; + check.detail.push_back(oss.str()); + } + } + + // FS_DEGRADED + if (is_degraded()) { + health_check_t& fscheck = checks->get_or_add( + "FS_DEGRADED", HEALTH_WARN, + "%num% filesystem%plurals% %isorare% degraded"); + ostringstream ss; + ss << "fs " << fs_name << " is degraded"; + fscheck.detail.push_back(ss.str()); + + list<string> detail; + for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { + if (!is_up(i)) + continue; + mds_gid_t gid = up.find(i)->second; + const auto& info = mds_info.at(gid); + stringstream ss; + ss << "fs " << fs_name << " mds." << info.name << " at " + << info.addrs << " rank " << i; + if (is_resolve(i)) + ss << " is resolving"; + if (is_replay(i)) + ss << " is replaying journal"; + if (is_rejoin(i)) + ss << " is rejoining"; + if (is_reconnect(i)) + ss << " is reconnecting to clients"; + if (ss.str().length()) + detail.push_back(ss.str()); + } + } + + // MDS_UP_LESS_THAN_MAX + if ((mds_rank_t)get_num_in_mds() < get_max_mds()) { + health_check_t& check = checks->add( + "MDS_UP_LESS_THAN_MAX", HEALTH_WARN, + "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds"); + stringstream ss; + ss << "fs " << fs_name << " has " << get_num_in_mds() + << " MDS online, but wants " << get_max_mds(); + check.detail.push_back(ss.str()); + } + + // MDS_ALL_DOWN + if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) { + health_check_t &check = checks->add( + "MDS_ALL_DOWN", HEALTH_ERR, + "%num% filesystem%plurals% %isorare% offline"); + stringstream ss; + ss << "fs " << fs_name << " is offline because no MDS is active for it."; + check.detail.push_back(ss.str()); + } + + if (get_max_mds() > 1 && + was_snaps_ever_allowed() && !allows_multimds_snaps()) { + health_check_t &check = checks->add( + "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR, + "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots"); + stringstream ss; + ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS"; + check.detail.push_back(ss.str()); + } +} + +void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const +{ + __u8 v = 9; + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 7; + } + ENCODE_START(v, 4, bl); + encode(global_id, bl); + encode(name, bl); + encode(rank, bl); + encode(inc, bl); + encode((int32_t)state, bl); + encode(state_seq, bl); + if (v < 8) { + encode(addrs.legacy_addr(), bl, features); + } else { + encode(addrs, bl, features); + } + encode(laggy_since, bl); + encode(MDS_RANK_NONE, bl); /* standby_for_rank */ + encode(std::string(), bl); /* standby_for_name */ + encode(export_targets, bl); + encode(mds_features, bl); + encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */ + encode(false, bl); + if (v >= 9) { + encode(flags, bl); + } + ENCODE_FINISH(bl); +} + +void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const +{ + __u8 struct_v = 3; + using ceph::encode; + encode(struct_v, bl); + encode(global_id, bl); + encode(name, bl); + encode(rank, bl); + encode(inc, bl); + encode((int32_t)state, bl); + encode(state_seq, bl); + encode(addrs.legacy_addr(), bl, 0); + encode(laggy_since, bl); + encode(MDS_RANK_NONE, bl); + encode(std::string(), bl); + encode(export_targets, bl); +} + +void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl); + decode(global_id, bl); + decode(name, bl); + decode(rank, bl); + decode(inc, bl); + int32_t raw_state; + decode(raw_state, bl); + state = (MDSMap::DaemonState)raw_state; + decode(state_seq, bl); + decode(addrs, bl); + decode(laggy_since, bl); + { + mds_rank_t standby_for_rank; + decode(standby_for_rank, bl); + } + { + std::string standby_for_name; + decode(standby_for_name, bl); + } + if (struct_v >= 2) + decode(export_targets, bl); + if (struct_v >= 5) + decode(mds_features, bl); + if (struct_v >= 6) { + fs_cluster_id_t standby_for_fscid; + decode(standby_for_fscid, bl); + } + if (struct_v >= 7) { + bool standby_replay; + decode(standby_replay, bl); + } + if (struct_v >= 9) { + decode(flags, bl); + } + DECODE_FINISH(bl); +} + +std::string MDSMap::mds_info_t::human_name() const +{ + // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost" + std::ostringstream out; + out << "daemon mds." << name; + return out.str(); +} + +void MDSMap::encode(bufferlist& bl, uint64_t features) const +{ + std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that + // old-mon peers have something sane + // during upgrade + for (const auto rank : in) { + inc.insert(std::make_pair(rank, epoch)); + } + + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + __u16 v = 2; + encode(v, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + __u32 n = mds_info.size(); + encode(n, bl); + for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + encode(i->first, bl); + encode(i->second, bl, features); + } + n = data_pools.size(); + encode(n, bl); + for (const auto p: data_pools) { + n = p; + encode(n, bl); + } + + int32_t m = cas_pool; + encode(m, bl); + return; + } else if ((features & CEPH_FEATURE_MDSENC) == 0) { + __u16 v = 3; + encode(v, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + __u32 n = mds_info.size(); + encode(n, bl); + for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + encode(i->first, bl); + encode(i->second, bl, features); + } + encode(data_pools, bl); + encode(cas_pool, bl); + + // kclient ignores everything from here + __u16 ev = 5; + encode(ev, bl); + encode(compat, bl); + encode(metadata_pool, bl); + encode(created, bl); + encode(modified, bl); + encode(tableserver, bl); + encode(in, bl); + encode(inc, bl); + encode(up, bl); + encode(failed, bl); + encode(stopped, bl); + encode(last_failure_osd_epoch, bl); + return; + } + + ENCODE_START(5, 4, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + encode(mds_info, bl, features); + encode(data_pools, bl); + encode(cas_pool, bl); + + // kclient ignores everything from here + __u16 ev = 14; + encode(ev, bl); + encode(compat, bl); + encode(metadata_pool, bl); + encode(created, bl); + encode(modified, bl); + encode(tableserver, bl); + encode(in, bl); + encode(inc, bl); + encode(up, bl); + encode(failed, bl); + encode(stopped, bl); + encode(last_failure_osd_epoch, bl); + encode(ever_allowed_features, bl); + encode(explicitly_allowed_features, bl); + encode(inline_data_enabled, bl); + encode(enabled, bl); + encode(fs_name, bl); + encode(damaged, bl); + encode(balancer, bl); + encode(standby_count_wanted, bl); + encode(old_max_mds, bl); + encode(min_compat_client, bl); + ENCODE_FINISH(bl); +} + +void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists) +{ + /* Before we did stricter checking, it was possible to remove a data pool + * without also deleting it from the MDSMap. Check for that here after + * decoding the data pools. + */ + + for (auto it = data_pools.begin(); it != data_pools.end();) { + if (!pool_exists(*it)) { + dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl; + it = data_pools.erase(it); + } else { + it++; + } + } +} + +void MDSMap::decode(bufferlist::const_iterator& p) +{ + std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop + + cached_up_features = 0; + DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p); + decode(epoch, p); + decode(flags, p); + decode(last_failure, p); + decode(root, p); + decode(session_timeout, p); + decode(session_autoclose, p); + decode(max_file_size, p); + decode(max_mds, p); + decode(mds_info, p); + if (struct_v < 3) { + __u32 n; + decode(n, p); + while (n--) { + __u32 m; + decode(m, p); + data_pools.push_back(m); + } + __s32 s; + decode(s, p); + cas_pool = s; + } else { + decode(data_pools, p); + decode(cas_pool, p); + } + + // kclient ignores everything from here + __u16 ev = 1; + if (struct_v >= 2) + decode(ev, p); + if (ev >= 3) + decode(compat, p); + else + compat = get_compat_set_base(); + if (ev < 5) { + __u32 n; + decode(n, p); + metadata_pool = n; + } else { + decode(metadata_pool, p); + } + decode(created, p); + decode(modified, p); + decode(tableserver, p); + decode(in, p); + decode(inc, p); + decode(up, p); + decode(failed, p); + decode(stopped, p); + if (ev >= 4) + decode(last_failure_osd_epoch, p); + if (ev >= 6) { + if (ev < 10) { + // previously this was a bool about snaps, not a flag map + bool flag; + decode(flag, p); + ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; + decode(flag, p); + explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; + } else { + decode(ever_allowed_features, p); + decode(explicitly_allowed_features, p); + } + } else { + ever_allowed_features = 0; + explicitly_allowed_features = 0; + } + if (ev >= 7) + decode(inline_data_enabled, p); + + if (ev >= 8) { + ceph_assert(struct_v >= 5); + decode(enabled, p); + decode(fs_name, p); + } else { + if (epoch > 1) { + // If an MDS has ever been started, epoch will be greater than 1, + // assume filesystem is enabled. + enabled = true; + } else { + // Upgrading from a cluster that never used an MDS, switch off + // filesystem until it's explicitly enabled. + enabled = false; + } + } + + if (ev >= 9) { + decode(damaged, p); + } + + if (ev >= 11) { + decode(balancer, p); + } + + if (ev >= 12) { + decode(standby_count_wanted, p); + } + + if (ev >= 13) { + decode(old_max_mds, p); + } + + if (ev >= 14) { + decode(min_compat_client, p); + } + + DECODE_FINISH(p); +} + +MDSMap::availability_t MDSMap::is_cluster_available() const +{ + if (epoch == 0) { + // If I'm a client, this means I'm looking at an MDSMap instance + // that was never actually initialized from the mons. Client should + // wait. + return TRANSIENT_UNAVAILABLE; + } + + // If a rank is marked damage (unavailable until operator intervenes) + if (damaged.size()) { + return STUCK_UNAVAILABLE; + } + + // If no ranks are created (filesystem not initialized) + if (in.empty()) { + return STUCK_UNAVAILABLE; + } + + for (const auto rank : in) { + if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) { + // This might only be transient, but because we can't see + // standbys, we have no way of knowing whether there is a + // standby available to replace the laggy guy. + return STUCK_UNAVAILABLE; + } + } + + if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) { + // Nobody looks stuck, so indicate to client they should go ahead + // and try mounting if anybody is active. This may include e.g. + // one MDS failing over and another active: the client should + // proceed to start talking to the active one and let the + // transiently-unavailable guy catch up later. + return AVAILABLE; + } else { + // Nothing indicating we were stuck, but nobody active (yet) + //return TRANSIENT_UNAVAILABLE; + + // Because we don't have standbys in the MDSMap any more, we can't + // reliably indicate transient vs. stuck, so always say stuck so + // that the client doesn't block. + return STUCK_UNAVAILABLE; + } +} + +bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) +{ + bool state_valid = true; + if (next != prev) { + if (prev == MDSMap::STATE_REPLAY) { + if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) { + state_valid = false; + } + } else if (prev == MDSMap::STATE_REJOIN) { + if (next != MDSMap::STATE_ACTIVE && + next != MDSMap::STATE_CLIENTREPLAY && + next != MDSMap::STATE_STOPPED) { + state_valid = false; + } + } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) { + // Once I have entered replay, the only allowable transitions are to + // the next next along in the sequence. + if (next != prev + 1) { + state_valid = false; + } + } + } + + return state_valid; +} + +bool MDSMap::check_health(mds_rank_t standby_daemon_count) +{ + std::set<mds_rank_t> standbys; + get_standby_replay_mds_set(standbys); + std::set<mds_rank_t> actives; + get_active_mds_set(actives); + mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count; + + /* If there are standby daemons available/replaying and + * standby_count_wanted is unset (default), then we set it to 1. This will + * happen during health checks by the mons. Also, during initial creation + * of the FS we will have no actives so we don't want to change the default + * yet. + */ + if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) { + set_standby_count_wanted(1); + return true; + } + return false; +} diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h new file mode 100644 index 00000000..031319da --- /dev/null +++ b/src/mds/MDSMap.h @@ -0,0 +1,686 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MDSMAP_H +#define CEPH_MDSMAP_H + +#include <algorithm> +#include <map> +#include <set> +#include <string> +#include <string_view> + +#include <errno.h> + +#include "include/types.h" +#include "common/Clock.h" +#include "include/health.h" + +#include "common/config.h" + +#include "include/CompatSet.h" +#include "include/ceph_features.h" +#include "common/Formatter.h" +#include "mds/mdstypes.h" + +class CephContext; +class health_check_map_t; + +#define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20") +#define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges") +#define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs") +#define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object") +#define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding") +#define MDS_FEATURE_INCOMPAT_OMAPDIRFRAG CompatSet::Feature(6, "dirfrag is stored in omap") +#define MDS_FEATURE_INCOMPAT_INLINE CompatSet::Feature(7, "mds uses inline data") +#define MDS_FEATURE_INCOMPAT_NOANCHOR CompatSet::Feature(8, "no anchor table") +#define MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 CompatSet::Feature(9, "file layout v2") +#define MDS_FEATURE_INCOMPAT_SNAPREALM_V2 CompatSet::Feature(10, "snaprealm v2") + +#define MDS_FS_NAME_DEFAULT "cephfs" + +class MDSMap { +public: + /* These states are the union of the set of possible states of an MDS daemon, + * and the set of possible states of an MDS rank. See + * doc/cephfs/mds-states.rst for state descriptions, + * doc/cephfs/mds-state-diagram.svg for a visual state diagram, and + * doc/cephfs/mds-state-diagram.dot to update mds-state-diagram.svg. + */ + typedef enum { + // States of an MDS daemon not currently holding a rank + // ==================================================== + STATE_NULL = CEPH_MDS_STATE_NULL, // null value for fns returning this type. + STATE_BOOT = CEPH_MDS_STATE_BOOT, // up, boot announcement. destiny unknown. + STATE_STANDBY = CEPH_MDS_STATE_STANDBY, // up, idle. waiting for assignment by monitor. + STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY, // up, replaying active node, ready to take over. + + // States of an MDS rank, and of any MDS daemon holding that rank + // ============================================================== + STATE_STOPPED = CEPH_MDS_STATE_STOPPED, // down, once existed, but no subtrees. empty log. may not be held by a daemon. + + STATE_CREATING = CEPH_MDS_STATE_CREATING, // up, creating MDS instance (new journal, idalloc..). + STATE_STARTING = CEPH_MDS_STATE_STARTING, // up, starting prior stopped MDS instance. + + STATE_REPLAY = CEPH_MDS_STATE_REPLAY, // up, starting prior failed instance. scanning journal. + STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE, // up, disambiguating distributed operations (import, rename, etc.) + STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT, // up, reconnect to clients + STATE_REJOIN = CEPH_MDS_STATE_REJOIN, // up, replayed journal, rejoining distributed cache + STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active + STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE, // up, active + STATE_STOPPING = CEPH_MDS_STATE_STOPPING, // up, exporting metadata (-> standby or out) + STATE_DNE = CEPH_MDS_STATE_DNE, // down, rank does not exist + + // State which a daemon may send to MDSMonitor in its beacon + // to indicate that offline repair is required. Daemon must stop + // immediately after indicating this state. + STATE_DAMAGED = CEPH_MDS_STATE_DAMAGED + + /* + * In addition to explicit states, an MDS rank implicitly in state: + * - STOPPED if it is not currently associated with an MDS daemon gid but it + * is in MDSMap::stopped + * - FAILED if it is not currently associated with an MDS daemon gid but it + * is in MDSMap::failed + * - DNE if it is not currently associated with an MDS daemon gid and it is + * missing from both MDSMap::failed and MDSMap::stopped + */ + } DaemonState; + + struct mds_info_t { + mds_gid_t global_id = MDS_GID_NONE; + std::string name; + mds_rank_t rank = MDS_RANK_NONE; + int32_t inc = 0; + MDSMap::DaemonState state = STATE_STANDBY; + version_t state_seq = 0; + entity_addrvec_t addrs; + utime_t laggy_since; + std::set<mds_rank_t> export_targets; + uint64_t mds_features = 0; + uint64_t flags = 0; + enum mds_flags : uint64_t { + FROZEN = 1 << 0, + }; + + mds_info_t() = default; + + bool laggy() const { return !(laggy_since == utime_t()); } + void clear_laggy() { laggy_since = utime_t(); } + + bool is_degraded() const { + return STATE_REPLAY <= state && state <= STATE_CLIENTREPLAY; + } + + void freeze() { flags |= mds_flags::FROZEN; } + void unfreeze() { flags &= ~mds_flags::FROZEN; } + bool is_frozen() const { return flags&mds_flags::FROZEN; } + + const entity_addrvec_t& get_addrs() const { + return addrs; + } + + void encode(bufferlist& bl, uint64_t features) const { + if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl); + else encode_versioned(bl, features); + } + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + void dump(std::ostream&) const; + + // The long form name for use in cluster log messages` + std::string human_name() const; + + static void generate_test_instances(list<mds_info_t*>& ls); + private: + void encode_versioned(bufferlist& bl, uint64_t features) const; + void encode_unversioned(bufferlist& bl) const; + }; + + static CompatSet get_compat_set_all(); + static CompatSet get_compat_set_default(); + static CompatSet get_compat_set_base(); // pre v0.20 + +protected: + // base map + epoch_t epoch = 0; + bool enabled = false; + std::string fs_name = MDS_FS_NAME_DEFAULT; + uint32_t flags = CEPH_MDSMAP_DEFAULTS; // flags + epoch_t last_failure = 0; // mds epoch of last failure + epoch_t last_failure_osd_epoch = 0; // osd epoch of last failure; any mds entering replay needs + // at least this osdmap to ensure the blacklist propagates. + utime_t created; + utime_t modified; + + mds_rank_t tableserver = 0; // which MDS has snaptable + mds_rank_t root = 0; // which MDS has root directory + + __u32 session_timeout = 60; + __u32 session_autoclose = 300; + uint64_t max_file_size = 1ULL<<40; /* 1TB */ + + int8_t min_compat_client = -1; + + std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default. + int64_t cas_pool = -1; // where CAS objects go + int64_t metadata_pool = -1; // where fs metadata objects go + + /* + * in: the set of logical mds #'s that define the cluster. this is the set + * of mds's the metadata may be distributed over. + * up: map from logical mds #'s to the addrs filling those roles. + * failed: subset of @in that are failed. + * stopped: set of nodes that have been initialized, but are not active. + * + * @up + @failed = @in. @in * @stopped = {}. + */ + + mds_rank_t max_mds = 1; /* The maximum number of active MDSes. Also, the maximum rank. */ + mds_rank_t old_max_mds = 0; /* Value to restore when MDS cluster is marked up */ + mds_rank_t standby_count_wanted = -1; + string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */ + + std::set<mds_rank_t> in; // currently defined cluster + + // which ranks are failed, stopped, damaged (i.e. not held by a daemon) + std::set<mds_rank_t> failed, stopped, damaged; + std::map<mds_rank_t, mds_gid_t> up; // who is in those roles + std::map<mds_gid_t, mds_info_t> mds_info; + + uint8_t ever_allowed_features = 0; //< bitmap of features the cluster has allowed + uint8_t explicitly_allowed_features = 0; //< bitmap of features explicitly enabled + + bool inline_data_enabled = false; + + uint64_t cached_up_features = 0; + +public: + CompatSet compat; + + friend class MDSMonitor; + friend class Filesystem; + friend class FSMap; + +public: + bool get_inline_data_enabled() const { return inline_data_enabled; } + void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; } + + utime_t get_session_timeout() const { + return utime_t(session_timeout,0); + } + void set_session_timeout(uint32_t t) { + session_timeout = t; + } + + utime_t get_session_autoclose() const { + return utime_t(session_autoclose, 0); + } + void set_session_autoclose(uint32_t t) { + session_autoclose = t; + } + + uint64_t get_max_filesize() const { return max_file_size; } + void set_max_filesize(uint64_t m) { max_file_size = m; } + + uint8_t get_min_compat_client() const { return min_compat_client; } + void set_min_compat_client(uint8_t version) { min_compat_client = version; } + + int get_flags() const { return flags; } + bool test_flag(int f) const { return flags & f; } + void set_flag(int f) { flags |= f; } + void clear_flag(int f) { flags &= ~f; } + + std::string_view get_fs_name() const {return fs_name;} + + void set_snaps_allowed() { + set_flag(CEPH_MDSMAP_ALLOW_SNAPS); + ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS; + explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS; + } + void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); } + bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); } + bool was_snaps_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_SNAPS; } + + void set_standby_replay_allowed() { + set_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); + ever_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; + explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; + } + void clear_standby_replay_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } + bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } + bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; } + + void set_multimds_snaps_allowed() { + set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); + ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; + explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; + } + void clear_multimds_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); } + bool allows_multimds_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); } + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + bool get_enabled() const { return enabled; } + + const utime_t& get_created() const { return created; } + void set_created(utime_t ct) { modified = created = ct; } + const utime_t& get_modified() const { return modified; } + void set_modified(utime_t mt) { modified = mt; } + + epoch_t get_last_failure() const { return last_failure; } + epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; } + + mds_rank_t get_max_mds() const { return max_mds; } + void set_max_mds(mds_rank_t m) { max_mds = m; } + void set_old_max_mds() { old_max_mds = max_mds; } + mds_rank_t get_old_max_mds() const { return old_max_mds; } + + mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const { + ceph_assert(standby_daemon_count >= 0); + std::set<mds_rank_t> s; + get_standby_replay_mds_set(s); + mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count; + mds_rank_t wanted = std::max(0, standby_count_wanted); + return wanted > standbys_avail ? wanted - standbys_avail : 0; + } + void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; } + bool check_health(mds_rank_t standby_daemon_count); + + const std::string get_balancer() const { return balancer; } + void set_balancer(std::string val) { balancer.assign(val); } + + mds_rank_t get_tableserver() const { return tableserver; } + mds_rank_t get_root() const { return root; } + + const std::vector<int64_t> &get_data_pools() const { return data_pools; } + int64_t get_first_data_pool() const { return *data_pools.begin(); } + int64_t get_metadata_pool() const { return metadata_pool; } + bool is_data_pool(int64_t poolid) const { + auto p = std::find(data_pools.begin(), data_pools.end(), poolid); + if (p == data_pools.end()) + return false; + return true; + } + + bool pool_in_use(int64_t poolid) const { + return get_enabled() && (is_data_pool(poolid) || metadata_pool == poolid); + } + + const auto& get_mds_info() const { return mds_info; } + const auto& get_mds_info_gid(mds_gid_t gid) const { + return mds_info.at(gid); + } + const mds_info_t& get_mds_info(mds_rank_t m) const { + ceph_assert(up.count(m) && mds_info.count(up.at(m))); + return mds_info.at(up.at(m)); + } + mds_gid_t find_mds_gid_by_name(std::string_view s) const { + for (const auto& [gid, info] : mds_info) { + if (info.name == s) { + return gid; + } + } + return MDS_GID_NONE; + } + + // counts + unsigned get_num_in_mds() const { + return in.size(); + } + unsigned get_num_up_mds() const { + return up.size(); + } + mds_rank_t get_last_in_mds() const { + auto p = in.rbegin(); + return p == in.rend() ? MDS_RANK_NONE : *p; + } + int get_num_failed_mds() const { + return failed.size(); + } + unsigned get_num_mds(int state) const { + unsigned n = 0; + for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state == state) ++n; + return n; + } + + // data pools + void add_data_pool(int64_t poolid) { + data_pools.push_back(poolid); + } + int remove_data_pool(int64_t poolid) { + std::vector<int64_t>::iterator p = std::find(data_pools.begin(), data_pools.end(), poolid); + if (p == data_pools.end()) + return -ENOENT; + data_pools.erase(p); + return 0; + } + + // sets + void get_mds_set(std::set<mds_rank_t>& s) const { + s = in; + } + void get_up_mds_set(std::set<mds_rank_t>& s) const { + for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin(); + p != up.end(); + ++p) + s.insert(p->first); + } + void get_active_mds_set(std::set<mds_rank_t>& s) const { + get_mds_set(s, MDSMap::STATE_ACTIVE); + } + void get_standby_replay_mds_set(std::set<mds_rank_t>& s) const { + get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY); + } + void get_failed_mds_set(std::set<mds_rank_t>& s) const { + s = failed; + } + + // features + uint64_t get_up_features() { + if (!cached_up_features) { + bool first = true; + for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin(); + p != up.end(); + ++p) { + std::map<mds_gid_t, mds_info_t>::const_iterator q = + mds_info.find(p->second); + ceph_assert(q != mds_info.end()); + if (first) { + cached_up_features = q->second.mds_features; + first = false; + } else { + cached_up_features &= q->second.mds_features; + } + } + } + return cached_up_features; + } + + /** + * Get MDS ranks which are in but not up. + */ + void get_down_mds_set(std::set<mds_rank_t> *s) const + { + ceph_assert(s != NULL); + s->insert(failed.begin(), failed.end()); + s->insert(damaged.begin(), damaged.end()); + } + + int get_failed() const { + if (!failed.empty()) return *failed.begin(); + return -1; + } + void get_stopped_mds_set(std::set<mds_rank_t>& s) const { + s = stopped; + } + void get_recovery_mds_set(std::set<mds_rank_t>& s) const { + s = failed; + for (const auto& p : damaged) + s.insert(p); + for (const auto& p : mds_info) + if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING) + s.insert(p.second.rank); + } + + void get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const { + for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state >= first && p->second.state <= STATE_STOPPING) + s.insert(p->second.rank); + } + void get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const { + for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state == state) + s.insert(p->second.rank); + } + + void get_health(list<pair<health_status_t,std::string> >& summary, + list<pair<health_status_t,std::string> > *detail) const; + + void get_health_checks(health_check_map_t *checks) const; + + typedef enum + { + AVAILABLE = 0, + TRANSIENT_UNAVAILABLE = 1, + STUCK_UNAVAILABLE = 2 + + } availability_t; + + /** + * Return indication of whether cluster is available. This is a + * heuristic for clients to see if they should bother waiting to talk to + * MDSs, or whether they should error out at startup/mount. + * + * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a + * transition state like replaying, or is potentially about the fail over. + * Clients should wait for an updated map before making a final decision + * about whether the filesystem is mountable. + * + * A STUCK_UNAVAILABLE result indicates that we can't see a way that + * the cluster is about to recover on its own, so it'll probably require + * administrator intervention: clients should probably not bother trying + * to mount. + */ + availability_t is_cluster_available() const; + + /** + * Return whether this MDSMap is suitable for resizing based on the state + * of the ranks. + */ + bool is_resizeable() const { + return !is_degraded() && + get_num_mds(CEPH_MDS_STATE_CREATING) == 0 && + get_num_mds(CEPH_MDS_STATE_STARTING) == 0 && + get_num_mds(CEPH_MDS_STATE_STOPPING) == 0; + } + + // mds states + bool is_down(mds_rank_t m) const { return up.count(m) == 0; } + bool is_up(mds_rank_t m) const { return up.count(m); } + bool is_in(mds_rank_t m) const { return up.count(m) || failed.count(m); } + bool is_out(mds_rank_t m) const { return !is_in(m); } + + bool is_failed(mds_rank_t m) const { return failed.count(m); } + bool is_stopped(mds_rank_t m) const { return stopped.count(m); } + + bool is_dne(mds_rank_t m) const { return in.count(m) == 0; } + bool is_dne_gid(mds_gid_t gid) const { return mds_info.count(gid) == 0; } + + /** + * Get MDS daemon status by GID + */ + auto get_state_gid(mds_gid_t gid) const { + auto it = mds_info.find(gid); + if (it == mds_info.end()) + return STATE_NULL; + return it->second.state; + } + + /** + * Get MDS rank state if the rank is up, else STATE_NULL + */ + auto get_state(mds_rank_t m) const { + auto it = up.find(m); + if (it == up.end()) + return STATE_NULL; + return get_state_gid(it->second); + } + + const auto& get_info(mds_rank_t m) const { + return mds_info.at(up.at(m)); + } + const auto& get_info_gid(mds_gid_t gid) const { + return mds_info.at(gid); + } + + bool is_boot(mds_rank_t m) const { return get_state(m) == STATE_BOOT; } + bool is_creating(mds_rank_t m) const { return get_state(m) == STATE_CREATING; } + bool is_starting(mds_rank_t m) const { return get_state(m) == STATE_STARTING; } + bool is_replay(mds_rank_t m) const { return get_state(m) == STATE_REPLAY; } + bool is_resolve(mds_rank_t m) const { return get_state(m) == STATE_RESOLVE; } + bool is_reconnect(mds_rank_t m) const { return get_state(m) == STATE_RECONNECT; } + bool is_rejoin(mds_rank_t m) const { return get_state(m) == STATE_REJOIN; } + bool is_clientreplay(mds_rank_t m) const { return get_state(m) == STATE_CLIENTREPLAY; } + bool is_active(mds_rank_t m) const { return get_state(m) == STATE_ACTIVE; } + bool is_stopping(mds_rank_t m) const { return get_state(m) == STATE_STOPPING; } + bool is_active_or_stopping(mds_rank_t m) const { + return is_active(m) || is_stopping(m); + } + bool is_clientreplay_or_active_or_stopping(mds_rank_t m) const { + return is_clientreplay(m) || is_active(m) || is_stopping(m); + } + + mds_gid_t get_standby_replay(mds_rank_t r) const { + for (auto& [gid,info] : mds_info) { + if (info.rank == r && info.state == STATE_STANDBY_REPLAY) { + return gid; + } + } + return MDS_GID_NONE; + } + bool has_standby_replay(mds_rank_t r) const { + return get_standby_replay(r) != MDS_GID_NONE; + } + + bool is_followable(mds_rank_t r) const { + if (auto it1 = up.find(r); it1 != up.end()) { + if (auto it2 = mds_info.find(it1->second); it2 != mds_info.end()) { + auto& info = it2->second; + if (!info.is_degraded() && !has_standby_replay(r)) { + return true; + } + } + } + return false; + } + + bool is_laggy_gid(mds_gid_t gid) const { + auto it = mds_info.find(gid); + return it == mds_info.end() ? false : it->second.laggy(); + } + + // degraded = some recovery in process. fixes active membership and + // recovery_set. + bool is_degraded() const { + if (!failed.empty() || !damaged.empty()) + return true; + for (const auto& p : mds_info) { + if (p.second.is_degraded()) + return true; + } + return false; + } + bool is_any_failed() const { + return failed.size(); + } + bool is_resolving() const { + return + get_num_mds(STATE_RESOLVE) > 0 && + get_num_mds(STATE_REPLAY) == 0 && + failed.empty() && damaged.empty(); + } + bool is_rejoining() const { + // nodes are rejoining cache state + return + get_num_mds(STATE_REJOIN) > 0 && + get_num_mds(STATE_REPLAY) == 0 && + get_num_mds(STATE_RECONNECT) == 0 && + get_num_mds(STATE_RESOLVE) == 0 && + failed.empty() && damaged.empty(); + } + bool is_stopped() const { + return up.empty(); + } + + /** + * Get whether a rank is 'up', i.e. has + * an MDS daemon's entity_inst_t associated + * with it. + */ + bool have_inst(mds_rank_t m) const { + return up.count(m); + } + + /** + * Get the MDS daemon entity_inst_t for a rank + * known to be up. + */ + entity_addrvec_t get_addrs(mds_rank_t m) const { + return mds_info.at(up.at(m)).get_addrs(); + } + + mds_rank_t get_rank_gid(mds_gid_t gid) const { + if (mds_info.count(gid)) { + return mds_info.at(gid).rank; + } else { + return MDS_RANK_NONE; + } + } + + /** + * Get MDS rank incarnation if the rank is up, else -1 + */ + mds_gid_t get_incarnation(mds_rank_t m) const { + std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m); + if (u == up.end()) + return MDS_GID_NONE; + return (mds_gid_t)get_inc_gid(u->second); + } + + int get_inc_gid(mds_gid_t gid) const { + auto mds_info_entry = mds_info.find(gid); + if (mds_info_entry != mds_info.end()) + return mds_info_entry->second.inc; + return -1; + } + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void decode(const bufferlist& bl) { + auto p = bl.cbegin(); + decode(p); + } + void sanitize(const std::function<bool(int64_t pool)>& pool_exists); + + void print(ostream& out) const; + void print_summary(Formatter *f, ostream *out) const; + + void dump(Formatter *f) const; + static void generate_test_instances(list<MDSMap*>& ls); + + static bool state_transition_valid(DaemonState prev, DaemonState next); +}; +WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) +WRITE_CLASS_ENCODER_FEATURES(MDSMap) + +inline ostream& operator<<(ostream &out, const MDSMap &m) { + m.print_summary(NULL, &out); + return out; +} + +inline std::ostream& operator<<(std::ostream& o, const MDSMap::mds_info_t& info) { + info.dump(o); + return o; +} + +#endif diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc new file mode 100644 index 00000000..83a9d127 --- /dev/null +++ b/src/mds/MDSRank.cc @@ -0,0 +1,3824 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string_view> + +#include "common/debug.h" +#include "common/errno.h" + +#include "messages/MClientRequestForward.h" +#include "messages/MMDSLoadTargets.h" +#include "messages/MMDSTableRequest.h" + +#include "mgr/MgrClient.h" + +#include "MDSDaemon.h" +#include "MDSMap.h" +#include "SnapClient.h" +#include "SnapServer.h" +#include "MDBalancer.h" +#include "Migrator.h" +#include "Locker.h" +#include "InoTable.h" +#include "mon/MonClient.h" +#include "common/HeartbeatMap.h" +#include "ScrubStack.h" + + +#include "MDSRank.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' ' + +class C_Flush_Journal : public MDSInternalContext { +public: + C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds, + std::ostream *ss, Context *on_finish) + : MDSInternalContext(mds), + mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish), + whoami(mds->whoami), incarnation(mds->incarnation) { + } + + void send() { + assert(mds->mds_lock.is_locked()); + + dout(20) << __func__ << dendl; + + if (mdcache->is_readonly()) { + dout(5) << __func__ << ": read-only FS" << dendl; + complete(-EROFS); + return; + } + + if (!mds->is_active()) { + dout(5) << __func__ << ": MDS not active, no-op" << dendl; + complete(0); + return; + } + + flush_mdlog(); + } + +private: + + void flush_mdlog() { + dout(20) << __func__ << dendl; + + // I need to seal off the current segment, and then mark all + // previous segments for expiry + mdlog->start_new_segment(); + + Context *ctx = new FunctionContext([this](int r) { + handle_flush_mdlog(r); + }); + + // Flush initially so that all the segments older than our new one + // will be elegible for expiry + mdlog->flush(); + mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx)); + } + + void handle_flush_mdlog(int r) { + dout(20) << __func__ << ": r=" << r << dendl; + + if (r != 0) { + *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal"; + complete(r); + return; + } + + clear_mdlog(); + } + + void clear_mdlog() { + dout(20) << __func__ << dendl; + + Context *ctx = new FunctionContext([this](int r) { + handle_clear_mdlog(r); + }); + + // Because we may not be the last wait_for_safe context on MDLog, + // and subsequent contexts might wake up in the middle of our + // later trim_all and interfere with expiry (by e.g. marking + // dirs/dentries dirty on previous log segments), we run a second + // wait_for_safe here. See #10368 + mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx)); + } + + void handle_clear_mdlog(int r) { + dout(20) << __func__ << ": r=" << r << dendl; + + if (r != 0) { + *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal"; + complete(r); + return; + } + + trim_mdlog(); + } + + void trim_mdlog() { + // Put all the old log segments into expiring or expired state + dout(5) << __func__ << ": beginning segment expiry" << dendl; + + int ret = mdlog->trim_all(); + if (ret != 0) { + *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log"; + complete(ret); + return; + } + + expire_segments(); + } + + void expire_segments() { + dout(20) << __func__ << dendl; + + // Attach contexts to wait for all expiring segments to expire + MDSGatherBuilder expiry_gather(g_ceph_context); + + const auto &expiring_segments = mdlog->get_expiring_segments(); + for (auto p : expiring_segments) { + p->wait_for_expiry(expiry_gather.new_sub()); + } + dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created() + << " segments to expire" << dendl; + + if (!expiry_gather.has_subs()) { + trim_segments(); + return; + } + + Context *ctx = new FunctionContext([this](int r) { + handle_expire_segments(r); + }); + expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, ctx)); + expiry_gather.activate(); + } + + void handle_expire_segments(int r) { + dout(20) << __func__ << ": r=" << r << dendl; + + ceph_assert(r == 0); // MDLog is not allowed to raise errors via + // wait_for_expiry + trim_segments(); + } + + void trim_segments() { + dout(20) << __func__ << dendl; + + Context *ctx = new C_OnFinisher(new FunctionContext([this](int _) { + std::lock_guard locker(mds->mds_lock); + trim_expired_segments(); + }), mds->finisher); + ctx->complete(0); + } + + void trim_expired_segments() { + dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " + << std::hex << mdlog->get_journaler()->get_expire_pos() << "/" + << mdlog->get_journaler()->get_trimmed_pos() << dendl; + + // Now everyone I'm interested in is expired + mdlog->trim_expired_segments(); + + dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " + << std::hex << mdlog->get_journaler()->get_expire_pos() << "/" + << mdlog->get_journaler()->get_trimmed_pos() << dendl; + + write_journal_head(); + } + + void write_journal_head() { + dout(20) << __func__ << dendl; + + Context *ctx = new FunctionContext([this](int r) { + std::lock_guard locker(mds->mds_lock); + handle_write_head(r); + }); + // Flush the journal header so that readers will start from after + // the flushed region + mdlog->get_journaler()->write_head(ctx); + } + + void handle_write_head(int r) { + if (r != 0) { + *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header"; + } else { + dout(5) << __func__ << ": write_head complete, all done!" << dendl; + } + + complete(r); + } + + void finish(int r) override { + dout(20) << __func__ << ": r=" << r << dendl; + on_finish->complete(r); + } + + MDCache *mdcache; + MDLog *mdlog; + std::ostream *ss; + Context *on_finish; + + // so as to use dout + mds_rank_t whoami; + int incarnation; +}; + +class C_Drop_Cache : public MDSInternalContext { +public: + C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog, + MDSRank *mds, uint64_t recall_timeout, + Formatter *f, Context *on_finish) + : MDSInternalContext(mds), + server(server), mdcache(mdcache), mdlog(mdlog), + recall_timeout(recall_timeout), recall_start(mono_clock::now()), + f(f), on_finish(on_finish), + whoami(mds->whoami), incarnation(mds->incarnation) { + } + + void send() { + // not really a hard requirement here, but lets ensure this in + // case we change the logic here. + assert(mds->mds_lock.is_locked()); + + dout(20) << __func__ << dendl; + f->open_object_section("result"); + recall_client_state(); + } + +private: + // context which completes itself (with -ETIMEDOUT) after a specified + // timeout or when explicitly completed, whichever comes first. Note + // that the context does not detroy itself after completion -- it + // needs to be explicitly freed. + class C_ContextTimeout : public MDSInternalContext { + public: + C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish) + : MDSInternalContext(mds), + timeout(timeout), + lock("mds::context::timeout", false, true), + on_finish(on_finish) { + } + ~C_ContextTimeout() { + ceph_assert(timer_task == nullptr); + } + + void start_timer() { + if (!timeout) { + return; + } + + timer_task = new FunctionContext([this](int _) { + timer_task = nullptr; + complete(-ETIMEDOUT); + }); + mds->timer.add_event_after(timeout, timer_task); + } + + void finish(int r) override { + Context *ctx = nullptr; + { + std::lock_guard locker(lock); + std::swap(on_finish, ctx); + } + if (ctx != nullptr) { + ctx->complete(r); + } + } + void complete(int r) override { + if (timer_task != nullptr) { + mds->timer.cancel_event(timer_task); + } + + finish(r); + } + + uint64_t timeout; + Mutex lock; + Context *on_finish = nullptr; + Context *timer_task = nullptr; + }; + + auto do_trim() { + auto [throttled, count] = mdcache->trim(UINT64_MAX); + dout(10) << __func__ + << (throttled ? " (throttled)" : "") + << " trimmed " << count << " caps" << dendl; + dentries_trimmed += count; + return std::make_pair(throttled, count); + } + + void recall_client_state() { + dout(20) << __func__ << dendl; + auto now = mono_clock::now(); + auto duration = std::chrono::duration<double>(now-recall_start).count(); + + MDSGatherBuilder gather(g_ceph_context); + auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM; + auto [throttled, count] = server->recall_client_state(&gather, flags); + dout(10) << __func__ + << (throttled ? " (throttled)" : "") + << " recalled " << count << " caps" << dendl; + + caps_recalled += count; + if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) { + C_ContextTimeout *ctx = new C_ContextTimeout( + mds, 1, new FunctionContext([this](int r) { + recall_client_state(); + })); + ctx->start_timer(); + gather.set_finisher(new MDSInternalContextWrapper(mds, ctx)); + gather.activate(); + mdlog->flush(); /* use down-time to incrementally flush log */ + do_trim(); /* use down-time to incrementally trim cache */ + } else { + if (!gather.has_subs()) { + return handle_recall_client_state(0); + } else if (recall_timeout > 0 && duration > recall_timeout) { + gather.set_finisher(new C_MDSInternalNoop); + gather.activate(); + return handle_recall_client_state(-ETIMEDOUT); + } else { + uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration); + C_ContextTimeout *ctx = new C_ContextTimeout( + mds, remaining, new FunctionContext([this](int r) { + handle_recall_client_state(r); + })); + + ctx->start_timer(); + gather.set_finisher(new MDSInternalContextWrapper(mds, ctx)); + gather.activate(); + } + } + } + + void handle_recall_client_state(int r) { + dout(20) << __func__ << ": r=" << r << dendl; + + // client recall section + f->open_object_section("client_recall"); + f->dump_int("return_code", r); + f->dump_string("message", cpp_strerror(r)); + f->dump_int("recalled", caps_recalled); + f->close_section(); + + // we can still continue after recall timeout + flush_journal(); + } + + void flush_journal() { + dout(20) << __func__ << dendl; + + Context *ctx = new FunctionContext([this](int r) { + handle_flush_journal(r); + }); + + C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx); + flush_journal->send(); + } + + void handle_flush_journal(int r) { + dout(20) << __func__ << ": r=" << r << dendl; + + if (r != 0) { + cmd_err(f, ss.str()); + complete(r); + return; + } + + // journal flush section + f->open_object_section("flush_journal"); + f->dump_int("return_code", r); + f->dump_string("message", ss.str()); + f->close_section(); + + trim_cache(); + } + + void trim_cache() { + dout(20) << __func__ << dendl; + + auto [throttled, count] = do_trim(); + if (throttled && count > 0) { + auto timer = new FunctionContext([this](int _) { + trim_cache(); + }); + mds->timer.add_event_after(1.0, timer); + } else { + cache_status(); + } + } + + void cache_status() { + dout(20) << __func__ << dendl; + + f->open_object_section("trim_cache"); + f->dump_int("trimmed", dentries_trimmed); + f->close_section(); + + // cache status section + mdcache->cache_status(f); + + complete(0); + } + + void finish(int r) override { + dout(20) << __func__ << ": r=" << r << dendl; + + auto d = std::chrono::duration<double>(mono_clock::now()-recall_start); + f->dump_float("duration", d.count()); + + f->close_section(); + on_finish->complete(r); + } + + Server *server; + MDCache *mdcache; + MDLog *mdlog; + uint64_t recall_timeout; + mono_time recall_start; + Formatter *f; + Context *on_finish; + + int retval = 0; + std::stringstream ss; + uint64_t caps_recalled = 0; + uint64_t dentries_trimmed = 0; + + // so as to use dout + mds_rank_t whoami; + int incarnation; + + void cmd_err(Formatter *f, std::string_view err) { + f->reset(); + f->open_object_section("result"); + f->dump_string("error", err); + f->close_section(); + } +}; + +MDSRank::MDSRank( + mds_rank_t whoami_, + Mutex &mds_lock_, + LogChannelRef &clog_, + SafeTimer &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap>& mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_) + : + whoami(whoami_), incarnation(0), + mds_lock(mds_lock_), cct(msgr->cct), clog(clog_), timer(timer_), + mdsmap(mdsmap_), + objecter(new Objecter(g_ceph_context, msgr, monc_, nullptr, 0, 0)), + server(NULL), mdcache(NULL), locker(NULL), mdlog(NULL), + balancer(NULL), scrubstack(NULL), + damage_table(whoami_), + inotable(NULL), snapserver(NULL), snapclient(NULL), + sessionmap(this), logger(NULL), mlogger(NULL), + op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, + g_conf()->osd_num_op_tracker_shard), + last_state(MDSMap::STATE_BOOT), + state(MDSMap::STATE_BOOT), + cluster_degraded(false), stopping(false), + purge_queue(g_ceph_context, whoami_, + mdsmap_->get_metadata_pool(), objecter, + new FunctionContext( + [this](int r){ + // Purge Queue operates inside mds_lock when we're calling into + // it, and outside when in background, so must handle both cases. + if (mds_lock.is_locked_by_me()) { + handle_write_error(r); + } else { + std::lock_guard l(mds_lock); + handle_write_error(r); + } + } + ) + ), + progress_thread(this), dispatch_depth(0), + hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_), + mds_slow_req_count(0), + last_client_mdsmap_bcast(0), + messenger(msgr), monc(monc_), mgrc(mgrc), + respawn_hook(respawn_hook_), + suicide_hook(suicide_hook_), + standby_replaying(false), + starttime(mono_clock::now()) +{ + hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self()); + + purge_queue.update_op_limit(*mdsmap); + + objecter->unset_honor_osdmap_full(); + + finisher = new Finisher(cct); + + mdcache = new MDCache(this, purge_queue); + mdlog = new MDLog(this); + balancer = new MDBalancer(this, messenger, monc); + + scrubstack = new ScrubStack(mdcache, clog, finisher); + + inotable = new InoTable(this); + snapserver = new SnapServer(this, monc); + snapclient = new SnapClient(this); + + server = new Server(this); + locker = new Locker(this, mdcache); + + op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time, + cct->_conf->mds_op_log_threshold); + op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size, + cct->_conf->mds_op_history_duration); + + schedule_update_timer_task(); +} + +MDSRank::~MDSRank() +{ + if (hb) { + g_ceph_context->get_heartbeat_map()->remove_worker(hb); + } + + if (scrubstack) { delete scrubstack; scrubstack = NULL; } + if (mdcache) { delete mdcache; mdcache = NULL; } + if (mdlog) { delete mdlog; mdlog = NULL; } + if (balancer) { delete balancer; balancer = NULL; } + if (inotable) { delete inotable; inotable = NULL; } + if (snapserver) { delete snapserver; snapserver = NULL; } + if (snapclient) { delete snapclient; snapclient = NULL; } + + if (server) { delete server; server = 0; } + if (locker) { delete locker; locker = 0; } + + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger); + delete logger; + logger = 0; + } + if (mlogger) { + g_ceph_context->get_perfcounters_collection()->remove(mlogger); + delete mlogger; + mlogger = 0; + } + + delete finisher; + finisher = NULL; + + delete suicide_hook; + suicide_hook = NULL; + + delete respawn_hook; + respawn_hook = NULL; + + delete objecter; + objecter = nullptr; +} + +void MDSRankDispatcher::init() +{ + objecter->init(); + messenger->add_dispatcher_head(objecter); + + objecter->start(); + + update_log_config(); + create_logger(); + + // Expose the OSDMap (already populated during MDS::init) to anyone + // who is interested in it. + handle_osd_map(); + + progress_thread.create("mds_rank_progr"); + + purge_queue.init(); + + finisher->start(); +} + +void MDSRank::update_targets() +{ + // get MonMap's idea of my export_targets + const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets; + + dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl; + + bool send = false; + set<mds_rank_t> new_map_targets; + + auto it = export_targets.begin(); + while (it != export_targets.end()) { + mds_rank_t rank = it->first; + auto &counter = it->second; + dout(20) << "export target mds." << rank << " is " << counter << dendl; + + double val = counter.get(); + if (val <= 0.01) { + dout(15) << "export target mds." << rank << " is no longer an export target" << dendl; + export_targets.erase(it++); + send = true; + continue; + } + if (!map_targets.count(rank)) { + dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl; + send = true; + } + new_map_targets.insert(rank); + it++; + } + if (new_map_targets.size() < map_targets.size()) { + dout(15) << "export target map holds stale targets, sending update" << dendl; + send = true; + } + + if (send) { + dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl; + auto m = MMDSLoadTargets::create(mds_gid_t(monc->get_global_id()), new_map_targets); + monc->send_mon_message(m.detach()); + } +} + +void MDSRank::hit_export_target(mds_rank_t rank, double amount) +{ + double rate = g_conf()->mds_bal_target_decay; + if (amount < 0.0) { + amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */ + } + auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate))); + auto &counter = em.first->second; + counter.hit(amount); + if (em.second) { + dout(15) << "hit export target (new) is " << counter << dendl; + } else { + dout(15) << "hit export target is " << counter << dendl; + } +} + +class C_MDS_MonCommand : public MDSInternalContext { + std::string cmd; +public: + std::string outs; + C_MDS_MonCommand(MDSRank *m, std::string_view c) + : MDSInternalContext(m), cmd(c) {} + void finish(int r) override { + mds->_mon_command_finish(r, cmd, outs); + } +}; + +void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs) +{ + if (r < 0) { + dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r + << " (" << outs << ")" << dendl; + } else { + dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl; + } +} + +void MDSRank::set_mdsmap_multimds_snaps_allowed() +{ + static bool already_sent = false; + if (already_sent) + return; + + stringstream ss; + ss << "{\"prefix\":\"fs set\", \"fs_name\":\"" << mdsmap->get_fs_name() << "\", "; + ss << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", "; + ss << "\"confirm\":\"--yes-i-am-really-a-mds\"}"; + std::vector<std::string> cmd = {ss.str()}; + + dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl; + + C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]); + monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin)); + + already_sent = true; +} + +void MDSRank::mark_base_recursively_scrubbed(inodeno_t ino) +{ + if (mdsmap->get_tableserver() == whoami) + snapserver->mark_base_recursively_scrubbed(ino); +} + +void MDSRankDispatcher::tick() +{ + heartbeat_reset(); + + if (beacon.is_laggy()) { + dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl; + return; + } + + check_ops_in_flight(); + + // Wake up thread in case we use to be laggy and have waiting_for_nolaggy + // messages to progress. + progress_thread.signal(); + + // make sure mds log flushes, trims periodically + mdlog->flush(); + + // update average session uptime + sessionmap.update_average_session_age(); + + if (is_active() || is_stopping()) { + mdlog->trim(); // NOT during recovery! + } + + // ... + if (is_cache_trimmable()) { + server->find_idle_sessions(); + server->evict_cap_revoke_non_responders(); + locker->tick(); + } + + // log + if (logger) { + logger->set(l_mds_subtrees, mdcache->num_subtrees()); + mdcache->log_stat(); + } + + if (is_reconnect()) + server->reconnect_tick(); + + if (is_active()) { + balancer->tick(); + mdcache->find_stale_fragment_freeze(); + mdcache->migrator->find_stale_export_freeze(); + + if (mdsmap->get_tableserver() == whoami) { + snapserver->check_osd_map(false); + // Filesystem was created by pre-mimic mds. Allow multi-active mds after + // all old snapshots are deleted. + if (!mdsmap->allows_multimds_snaps() && + snapserver->can_allow_multimds_snaps()) { + set_mdsmap_multimds_snaps_allowed(); + } + } + } + + if (is_active() || is_stopping()) { + update_targets(); + } + + // shut down? + if (is_stopping()) { + mdlog->trim(); + if (mdcache->shutdown_pass()) { + uint64_t pq_progress = 0 ; + uint64_t pq_total = 0; + size_t pq_in_flight = 0; + if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) { + dout(7) << "shutdown_pass=true, but still waiting for purge queue" + << dendl; + // This takes unbounded time, so we must indicate progress + // to the administrator: we do it in a slightly imperfect way + // by sending periodic (tick frequency) clog messages while + // in this state. + clog->info() << "MDS rank " << whoami << " waiting for purge queue (" + << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight + << " files purging" << ")"; + } else { + dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to " + "down:stopped" << dendl; + stopping_done(); + } + } + else { + dout(7) << "shutdown_pass=false" << dendl; + } + } + + // Expose ourselves to Beacon to update health indicators + beacon.notify_health(this); +} + +void MDSRankDispatcher::shutdown() +{ + // It should never be possible for shutdown to get called twice, because + // anyone picking up mds_lock checks if stopping is true and drops + // out if it is. + ceph_assert(stopping == false); + stopping = true; + + dout(1) << __func__ << ": shutting down rank " << whoami << dendl; + + g_conf().remove_observer(this); + + timer.shutdown(); + + // MDLog has to shut down before the finisher, because some of its + // threads block on IOs that require finisher to complete. + mdlog->shutdown(); + + // shut down cache + mdcache->shutdown(); + + purge_queue.shutdown(); + + mds_lock.Unlock(); + finisher->stop(); // no flushing + mds_lock.Lock(); + + if (objecter->initialized) + objecter->shutdown(); + + monc->shutdown(); + + op_tracker.on_shutdown(); + + progress_thread.shutdown(); + + // release mds_lock for finisher/messenger threads (e.g. + // MDSDaemon::ms_handle_reset called from Messenger). + mds_lock.Unlock(); + + // shut down messenger + messenger->shutdown(); + + mds_lock.Lock(); + + // Workaround unclean shutdown: HeartbeatMap will assert if + // worker is not removed (as we do in ~MDS), but ~MDS is not + // always called after suicide. + if (hb) { + g_ceph_context->get_heartbeat_map()->remove_worker(hb); + hb = NULL; + } +} + +/** + * Helper for simple callbacks that call a void fn with no args. + */ +class C_MDS_VoidFn : public MDSInternalContext +{ + typedef void (MDSRank::*fn_ptr)(); + protected: + fn_ptr fn; + public: + C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_) + : MDSInternalContext(mds_), fn(fn_) + { + ceph_assert(mds_); + ceph_assert(fn_); + } + + void finish(int r) override + { + (mds->*fn)(); + } +}; + +int64_t MDSRank::get_metadata_pool() +{ + return mdsmap->get_metadata_pool(); +} + +MDSTableClient *MDSRank::get_table_client(int t) +{ + switch (t) { + case TABLE_ANCHOR: return NULL; + case TABLE_SNAP: return snapclient; + default: ceph_abort(); + } +} + +MDSTableServer *MDSRank::get_table_server(int t) +{ + switch (t) { + case TABLE_ANCHOR: return NULL; + case TABLE_SNAP: return snapserver; + default: ceph_abort(); + } +} + +void MDSRank::suicide() +{ + if (suicide_hook) { + suicide_hook->complete(0); + suicide_hook = NULL; + } +} + +void MDSRank::respawn() +{ + if (respawn_hook) { + respawn_hook->complete(0); + respawn_hook = NULL; + } +} + +void MDSRank::damaged() +{ + ceph_assert(whoami != MDS_RANK_NONE); + ceph_assert(mds_lock.is_locked_by_me()); + + beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED); + monc->flush_log(); // Flush any clog error from before we were called + beacon.notify_health(this); // Include latest status in our swan song + beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout); + + // It's okay if we timed out and the mon didn't get our beacon, because + // another daemon (or ourselves after respawn) will eventually take the + // rank and report DAMAGED again when it hits same problem we did. + + respawn(); // Respawn into standby in case mon has other work for us +} + +void MDSRank::damaged_unlocked() +{ + std::lock_guard l(mds_lock); + damaged(); +} + +void MDSRank::handle_write_error(int err) +{ + if (err == -EBLACKLISTED) { + derr << "we have been blacklisted (fenced), respawning..." << dendl; + respawn(); + return; + } + + if (g_conf()->mds_action_on_write_error >= 2) { + derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl; + respawn(); + } else if (g_conf()->mds_action_on_write_error == 1) { + derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl; + mdcache->force_readonly(); + } else { + // ignore; + derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl; + } +} + +void *MDSRank::ProgressThread::entry() +{ + std::lock_guard l(mds->mds_lock); + while (true) { + while (!mds->stopping && + mds->finished_queue.empty() && + (mds->waiting_for_nolaggy.empty() || mds->beacon.is_laggy())) { + cond.Wait(mds->mds_lock); + } + + if (mds->stopping) { + break; + } + + mds->_advance_queues(); + } + + return NULL; +} + + +void MDSRank::ProgressThread::shutdown() +{ + ceph_assert(mds->mds_lock.is_locked_by_me()); + ceph_assert(mds->stopping); + + if (am_self()) { + // Stopping is set, we will fall out of our main loop naturally + } else { + // Kick the thread to notice mds->stopping, and join it + cond.Signal(); + mds->mds_lock.Unlock(); + if (is_started()) + join(); + mds->mds_lock.Lock(); + } +} + +bool MDSRankDispatcher::ms_dispatch(const Message::const_ref &m) +{ + if (m->get_source().is_client()) { + Session *session = static_cast<Session*>(m->get_connection()->get_priv().get()); + if (session) + session->last_seen = Session::clock::now(); + } + + inc_dispatch_depth(); + bool ret = _dispatch(m, true); + dec_dispatch_depth(); + return ret; +} + +bool MDSRank::_dispatch(const Message::const_ref &m, bool new_msg) +{ + if (is_stale_message(m)) { + return true; + } + // do not proceed if this message cannot be handled + if (!is_valid_message(m)) { + return false; + } + + if (beacon.is_laggy()) { + dout(5) << " laggy, deferring " << *m << dendl; + waiting_for_nolaggy.push_back(m); + } else if (new_msg && !waiting_for_nolaggy.empty()) { + dout(5) << " there are deferred messages, deferring " << *m << dendl; + waiting_for_nolaggy.push_back(m); + } else { + handle_message(m); + heartbeat_reset(); + } + + if (dispatch_depth > 1) + return true; + + // finish any triggered contexts + _advance_queues(); + + if (beacon.is_laggy()) { + // We've gone laggy during dispatch, don't do any + // more housekeeping + return true; + } + + // hack: thrash exports + static utime_t start; + utime_t now = ceph_clock_now(); + if (start == utime_t()) + start = now; + /*double el = now - start; + if (el > 30.0 && + el < 60.0)*/ + for (int i=0; i<g_conf()->mds_thrash_exports; i++) { + set<mds_rank_t> s; + if (!is_active()) break; + mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); + if (s.size() < 2 || CInode::count() < 10) + break; // need peers for this to work. + if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 || + mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10) + break; + + dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl; + + // pick a random dir inode + CInode *in = mdcache->hack_pick_random_inode(); + + list<CDir*> ls; + in->get_dirfrags(ls); + if (!ls.empty()) { // must be an open dir. + list<CDir*>::iterator p = ls.begin(); + int n = rand() % ls.size(); + while (n--) + ++p; + CDir *dir = *p; + if (!dir->get_parent_dir()) continue; // must be linked. + if (!dir->is_auth()) continue; // must be auth. + + mds_rank_t dest; + do { + int k = rand() % s.size(); + set<mds_rank_t>::iterator p = s.begin(); + while (k--) ++p; + dest = *p; + } while (dest == whoami); + mdcache->migrator->export_dir_nicely(dir,dest); + } + } + // hack: thrash fragments + for (int i=0; i<g_conf()->mds_thrash_fragments; i++) { + if (!is_active()) break; + if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break; + dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl; + + // pick a random dir inode + CInode *in = mdcache->hack_pick_random_inode(); + + list<CDir*> ls; + in->get_dirfrags(ls); + if (ls.empty()) continue; // must be an open dir. + CDir *dir = ls.front(); + if (!dir->get_parent_dir()) continue; // must be linked. + if (!dir->is_auth()) continue; // must be auth. + frag_t fg = dir->get_frag(); + if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) { + mdcache->split_dir(dir, 1); + } else { + balancer->queue_merge(dir); + } + } + + // hack: force hash root? + /* + if (false && + mdcache->get_root() && + mdcache->get_root()->dir && + !(mdcache->get_root()->dir->is_hashed() || + mdcache->get_root()->dir->is_hashing())) { + dout(0) << "hashing root" << dendl; + mdcache->migrator->hash_dir(mdcache->get_root()->dir); + } + */ + + update_mlogger(); + return true; +} + +void MDSRank::update_mlogger() +{ + if (mlogger) { + mlogger->set(l_mdm_ino, CInode::count()); + mlogger->set(l_mdm_dir, CDir::count()); + mlogger->set(l_mdm_dn, CDentry::count()); + mlogger->set(l_mdm_cap, Capability::count()); + mlogger->set(l_mdm_inoa, CInode::increments()); + mlogger->set(l_mdm_inos, CInode::decrements()); + mlogger->set(l_mdm_dira, CDir::increments()); + mlogger->set(l_mdm_dirs, CDir::decrements()); + mlogger->set(l_mdm_dna, CDentry::increments()); + mlogger->set(l_mdm_dns, CDentry::decrements()); + mlogger->set(l_mdm_capa, Capability::increments()); + mlogger->set(l_mdm_caps, Capability::decrements()); + } +} + +// message types that the mds can handle +bool MDSRank::is_valid_message(const Message::const_ref &m) { + int port = m->get_type() & 0xff00; + int type = m->get_type(); + + if (port == MDS_PORT_CACHE || + port == MDS_PORT_MIGRATOR || + type == CEPH_MSG_CLIENT_SESSION || + type == CEPH_MSG_CLIENT_RECONNECT || + type == CEPH_MSG_CLIENT_RECLAIM || + type == CEPH_MSG_CLIENT_REQUEST || + type == MSG_MDS_SLAVE_REQUEST || + type == MSG_MDS_HEARTBEAT || + type == MSG_MDS_TABLE_REQUEST || + type == MSG_MDS_LOCK || + type == MSG_MDS_INODEFILECAPS || + type == CEPH_MSG_CLIENT_CAPS || + type == CEPH_MSG_CLIENT_CAPRELEASE || + type == CEPH_MSG_CLIENT_LEASE) { + return true; + } + + return false; +} + +/* + * lower priority messages we defer if we seem laggy + */ + +#define ALLOW_MESSAGES_FROM(peers) \ + do { \ + if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \ + dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \ + << " allowing=" << #peers << " message=" << *m << dendl; \ + return; \ + } \ + } while (0) + +void MDSRank::handle_message(const Message::const_ref &m) +{ + int port = m->get_type() & 0xff00; + + switch (port) { + case MDS_PORT_CACHE: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + mdcache->dispatch(m); + break; + + case MDS_PORT_MIGRATOR: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + mdcache->migrator->dispatch(m); + break; + + default: + switch (m->get_type()) { + // SERVER + case CEPH_MSG_CLIENT_SESSION: + case CEPH_MSG_CLIENT_RECONNECT: + case CEPH_MSG_CLIENT_RECLAIM: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT); + // fall-thru + case CEPH_MSG_CLIENT_REQUEST: + server->dispatch(m); + break; + case MSG_MDS_SLAVE_REQUEST: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + server->dispatch(m); + break; + + case MSG_MDS_HEARTBEAT: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + balancer->proc_message(m); + break; + + case MSG_MDS_TABLE_REQUEST: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + { + const MMDSTableRequest::const_ref &req = MMDSTableRequest::msgref_cast(m); + if (req->op < 0) { + MDSTableClient *client = get_table_client(req->table); + client->handle_request(req); + } else { + MDSTableServer *server = get_table_server(req->table); + server->handle_request(req); + } + } + break; + + case MSG_MDS_LOCK: + case MSG_MDS_INODEFILECAPS: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS); + locker->dispatch(m); + break; + + case CEPH_MSG_CLIENT_CAPS: + case CEPH_MSG_CLIENT_CAPRELEASE: + case CEPH_MSG_CLIENT_LEASE: + ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT); + locker->dispatch(m); + break; + + default: + derr << "unrecognized message " << *m << dendl; + } + } +} + +/** + * Advance finished_queue and waiting_for_nolaggy. + * + * Usually drain both queues, but may not drain waiting_for_nolaggy + * if beacon is currently laggy. + */ +void MDSRank::_advance_queues() +{ + ceph_assert(mds_lock.is_locked_by_me()); + + if (!finished_queue.empty()) { + dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl; + while (!finished_queue.empty()) { + auto fin = finished_queue.front(); + finished_queue.pop_front(); + + dout(10) << " finish " << fin << dendl; + fin->complete(0); + + heartbeat_reset(); + } + } + + while (!waiting_for_nolaggy.empty()) { + // stop if we're laggy now! + if (beacon.is_laggy()) + break; + + Message::const_ref old = waiting_for_nolaggy.front(); + waiting_for_nolaggy.pop_front(); + + if (!is_stale_message(old)) { + dout(7) << " processing laggy deferred " << *old << dendl; + ceph_assert(is_valid_message(old)); + handle_message(old); + } + + heartbeat_reset(); + } +} + +/** + * Call this when you take mds_lock, or periodically if you're going to + * hold the lock for a long time (e.g. iterating over clients/inodes) + */ +void MDSRank::heartbeat_reset() +{ + // Any thread might jump into mds_lock and call us immediately + // after a call to suicide() completes, in which case MDSRank::hb + // has been freed and we are a no-op. + if (!hb) { + ceph_assert(stopping); + return; + } + + // NB not enabling suicide grace, because the mon takes care of killing us + // (by blacklisting us) when we fail to send beacons, and it's simpler to + // only have one way of dying. + auto grace = g_conf().get_val<double>("mds_heartbeat_grace"); + g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0); +} + +bool MDSRank::is_stale_message(const Message::const_ref &m) const +{ + // from bad mds? + if (m->get_source().is_mds()) { + mds_rank_t from = mds_rank_t(m->get_source().num()); + bool bad = false; + if (mdsmap->is_down(from)) { + bad = true; + } else { + // FIXME: this is a convoluted check. we should be maintaining a nice + // clean map of current ConnectionRefs for current mdses!!! + auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS, + mdsmap->get_addrs(from)); + if (c != m->get_connection()) { + bad = true; + dout(5) << " mds." << from << " should be " << c << " " + << c->get_peer_addrs() << " but this message is " + << m->get_connection() << " " << m->get_source_addrs() + << dendl; + } + } + if (bad) { + // bogus mds? + if (m->get_type() == CEPH_MSG_MDS_MAP) { + dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + << ", but it's an mdsmap, looking at it" << dendl; + } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && + mdsmap->get_addrs(from) == m->get_source_addrs()) { + dout(5) << "got " << *m << " from down mds " << m->get_source() + << ", but it's a cache_expire, looking at it" << dendl; + } else { + dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() + << ", dropping" << dendl; + return true; + } + } + } + return false; +} + +Session *MDSRank::get_session(const Message::const_ref &m) +{ + // do not carry ref + auto session = static_cast<Session *>(m->get_connection()->get_priv().get()); + if (session) { + dout(20) << "get_session have " << session << " " << session->info.inst + << " state " << session->get_state_name() << dendl; + // Check if we've imported an open session since (new sessions start closed) + if (session->is_closed()) { + Session *imported_session = sessionmap.get_session(session->info.inst.name); + if (imported_session && imported_session != session) { + dout(10) << __func__ << " replacing connection bootstrap session " + << session << " with imported session " << imported_session + << dendl; + imported_session->info.auth_name = session->info.auth_name; + //assert(session->info.auth_name == imported_session->info.auth_name); + ceph_assert(session->info.inst == imported_session->info.inst); + imported_session->set_connection(session->get_connection().get()); + // send out any queued messages + while (!session->preopen_out_queue.empty()) { + imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front())); + session->preopen_out_queue.pop_front(); + } + imported_session->auth_caps = session->auth_caps; + imported_session->last_seen = session->last_seen; + ceph_assert(session->get_nref() == 1); + imported_session->get_connection()->set_priv(imported_session->get()); + session = imported_session; + } + } + } else { + dout(20) << "get_session dne for " << m->get_source_inst() << dendl; + } + return session; +} + +void MDSRank::send_message(const Message::ref& m, const ConnectionRef& c) +{ + ceph_assert(c); + c->send_message2(m); +} + + +void MDSRank::send_message_mds(const Message::ref& m, mds_rank_t mds) +{ + if (!mdsmap->is_up(mds)) { + dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl; + return; + } + + // send mdsmap first? + if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { + auto _m = MMDSMap::create(monc->get_fsid(), *mdsmap); + messenger->send_to_mds(_m.detach(), mdsmap->get_addrs(mds)); + peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); + } + + // send message + messenger->send_to_mds(Message::ref(m).detach(), mdsmap->get_addrs(mds)); +} + +void MDSRank::forward_message_mds(const MClientRequest::const_ref& m, mds_rank_t mds) +{ + ceph_assert(mds != whoami); + + /* + * don't actually forward if non-idempotent! + * client has to do it. although the MDS will ignore duplicate requests, + * the affected metadata may migrate, in which case the new authority + * won't have the metareq_id in the completed request map. + */ + // NEW: always make the client resend! + bool client_must_resend = true; //!creq->can_forward(); + + // tell the client where it should go + auto session = get_session(m); + auto f = MClientRequestForward::create(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend); + send_message_client(f, session); +} + +void MDSRank::send_message_client_counted(const Message::ref& m, client_t client) +{ + Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v)); + if (session) { + send_message_client_counted(m, session); + } else { + dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl; + } +} + +void MDSRank::send_message_client_counted(const Message::ref& m, const ConnectionRef& connection) +{ + // do not carry ref + auto session = static_cast<Session *>(connection->get_priv().get()); + if (session) { + send_message_client_counted(m, session); + } else { + dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl; + // another Connection took over the Session + } +} + +void MDSRank::send_message_client_counted(const Message::ref& m, Session* session) +{ + version_t seq = session->inc_push_seq(); + dout(10) << "send_message_client_counted " << session->info.inst.name << " seq " + << seq << " " << *m << dendl; + if (session->get_connection()) { + session->get_connection()->send_message2(m); + } else { + session->preopen_out_queue.push_back(m); + } +} + +void MDSRank::send_message_client(const Message::ref& m, Session* session) +{ + dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl; + if (session->get_connection()) { + session->get_connection()->send_message2(m); + } else { + session->preopen_out_queue.push_back(m); + } +} + +/** + * This is used whenever a RADOS operation has been cancelled + * or a RADOS client has been blacklisted, to cause the MDS and + * any clients to wait for this OSD epoch before using any new caps. + * + * See doc/cephfs/eviction + */ +void MDSRank::set_osd_epoch_barrier(epoch_t e) +{ + dout(4) << __func__ << ": epoch=" << e << dendl; + osd_epoch_barrier = e; +} + +void MDSRank::retry_dispatch(const Message::const_ref &m) +{ + inc_dispatch_depth(); + _dispatch(m, false); + dec_dispatch_depth(); +} + +double MDSRank::get_dispatch_queue_max_age(utime_t now) const +{ + return messenger->get_dispatch_queue_max_age(now); +} + +bool MDSRank::is_daemon_stopping() const +{ + return stopping; +} + +void MDSRank::request_state(MDSMap::DaemonState s) +{ + dout(3) << "request_state " << ceph_mds_state_name(s) << dendl; + beacon.set_want_state(*mdsmap, s); + beacon.send(); +} + + +class C_MDS_BootStart : public MDSInternalContext { + MDSRank::BootStep nextstep; +public: + C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n) + : MDSInternalContext(m), nextstep(n) {} + void finish(int r) override { + mds->boot_start(nextstep, r); + } +}; + + +void MDSRank::boot_start(BootStep step, int r) +{ + // Handle errors from previous step + if (r < 0) { + if (is_standby_replay() && (r == -EAGAIN)) { + dout(0) << "boot_start encountered an error EAGAIN" + << ", respawning since we fell behind journal" << dendl; + respawn(); + } else if (r == -EINVAL || r == -ENOENT) { + // Invalid or absent data, indicates damaged on-disk structures + clog->error() << "Error loading MDS rank " << whoami << ": " + << cpp_strerror(r); + damaged(); + ceph_assert(r == 0); // Unreachable, damaged() calls respawn() + } else if (r == -EROFS) { + dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl; + } else { + // Completely unexpected error, give up and die + dout(0) << "boot_start encountered an error, failing" << dendl; + suicide(); + return; + } + } + + ceph_assert(is_starting() || is_any_replay()); + + switch(step) { + case MDS_BOOT_INITIAL: + { + mdcache->init_layouts(); + + MDSGatherBuilder gather(g_ceph_context, + new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT)); + dout(2) << "Booting: " << step << ": opening inotable" << dendl; + inotable->set_rank(whoami); + inotable->load(gather.new_sub()); + + dout(2) << "Booting: " << step << ": opening sessionmap" << dendl; + sessionmap.set_rank(whoami); + sessionmap.load(gather.new_sub()); + + dout(2) << "Booting: " << step << ": opening mds log" << dendl; + mdlog->open(gather.new_sub()); + + if (is_starting()) { + dout(2) << "Booting: " << step << ": opening purge queue" << dendl; + purge_queue.open(new C_IO_Wrapper(this, gather.new_sub())); + } else if (!standby_replaying) { + dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl; + purge_queue.open(NULL); + dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl; + mdcache->open_file_table.load(nullptr); + } + + if (mdsmap->get_tableserver() == whoami) { + dout(2) << "Booting: " << step << ": opening snap table" << dendl; + snapserver->set_rank(whoami); + snapserver->load(gather.new_sub()); + } + + gather.activate(); + } + break; + case MDS_BOOT_OPEN_ROOT: + { + dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl; + + MDSGatherBuilder gather(g_ceph_context, + new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG)); + + if (is_starting()) { + // load mydir frag for the first log segment (creating subtree map) + mdcache->open_mydir_frag(gather.new_sub()); + } else { + mdcache->open_mydir_inode(gather.new_sub()); + } + + mdcache->create_global_snaprealm(); + + if (whoami == mdsmap->get_root()) { // load root inode off disk if we are auth + mdcache->open_root_inode(gather.new_sub()); + } else if (is_any_replay()) { + // replay. make up fake root inode to start with + mdcache->create_root_inode(); + } + gather.activate(); + } + break; + case MDS_BOOT_PREPARE_LOG: + if (is_any_replay()) { + dout(2) << "Booting: " << step << ": replaying mds log" << dendl; + MDSGatherBuilder gather(g_ceph_context, + new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE)); + + if (!standby_replaying) { + dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl; + purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub())); + } + + mdlog->replay(gather.new_sub()); + gather.activate(); + } else { + dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl; + mdlog->append(); + starting_done(); + } + break; + case MDS_BOOT_REPLAY_DONE: + ceph_assert(is_any_replay()); + + // Sessiontable and inotable should be in sync after replay, validate + // that they are consistent. + validate_sessions(); + + replay_done(); + break; + } +} + +void MDSRank::validate_sessions() +{ + ceph_assert(mds_lock.is_locked_by_me()); + bool valid = true; + + // Identify any sessions which have state inconsistent with other, + // after they have been loaded from rados during startup. + // Mitigate bugs like: http://tracker.ceph.com/issues/16842 + for (const auto &i : sessionmap.get_sessions()) { + Session *session = i.second; + interval_set<inodeno_t> badones; + if (inotable->intersects_free(session->info.prealloc_inos, &badones)) { + clog->error() << "client " << *session + << "loaded with preallocated inodes that are inconsistent with inotable"; + valid = false; + } + } + + if (!valid) { + damaged(); + ceph_assert(valid); + } +} + +void MDSRank::starting_done() +{ + dout(3) << "starting_done" << dendl; + ceph_assert(is_starting()); + request_state(MDSMap::STATE_ACTIVE); + + mdlog->start_new_segment(); + + // sync snaptable cache + snapclient->sync(new C_MDSInternalNoop); +} + + +void MDSRank::calc_recovery_set() +{ + // initialize gather sets + set<mds_rank_t> rs; + mdsmap->get_recovery_mds_set(rs); + rs.erase(whoami); + mdcache->set_recovery_set(rs); + + dout(1) << " recovery set is " << rs << dendl; +} + + +void MDSRank::replay_start() +{ + dout(1) << "replay_start" << dendl; + + if (is_standby_replay()) + standby_replaying = true; + + // Check if we need to wait for a newer OSD map before starting + Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL)); + bool const ready = objecter->wait_for_map( + mdsmap->get_last_failure_osd_epoch(), + fin); + + if (ready) { + delete fin; + boot_start(); + } else { + dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() + << " (which blacklists prior instance)" << dendl; + } +} + + +class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext { + uint64_t old_read_pos; +public: + C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) : + MDSIOContext(mds_), old_read_pos(old_read_pos_) {} + void finish(int r) override { + mds->_standby_replay_restart_finish(r, old_read_pos); + } + void print(ostream& out) const override { + out << "standby_replay_restart"; + } +}; + +void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos) +{ + if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) { + dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl; + respawn(); /* we're too far back, and this is easier than + trying to reset everything in the cache, etc */ + } else { + mdlog->standby_trim_segments(); + boot_start(MDS_BOOT_PREPARE_LOG, r); + } +} + +class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext { +public: + explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {} + void finish(int r) override { + ceph_assert(!r); + mds->standby_replay_restart(); + } +}; + +void MDSRank::standby_replay_restart() +{ + if (standby_replaying) { + /* Go around for another pass of replaying in standby */ + dout(5) << "Restarting replay as standby-replay" << dendl; + mdlog->get_journaler()->reread_head_and_probe( + new C_MDS_StandbyReplayRestartFinish( + this, + mdlog->get_journaler()->get_read_pos())); + } else { + /* We are transitioning out of standby: wait for OSD map update + before making final pass */ + dout(1) << "standby_replay_restart (final takeover pass)" << dendl; + Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this)); + bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin); + if (ready) { + delete fin; + mdlog->get_journaler()->reread_head_and_probe( + new C_MDS_StandbyReplayRestartFinish( + this, + mdlog->get_journaler()->get_read_pos())); + + dout(1) << " opening purge_queue (async)" << dendl; + purge_queue.open(NULL); + dout(1) << " opening open_file_table (async)" << dendl; + mdcache->open_file_table.load(nullptr); + } else { + dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() + << " (which blacklists prior instance)" << dendl; + } + } +} + +void MDSRank::replay_done() +{ + if (!standby_replaying) { + dout(1) << "Finished replaying journal" << dendl; + } else { + dout(5) << "Finished replaying journal as standby-replay" << dendl; + } + + if (is_standby_replay()) { + // The replay was done in standby state, and we are still in that state + ceph_assert(standby_replaying); + dout(10) << "setting replay timer" << dendl; + timer.add_event_after(g_conf()->mds_replay_interval, + new C_MDS_StandbyReplayRestart(this)); + return; + } else if (standby_replaying) { + // The replay was done in standby state, we have now _left_ that state + dout(10) << " last replay pass was as a standby; making final pass" << dendl; + standby_replaying = false; + standby_replay_restart(); + return; + } else { + // Replay is complete, journal read should be up to date + ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos()); + ceph_assert(!is_standby_replay()); + + // Reformat and come back here + if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) { + dout(4) << "reformatting journal on standby-replay->replay transition" << dendl; + mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE)); + return; + } + } + + dout(1) << "making mds journal writeable" << dendl; + mdlog->get_journaler()->set_writeable(); + mdlog->get_journaler()->trim_tail(); + + if (mdsmap->get_tableserver() == whoami && + snapserver->upgrade_format()) { + dout(1) << "upgrading snaptable format" << dendl; + snapserver->save(new C_MDSInternalNoop); + } + + if (g_conf()->mds_wipe_sessions) { + dout(1) << "wiping out client sessions" << dendl; + sessionmap.wipe(); + sessionmap.save(new C_MDSInternalNoop); + } + if (g_conf()->mds_wipe_ino_prealloc) { + dout(1) << "wiping out ino prealloc from sessions" << dendl; + sessionmap.wipe_ino_prealloc(); + sessionmap.save(new C_MDSInternalNoop); + } + if (g_conf()->mds_skip_ino) { + inodeno_t i = g_conf()->mds_skip_ino; + dout(1) << "skipping " << i << " inodes" << dendl; + inotable->skip_inos(i); + inotable->save(new C_MDSInternalNoop); + } + + if (mdsmap->get_num_in_mds() == 1 && + mdsmap->get_num_failed_mds() == 0) { // just me! + dout(2) << "i am alone, moving to state reconnect" << dendl; + request_state(MDSMap::STATE_RECONNECT); + // sync snaptable cache + snapclient->sync(new C_MDSInternalNoop); + } else { + dout(2) << "i am not alone, moving to state resolve" << dendl; + request_state(MDSMap::STATE_RESOLVE); + } +} + +void MDSRank::reopen_log() +{ + dout(1) << "reopen_log" << dendl; + mdcache->rollback_uncommitted_fragments(); +} + +void MDSRank::resolve_start() +{ + dout(1) << "resolve_start" << dendl; + + reopen_log(); + + calc_recovery_set(); + + mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done)); + finish_contexts(g_ceph_context, waiting_for_resolve); +} + +void MDSRank::resolve_done() +{ + dout(1) << "resolve_done" << dendl; + request_state(MDSMap::STATE_RECONNECT); + // sync snaptable cache + snapclient->sync(new C_MDSInternalNoop); +} + +void MDSRank::reconnect_start() +{ + dout(1) << "reconnect_start" << dendl; + + if (last_state == MDSMap::STATE_REPLAY) { + reopen_log(); + } + + // Drop any blacklisted clients from the SessionMap before going + // into reconnect, so that we don't wait for them. + objecter->enable_blacklist_events(); + std::set<entity_addr_t> blacklist; + epoch_t epoch = 0; + objecter->with_osdmap([&blacklist, &epoch](const OSDMap& o) { + o.get_blacklist(&blacklist); + epoch = o.get_epoch(); + }); + auto killed = server->apply_blacklist(blacklist); + dout(4) << "reconnect_start: killed " << killed << " blacklisted sessions (" + << blacklist.size() << " blacklist entries, " + << sessionmap.get_sessions().size() << ")" << dendl; + if (killed) { + set_osd_epoch_barrier(epoch); + } + + server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done)); + finish_contexts(g_ceph_context, waiting_for_reconnect); +} +void MDSRank::reconnect_done() +{ + dout(1) << "reconnect_done" << dendl; + request_state(MDSMap::STATE_REJOIN); // move to rejoin state +} + +void MDSRank::rejoin_joint_start() +{ + dout(1) << "rejoin_joint_start" << dendl; + mdcache->rejoin_send_rejoins(); +} +void MDSRank::rejoin_start() +{ + dout(1) << "rejoin_start" << dendl; + mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done)); + finish_contexts(g_ceph_context, waiting_for_rejoin); +} +void MDSRank::rejoin_done() +{ + dout(1) << "rejoin_done" << dendl; + mdcache->show_subtrees(); + mdcache->show_cache(); + + if (mdcache->is_any_uncommitted_fragment()) { + dout(1) << " waiting for uncommitted fragments" << dendl; + mdcache->wait_for_uncommitted_fragments(new C_MDS_VoidFn(this, &MDSRank::rejoin_done)); + return; + } + + // funny case: is our cache empty? no subtrees? + if (!mdcache->is_subtrees()) { + if (whoami == 0) { + // The root should always have a subtree! + clog->error() << "No subtrees found for root MDS rank!"; + damaged(); + ceph_assert(mdcache->is_subtrees()); + } else { + dout(1) << " empty cache, no subtrees, leaving cluster" << dendl; + request_state(MDSMap::STATE_STOPPED); + } + return; + } + + if (replay_queue.empty() && !server->get_num_pending_reclaim()) { + request_state(MDSMap::STATE_ACTIVE); + } else { + replaying_requests_done = replay_queue.empty(); + request_state(MDSMap::STATE_CLIENTREPLAY); + } +} + +void MDSRank::clientreplay_start() +{ + dout(1) << "clientreplay_start" << dendl; + finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters + queue_one_replay(); +} + +bool MDSRank::queue_one_replay() +{ + if (!replay_queue.empty()) { + queue_waiter(replay_queue.front()); + replay_queue.pop_front(); + return true; + } + if (!replaying_requests_done) { + replaying_requests_done = true; + mdlog->flush(); + } + maybe_clientreplay_done(); + return false; +} + +void MDSRank::maybe_clientreplay_done() +{ + if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) { + + // don't go to active if there are session waiting for being reclaimed + if (replaying_requests_done && !server->get_num_pending_reclaim()) { + mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done)); + return; + } + + dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done + << " requests need to be replayed, " << server->get_num_pending_reclaim() + << " sessions need to be reclaimed" << dendl; + } +} + +void MDSRank::clientreplay_done() +{ + dout(1) << "clientreplay_done" << dendl; + request_state(MDSMap::STATE_ACTIVE); +} + +void MDSRank::active_start() +{ + dout(1) << "active_start" << dendl; + + if (last_state == MDSMap::STATE_CREATING || + last_state == MDSMap::STATE_STARTING) { + mdcache->open_root(); + } + + mdcache->clean_open_file_lists(); + mdcache->export_remaining_imported_caps(); + finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters + + mdcache->reissue_all_caps(); + + finish_contexts(g_ceph_context, waiting_for_active); // kick waiters +} + +void MDSRank::recovery_done(int oldstate) +{ + dout(1) << "recovery_done -- successful recovery!" << dendl; + ceph_assert(is_clientreplay() || is_active()); + + if (oldstate == MDSMap::STATE_CREATING) + return; + + mdcache->start_recovered_truncates(); + mdcache->start_files_to_recover(); + + // tell connected clients + //bcast_mds_map(); // not anymore, they get this from the monitor + + mdcache->populate_mydir(); +} + +void MDSRank::creating_done() +{ + dout(1)<< "creating_done" << dendl; + request_state(MDSMap::STATE_ACTIVE); + // sync snaptable cache + snapclient->sync(new C_MDSInternalNoop); +} + +void MDSRank::boot_create() +{ + dout(3) << "boot_create" << dendl; + + MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done)); + + mdcache->init_layouts(); + + inotable->set_rank(whoami); + sessionmap.set_rank(whoami); + + // start with a fresh journal + dout(10) << "boot_create creating fresh journal" << dendl; + mdlog->create(fin.new_sub()); + + // open new journal segment, but do not journal subtree map (yet) + mdlog->prepare_new_segment(); + + if (whoami == mdsmap->get_root()) { + dout(3) << "boot_create creating fresh hierarchy" << dendl; + mdcache->create_empty_hierarchy(fin.get()); + } + + dout(3) << "boot_create creating mydir hierarchy" << dendl; + mdcache->create_mydir_hierarchy(fin.get()); + + dout(3) << "boot_create creating global snaprealm" << dendl; + mdcache->create_global_snaprealm(); + + // fixme: fake out inotable (reset, pretend loaded) + dout(10) << "boot_create creating fresh inotable table" << dendl; + inotable->reset(); + inotable->save(fin.new_sub()); + + // write empty sessionmap + sessionmap.save(fin.new_sub()); + + // Create empty purge queue + purge_queue.create(new C_IO_Wrapper(this, fin.new_sub())); + + // initialize tables + if (mdsmap->get_tableserver() == whoami) { + dout(10) << "boot_create creating fresh snaptable" << dendl; + snapserver->set_rank(whoami); + snapserver->reset(); + snapserver->save(fin.new_sub()); + } + + ceph_assert(g_conf()->mds_kill_create_at != 1); + + // ok now journal it + mdlog->journal_segment_subtree_map(fin.new_sub()); + mdlog->flush(); + + // Usually we do this during reconnect, but creation skips that. + objecter->enable_blacklist_events(); + + fin.activate(); +} + +void MDSRank::stopping_start() +{ + dout(2) << "Stopping..." << dendl; + + if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) { + std::vector<Session*> victims; + const auto& sessions = sessionmap.get_sessions(); + for (const auto& p : sessions) { + if (!p.first.is_client()) { + continue; + } + + Session *s = p.second; + victims.push_back(s); + } + + dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl; + ceph_assert(!victims.empty()); + + C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop); + for (const auto &s : victims) { + std::stringstream ss; + evict_client(s->get_client().v, false, + g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub()); + } + gather.activate(); + } + + mdcache->shutdown_start(); +} + +void MDSRank::stopping_done() +{ + dout(2) << "Finished stopping..." << dendl; + + // tell monitor we shut down cleanly. + request_state(MDSMap::STATE_STOPPED); +} + +void MDSRankDispatcher::handle_mds_map( + const MMDSMap::const_ref &m, + const MDSMap &oldmap) +{ + // I am only to be passed MDSMaps in which I hold a rank + ceph_assert(whoami != MDS_RANK_NONE); + + MDSMap::DaemonState oldstate = state; + mds_gid_t mds_gid = mds_gid_t(monc->get_global_id()); + state = mdsmap->get_state_gid(mds_gid); + if (state != oldstate) { + last_state = oldstate; + incarnation = mdsmap->get_inc_gid(mds_gid); + } + + version_t epoch = m->get_epoch(); + + // note source's map version + if (m->get_source().is_mds() && + peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) { + dout(15) << " peer " << m->get_source() + << " has mdsmap epoch >= " << epoch + << dendl; + peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch; + } + + // Validate state transitions while I hold a rank + if (!MDSMap::state_transition_valid(oldstate, state)) { + derr << "Invalid state transition " << ceph_mds_state_name(oldstate) + << "->" << ceph_mds_state_name(state) << dendl; + respawn(); + } + + if (oldstate != state) { + // update messenger. + if (state == MDSMap::STATE_STANDBY_REPLAY) { + dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation + << " replaying mds." << whoami << "." << incarnation << dendl; + messenger->set_myname(entity_name_t::MDS(mds_gid)); + } else { + dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl; + messenger->set_myname(entity_name_t::MDS(whoami)); + } + } + + // tell objecter my incarnation + if (objecter->get_client_incarnation() != incarnation) + objecter->set_client_incarnation(incarnation); + + if (oldmap.get_min_compat_client() != mdsmap->get_min_compat_client()) + server->update_required_client_features(); + + // for debug + if (g_conf()->mds_dump_cache_on_map) + mdcache->dump_cache(); + + cluster_degraded = mdsmap->is_degraded(); + + // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap. + // the 'restart' set tracks ranks that have restarted since the old mdsmap + set<mds_rank_t> restart; + // replaying mds does not communicate with other ranks + if (state >= MDSMap::STATE_RESOLVE) { + // did someone fail? + // new down? + set<mds_rank_t> olddown, down; + oldmap.get_down_mds_set(&olddown); + mdsmap->get_down_mds_set(&down); + for (const auto& r : down) { + if (oldmap.have_inst(r) && olddown.count(r) == 0) { + messenger->mark_down_addrs(oldmap.get_addrs(r)); + handle_mds_failure(r); + } + } + + // did someone fail? + // did their addr/inst change? + set<mds_rank_t> up; + mdsmap->get_up_mds_set(up); + for (const auto& r : up) { + auto& info = mdsmap->get_info(r); + if (oldmap.have_inst(r)) { + auto& oldinfo = oldmap.get_info(r); + if (info.inc != oldinfo.inc) { + messenger->mark_down_addrs(oldinfo.get_addrs()); + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + restart.insert(r); + handle_mds_failure(r); + } else { + ceph_assert(info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + // -> stopped (missing) -> starting -> active + restart.insert(r); + mdcache->migrator->handle_mds_failure_or_stop(r); + if (mdsmap->get_tableserver() == whoami) + snapserver->handle_mds_failure_or_stop(r); + } + } + } else { + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + // -> starting/creating (missing) -> active (missing) -> replay -> resolve + restart.insert(r); + handle_mds_failure(r); + } else { + ceph_assert(info.state == MDSMap::STATE_CREATING || + info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + } + } + } + } + + // did it change? + if (oldstate != state) { + dout(1) << "handle_mds_map state change " + << ceph_mds_state_name(oldstate) << " --> " + << ceph_mds_state_name(state) << dendl; + beacon.set_want_state(*mdsmap, state); + + if (oldstate == MDSMap::STATE_STANDBY_REPLAY) { + dout(10) << "Monitor activated us! Deactivating replay loop" << dendl; + assert (state == MDSMap::STATE_REPLAY); + } else { + // did i just recover? + if ((is_active() || is_clientreplay()) && + (oldstate == MDSMap::STATE_CREATING || + oldstate == MDSMap::STATE_REJOIN || + oldstate == MDSMap::STATE_RECONNECT)) + recovery_done(oldstate); + + if (is_active()) { + active_start(); + } else if (is_any_replay()) { + replay_start(); + } else if (is_resolve()) { + resolve_start(); + } else if (is_reconnect()) { + reconnect_start(); + } else if (is_rejoin()) { + rejoin_start(); + } else if (is_clientreplay()) { + clientreplay_start(); + } else if (is_creating()) { + boot_create(); + } else if (is_starting()) { + boot_start(); + } else if (is_stopping()) { + ceph_assert(oldstate == MDSMap::STATE_ACTIVE); + stopping_start(); + } + } + } + + // RESOLVE + // is someone else newly resolving? + if (state >= MDSMap::STATE_RESOLVE) { + // recover snaptable + if (mdsmap->get_tableserver() == whoami) { + if (oldstate < MDSMap::STATE_RESOLVE) { + set<mds_rank_t> s; + mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE); + snapserver->finish_recovery(s); + } else { + set<mds_rank_t> old_set, new_set; + oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE); + mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE); + for (const auto& r : new_set) { + if (r == whoami) + continue; // not me + if (!old_set.count(r) || restart.count(r)) { // newly so? + snapserver->handle_mds_recovery(r); + } + } + } + } + + if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) { + set<mds_rank_t> resolve; + mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); + dout(10) << " resolve set is " << resolve << dendl; + calc_recovery_set(); + mdcache->send_resolves(); + } + } + + // REJOIN + // is everybody finally rejoining? + if (state >= MDSMap::STATE_REJOIN) { + // did we start? + if (!oldmap.is_rejoining() && mdsmap->is_rejoining()) + rejoin_joint_start(); + + // did we finish? + if (g_conf()->mds_dump_cache_after_rejoin && + oldmap.is_rejoining() && !mdsmap->is_rejoining()) + mdcache->dump_cache(); // for DEBUG only + + if (oldstate >= MDSMap::STATE_REJOIN || + oldstate == MDSMap::STATE_STARTING) { + // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them. + set<mds_rank_t> olddis, dis; + oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN); + mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN); + for (const auto& r : dis) { + if (r == whoami) + continue; // not me + if (!olddis.count(r) || restart.count(r)) { // newly so? + mdcache->kick_discovers(r); + mdcache->kick_open_ino_peers(r); + } + } + } + } + + if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) { + dout(1) << "cluster recovered." << dendl; + auto it = waiting_for_active_peer.find(MDS_RANK_NONE); + if (it != waiting_for_active_peer.end()) { + queue_waiters(it->second); + waiting_for_active_peer.erase(it); + } + } + + // did someone go active? + if (state >= MDSMap::STATE_CLIENTREPLAY && + oldstate >= MDSMap::STATE_CLIENTREPLAY) { + set<mds_rank_t> oldactive, active; + oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY); + mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); + for (const auto& r : active) { + if (r == whoami) + continue; // not me + if (!oldactive.count(r) || restart.count(r)) // newly so? + handle_mds_recovery(r); + } + } + + if (is_clientreplay() || is_active() || is_stopping()) { + // did anyone stop? + set<mds_rank_t> oldstopped, stopped; + oldmap.get_stopped_mds_set(oldstopped); + mdsmap->get_stopped_mds_set(stopped); + for (const auto& r : stopped) + if (oldstopped.count(r) == 0) { // newly so? + mdcache->migrator->handle_mds_failure_or_stop(r); + if (mdsmap->get_tableserver() == whoami) + snapserver->handle_mds_failure_or_stop(r); + } + } + + { + map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin(); + while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) { + MDSContext::vec ls; + ls.swap(p->second); + waiting_for_mdsmap.erase(p++); + queue_waiters(ls); + } + } + + if (is_active()) { + // Before going active, set OSD epoch barrier to latest (so that + // we don't risk handing out caps to clients with old OSD maps that + // might not include barriers from the previous incarnation of this MDS) + set_osd_epoch_barrier(objecter->with_osdmap( + std::mem_fn(&OSDMap::get_epoch))); + + /* Now check if we should hint to the OSD that a read may follow */ + if (mdsmap->has_standby_replay(whoami)) + mdlog->set_write_iohint(0); + else + mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + } + + if (oldmap.get_max_mds() != mdsmap->get_max_mds()) { + purge_queue.update_op_limit(*mdsmap); + } + + if (scrubstack->is_scrubbing()) { + if (mdsmap->get_max_mds() > 1) { + auto c = new C_MDSInternalNoop; + scrubstack->scrub_abort(c); + } + } + mdcache->handle_mdsmap(*mdsmap); +} + +void MDSRank::handle_mds_recovery(mds_rank_t who) +{ + dout(5) << "handle_mds_recovery mds." << who << dendl; + + mdcache->handle_mds_recovery(who); + + queue_waiters(waiting_for_active_peer[who]); + waiting_for_active_peer.erase(who); +} + +void MDSRank::handle_mds_failure(mds_rank_t who) +{ + if (who == whoami) { + dout(5) << "handle_mds_failure for myself; not doing anything" << dendl; + return; + } + dout(5) << "handle_mds_failure mds." << who << dendl; + + mdcache->handle_mds_failure(who); + + if (mdsmap->get_tableserver() == whoami) + snapserver->handle_mds_failure_or_stop(who); + + snapclient->handle_mds_failure(who); +} + +bool MDSRankDispatcher::handle_asok_command(std::string_view command, + const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& ss) +{ + if (command == "dump_ops_in_flight" || + command == "ops") { + if (!op_tracker.dump_ops_in_flight(f)) { + ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "dump_blocked_ops") { + if (!op_tracker.dump_ops_in_flight(f, true)) { + ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + Please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "dump_historic_ops") { + if (!op_tracker.dump_historic_ops(f)) { + ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "dump_historic_ops_by_duration") { + if (!op_tracker.dump_historic_ops(f, true)) { + ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "osdmap barrier") { + int64_t target_epoch = 0; + bool got_val = cmd_getval(g_ceph_context, cmdmap, "target_epoch", target_epoch); + + if (!got_val) { + ss << "no target epoch given"; + return true; + } + + mds_lock.Lock(); + set_osd_epoch_barrier(target_epoch); + mds_lock.Unlock(); + + C_SaferCond cond; + bool already_got = objecter->wait_for_map(target_epoch, &cond); + if (!already_got) { + dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl; + cond.wait(); + } + } else if (command == "session ls") { + std::lock_guard l(mds_lock); + + heartbeat_reset(); + + dump_sessions(SessionFilter(), f); + } else if (command == "session evict") { + std::string client_id; + const bool got_arg = cmd_getval(g_ceph_context, cmdmap, "client_id", client_id); + if(!got_arg) { + ss << "Invalid client_id specified"; + return true; + } + + mds_lock.Lock(); + std::stringstream dss; + bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true, + g_conf()->mds_session_blacklist_on_evict, dss); + if (!evicted) { + dout(15) << dss.str() << dendl; + ss << dss.str(); + } + mds_lock.Unlock(); + } else if (command == "session config") { + int64_t client_id; + std::string option; + std::string value; + + cmd_getval(g_ceph_context, cmdmap, "client_id", client_id); + cmd_getval(g_ceph_context, cmdmap, "option", option); + bool got_value = cmd_getval(g_ceph_context, cmdmap, "value", value); + + mds_lock.Lock(); + config_client(client_id, !got_value, option, value, ss); + mds_lock.Unlock(); + } else if (command == "scrub_path") { + string path; + vector<string> scrubop_vec; + cmd_getval(g_ceph_context, cmdmap, "scrubops", scrubop_vec); + cmd_getval(g_ceph_context, cmdmap, "path", path); + + /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */ + if (mdsmap->get_max_mds() > 1) { + ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub."; + return true; + } + + C_SaferCond cond; + command_scrub_start(f, path, "", scrubop_vec, &cond); + cond.wait(); + } else if (command == "tag path") { + string path; + cmd_getval(g_ceph_context, cmdmap, "path", path); + string tag; + cmd_getval(g_ceph_context, cmdmap, "tag", tag); + command_tag_path(f, path, tag); + } else if (command == "flush_path") { + string path; + cmd_getval(g_ceph_context, cmdmap, "path", path); + command_flush_path(f, path); + } else if (command == "flush journal") { + command_flush_journal(f); + } else if (command == "get subtrees") { + command_get_subtrees(f); + } else if (command == "export dir") { + string path; + if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) { + ss << "malformed path"; + return true; + } + int64_t rank; + if(!cmd_getval(g_ceph_context, cmdmap, "rank", rank)) { + ss << "malformed rank"; + return true; + } + command_export_dir(f, path, (mds_rank_t)rank); + } else if (command == "dump cache") { + std::lock_guard l(mds_lock); + string path; + int r; + if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) { + r = mdcache->dump_cache(f); + } else { + r = mdcache->dump_cache(path); + } + + if (r != 0) { + ss << "Failed to dump cache: " << cpp_strerror(r); + f->reset(); + } + } else if (command == "cache status") { + std::lock_guard l(mds_lock); + mdcache->cache_status(f); + } else if (command == "dump tree") { + command_dump_tree(cmdmap, ss, f); + } else if (command == "dump loads") { + std::lock_guard l(mds_lock); + int r = balancer->dump_loads(f); + if (r != 0) { + ss << "Failed to dump loads: " << cpp_strerror(r); + f->reset(); + } + } else if (command == "dump snaps") { + std::lock_guard l(mds_lock); + string server; + cmd_getval(g_ceph_context, cmdmap, "server", server); + if (server == "--server") { + if (mdsmap->get_tableserver() == whoami) { + snapserver->dump(f); + } else { + ss << "Not snapserver"; + } + } else { + int r = snapclient->dump_cache(f); + if (r != 0) { + ss << "Failed to dump snapclient: " << cpp_strerror(r); + f->reset(); + } + } + } else if (command == "force_readonly") { + std::lock_guard l(mds_lock); + mdcache->force_readonly(); + } else if (command == "dirfrag split") { + command_dirfrag_split(cmdmap, ss); + } else if (command == "dirfrag merge") { + command_dirfrag_merge(cmdmap, ss); + } else if (command == "dirfrag ls") { + command_dirfrag_ls(cmdmap, ss, f); + } else if (command == "openfiles ls") { + command_openfiles_ls(f); + } else if (command == "dump inode") { + command_dump_inode(f, cmdmap, ss); + } else { + return false; + } + + return true; +} + +class C_MDS_Send_Command_Reply : public MDSInternalContext { +protected: + MCommand::const_ref m; +public: + C_MDS_Send_Command_Reply(MDSRank *_mds, const MCommand::const_ref &_m) : + MDSInternalContext(_mds), m(_m) {} + + void send(int r, std::string_view ss) { + std::stringstream ds; + send(r, ss, ds); + } + + void send(int r, std::string_view ss, std::stringstream &ds) { + bufferlist bl; + bl.append(ds); + MDSDaemon::send_command_reply(m, mds, r, bl, ss); + } + + void finish(int r) override { + send(r, ""); + } +}; + +class C_ExecAndReply : public C_MDS_Send_Command_Reply { +public: + C_ExecAndReply(MDSRank *mds, const MCommand::const_ref &m) + : C_MDS_Send_Command_Reply(mds, m), f(true) { + } + + void finish(int r) override { + std::stringstream ds; + std::stringstream ss; + if (r != 0) { + f.flush(ss); + } else { + f.flush(ds); + } + + send(r, ss.str(), ds); + } + + virtual void exec() = 0; + +protected: + JSONFormatter f; +}; + +class C_CacheDropExecAndReply : public C_ExecAndReply { +public: + C_CacheDropExecAndReply(MDSRank *mds, const MCommand::const_ref &m, + uint64_t timeout) + : C_ExecAndReply(mds, m), timeout(timeout) { + } + + void exec() override { + mds->command_cache_drop(timeout, &f, this); + } + +private: + uint64_t timeout; +}; + +class C_ScrubExecAndReply : public C_ExecAndReply { +public: + C_ScrubExecAndReply(MDSRank *mds, const MCommand::const_ref &m, + const std::string &path, const std::string &tag, + const std::vector<std::string> &scrubop) + : C_ExecAndReply(mds, m), path(path), tag(tag), scrubop(scrubop) { + } + + void exec() override { + mds->command_scrub_start(&f, path, tag, scrubop, this); + } + +private: + std::string path; + std::string tag; + std::vector<std::string> scrubop; +}; + +class C_ScrubControlExecAndReply : public C_ExecAndReply { +public: + C_ScrubControlExecAndReply(MDSRank *mds, const MCommand::const_ref &m, + const std::string &command) + : C_ExecAndReply(mds, m), command(command) { + } + + void exec() override { + if (command == "abort") { + mds->command_scrub_abort(&f, this); + } else if (command == "pause") { + mds->command_scrub_pause(&f, this); + } else { + ceph_abort(); + } + } + + void finish(int r) override { + f.open_object_section("result"); + f.dump_int("return_code", r); + f.close_section(); + C_ExecAndReply::finish(r); + } + +private: + std::string command; +}; + +/** + * This function drops the mds_lock, so don't do anything with + * MDSRank after calling it (we could have gone into shutdown): just + * send your result back to the calling client and finish. + */ +void MDSRankDispatcher::evict_clients(const SessionFilter &filter, const MCommand::const_ref &m) +{ + C_MDS_Send_Command_Reply *reply = new C_MDS_Send_Command_Reply(this, m); + + if (is_any_replay()) { + reply->send(-EAGAIN, "MDS is replaying log"); + delete reply; + return; + } + + std::vector<Session*> victims; + const auto& sessions = sessionmap.get_sessions(); + for (const auto& p : sessions) { + if (!p.first.is_client()) { + continue; + } + + Session *s = p.second; + + if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) { + victims.push_back(s); + } + } + + dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl; + + if (victims.empty()) { + reply->send(0, ""); + delete reply; + return; + } + + C_GatherBuilder gather(g_ceph_context, reply); + for (const auto s : victims) { + std::stringstream ss; + evict_client(s->get_client().v, false, + g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub()); + } + gather.activate(); +} + +void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f) const +{ + // Dump sessions, decorated with recovery/replay status + f->open_array_section("sessions"); + for (auto& [name, s] : sessionmap.get_sessions()) { + if (!name.is_client()) { + continue; + } + + if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) { + continue; + } + + f->dump_object("session", *s); + } + f->close_section(); // sessions +} + +void MDSRank::command_scrub_start(Formatter *f, + std::string_view path, std::string_view tag, + const vector<string>& scrubop_vec, Context *on_finish) +{ + bool force = false; + bool recursive = false; + bool repair = false; + for (auto &op : scrubop_vec) { + if (op == "force") + force = true; + else if (op == "recursive") + recursive = true; + else if (op == "repair") + repair = true; + } + + std::lock_guard l(mds_lock); + mdcache->enqueue_scrub(path, tag, force, recursive, repair, f, on_finish); + // scrub_dentry() finishers will dump the data for us; we're done! +} + +void MDSRank::command_tag_path(Formatter *f, + std::string_view path, std::string_view tag) +{ + C_SaferCond scond; + { + std::lock_guard l(mds_lock); + mdcache->enqueue_scrub(path, tag, true, true, false, f, &scond); + } + scond.wait(); +} + +void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) { + std::lock_guard l(mds_lock); + scrubstack->scrub_abort(on_finish); +} + +void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) { + std::lock_guard l(mds_lock); + scrubstack->scrub_pause(on_finish); +} + +void MDSRank::command_scrub_resume(Formatter *f) { + int r = scrubstack->scrub_resume(); + + f->open_object_section("result"); + f->dump_int("return_code", r); + f->close_section(); +} + +void MDSRank::command_scrub_status(Formatter *f) { + scrubstack->scrub_status(f); +} + +void MDSRank::command_flush_path(Formatter *f, std::string_view path) +{ + C_SaferCond scond; + { + std::lock_guard l(mds_lock); + mdcache->flush_dentry(path, &scond); + } + int r = scond.wait(); + f->open_object_section("results"); + f->dump_int("return_code", r); + f->close_section(); // results +} + +// synchronous wrapper around "journal flush" asynchronous context +// execution. +void MDSRank::command_flush_journal(Formatter *f) { + ceph_assert(f != NULL); + + C_SaferCond cond; + std::stringstream ss; + { + std::lock_guard locker(mds_lock); + C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond); + flush_journal->send(); + } + int r = cond.wait(); + + f->open_object_section("result"); + f->dump_string("message", ss.str()); + f->dump_int("return_code", r); + f->close_section(); +} + +void MDSRank::command_get_subtrees(Formatter *f) +{ + ceph_assert(f != NULL); + std::lock_guard l(mds_lock); + + std::vector<CDir*> subtrees; + mdcache->get_subtrees(subtrees); + + f->open_array_section("subtrees"); + for (const auto& dir : subtrees) { + f->open_object_section("subtree"); + { + f->dump_bool("is_auth", dir->is_auth()); + f->dump_int("auth_first", dir->get_dir_auth().first); + f->dump_int("auth_second", dir->get_dir_auth().second); + f->dump_int("export_pin", dir->inode->get_export_pin()); + f->open_object_section("dir"); + dir->dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + + +void MDSRank::command_export_dir(Formatter *f, + std::string_view path, + mds_rank_t target) +{ + int r = _command_export_dir(path, target); + f->open_object_section("results"); + f->dump_int("return_code", r); + f->close_section(); // results +} + +int MDSRank::_command_export_dir( + std::string_view path, + mds_rank_t target) +{ + std::lock_guard l(mds_lock); + filepath fp(path); + + if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) { + derr << "bad MDS target " << target << dendl; + return -ENOENT; + } + + CInode *in = mdcache->cache_traverse(fp); + if (!in) { + derr << "Bath path '" << path << "'" << dendl; + return -ENOENT; + } + CDir *dir = in->get_dirfrag(frag_t()); + if (!dir || !(dir->is_auth())) { + derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl; + return -EINVAL; + } + + mdcache->migrator->export_dir(dir, target); + return 0; +} + +void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f) +{ + std::string root; + int64_t depth; + cmd_getval(g_ceph_context, cmdmap, "root", root); + if (!cmd_getval(g_ceph_context, cmdmap, "depth", depth)) + depth = -1; + std::lock_guard l(mds_lock); + CInode *in = mdcache->cache_traverse(filepath(root.c_str())); + if (!in) { + ss << "root inode is not in cache"; + return; + } + f->open_array_section("inodes"); + mdcache->dump_tree(in, 0, depth, f); + f->close_section(); +} + +CDir *MDSRank::_command_dirfrag_get( + const cmdmap_t &cmdmap, + std::ostream &ss) +{ + std::string path; + bool got = cmd_getval(g_ceph_context, cmdmap, "path", path); + if (!got) { + ss << "missing path argument"; + return NULL; + } + + std::string frag_str; + if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) { + ss << "missing frag argument"; + return NULL; + } + + CInode *in = mdcache->cache_traverse(filepath(path.c_str())); + if (!in) { + // TODO really we should load something in if it's not in cache, + // but the infrastructure is harder, and we might still be unable + // to act on it if someone else is auth. + ss << "directory '" << path << "' inode not in cache"; + return NULL; + } + + frag_t fg; + + if (!fg.parse(frag_str.c_str())) { + ss << "frag " << frag_str << " failed to parse"; + return NULL; + } + + CDir *dir = in->get_dirfrag(fg); + if (!dir) { + ss << "frag " << in->ino() << "/" << fg << " not in cache (" + "use `dirfrag ls` to see if it should exist)"; + return NULL; + } + + if (!dir->is_auth()) { + ss << "frag " << dir->dirfrag() << " not auth (auth = " + << dir->authority() << ")"; + return NULL; + } + + return dir; +} + +bool MDSRank::command_dirfrag_split( + cmdmap_t cmdmap, + std::ostream &ss) +{ + std::lock_guard l(mds_lock); + int64_t by = 0; + if (!cmd_getval(g_ceph_context, cmdmap, "bits", by)) { + ss << "missing bits argument"; + return false; + } + + if (by <= 0) { + ss << "must split by >0 bits"; + return false; + } + + CDir *dir = _command_dirfrag_get(cmdmap, ss); + if (!dir) { + return false; + } + + mdcache->split_dir(dir, by); + + return true; +} + +bool MDSRank::command_dirfrag_merge( + cmdmap_t cmdmap, + std::ostream &ss) +{ + std::lock_guard l(mds_lock); + std::string path; + bool got = cmd_getval(g_ceph_context, cmdmap, "path", path); + if (!got) { + ss << "missing path argument"; + return false; + } + + std::string frag_str; + if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) { + ss << "missing frag argument"; + return false; + } + + CInode *in = mdcache->cache_traverse(filepath(path.c_str())); + if (!in) { + ss << "directory '" << path << "' inode not in cache"; + return false; + } + + frag_t fg; + if (!fg.parse(frag_str.c_str())) { + ss << "frag " << frag_str << " failed to parse"; + return false; + } + + mdcache->merge_dir(in, fg); + + return true; +} + +bool MDSRank::command_dirfrag_ls( + cmdmap_t cmdmap, + std::ostream &ss, + Formatter *f) +{ + std::lock_guard l(mds_lock); + std::string path; + bool got = cmd_getval(g_ceph_context, cmdmap, "path", path); + if (!got) { + ss << "missing path argument"; + return false; + } + + CInode *in = mdcache->cache_traverse(filepath(path.c_str())); + if (!in) { + ss << "directory inode not in cache"; + return false; + } + + f->open_array_section("frags"); + frag_vec_t leaves; + // NB using get_leaves_under instead of get_dirfrags to give + // you the list of what dirfrags may exist, not which are in cache + in->dirfragtree.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + f->open_object_section("frag"); + f->dump_int("value", leaf.value()); + f->dump_int("bits", leaf.bits()); + CachedStackStringStream css; + *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits(); + f->dump_string("str", css->strv()); + f->close_section(); + } + f->close_section(); + + return true; +} + +void MDSRank::command_openfiles_ls(Formatter *f) +{ + std::lock_guard l(mds_lock); + mdcache->dump_openfiles(f); +} + +void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss) +{ + std::lock_guard l(mds_lock); + int64_t number; + bool got = cmd_getval(g_ceph_context, cmdmap, "number", number); + if (!got) { + ss << "missing inode number"; + return; + } + + bool success = mdcache->dump_inode(f, number); + if (!success) { + ss << "dump inode failed, wrong inode number or the inode is not cached"; + } +} + +void MDSRank::dump_status(Formatter *f) const +{ + if (state == MDSMap::STATE_REPLAY || + state == MDSMap::STATE_STANDBY_REPLAY) { + mdlog->dump_replay_status(f); + } else if (state == MDSMap::STATE_RESOLVE) { + mdcache->dump_resolve_status(f); + } else if (state == MDSMap::STATE_RECONNECT) { + server->dump_reconnect_status(f); + } else if (state == MDSMap::STATE_REJOIN) { + mdcache->dump_rejoin_status(f); + } else if (state == MDSMap::STATE_CLIENTREPLAY) { + dump_clientreplay_status(f); + } + f->dump_float("rank_uptime", get_uptime().count()); +} + +void MDSRank::dump_clientreplay_status(Formatter *f) const +{ + f->open_object_section("clientreplay_status"); + f->dump_unsigned("clientreplay_queue", replay_queue.size()); + f->dump_unsigned("active_replay", mdcache->get_num_client_requests()); + f->close_section(); +} + +void MDSRankDispatcher::update_log_config() +{ + map<string,string> log_to_monitors; + map<string,string> log_to_syslog; + map<string,string> log_channel; + map<string,string> log_prio; + map<string,string> log_to_graylog; + map<string,string> log_to_graylog_host; + map<string,string> log_to_graylog_port; + uuid_d fsid; + string host; + + if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host) == 0) + clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl; +} + +void MDSRank::create_logger() +{ + dout(10) << "create_logger" << dendl; + { + PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last); + + // super useful (high prio) perf stats + mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req", + PerfCountersBuilder::PRIO_CRITICAL); + mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat", + PerfCountersBuilder::PRIO_CRITICAL); + mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos", + PerfCountersBuilder::PRIO_CRITICAL); + mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd", + PerfCountersBuilder::PRIO_INTERESTING); + mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps", + PerfCountersBuilder::PRIO_INTERESTING); + mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes", + "exi", PerfCountersBuilder::PRIO_INTERESTING); + mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes", + "imi", PerfCountersBuilder::PRIO_INTERESTING); + + // useful dir/inode/subtree stats + mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles"); + mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes"); + mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps"); + mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch"); + mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit"); + mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split"); + mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge"); + mds_plb.add_u64(l_mds_inode_max, "inode_max", "Max inodes, cache size"); + mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned"); + mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired"); + mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps", + "Inodes with capabilities"); + mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees"); + mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent"); + mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch", + "OpenIno incomplete directory fetchings"); + + // low prio stats + mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies"); + mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top"); + mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom"); + mds_plb.add_u64( + l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail"); + mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses"); + mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits"); + mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward", + "Traverse forwards"); + mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover", + "Traverse directory discovers"); + mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch", + "Traverse incomplete directory content fetchings"); + mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino", + "Traverse remote dentries"); + mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock", + "Traverse locks"); + mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length"); + mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports"); + mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports"); + mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch", + "OpenIno backtrace fetchings"); + mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover", + "OpenIno peer inode discovers"); + + logger = mds_plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); + } + + { + PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last); + mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino", + PerfCountersBuilder::PRIO_INTERESTING); + mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn", + PerfCountersBuilder::PRIO_INTERESTING); + + mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened"); + mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed"); + mdm_plb.add_u64(l_mdm_dir, "dir", "Directories"); + mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened"); + mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed"); + mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened"); + mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed"); + mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities"); + mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added"); + mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed"); + mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size"); + + mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + mdm_plb.add_u64(l_mdm_rss, "rss", "RSS"); + + mlogger = mdm_plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(mlogger); + } + + mdlog->create_logger(); + server->create_logger(); + purge_queue.create_logger(); + sessionmap.register_perfcounters(); + mdcache->register_perfcounters(); +} + +void MDSRank::check_ops_in_flight() +{ + string summary; + vector<string> warnings; + int slow = 0; + if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) { + clog->warn() << summary; + for (const auto& warning : warnings) { + clog->warn() << warning; + } + } + + // set mds slow request count + mds_slow_req_count = slow; + return; +} + +void MDSRankDispatcher::handle_osd_map() +{ + if (is_active() && + mdsmap->get_tableserver() == whoami) { + snapserver->check_osd_map(true); + } + + server->handle_osd_map(); + + purge_queue.update_op_limit(*mdsmap); + + std::set<entity_addr_t> newly_blacklisted; + objecter->consume_blacklist_events(&newly_blacklisted); + auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();}); + dout(4) << "handle_osd_map epoch " << epoch << ", " + << newly_blacklisted.size() << " new blacklist entries" << dendl; + auto victims = server->apply_blacklist(newly_blacklisted); + if (victims) { + set_osd_epoch_barrier(epoch); + } + + + // By default the objecter only requests OSDMap updates on use, + // we would like to always receive the latest maps in order to + // apply policy based on the FULL flag. + objecter->maybe_request_map(); +} + +int MDSRank::config_client(int64_t session_id, bool remove, + const std::string& option, const std::string& value, + std::ostream& ss) +{ + Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id)); + if (!session) { + ss << "session " << session_id << " not in sessionmap!"; + return -ENOENT; + } + + if (option == "timeout") { + if (remove) { + auto it = session->info.client_metadata.find("timeout"); + if (it == session->info.client_metadata.end()) { + ss << "Nonexistent config: " << option; + return -ENODATA; + } + session->info.client_metadata.erase(it); + } else { + char *end; + strtoul(value.c_str(), &end, 0); + if (*end) { + ss << "Invalid config for timeout: " << value; + return -EINVAL; + } + session->info.client_metadata[option] = value; + } + //sessionmap._mark_dirty(session, true); + } else { + ss << "Invalid config option: " << option; + return -EINVAL; + } + + return 0; +} + +bool MDSRank::evict_client(int64_t session_id, + bool wait, bool blacklist, std::ostream& err_ss, + Context *on_killed) +{ + ceph_assert(mds_lock.is_locked_by_me()); + + // Mutually exclusive args + ceph_assert(!(wait && on_killed != nullptr)); + + if (is_any_replay()) { + err_ss << "MDS is replaying log"; + return false; + } + + Session *session = sessionmap.get_session( + entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id)); + if (!session) { + err_ss << "session " << session_id << " not in sessionmap!"; + return false; + } + + auto& addr = session->info.inst.addr; + { + CachedStackStringStream css; + *css << "Evicting " << (blacklist ? "(and blacklisting) " : "") + << "client session " << session_id << " (" << addr << ")"; + dout(1) << css->strv() << dendl; + clog->info() << css->strv(); + } + + dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl; + stringstream ss; + ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\","; + ss << "\"addr\":\""; + ss << addr; + ss << "\"}"; + std::string tmp = ss.str(); + std::vector<std::string> cmd = {tmp}; + + auto kill_client_session = [this, session_id, wait, on_killed](){ + ceph_assert(mds_lock.is_locked_by_me()); + Session *session = sessionmap.get_session( + entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id)); + if (session) { + if (on_killed || !wait) { + server->kill_session(session, on_killed); + } else { + C_SaferCond on_safe; + server->kill_session(session, &on_safe); + + mds_lock.Unlock(); + on_safe.wait(); + mds_lock.Lock(); + } + } else { + dout(1) << "session " << session_id << " was removed while we waited " + "for blacklist" << dendl; + + // Even though it wasn't us that removed it, kick our completion + // as the session has been removed. + if (on_killed) { + on_killed->complete(0); + } + } + }; + + auto apply_blacklist = [this, cmd](std::function<void ()> fn){ + ceph_assert(mds_lock.is_locked_by_me()); + + Context *on_blacklist_done = new FunctionContext([this, fn](int r) { + objecter->wait_for_latest_osdmap( + new C_OnFinisher( + new FunctionContext([this, fn](int r) { + std::lock_guard l(mds_lock); + auto epoch = objecter->with_osdmap([](const OSDMap &o){ + return o.get_epoch(); + }); + + set_osd_epoch_barrier(epoch); + + fn(); + }), finisher) + ); + }); + + dout(4) << "Sending mon blacklist command: " << cmd[0] << dendl; + monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blacklist_done); + }; + + if (wait) { + if (blacklist) { + C_SaferCond inline_ctx; + apply_blacklist([&inline_ctx](){inline_ctx.complete(0);}); + mds_lock.Unlock(); + inline_ctx.wait(); + mds_lock.Lock(); + } + + // We dropped mds_lock, so check that session still exists + session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, + session_id)); + if (!session) { + dout(1) << "session " << session_id << " was removed while we waited " + "for blacklist" << dendl; + return true; + } + kill_client_session(); + } else { + if (blacklist) { + apply_blacklist(kill_client_session); + } else { + kill_client_session(); + } + } + + return true; +} + +void MDSRank::bcast_mds_map() +{ + dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl; + + // share the map with mounted clients + set<Session*> clients; + sessionmap.get_client_session_set(clients); + for (const auto &session : clients) { + auto m = MMDSMap::create(monc->get_fsid(), *mdsmap); + session->get_connection()->send_message2(std::move(m)); + } + last_client_mdsmap_bcast = mdsmap->get_epoch(); +} + +Context *MDSRank::create_async_exec_context(C_ExecAndReply *ctx) { + return new C_OnFinisher(new FunctionContext([ctx](int _) { + ctx->exec(); + }), finisher); +} + +MDSRankDispatcher::MDSRankDispatcher( + mds_rank_t whoami_, + Mutex &mds_lock_, + LogChannelRef &clog_, + SafeTimer &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap> &mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_) + : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_, + msgr, monc_, mgrc, respawn_hook_, suicide_hook_) +{ + g_conf().add_observer(this); +} + +bool MDSRankDispatcher::handle_command( + const cmdmap_t &cmdmap, + const MCommand::const_ref &m, + int *r, + std::stringstream *ds, + std::stringstream *ss, + Context **run_later, + bool *need_reply) +{ + ceph_assert(r != nullptr); + ceph_assert(ds != nullptr); + ceph_assert(ss != nullptr); + + *need_reply = true; + + std::string prefix; + cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + + if (prefix == "session ls" || prefix == "client ls") { + std::vector<std::string> filter_args; + cmd_getval(g_ceph_context, cmdmap, "filters", filter_args); + + SessionFilter filter; + *r = filter.parse(filter_args, ss); + if (*r != 0) { + return true; + } + + JSONFormatter f(true); + dump_sessions(filter, &f); + f.flush(*ds); + return true; + } else if (prefix == "session evict" || prefix == "client evict") { + std::vector<std::string> filter_args; + cmd_getval(g_ceph_context, cmdmap, "filters", filter_args); + + SessionFilter filter; + *r = filter.parse(filter_args, ss); + if (*r != 0) { + return true; + } + + evict_clients(filter, m); + + *need_reply = false; + return true; + } else if (prefix == "session config" || prefix == "client config") { + int64_t client_id; + std::string option; + std::string value; + + cmd_getval(g_ceph_context, cmdmap, "client_id", client_id); + cmd_getval(g_ceph_context, cmdmap, "option", option); + bool got_value = cmd_getval(g_ceph_context, cmdmap, "value", value); + + *r = config_client(client_id, !got_value, option, value, *ss); + return true; + } else if (prefix == "damage ls") { + JSONFormatter f(true); + damage_table.dump(&f); + f.flush(*ds); + return true; + } else if (prefix == "damage rm") { + damage_entry_id_t id = 0; + bool got = cmd_getval(g_ceph_context, cmdmap, "damage_id", (int64_t&)id); + if (!got) { + *r = -EINVAL; + return true; + } + + damage_table.erase(id); + return true; + } else if (prefix == "cache drop") { + int64_t timeout; + if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) { + timeout = 0; + } + + *need_reply = false; + *run_later = create_async_exec_context(new C_CacheDropExecAndReply + (this, m, (uint64_t)timeout)); + return true; + } else if (prefix == "scrub start") { + string path; + string tag; + vector<string> scrubop_vec; + cmd_getval(g_ceph_context, cmdmap, "scrubops", scrubop_vec); + cmd_getval(g_ceph_context, cmdmap, "path", path); + cmd_getval(g_ceph_context, cmdmap, "tag", tag); + + /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */ + if (mdsmap->get_max_mds() > 1) { + *ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub."; + *r = ENOTSUP; + return true; + } + + *need_reply = false; + *run_later = create_async_exec_context(new C_ScrubExecAndReply + (this, m, path, tag, scrubop_vec)); + return true; + } else if (prefix == "scrub abort") { + *need_reply = false; + *run_later = create_async_exec_context(new C_ScrubControlExecAndReply + (this, m, "abort")); + return true; + } else if (prefix == "scrub pause") { + *need_reply = false; + *run_later = create_async_exec_context(new C_ScrubControlExecAndReply + (this, m, "pause")); + return true; + } else if (prefix == "scrub resume") { + JSONFormatter f(true); + command_scrub_resume(&f); + f.flush(*ds); + return true; + } else if (prefix == "scrub status") { + JSONFormatter f(true); + command_scrub_status(&f); + f.flush(*ds); + return true; + } else { + return false; + } +} + +void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) { + dout(20) << __func__ << dendl; + + std::lock_guard locker(mds_lock); + C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this, + timeout, f, on_finish); + request->send(); +} + +epoch_t MDSRank::get_osd_epoch() const +{ + return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch)); +} + +const char** MDSRankDispatcher::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "fsid", + "host", + "mds_bal_fragment_dirs", + "mds_bal_fragment_interval", + "mds_cache_memory_limit", + "mds_cache_mid", + "mds_cache_reservation", + "mds_cache_size", + "mds_cache_trim_decay_rate", + "mds_cap_revoke_eviction_timeout", + "mds_dump_cache_threshold_file", + "mds_dump_cache_threshold_formatter", + "mds_enable_op_tracker", + "mds_health_cache_threshold", + "mds_inject_migrator_session_race", + "mds_log_pause", + "mds_max_export_size", + "mds_max_purge_files", + "mds_forward_all_requests_to_auth", + "mds_max_purge_ops", + "mds_max_purge_ops_per_pg", + "mds_max_snaps_per_dir", + "mds_op_complaint_time", + "mds_op_history_duration", + "mds_op_history_size", + "mds_op_log_threshold", + "mds_recall_max_decay_rate", + "mds_recall_warning_decay_rate", + "mds_request_load_average_decay_rate", + "mds_session_cache_liveness_decay_rate", + "mds_replay_unsafe_with_closed_session", + "mds_session_cap_acquisition_decay_rate", + "mds_max_caps_per_client", + "mds_session_cap_acquisition_throttle", + "mds_session_max_caps_throttle_ratio", + "mds_cap_acquisition_throttle_retry_request_time", + NULL + }; + return KEYS; +} + +void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) +{ + // XXX with or without mds_lock! + + if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) { + op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold); + } + if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) { + op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration); + } + if (changed.count("mds_enable_op_tracker")) { + op_tracker.set_tracking(conf->mds_enable_op_tracker); + } + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + update_log_config(); + } + + finisher->queue(new FunctionContext([this, changed](int r) { + std::scoped_lock lock(mds_lock); + + if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) { + mdlog->kick_submitter(); + } + sessionmap.handle_conf_change(changed); + server->handle_conf_change(changed); + mdcache->handle_conf_change(changed, *mdsmap); + purge_queue.handle_conf_change(changed, *mdsmap); + })); +} + +void MDSRank::get_task_status(std::map<std::string, std::string> *status) { + dout(20) << __func__ << dendl; + + // scrub summary for now.. + std::string_view scrub_summary = scrubstack->scrub_summary(); + status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary)); +} + +void MDSRank::schedule_update_timer_task() { + dout(20) << __func__ << dendl; + + timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"), + new FunctionContext([this](int _) { + send_task_status(); + })); +} + +void MDSRank::send_task_status() { + std::map<std::string, std::string> status; + get_task_status(&status); + + if (!status.empty()) { + dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl; + + int r = mgrc->service_daemon_update_task_status(std::move(status)); + if (r < 0) { + derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl; + } + } + + schedule_update_timer_task(); +} diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h new file mode 100644 index 00000000..c10bad23 --- /dev/null +++ b/src/mds/MDSRank.h @@ -0,0 +1,673 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef MDS_RANK_H_ +#define MDS_RANK_H_ + +#include <string_view> + +#include "common/DecayCounter.h" +#include "common/LogClient.h" +#include "common/Timer.h" +#include "common/TrackedOp.h" + +#include "messages/MClientRequest.h" +#include "messages/MCommand.h" +#include "messages/MMDSMap.h" + +#include "Beacon.h" +#include "DamageTable.h" +#include "MDSMap.h" +#include "SessionMap.h" +#include "MDCache.h" +#include "MDLog.h" +#include "MDSContext.h" +#include "PurgeQueue.h" +#include "Server.h" +#include "osdc/Journaler.h" + +// Full .h import instead of forward declaration for PerfCounter, for the +// benefit of those including this header and using MDSRank::logger +#include "common/perf_counters.h" + +enum { + l_mds_first = 2000, + l_mds_request, + l_mds_reply, + l_mds_reply_latency, + l_mds_forward, + l_mds_dir_fetch, + l_mds_dir_commit, + l_mds_dir_split, + l_mds_dir_merge, + l_mds_inode_max, + l_mds_inodes, + l_mds_inodes_top, + l_mds_inodes_bottom, + l_mds_inodes_pin_tail, + l_mds_inodes_pinned, + l_mds_inodes_expired, + l_mds_inodes_with_caps, + l_mds_caps, + l_mds_subtrees, + l_mds_traverse, + l_mds_traverse_hit, + l_mds_traverse_forward, + l_mds_traverse_discover, + l_mds_traverse_dir_fetch, + l_mds_traverse_remote_ino, + l_mds_traverse_lock, + l_mds_load_cent, + l_mds_dispatch_queue_len, + l_mds_exported, + l_mds_exported_inodes, + l_mds_imported, + l_mds_imported_inodes, + l_mds_openino_dir_fetch, + l_mds_openino_backtrace_fetch, + l_mds_openino_peer_discover, + l_mds_root_rfiles, + l_mds_root_rbytes, + l_mds_root_rsnaps, + l_mds_last, +}; + +// memory utilization +enum { + l_mdm_first = 2500, + l_mdm_ino, + l_mdm_inoa, + l_mdm_inos, + l_mdm_dir, + l_mdm_dira, + l_mdm_dirs, + l_mdm_dn, + l_mdm_dna, + l_mdm_dns, + l_mdm_cap, + l_mdm_capa, + l_mdm_caps, + l_mdm_rss, + l_mdm_heap, + l_mdm_last, +}; + +namespace ceph { + struct heartbeat_handle_d; +} + +class Locker; +class MDCache; +class MDLog; +class MDBalancer; +class InoTable; +class SnapServer; +class SnapClient; +class MDSTableServer; +class MDSTableClient; +class Messenger; +class Objecter; +class MonClient; +class MgrClient; +class Finisher; +class ScrubStack; +class C_MDS_Send_Command_Reply; +class C_ExecAndReply; + +/** + * The public part of this class's interface is what's exposed to all + * the various subsystems (server, mdcache, etc), such as pointers + * to the other subsystems, and message-sending calls. + */ +class MDSRank { + protected: + const mds_rank_t whoami; + + // Incarnation as seen in MDSMap at the point where a rank is + // assigned. + int incarnation; + + public: + + friend class C_Flush_Journal; + friend class C_Drop_Cache; + + friend class C_CacheDropExecAndReply; + friend class C_ScrubExecAndReply; + friend class C_ScrubControlExecAndReply; + + mds_rank_t get_nodeid() const { return whoami; } + int64_t get_metadata_pool(); + + // Reference to global MDS::mds_lock, so that users of MDSRank don't + // carry around references to the outer MDS, and we can substitute + // a separate lock here in future potentially. + Mutex &mds_lock; + + mono_time get_starttime() const { + return starttime; + } + chrono::duration<double> get_uptime() const { + mono_time now = mono_clock::now(); + return chrono::duration<double>(now-starttime); + } + + class CephContext *cct; + + bool is_daemon_stopping() const; + + // Reference to global cluster log client, just to avoid initialising + // a separate one here. + LogChannelRef &clog; + + // Reference to global timer utility, because MDSRank and MDSDaemon + // currently both use the same mds_lock, so it makes sense for them + // to share a timer. + SafeTimer &timer; + + std::unique_ptr<MDSMap> &mdsmap; /* MDSDaemon::mdsmap */ + + Objecter *objecter; + + // sub systems + Server *server; + MDCache *mdcache; + Locker *locker; + MDLog *mdlog; + MDBalancer *balancer; + ScrubStack *scrubstack; + DamageTable damage_table; + + + InoTable *inotable; + + SnapServer *snapserver; + SnapClient *snapclient; + + MDSTableClient *get_table_client(int t); + MDSTableServer *get_table_server(int t); + + SessionMap sessionmap; + Session *get_session(client_t client) { + return sessionmap.get_session(entity_name_t::CLIENT(client.v)); + } + Session *get_session(const Message::const_ref &m); + + PerfCounters *logger, *mlogger; + OpTracker op_tracker; + + // The last different state I held before current + MDSMap::DaemonState last_state; + // The state assigned to me by the MDSMap + MDSMap::DaemonState state; + + bool cluster_degraded; + + MDSMap::DaemonState get_state() const { return state; } + MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } + + bool is_creating() const { return state == MDSMap::STATE_CREATING; } + bool is_starting() const { return state == MDSMap::STATE_STARTING; } + bool is_standby() const { return state == MDSMap::STATE_STANDBY; } + bool is_replay() const { return state == MDSMap::STATE_REPLAY; } + bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; } + bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; } + bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; } + bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; } + bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; } + bool is_active() const { return state == MDSMap::STATE_ACTIVE; } + bool is_stopping() const { return state == MDSMap::STATE_STOPPING; } + bool is_any_replay() const { return (is_replay() || is_standby_replay()); } + bool is_stopped() const { return mdsmap->is_stopped(whoami); } + bool is_cluster_degraded() const { return cluster_degraded; } + bool allows_multimds_snaps() const { return mdsmap->allows_multimds_snaps(); } + + bool is_cache_trimmable() const { + return is_clientreplay() || is_active() || is_stopping(); + } + + void handle_write_error(int err); + + void update_mlogger(); + protected: + // Flag to indicate we entered shutdown: anyone seeing this to be true + // after taking mds_lock must drop out. + bool stopping; + + // PurgeQueue is only used by StrayManager, but it is owned by MDSRank + // because its init/shutdown happens at the top level. + PurgeQueue purge_queue; + + class ProgressThread : public Thread { + MDSRank *mds; + Cond cond; + public: + explicit ProgressThread(MDSRank *mds_) : mds(mds_) {} + void * entry() override; + void shutdown(); + void signal() {cond.Signal();} + } progress_thread; + + list<Message::const_ref> waiting_for_nolaggy; + MDSContext::que finished_queue; + // Dispatch, retry, queues + int dispatch_depth; + void inc_dispatch_depth() { ++dispatch_depth; } + void dec_dispatch_depth() { --dispatch_depth; } + void retry_dispatch(const Message::const_ref &m); + bool is_valid_message(const Message::const_ref &m); + void handle_message(const Message::const_ref &m); + void _advance_queues(); + bool _dispatch(const Message::const_ref &m, bool new_msg); + + ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock + + bool is_stale_message(const Message::const_ref &m) const; + + map<mds_rank_t, version_t> peer_mdsmap_epoch; + + ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename) + + MDSContext::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin, + waiting_for_reconnect, waiting_for_resolve; + MDSContext::vec waiting_for_any_client_connection; + MDSContext::que replay_queue; + bool replaying_requests_done = false; + + map<mds_rank_t, MDSContext::vec > waiting_for_active_peer; + map<epoch_t, MDSContext::vec > waiting_for_mdsmap; + + epoch_t osd_epoch_barrier; + + // Const reference to the beacon so that we can behave differently + // when it's laggy. + Beacon &beacon; + + /** + * Emit clog warnings for any ops reported as warnings by optracker + */ + void check_ops_in_flight(); + + int mds_slow_req_count; + + /** + * Share MDSMap with clients + */ + void bcast_mds_map(); // to mounted clients + epoch_t last_client_mdsmap_bcast; + + map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */ + + void create_logger(); + public: + void queue_waiter(MDSContext *c) { + finished_queue.push_back(c); + progress_thread.signal(); + } + void queue_waiter_front(MDSContext *c) { + finished_queue.push_front(c); + progress_thread.signal(); + } + void queue_waiters(MDSContext::vec& ls) { + MDSContext::vec v; + v.swap(ls); + std::copy(v.begin(), v.end(), std::back_inserter(finished_queue)); + progress_thread.signal(); + } + void queue_waiters_front(MDSContext::vec& ls) { + MDSContext::vec v; + v.swap(ls); + std::copy(v.rbegin(), v.rend(), std::front_inserter(finished_queue)); + progress_thread.signal(); + } + + MDSRank( + mds_rank_t whoami_, + Mutex &mds_lock_, + LogChannelRef &clog_, + SafeTimer &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap> & mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_); + + protected: + ~MDSRank(); + + public: + + // Daemon lifetime functions: these guys break the abstraction + // and call up into the parent MDSDaemon instance. It's kind + // of unavoidable: if we want any depth into our calls + // to be able to e.g. tear down the whole process, we have to + // have a reference going all the way down. + // >>> + void suicide(); + void respawn(); + // <<< + + /** + * Call this periodically if inside a potentially long running piece + * of code while holding the mds_lock + */ + void heartbeat_reset(); + + /** + * Report state DAMAGED to the mon, and then pass on to respawn(). Call + * this when an unrecoverable error is encountered while attempting + * to load an MDS rank's data structures. This is *not* for use with + * errors affecting normal dirfrag/inode objects -- they should be handled + * through cleaner scrub/repair mechanisms. + * + * Callers must already hold mds_lock. + */ + void damaged(); + + /** + * Wrapper around `damaged` for users who are not + * already holding mds_lock. + * + * Callers must not already hold mds_lock. + */ + void damaged_unlocked(); + + double last_cleared_laggy() const { + return beacon.last_cleared_laggy(); + } + + double get_dispatch_queue_max_age(utime_t now) const; + + void send_message_mds(const Message::ref& m, mds_rank_t mds); + void forward_message_mds(const MClientRequest::const_ref& req, mds_rank_t mds); + void send_message_client_counted(const Message::ref& m, client_t client); + void send_message_client_counted(const Message::ref& m, Session* session); + void send_message_client_counted(const Message::ref& m, const ConnectionRef& connection); + void send_message_client(const Message::ref& m, Session* session); + void send_message(const Message::ref& m, const ConnectionRef& c); + + void wait_for_active_peer(mds_rank_t who, MDSContext *c) { + waiting_for_active_peer[who].push_back(c); + } + void wait_for_cluster_recovered(MDSContext *c) { + ceph_assert(cluster_degraded); + waiting_for_active_peer[MDS_RANK_NONE].push_back(c); + } + + void wait_for_any_client_connection(MDSContext *c) { + waiting_for_any_client_connection.push_back(c); + } + void kick_waiters_for_any_client_connection(void) { + finish_contexts(g_ceph_context, waiting_for_any_client_connection); + } + void wait_for_active(MDSContext *c) { + waiting_for_active.push_back(c); + } + void wait_for_replay(MDSContext *c) { + waiting_for_replay.push_back(c); + } + void wait_for_rejoin(MDSContext *c) { + waiting_for_rejoin.push_back(c); + } + void wait_for_reconnect(MDSContext *c) { + waiting_for_reconnect.push_back(c); + } + void wait_for_resolve(MDSContext *c) { + waiting_for_resolve.push_back(c); + } + void wait_for_mdsmap(epoch_t e, MDSContext *c) { + waiting_for_mdsmap[e].push_back(c); + } + void enqueue_replay(MDSContext *c) { + replay_queue.push_back(c); + } + + bool queue_one_replay(); + void maybe_clientreplay_done(); + + void set_osd_epoch_barrier(epoch_t e); + epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;} + epoch_t get_osd_epoch() const; + + ceph_tid_t issue_tid() { return ++last_tid; } + + Finisher *finisher; + + MDSMap *get_mds_map() { return mdsmap.get(); } + + uint64_t get_num_requests() const { return logger->get(l_mds_request); } + + int get_mds_slow_req_count() const { return mds_slow_req_count; } + + void dump_status(Formatter *f) const; + + void hit_export_target(mds_rank_t rank, double amount=-1.0); + bool is_export_target(mds_rank_t rank) { + const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets; + return map_targets.count(rank); + } + + bool evict_client(int64_t session_id, bool wait, bool blacklist, + std::ostream& ss, Context *on_killed=nullptr); + int config_client(int64_t session_id, bool remove, + const std::string& option, const std::string& value, + std::ostream& ss); + + void mark_base_recursively_scrubbed(inodeno_t ino); + + protected: + void dump_clientreplay_status(Formatter *f) const; + void command_scrub_start(Formatter *f, + std::string_view path, std::string_view tag, + const vector<string>& scrubop_vec, Context *on_finish); + void command_tag_path(Formatter *f, std::string_view path, + std::string_view tag); + // scrub control commands + void command_scrub_abort(Formatter *f, Context *on_finish); + void command_scrub_pause(Formatter *f, Context *on_finish); + void command_scrub_resume(Formatter *f); + void command_scrub_status(Formatter *f); + + void command_flush_path(Formatter *f, std::string_view path); + void command_flush_journal(Formatter *f); + void command_get_subtrees(Formatter *f); + void command_export_dir(Formatter *f, + std::string_view path, mds_rank_t dest); + bool command_dirfrag_split( + cmdmap_t cmdmap, + std::ostream &ss); + bool command_dirfrag_merge( + cmdmap_t cmdmap, + std::ostream &ss); + bool command_dirfrag_ls( + cmdmap_t cmdmap, + std::ostream &ss, + Formatter *f); + int _command_export_dir(std::string_view path, mds_rank_t dest); + CDir *_command_dirfrag_get( + const cmdmap_t &cmdmap, + std::ostream &ss); + void command_openfiles_ls(Formatter *f); + void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f); + void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); + void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish); + + protected: + Messenger *messenger; + MonClient *monc; + MgrClient *mgrc; + + Context *respawn_hook; + Context *suicide_hook; + + // Friended to access retry_dispatch + friend class C_MDS_RetryMessage; + + // FIXME the state machine logic should be separable from the dispatch + // logic that calls it. + // >>> + void calc_recovery_set(); + void request_state(MDSMap::DaemonState s); + + bool standby_replaying; // true if current replay pass is in standby-replay mode + + typedef enum { + // The MDSMap is available, configure default layouts and structures + MDS_BOOT_INITIAL = 0, + // We are ready to open some inodes + MDS_BOOT_OPEN_ROOT, + // We are ready to do a replay if needed + MDS_BOOT_PREPARE_LOG, + // Replay is complete + MDS_BOOT_REPLAY_DONE + } BootStep; + friend class C_MDS_BootStart; + friend class C_MDS_InternalBootStart; + void boot_create(); // i am new mds. + void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay + + void replay_start(); + void creating_done(); + void starting_done(); + void replay_done(); + void standby_replay_restart(); + void _standby_replay_restart_finish(int r, uint64_t old_read_pos); + class C_MDS_StandbyReplayRestart; + class C_MDS_StandbyReplayRestartFinish; + + void reopen_log(); + + void resolve_start(); + void resolve_done(); + void reconnect_start(); + void reconnect_done(); + void rejoin_joint_start(); + void rejoin_start(); + void rejoin_done(); + void recovery_done(int oldstate); + void clientreplay_start(); + void clientreplay_done(); + void active_start(); + void stopping_start(); + void stopping_done(); + + void validate_sessions(); + // <<< + + // >>> + void handle_mds_recovery(mds_rank_t who); + void handle_mds_failure(mds_rank_t who); + // <<< + + /* Update MDSMap export_targets for this rank. Called on ::tick(). */ + void update_targets(); + + friend class C_MDS_MonCommand; + void _mon_command_finish(int r, std::string_view cmd, std::string_view outs); + void set_mdsmap_multimds_snaps_allowed(); +private: + mono_time starttime = mono_clock::zero(); + + // "task" string that gets displayed in ceph status + inline static const std::string SCRUB_STATUS_KEY = "scrub status"; + + void get_task_status(std::map<std::string, std::string> *status); + void schedule_update_timer_task(); + void send_task_status(); + +protected: + Context *create_async_exec_context(C_ExecAndReply *ctx); +}; + +/* This expects to be given a reference which it is responsible for. + * The finish function calls functions which + * will put the Message exactly once.*/ +class C_MDS_RetryMessage : public MDSInternalContext { +public: + C_MDS_RetryMessage(MDSRank *mds, const Message::const_ref &m) + : MDSInternalContext(mds), m(m) {} + void finish(int r) override { + get_mds()->retry_dispatch(m); + } +protected: + Message::const_ref m; +}; + +class CF_MDS_RetryMessageFactory : public MDSContextFactory { +public: + CF_MDS_RetryMessageFactory(MDSRank *mds, const Message::const_ref &m) + : mds(mds), m(m) {} + + MDSContext *build() { + return new C_MDS_RetryMessage(mds, m); + } + +private: + MDSRank *mds; + Message::const_ref m; +}; + +/** + * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e. + * the service/dispatcher stuff like init/shutdown that subsystems should + * never touch. + */ +class MDSRankDispatcher : public MDSRank, public md_config_obs_t +{ +public: + void init(); + void tick(); + void shutdown(); + bool handle_asok_command(std::string_view command, const cmdmap_t& cmdmap, + Formatter *f, std::ostream& ss); + void handle_mds_map(const MMDSMap::const_ref &m, const MDSMap &oldmap); + void handle_osd_map(); + void update_log_config(); + + const char** get_tracked_conf_keys() const override final; + void handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) override; + + bool handle_command( + const cmdmap_t &cmdmap, + const MCommand::const_ref &m, + int *r, + std::stringstream *ds, + std::stringstream *ss, + Context **run_later, + bool *need_reply); + + void dump_sessions(const SessionFilter &filter, Formatter *f) const; + void evict_clients(const SessionFilter &filter, const MCommand::const_ref &m); + + // Call into me from MDS::ms_dispatch + bool ms_dispatch(const Message::const_ref &m); + + MDSRankDispatcher( + mds_rank_t whoami_, + Mutex &mds_lock_, + LogChannelRef &clog_, + SafeTimer &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap> &mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_); +}; + +#endif // MDS_RANK_H_ + diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc new file mode 100644 index 00000000..b0809f50 --- /dev/null +++ b/src/mds/MDSTable.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSTable.h" + +#include "MDSRank.h" +#include "MDLog.h" + +#include "osdc/Filer.h" + +#include "include/types.h" + +#include "common/config.h" +#include "common/errno.h" +#include "common/Finisher.h" + +#include "include/ceph_assert.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": " + + +class MDSTableIOContext : public MDSIOContextBase +{ + protected: + MDSTable *ida; + MDSRank *get_mds() override {return ida->mds;} + public: + explicit MDSTableIOContext(MDSTable *ida_) : ida(ida_) { + ceph_assert(ida != NULL); + } +}; + + +class C_IO_MT_Save : public MDSTableIOContext { + version_t version; +public: + C_IO_MT_Save(MDSTable *i, version_t v) : MDSTableIOContext(i), version(v) {} + void finish(int r) override { + ida->save_2(r, version); + } + void print(ostream& out) const override { + out << "table_save(" << ida->table_name << ")"; + } +}; + +void MDSTable::save(MDSContext *onfinish, version_t v) +{ + if (v > 0 && v <= committing_version) { + dout(10) << "save v " << version << " - already saving " + << committing_version << " >= needed " << v << dendl; + if (onfinish) + waitfor_save[v].push_back(onfinish); + return; + } + + dout(10) << "save v " << version << dendl; + ceph_assert(is_active()); + + bufferlist bl; + encode(version, bl); + encode_state(bl); + + committing_version = version; + + if (onfinish) + waitfor_save[version].push_back(onfinish); + + // write (async) + SnapContext snapc; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + mds->objecter->write_full(oid, oloc, + snapc, + bl, ceph::real_clock::now(), 0, + new C_OnFinisher(new C_IO_MT_Save(this, version), + mds->finisher)); +} + +void MDSTable::save_2(int r, version_t v) +{ + if (r < 0) { + dout(1) << "save error " << r << " v " << v << dendl; + mds->clog->error() << "failed to store table " << table_name << " object," + << " errno " << r; + mds->handle_write_error(r); + return; + } + + dout(10) << "save_2 v " << v << dendl; + committed_version = v; + + MDSContext::vec ls; + while (!waitfor_save.empty()) { + auto it = waitfor_save.begin(); + if (it->first > v) break; + auto& v = it->second; + ls.insert(ls.end(), v.begin(), v.end()); + waitfor_save.erase(it); + } + finish_contexts(g_ceph_context, ls, 0); +} + + +void MDSTable::reset() +{ + reset_state(); + projected_version = version; + state = STATE_ACTIVE; +} + + + +// ----------------------- + +class C_IO_MT_Load : public MDSTableIOContext { +public: + Context *onfinish; + bufferlist bl; + C_IO_MT_Load(MDSTable *i, Context *o) : MDSTableIOContext(i), onfinish(o) {} + void finish(int r) override { + ida->load_2(r, bl, onfinish); + } + void print(ostream& out) const override { + out << "table_load(" << ida->table_name << ")"; + } +}; + +object_t MDSTable::get_object_name() const +{ + char n[50]; + if (per_mds) + snprintf(n, sizeof(n), "mds%d_%s", int(rank), table_name.c_str()); + else + snprintf(n, sizeof(n), "mds_%s", table_name.c_str()); + return object_t(n); +} + +void MDSTable::load(MDSContext *onfinish) +{ + dout(10) << "load" << dendl; + + ceph_assert(is_undef()); + state = STATE_OPENING; + + C_IO_MT_Load *c = new C_IO_MT_Load(this, onfinish); + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0, + new C_OnFinisher(c, mds->finisher)); +} + +void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish) +{ + ceph_assert(is_opening()); + state = STATE_ACTIVE; + if (r == -EBLACKLISTED) { + mds->respawn(); + return; + } + if (r < 0) { + derr << "load_2 could not read table: " << r << dendl; + mds->clog->error() << "error reading table object '" << get_object_name() + << "' " << r << " (" << cpp_strerror(r) << ")"; + mds->damaged(); + ceph_assert(r >= 0); // Should be unreachable because damaged() calls respawn() + } + + dout(10) << "load_2 got " << bl.length() << " bytes" << dendl; + auto p = bl.cbegin(); + + try { + decode(version, p); + projected_version = committed_version = version; + dout(10) << "load_2 loaded v" << version << dendl; + decode_state(p); + } catch (buffer::error &e) { + mds->clog->error() << "error decoding table object '" << get_object_name() + << "': " << e.what(); + mds->damaged(); + ceph_assert(r >= 0); // Should be unreachable because damaged() calls respawn() + } + + if (onfinish) { + onfinish->complete(0); + } +} diff --git a/src/mds/MDSTable.h b/src/mds/MDSTable.h new file mode 100644 index 00000000..6ad52b20 --- /dev/null +++ b/src/mds/MDSTable.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSTABLE_H +#define CEPH_MDSTABLE_H + +#include "mdstypes.h" +#include "mds_table_types.h" +#include "include/buffer_fwd.h" + +#include "MDSContext.h" + +class MDSRank; + +class MDSTable { +public: + MDSRank *mds; +protected: + std::string table_name; + bool per_mds; + mds_rank_t rank; + + + static const int STATE_UNDEF = 0; + static const int STATE_OPENING = 1; + static const int STATE_ACTIVE = 2; + //static const int STATE_COMMITTING = 3; + int state; + + version_t version, committing_version, committed_version, projected_version; + + map<version_t, MDSContext::vec > waitfor_save; + +public: + MDSTable(MDSRank *m, std::string_view n, bool is_per_mds) : + mds(m), table_name(n), per_mds(is_per_mds), rank(MDS_RANK_NONE), + state(STATE_UNDEF), + version(0), committing_version(0), committed_version(0), projected_version(0) {} + virtual ~MDSTable() {} + + void set_rank(mds_rank_t r) + { + rank = r; + } + + version_t get_version() const { return version; } + version_t get_committed_version() const { return committed_version; } + version_t get_committing_version() const { return committing_version; } + version_t get_projected_version() const { return projected_version; } + + void force_replay_version(version_t v) { + version = projected_version = v; + } + + //version_t project_version() { return ++projected_version; } + //version_t inc_version() { return ++version; } + + // load/save from disk (hack) + bool is_undef() const { return state == STATE_UNDEF; } + bool is_active() const { return state == STATE_ACTIVE; } + bool is_opening() const { return state == STATE_OPENING; } + + void reset(); + void save(MDSContext *onfinish=0, version_t need=0); + void save_2(int r, version_t v); + + void shutdown() { + if (is_active()) save(0); + } + + object_t get_object_name() const; + void load(MDSContext *onfinish); + void load_2(int, bufferlist&, Context *onfinish); + + // child must overload these + virtual void reset_state() = 0; + virtual void decode_state(bufferlist::const_iterator& p) = 0; + virtual void encode_state(bufferlist& bl) const = 0; + + friend class C_IO_MT_Load; + friend class C_IO_MT_Save; +}; + +#endif diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc new file mode 100644 index 00000000..6418b130 --- /dev/null +++ b/src/mds/MDSTableClient.cc @@ -0,0 +1,264 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> + +#include "MDSMap.h" + +#include "MDSContext.h" +#include "msg/Messenger.h" + +#include "MDSRank.h" +#include "MDLog.h" +#include "LogSegment.h" + +#include "MDSTableClient.h" +#include "events/ETableClient.h" + +#include "common/config.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") " + + +class C_LoggedAck : public MDSLogContextBase { + MDSTableClient *tc; + version_t tid; + MDSRank *get_mds() override { return tc->mds; } +public: + C_LoggedAck(MDSTableClient *a, version_t t) : tc(a), tid(t) {} + void finish(int r) override { + tc->_logged_ack(tid); + } +}; + + +void MDSTableClient::handle_request(const MMDSTableRequest::const_ref &m) +{ + dout(10) << "handle_request " << *m << dendl; + ceph_assert(m->table == table); + + if (mds->get_state() < MDSMap::STATE_RESOLVE) { + if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { + mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); + } + return; + } + + version_t tid = m->get_tid(); + uint64_t reqid = m->reqid; + + switch (m->op) { + case TABLESERVER_OP_QUERY_REPLY: + handle_query_result(m); + break; + + case TABLESERVER_OP_NOTIFY_PREP: + ceph_assert(g_conf()->mds_kill_mdstable_at != 9); + handle_notify_prep(m); + break; + + case TABLESERVER_OP_AGREE: + if (pending_prepare.count(reqid)) { + dout(10) << "got agree on " << reqid << " atid " << tid << dendl; + + ceph_assert(g_conf()->mds_kill_mdstable_at != 3); + + MDSContext *onfinish = pending_prepare[reqid].onfinish; + *pending_prepare[reqid].ptid = tid; + if (pending_prepare[reqid].pbl) + *pending_prepare[reqid].pbl = m->bl; + pending_prepare.erase(reqid); + prepared_update[tid] = reqid; + if (onfinish) { + onfinish->complete(0); + } + } + else if (prepared_update.count(tid)) { + dout(10) << "got duplicated agree on " << reqid << " atid " << tid << dendl; + ceph_assert(prepared_update[tid] == reqid); + ceph_assert(!server_ready); + } + else if (pending_commit.count(tid)) { + dout(10) << "stray agree on " << reqid << " tid " << tid + << ", already committing, will resend COMMIT" << dendl; + ceph_assert(!server_ready); + // will re-send commit when receiving the server ready message + } + else { + dout(10) << "stray agree on " << reqid << " tid " << tid + << ", sending ROLLBACK" << dendl; + ceph_assert(!server_ready); + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_ROLLBACK, 0, tid); + mds->send_message_mds(req, mds->get_mds_map()->get_tableserver()); + } + break; + + case TABLESERVER_OP_ACK: + if (pending_commit.count(tid) && + pending_commit[tid]->pending_commit_tids[table].count(tid)) { + dout(10) << "got ack on tid " << tid << ", logging" << dendl; + + ceph_assert(g_conf()->mds_kill_mdstable_at != 7); + + // remove from committing list + pending_commit[tid]->pending_commit_tids[table].erase(tid); + pending_commit.erase(tid); + + // log ACK. + mds->mdlog->start_submit_entry(new ETableClient(table, TABLESERVER_OP_ACK, tid), + new C_LoggedAck(this, tid)); + } else { + dout(10) << "got stray ack on tid " << tid << ", ignoring" << dendl; + } + break; + + case TABLESERVER_OP_SERVER_READY: + ceph_assert(!server_ready); + server_ready = true; + + if (last_reqid == ~0ULL) + last_reqid = reqid; + + resend_queries(); + resend_prepares(); + resend_commits(); + break; + + default: + ceph_abort_msg("unrecognized mds_table_client request op"); + } +} + + +void MDSTableClient::_logged_ack(version_t tid) +{ + dout(10) << "_logged_ack " << tid << dendl; + // kick any waiters (LogSegment trim) + if (ack_waiters.count(tid)) { + dout(15) << "kicking ack waiters on tid " << tid << dendl; + mds->queue_waiters(ack_waiters[tid]); + ack_waiters.erase(tid); + } +} + +void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, + MDSContext *onfinish) +{ + if (last_reqid == ~0ULL) { + dout(10) << "tableserver is not ready yet, waiting for request id" << dendl; + waiting_for_reqid.push_back(_pending_prepare(onfinish, ptid, pbl, mutation)); + return; + } + + uint64_t reqid = ++last_reqid; + dout(10) << "_prepare " << reqid << dendl; + + pending_prepare[reqid].mutation = mutation; + pending_prepare[reqid].ptid = ptid; + pending_prepare[reqid].pbl = pbl; + pending_prepare[reqid].onfinish = onfinish; + + if (server_ready) { + // send message + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_PREPARE, reqid); + req->bl = mutation; + mds->send_message_mds(req, mds->get_mds_map()->get_tableserver()); + } else + dout(10) << "tableserver is not ready yet, deferring request" << dendl; +} + +void MDSTableClient::commit(version_t tid, LogSegment *ls) +{ + dout(10) << "commit " << tid << dendl; + + ceph_assert(prepared_update.count(tid)); + prepared_update.erase(tid); + + ceph_assert(pending_commit.count(tid) == 0); + pending_commit[tid] = ls; + ls->pending_commit_tids[table].insert(tid); + + notify_commit(tid); + + ceph_assert(g_conf()->mds_kill_mdstable_at != 4); + + if (server_ready) { + // send message + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_COMMIT, 0, tid); + mds->send_message_mds(req, mds->get_mds_map()->get_tableserver()); + } else + dout(10) << "tableserver is not ready yet, deferring request" << dendl; +} + + + +// recovery + +void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls) +{ + dout(10) << "got_journaled_agree " << tid << dendl; + ls->pending_commit_tids[table].insert(tid); + pending_commit[tid] = ls; + + notify_commit(tid); +} + +void MDSTableClient::got_journaled_ack(version_t tid) +{ + dout(10) << "got_journaled_ack " << tid << dendl; + if (pending_commit.count(tid)) { + pending_commit[tid]->pending_commit_tids[table].erase(tid); + pending_commit.erase(tid); + } +} + +void MDSTableClient::resend_commits() +{ + for (map<version_t,LogSegment*>::iterator p = pending_commit.begin(); + p != pending_commit.end(); + ++p) { + dout(10) << "resending commit on " << p->first << dendl; + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_COMMIT, 0, p->first); + mds->send_message_mds(req, mds->get_mds_map()->get_tableserver()); + } +} + +void MDSTableClient::resend_prepares() +{ + while (!waiting_for_reqid.empty()) { + pending_prepare[++last_reqid] = waiting_for_reqid.front(); + waiting_for_reqid.pop_front(); + } + + for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin(); + p != pending_prepare.end(); + ++p) { + dout(10) << "resending prepare on " << p->first << dendl; + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_PREPARE, p->first); + req->bl = p->second.mutation; + mds->send_message_mds(req, mds->get_mds_map()->get_tableserver()); + } +} + +void MDSTableClient::handle_mds_failure(mds_rank_t who) +{ + if (who != mds->get_mds_map()->get_tableserver()) + return; // do nothing. + + dout(7) << "tableserver mds." << who << " fails" << dendl; + server_ready = false; +} diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h new file mode 100644 index 00000000..f2bf461a --- /dev/null +++ b/src/mds/MDSTableClient.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSTABLECLIENT_H +#define CEPH_MDSTABLECLIENT_H + +#include "include/types.h" +#include "MDSContext.h" +#include "mds_table_types.h" + +#include "messages/MMDSTableRequest.h" + +class MDSRank; +class LogSegment; + +class MDSTableClient { +protected: + MDSRank *mds; + int table; + + uint64_t last_reqid; + + bool server_ready; + + // prepares + struct _pending_prepare { + MDSContext *onfinish; + version_t *ptid; + bufferlist *pbl; + bufferlist mutation; + + _pending_prepare() : onfinish(0), ptid(0), pbl(0) {} + _pending_prepare(MDSContext *c, version_t *pt, bufferlist *pb, bufferlist& m) : + onfinish(c), ptid(pt), pbl(pb), mutation(m) {} + }; + + map<uint64_t, _pending_prepare> pending_prepare; + map<version_t, uint64_t> prepared_update; + list<_pending_prepare> waiting_for_reqid; + + // pending commits + map<version_t, LogSegment*> pending_commit; + map<version_t, MDSContext::vec > ack_waiters; + + void handle_reply(class MMDSTableQuery *m); + void _logged_ack(version_t tid); + friend class C_LoggedAck; + +public: + MDSTableClient(MDSRank *m, int tab) : + mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {} + virtual ~MDSTableClient() {} + + void handle_request(const MMDSTableRequest::const_ref &m); + + void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, MDSContext *onfinish); + void commit(version_t tid, LogSegment *ls); + + void resend_commits(); + void resend_prepares(); + + // for recovery (by me) + void got_journaled_agree(version_t tid, LogSegment *ls); + void got_journaled_ack(version_t tid); + + bool has_committed(version_t tid) const { + return pending_commit.count(tid) == 0; + } + void wait_for_ack(version_t tid, MDSContext *c) { + ack_waiters[tid].push_back(c); + } + + set<version_t> get_journaled_tids() const { + set<version_t> tids; + for (auto p : pending_commit) + tids.insert(p.first); + return tids; + } + + void handle_mds_failure(mds_rank_t mds); + + // child must implement + virtual void resend_queries() = 0; + virtual void handle_query_result(const MMDSTableRequest::const_ref &m) = 0; + virtual void handle_notify_prep(const MMDSTableRequest::const_ref &m) = 0; + virtual void notify_commit(version_t tid) = 0; + + // and friendly front-end for _prepare. + +}; + +#endif diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc new file mode 100644 index 00000000..cd7724f5 --- /dev/null +++ b/src/mds/MDSTableServer.cc @@ -0,0 +1,373 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSTableServer.h" +#include "MDSRank.h" +#include "MDLog.h" +#include "msg/Messenger.h" + +#include "events/ETableServer.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << ".tableserver(" << get_mdstable_name(table) << ") " + +void MDSTableServer::handle_request(const MMDSTableRequest::const_ref &req) +{ + ceph_assert(req->op >= 0); + switch (req->op) { + case TABLESERVER_OP_QUERY: return handle_query(req); + case TABLESERVER_OP_PREPARE: return handle_prepare(req); + case TABLESERVER_OP_COMMIT: return handle_commit(req); + case TABLESERVER_OP_ROLLBACK: return handle_rollback(req); + case TABLESERVER_OP_NOTIFY_ACK: return handle_notify_ack(req); + default: ceph_abort_msg("unrecognized mds_table_server request op"); + } +} + +class C_Prepare : public MDSLogContextBase { + MDSTableServer *server; + MMDSTableRequest::const_ref req; + version_t tid; + MDSRank *get_mds() override { return server->mds; } +public: + + C_Prepare(MDSTableServer *s, const MMDSTableRequest::const_ref r, version_t v) : server(s), req(r), tid(v) {} + void finish(int r) override { + server->_prepare_logged(req, tid); + } +}; + +// prepare +void MDSTableServer::handle_prepare(const MMDSTableRequest::const_ref &req) +{ + dout(7) << "handle_prepare " << *req << dendl; + mds_rank_t from = mds_rank_t(req->get_source().num()); + + ceph_assert(g_conf()->mds_kill_mdstable_at != 1); + + projected_version++; + + ETableServer *le = new ETableServer(table, TABLESERVER_OP_PREPARE, req->reqid, from, + projected_version, projected_version); + mds->mdlog->start_entry(le); + le->mutation = req->bl; + mds->mdlog->submit_entry(le, new C_Prepare(this, req, projected_version)); + mds->mdlog->flush(); +} + +void MDSTableServer::_prepare_logged(const MMDSTableRequest::const_ref &req, version_t tid) +{ + dout(7) << "_create_logged " << *req << " tid " << tid << dendl; + mds_rank_t from = mds_rank_t(req->get_source().num()); + + ceph_assert(g_conf()->mds_kill_mdstable_at != 2); + + _note_prepare(from, req->reqid); + bufferlist out; + _prepare(req->bl, req->reqid, from, out); + ceph_assert(version == tid); + + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, req->reqid, tid); + reply->bl = std::move(out); + + if (_notify_prep(tid)) { + auto& p = pending_notifies[tid]; + p.notify_ack_gather = active_clients; + p.mds = from; + p.reply = reply; + } else { + mds->send_message_mds(reply, from); + } +} + +void MDSTableServer::handle_notify_ack(const MMDSTableRequest::const_ref &m) +{ + dout(7) << __func__ << " " << *m << dendl; + mds_rank_t from = mds_rank_t(m->get_source().num()); + version_t tid = m->get_tid(); + + auto p = pending_notifies.find(tid); + if (p != pending_notifies.end()) { + if (p->second.notify_ack_gather.erase(from)) { + if (p->second.notify_ack_gather.empty()) { + if (p->second.onfinish) + p->second.onfinish->complete(0); + else + mds->send_message_mds(p->second.reply, p->second.mds); + pending_notifies.erase(p); + } + } else { + dout(0) << "got unexpected notify ack for tid " << tid << " from mds." << from << dendl; + } + } else { + } +} + +class C_Commit : public MDSLogContextBase { + MDSTableServer *server; + MMDSTableRequest::const_ref req; + MDSRank *get_mds() override { return server->mds; } +public: + C_Commit(MDSTableServer *s, const MMDSTableRequest::const_ref &r) : server(s), req(r) {} + void finish(int r) override { + server->_commit_logged(req); + } +}; + +// commit +void MDSTableServer::handle_commit(const MMDSTableRequest::const_ref &req) +{ + dout(7) << "handle_commit " << *req << dendl; + + version_t tid = req->get_tid(); + + if (pending_for_mds.count(tid)) { + + if (committing_tids.count(tid)) { + dout(0) << "got commit for tid " << tid << ", already committing, waiting." << dendl; + return; + } + + ceph_assert(g_conf()->mds_kill_mdstable_at != 5); + + projected_version++; + committing_tids.insert(tid); + + mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_COMMIT, 0, MDS_RANK_NONE, + tid, projected_version), + new C_Commit(this, req)); + } + else if (tid <= version) { + dout(0) << "got commit for tid " << tid << " <= " << version + << ", already committed, sending ack." << dendl; + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_ACK, req->reqid, tid); + mds->send_message(reply, req->get_connection()); + } + else { + // wtf. + dout(0) << "got commit for tid " << tid << " > " << version << dendl; + ceph_assert(tid <= version); + } +} + +void MDSTableServer::_commit_logged(const MMDSTableRequest::const_ref &req) +{ + dout(7) << "_commit_logged, sending ACK" << dendl; + + ceph_assert(g_conf()->mds_kill_mdstable_at != 6); + version_t tid = req->get_tid(); + + pending_for_mds.erase(tid); + committing_tids.erase(tid); + + _commit(tid, req); + _note_commit(tid); + + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_ACK, req->reqid, req->get_tid()); + mds->send_message_mds(reply, mds_rank_t(req->get_source().num())); +} + +class C_Rollback : public MDSLogContextBase { + MDSTableServer *server; + MMDSTableRequest::const_ref req; + MDSRank *get_mds() override { return server->mds; } +public: + C_Rollback(MDSTableServer *s, const MMDSTableRequest::const_ref &r) : server(s), req(r) {} + void finish(int r) override { + server->_rollback_logged(req); + } +}; + +// ROLLBACK +void MDSTableServer::handle_rollback(const MMDSTableRequest::const_ref &req) +{ + dout(7) << "handle_rollback " << *req << dendl; + + ceph_assert(g_conf()->mds_kill_mdstable_at != 8); + version_t tid = req->get_tid(); + ceph_assert(pending_for_mds.count(tid)); + ceph_assert(!committing_tids.count(tid)); + + projected_version++; + committing_tids.insert(tid); + + mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, MDS_RANK_NONE, + tid, projected_version), + new C_Rollback(this, req)); +} + +void MDSTableServer::_rollback_logged(const MMDSTableRequest::const_ref &req) +{ + dout(7) << "_rollback_logged " << *req << dendl; + + version_t tid = req->get_tid(); + + pending_for_mds.erase(tid); + committing_tids.erase(tid); + + _rollback(tid); + _note_rollback(tid); +} + + + +// SERVER UPDATE +class C_ServerUpdate : public MDSLogContextBase { + MDSTableServer *server; + bufferlist bl; + MDSRank *get_mds() override { return server->mds; } +public: + C_ServerUpdate(MDSTableServer *s, bufferlist &b) : server(s), bl(b) {} + void finish(int r) override { + server->_server_update_logged(bl); + } +}; + +void MDSTableServer::do_server_update(bufferlist& bl) +{ + dout(10) << "do_server_update len " << bl.length() << dendl; + + projected_version++; + + ETableServer *le = new ETableServer(table, TABLESERVER_OP_SERVER_UPDATE, 0, MDS_RANK_NONE, 0, projected_version); + mds->mdlog->start_entry(le); + le->mutation = bl; + mds->mdlog->submit_entry(le, new C_ServerUpdate(this, bl)); +} + +void MDSTableServer::_server_update_logged(bufferlist& bl) +{ + dout(10) << "_server_update_logged len " << bl.length() << dendl; + _server_update(bl); + _note_server_update(bl); +} + +// recovery + +class C_ServerRecovery : public MDSContext { + MDSTableServer *server; + MDSRank *get_mds() override { return server->mds; } +public: + C_ServerRecovery(MDSTableServer *s) : server(s) {} + void finish(int r) override { + server->_do_server_recovery(); + } +}; + +void MDSTableServer::_do_server_recovery() +{ + dout(7) << __func__ << " " << active_clients << dendl; + map<mds_rank_t, uint64_t> next_reqids; + + for (auto p : pending_for_mds) { + mds_rank_t who = p.second.mds; + if (!active_clients.count(who)) + continue; + + if (p.second.reqid >= next_reqids[who]) + next_reqids[who] = p.second.reqid + 1; + + version_t tid = p.second.tid; + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, p.second.reqid, tid); + _get_reply_buffer(tid, &reply->bl); + mds->send_message_mds(reply, who); + } + + for (auto p : active_clients) { + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_SERVER_READY, next_reqids[p]); + mds->send_message_mds(reply, p); + } + recovered = true; +} + +void MDSTableServer::finish_recovery(set<mds_rank_t>& active) +{ + dout(7) << __func__ << dendl; + + active_clients = active; + + // don't know if survivor mds have received all 'notify prep' messages. + // so we need to send 'notify prep' again. + if (!pending_for_mds.empty() && _notify_prep(version)) { + auto& q = pending_notifies[version]; + q.notify_ack_gather = active_clients; + q.mds = MDS_RANK_NONE; + q.onfinish = new C_ServerRecovery(this); + } else { + _do_server_recovery(); + } +} + +void MDSTableServer::handle_mds_recovery(mds_rank_t who) +{ + dout(7) << "handle_mds_recovery mds." << who << dendl; + + active_clients.insert(who); + if (!recovered) { + dout(7) << " still not recovered, delaying" << dendl; + return; + } + + uint64_t next_reqid = 0; + // resend agrees for recovered mds + for (auto p = pending_for_mds.begin(); p != pending_for_mds.end(); ++p) { + if (p->second.mds != who) + continue; + ceph_assert(!pending_notifies.count(p->second.tid)); + + if (p->second.reqid >= next_reqid) + next_reqid = p->second.reqid + 1; + + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid); + _get_reply_buffer(p->second.tid, &reply->bl); + mds->send_message_mds(reply, who); + } + + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_SERVER_READY, next_reqid); + mds->send_message_mds(reply, who); +} + +void MDSTableServer::handle_mds_failure_or_stop(mds_rank_t who) +{ + dout(7) << __func__ << " mds." << who << dendl; + + active_clients.erase(who); + + list<MMDSTableRequest::ref> rollback; + for (auto p = pending_notifies.begin(); p != pending_notifies.end(); ) { + auto q = p++; + if (q->second.mds == who) { + // haven't sent reply yet. + rollback.push_back(q->second.reply); + pending_notifies.erase(q); + } else if (q->second.notify_ack_gather.erase(who)) { + // the failed mds will reload snaptable when it recovers. + // so we can remove it from the gather set. + if (q->second.notify_ack_gather.empty()) { + if (q->second.onfinish) + q->second.onfinish->complete(0); + else + mds->send_message_mds(q->second.reply, q->second.mds); + pending_notifies.erase(q); + } + } + } + + for (auto &req : rollback) { + req->op = TABLESERVER_OP_ROLLBACK; + handle_rollback(req); + } +} diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h new file mode 100644 index 00000000..83f10315 --- /dev/null +++ b/src/mds/MDSTableServer.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSTABLESERVER_H +#define CEPH_MDSTABLESERVER_H + +#include "MDSTable.h" +#include "MDSContext.h" + +#include "messages/MMDSTableRequest.h" + +class MDSTableServer : public MDSTable { +protected: + int table; + bool recovered; + set<mds_rank_t> active_clients; +private: + map<version_t,mds_table_pending_t> pending_for_mds; // ** child should encode this! ** + set<version_t> committing_tids; + + struct notify_info_t { + set<mds_rank_t> notify_ack_gather; + mds_rank_t mds; + MMDSTableRequest::ref reply; + MDSContext *onfinish; + notify_info_t() : reply(NULL), onfinish(NULL) {} + }; + map<version_t, notify_info_t> pending_notifies; + + void handle_prepare(const MMDSTableRequest::const_ref &m); + void _prepare_logged(const MMDSTableRequest::const_ref &m, version_t tid); + friend class C_Prepare; + + void handle_commit(const MMDSTableRequest::const_ref &m); + void _commit_logged(const MMDSTableRequest::const_ref &m); + friend class C_Commit; + + void handle_rollback(const MMDSTableRequest::const_ref &m); + void _rollback_logged(const MMDSTableRequest::const_ref &m); + friend class C_Rollback; + + void _server_update_logged(bufferlist& bl); + friend class C_ServerUpdate; + + void handle_notify_ack(const MMDSTableRequest::const_ref &m); + +public: + virtual void handle_query(const MMDSTableRequest::const_ref &m) = 0; + virtual void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out) = 0; + virtual void _get_reply_buffer(version_t tid, bufferlist *pbl) const = 0; + virtual void _commit(version_t tid, MMDSTableRequest::const_ref req) = 0; + virtual void _rollback(version_t tid) = 0; + virtual void _server_update(bufferlist& bl) { ceph_abort(); } + virtual bool _notify_prep(version_t tid) { return false; }; + + void _note_prepare(mds_rank_t mds, uint64_t reqid, bool replay=false) { + version++; + if (replay) + projected_version = version; + pending_for_mds[version].mds = mds; + pending_for_mds[version].reqid = reqid; + pending_for_mds[version].tid = version; + } + void _note_commit(uint64_t tid, bool replay=false) { + version++; + if (replay) + projected_version = version; + pending_for_mds.erase(tid); + } + void _note_rollback(uint64_t tid, bool replay=false) { + version++; + if (replay) + projected_version = version; + pending_for_mds.erase(tid); + } + void _note_server_update(bufferlist& bl, bool replay=false) { + version++; + if (replay) + projected_version = version; + } + + MDSTableServer(MDSRank *m, int tab) : + MDSTable(m, get_mdstable_name(tab), false), table(tab), recovered(false) {} + ~MDSTableServer() override {} + + void reset_state() override { + pending_for_mds.clear(); + ++version; + } + + void handle_request(const MMDSTableRequest::const_ref &m); + void do_server_update(bufferlist& bl); + + virtual void encode_server_state(bufferlist& bl) const = 0; + virtual void decode_server_state(bufferlist::const_iterator& bl) = 0; + + void encode_state(bufferlist& bl) const override { + encode_server_state(bl); + encode(pending_for_mds, bl); + } + void decode_state(bufferlist::const_iterator& bl) override { + decode_server_state(bl); + decode(pending_for_mds, bl); + } + + // recovery + void finish_recovery(set<mds_rank_t>& active); + void _do_server_recovery(); + friend class C_ServerRecovery; + + void handle_mds_recovery(mds_rank_t who); + void handle_mds_failure_or_stop(mds_rank_t who); +}; + +#endif diff --git a/src/mds/Mantle.cc b/src/mds/Mantle.cc new file mode 100644 index 00000000..15d325e8 --- /dev/null +++ b/src/mds/Mantle.cc @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mdstypes.h" +#include "MDSRank.h" +#include "Mantle.h" +#include "msg/Messenger.h" +#include "common/Clock.h" +#include "CInode.h" + +#include <fstream> + +#define dout_context g_ceph_context +#undef dout_prefix +#define dout_prefix *_dout << "mds.mantle " +#define mantle_dout(lvl) \ + do {\ + auto subsys = ceph_subsys_mds;\ + if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\ + subsys = ceph_subsys_mds_balancer;\ + }\ + dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix + +#define mantle_dendl dendl; } while (0) + + +static int dout_wrapper(lua_State *L) +{ + int level = luaL_checkinteger(L, 1); + lua_concat(L, lua_gettop(L)-1); + mantle_dout(ceph::dout::need_dynamic(level)) << lua_tostring(L, 2) + << mantle_dendl; + return 0; +} + +int Mantle::balance(std::string_view script, + mds_rank_t whoami, + const std::vector<std::map<std::string, double>> &metrics, + std::map<mds_rank_t, double> &my_targets) +{ + lua_settop(L, 0); /* clear the stack */ + + /* load the balancer */ + if (luaL_loadstring(L, script.data())) { + mantle_dout(0) << "WARNING: mantle could not load balancer: " + << lua_tostring(L, -1) << mantle_dendl; + return -EINVAL; + } + + /* tell the balancer which mds is making the decision */ + lua_pushinteger(L, (lua_Integer)whoami); + lua_setglobal(L, "whoami"); + + /* global mds metrics to hold all dictionaries */ + lua_newtable(L); + + /* push name of mds (i) and its metrics onto Lua stack */ + for (size_t i=0; i < metrics.size(); i++) { + lua_newtable(L); + + /* push values into this mds's table; setfield assigns key/pops val */ + for (const auto &it : metrics[i]) { + lua_pushnumber(L, it.second); + lua_setfield(L, -2, it.first.c_str()); + } + + /* in global mds table at stack[-3], set k=stack[-1] to v=stack[-2] */ + lua_seti(L, -2, i); + } + + /* set the name of the global mds table */ + lua_setglobal(L, "mds"); + + ceph_assert(lua_gettop(L) == 1); + if (lua_pcall(L, 0, 1, 0) != LUA_OK) { + mantle_dout(0) << "WARNING: mantle could not execute script: " + << lua_tostring(L, -1) << mantle_dendl; + return -EINVAL; + } + + /* parse response by iterating over Lua stack */ + if (lua_istable(L, -1) == 0) { + mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl; + return -EINVAL; + } + + /* fill in return value */ + for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) { + if (!lua_isinteger(L, -2) || !lua_isnumber(L, -1)) { + mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl; + return -EINVAL; + } + mds_rank_t rank(lua_tointeger(L, -2)); + my_targets[rank] = lua_tonumber(L, -1); + } + + return 0; +} + +Mantle::Mantle (void) +{ + /* build lua vm state */ + L = luaL_newstate(); + if (!L) { + mantle_dout(0) << "WARNING: mantle could not load Lua state" << mantle_dendl; + throw std::bad_alloc(); + } + + /* balancer policies can use basic Lua functions */ + static const luaL_Reg loadedlibs[] = { + {"_G", luaopen_base}, + {LUA_COLIBNAME, luaopen_coroutine}, + {LUA_STRLIBNAME, luaopen_string}, + {LUA_MATHLIBNAME, luaopen_math}, + {LUA_TABLIBNAME, luaopen_table}, + {LUA_UTF8LIBNAME, luaopen_utf8}, + {NULL, NULL} + }; + + const luaL_Reg *lib; + for (lib = loadedlibs; lib->func; lib++) { + luaL_requiref(L, lib->name, lib->func, 1); + lua_pop(L, 1); /* remove lib */ + } + + /* setup debugging */ + lua_register(L, "BAL_LOG", dout_wrapper); +} diff --git a/src/mds/Mantle.h b/src/mds/Mantle.h new file mode 100644 index 00000000..ffc1843a --- /dev/null +++ b/src/mds/Mantle.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MANTLE_H +#define CEPH_MANTLE_H + +#include <string_view> + +#include <lua.hpp> +#include <vector> +#include <map> +#include <string> + +#include "mdstypes.h" + +class Mantle { + public: + Mantle(); + ~Mantle() { if (L) lua_close(L); } + int balance(std::string_view script, + mds_rank_t whoami, + const std::vector <std::map<std::string, double>> &metrics, + std::map<mds_rank_t,double> &my_targets); + + protected: + lua_State *L; +}; + +#endif diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc new file mode 100644 index 00000000..98bf78c3 --- /dev/null +++ b/src/mds/Migrator.cc @@ -0,0 +1,3611 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSRank.h" +#include "MDCache.h" +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" +#include "Migrator.h" +#include "Locker.h" +#include "Server.h" + +#include "MDBalancer.h" +#include "MDLog.h" +#include "MDSMap.h" +#include "Mutation.h" + +#include "include/filepath.h" +#include "common/likely.h" + +#include "events/EExport.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" +#include "events/ESessions.h" + +#include "msg/Messenger.h" + +#include "messages/MClientCaps.h" + +/* + * this is what the dir->dir_auth values look like + * + * dir_auth authbits + * export + * me me - before + * me, me me - still me, but preparing for export + * me, them me - send MExportDir (peer is preparing) + * them, me me - journaled EExport + * them them - done + * + * import: + * them them - before + * me, them me - journaled EImportStart + * me me - done + * + * which implies: + * - auth bit is set if i am listed as first _or_ second dir_auth. + */ + +#include "common/config.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator " + + +class MigratorContext : public MDSContext { +protected: + Migrator *mig; + MDSRank *get_mds() override { + return mig->mds; + } +public: + explicit MigratorContext(Migrator *mig_) : mig(mig_) { + ceph_assert(mig != NULL); + } +}; + +class MigratorLogContext : public MDSLogContextBase { +protected: + Migrator *mig; + MDSRank *get_mds() override { + return mig->mds; + } +public: + explicit MigratorLogContext(Migrator *mig_) : mig(mig_) { + ceph_assert(mig != NULL); + } +}; + +void Migrator::dispatch(const Message::const_ref &m) +{ + switch (m->get_type()) { + // import + case MSG_MDS_EXPORTDIRDISCOVER: + handle_export_discover(MExportDirDiscover::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIRPREP: + handle_export_prep(MExportDirPrep::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIR: + if (unlikely(inject_session_race)) { + dout(0) << "waiting for inject_session_race" << dendl; + mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m)); + } else { + handle_export_dir(MExportDir::msgref_cast(m)); + } + break; + case MSG_MDS_EXPORTDIRFINISH: + handle_export_finish(MExportDirFinish::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIRCANCEL: + handle_export_cancel(MExportDirCancel::msgref_cast(m)); + break; + + // export + case MSG_MDS_EXPORTDIRDISCOVERACK: + handle_export_discover_ack(MExportDirDiscoverAck::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIRPREPACK: + handle_export_prep_ack(MExportDirPrepAck::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIRACK: + handle_export_ack(MExportDirAck::msgref_cast(m)); + break; + case MSG_MDS_EXPORTDIRNOTIFYACK: + handle_export_notify_ack(MExportDirNotifyAck::msgref_cast(m)); + break; + + // export 3rd party (dir_auth adjustments) + case MSG_MDS_EXPORTDIRNOTIFY: + handle_export_notify(MExportDirNotify::msgref_cast(m)); + break; + + // caps + case MSG_MDS_EXPORTCAPS: + handle_export_caps(MExportCaps::msgref_cast(m)); + break; + case MSG_MDS_EXPORTCAPSACK: + handle_export_caps_ack(MExportCapsAck::msgref_cast(m)); + break; + case MSG_MDS_GATHERCAPS: + handle_gather_caps(MGatherCaps::msgref_cast(m)); + break; + + default: + derr << "migrator unknown message " << m->get_type() << dendl; + ceph_abort_msg("migrator unknown message"); + } +} + + +class C_MDC_EmptyImport : public MigratorContext { + CDir *dir; +public: + C_MDC_EmptyImport(Migrator *m, CDir *d) : + MigratorContext(m), dir(d) { + dir->get(CDir::PIN_PTRWAITER); + } + void finish(int r) override { + mig->export_empty_import(dir); + dir->put(CDir::PIN_PTRWAITER); + } +}; + + +void Migrator::export_empty_import(CDir *dir) +{ + dout(7) << "export_empty_import " << *dir << dendl; + ceph_assert(dir->is_subtree_root()); + + if (dir->inode->is_auth()) { + dout(7) << " inode is auth" << dendl; + return; + } + if (!dir->is_auth()) { + dout(7) << " not auth" << dendl; + return; + } + if (dir->is_freezing() || dir->is_frozen()) { + dout(7) << " freezing or frozen" << dendl; + return; + } + if (dir->get_num_head_items() > 0) { + dout(7) << " not actually empty" << dendl; + return; + } + if (dir->inode->is_root()) { + dout(7) << " root" << dendl; + return; + } + + mds_rank_t dest = dir->inode->authority().first; + //if (mds->is_shutting_down()) dest = 0; // this is more efficient. + + dout(7) << " really empty, exporting to " << dest << dendl; + assert (dest != mds->get_nodeid()); + + dout(7) << "exporting to mds." << dest + << " empty import " << *dir << dendl; + export_dir( dir, dest ); +} + +void Migrator::find_stale_export_freeze() +{ + utime_t now = ceph_clock_now(); + utime_t cutoff = now; + cutoff -= g_conf()->mds_freeze_tree_timeout; + + + /* + * We could have situations like: + * + * - mds.0 authpins an item in subtree A + * - mds.0 sends request to mds.1 to authpin an item in subtree B + * - mds.0 freezes subtree A + * - mds.1 authpins an item in subtree B + * - mds.1 sends request to mds.0 to authpin an item in subtree A + * - mds.1 freezes subtree B + * - mds.1 receives the remote authpin request from mds.0 + * (wait because subtree B is freezing) + * - mds.0 receives the remote authpin request from mds.1 + * (wait because subtree A is freezing) + * + * + * - client request authpins items in subtree B + * - freeze subtree B + * - import subtree A which is parent of subtree B + * (authpins parent inode of subtree B, see CDir::set_dir_auth()) + * - freeze subtree A + * - client request tries authpinning items in subtree A + * (wait because subtree A is freezing) + */ + for (map<CDir*,export_state_t>::iterator p = export_state.begin(); + p != export_state.end(); ) { + CDir* dir = p->first; + export_state_t& stat = p->second; + ++p; + if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING) + continue; + ceph_assert(dir->freeze_tree_state); + if (stat.last_cum_auth_pins != dir->freeze_tree_state->auth_pins) { + stat.last_cum_auth_pins = dir->freeze_tree_state->auth_pins; + stat.last_cum_auth_pins_change = now; + continue; + } + if (stat.last_cum_auth_pins_change >= cutoff) + continue; + if (stat.num_remote_waiters > 0 || + (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { + export_try_cancel(dir); + } + } +} + +void Migrator::export_try_cancel(CDir *dir, bool notify_peer) +{ + dout(10) << "export_try_cancel " << *dir << dendl; + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + ceph_assert(it != export_state.end()); + + int state = it->second.state; + switch (state) { + case EXPORT_LOCKING: + dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl; + num_locking_exports--; + it->second.state = EXPORT_CANCELLED; + dir->auth_unpin(this); + break; + case EXPORT_DISCOVERING: + dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; + it->second.state = EXPORT_CANCELLED; + dir->unfreeze_tree(); // cancel the freeze + dir->auth_unpin(this); + if (notify_peer && + (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them. + mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer); + break; + + case EXPORT_FREEZING: + dout(10) << "export state=freezing : canceling freeze" << dendl; + it->second.state = EXPORT_CANCELLED; + dir->unfreeze_tree(); // cancel the freeze + if (dir->is_subtree_root()) + cache->try_subtree_merge(dir); + if (notify_peer && + (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them. + mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer); + break; + + // NOTE: state order reversal, warning comes after prepping + case EXPORT_WARNING: + dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; + it->second.state = EXPORT_CANCELLING; + // fall-thru + + case EXPORT_PREPPING: + if (state != EXPORT_WARNING) { + dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl; + it->second.state = EXPORT_CANCELLED; + } + + { + // unpin bounds + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + for (set<CDir*>::iterator q = bounds.begin(); + q != bounds.end(); + ++q) { + CDir *bd = *q; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + if (state == EXPORT_WARNING) { + // notify bystanders + export_notify_abort(dir, it->second, bounds); + // process delayed expires + cache->process_delayed_expire(dir); + } + } + dir->unfreeze_tree(); + cache->try_subtree_merge(dir); + if (notify_peer && + (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them. + mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer); + break; + + case EXPORT_EXPORTING: + dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; + it->second.state = EXPORT_CANCELLING; + export_reverse(dir, it->second); + break; + + case EXPORT_LOGGINGFINISH: + case EXPORT_NOTIFYING: + dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; + // leave export_state, don't clean up now. + break; + case EXPORT_CANCELLING: + break; + + default: + ceph_abort(); + } + + // finish clean-up? + if (it->second.state == EXPORT_CANCELLING || + it->second.state == EXPORT_CANCELLED) { + MutationRef mut; + mut.swap(it->second.mut); + + if (it->second.state == EXPORT_CANCELLED) { + export_cancel_finish(it); + } + + // drop locks + if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) { + MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get()); + ceph_assert(mdr); + mds->mdcache->request_kill(mdr); + } else if (mut) { + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + } + + cache->show_subtrees(); + + maybe_do_queued_export(); + } +} + +void Migrator::export_cancel_finish(export_state_iterator& it) +{ + CDir *dir = it->first; + bool unpin = (it->second.state == EXPORT_CANCELLING); + auto parent = std::move(it->second.parent); + + total_exporting_size -= it->second.approx_size; + export_state.erase(it); + + ceph_assert(dir->state_test(CDir::STATE_EXPORTING)); + dir->clear_exporting(); + + if (unpin) { + // pinned by Migrator::export_notify_abort() + dir->auth_unpin(this); + } + // send pending import_maps? (these need to go out when all exports have finished.) + cache->maybe_send_pending_resolves(); + + if (parent) + child_export_finish(parent, false); +} + +// ========================================================== +// mds failure handling + +void Migrator::handle_mds_failure_or_stop(mds_rank_t who) +{ + dout(5) << "handle_mds_failure_or_stop mds." << who << dendl; + + // check my exports + + // first add an extra auth_pin on any freezes, so that canceling a + // nested freeze doesn't complete one further up the hierarchy and + // confuse the shit out of us. we'll remove it after canceling the + // freeze. this way no freeze completions run before we want them + // to. + list<CDir*> pinned_dirs; + for (map<CDir*,export_state_t>::iterator p = export_state.begin(); + p != export_state.end(); + ++p) { + if (p->second.state == EXPORT_FREEZING) { + CDir *dir = p->first; + dout(10) << "adding temp auth_pin on freezing " << *dir << dendl; + dir->auth_pin(this); + pinned_dirs.push_back(dir); + } + } + + map<CDir*,export_state_t>::iterator p = export_state.begin(); + while (p != export_state.end()) { + map<CDir*,export_state_t>::iterator next = p; + ++next; + CDir *dir = p->first; + + // abort exports: + // - that are going to the failed node + // - that aren't frozen yet (to avoid auth_pin deadlock) + // - they havne't prepped yet (they may need to discover bounds to do that) + if ((p->second.peer == who && + p->second.state != EXPORT_CANCELLING) || + p->second.state == EXPORT_LOCKING || + p->second.state == EXPORT_DISCOVERING || + p->second.state == EXPORT_FREEZING || + p->second.state == EXPORT_PREPPING) { + // the guy i'm exporting to failed, or we're just freezing. + dout(10) << "cleaning up export state (" << p->second.state << ")" + << get_export_statename(p->second.state) << " of " << *dir << dendl; + export_try_cancel(dir); + } else if (p->second.peer != who) { + // bystander failed. + if (p->second.warning_ack_waiting.erase(who)) { + if (p->second.state == EXPORT_WARNING) { + p->second.notify_ack_waiting.erase(who); // they won't get a notify either. + // exporter waiting for warning acks, let's fake theirs. + dout(10) << "faking export_warning_ack from mds." << who + << " on " << *dir << " to mds." << p->second.peer + << dendl; + if (p->second.warning_ack_waiting.empty()) + export_go(dir); + } + } + if (p->second.notify_ack_waiting.erase(who)) { + // exporter is waiting for notify acks, fake it + dout(10) << "faking export_notify_ack from mds." << who + << " on " << *dir << " to mds." << p->second.peer + << dendl; + if (p->second.state == EXPORT_NOTIFYING) { + if (p->second.notify_ack_waiting.empty()) + export_finish(dir); + } else if (p->second.state == EXPORT_CANCELLING) { + if (p->second.notify_ack_waiting.empty()) { + export_cancel_finish(p); + } + } + } + } + + // next! + p = next; + } + + + // check my imports + map<dirfrag_t,import_state_t>::iterator q = import_state.begin(); + while (q != import_state.end()) { + map<dirfrag_t,import_state_t>::iterator next = q; + ++next; + dirfrag_t df = q->first; + CInode *diri = mds->mdcache->get_inode(df.ino); + CDir *dir = mds->mdcache->get_dirfrag(df); + + if (q->second.peer == who) { + if (dir) + dout(10) << "cleaning up import state (" << q->second.state << ")" + << get_import_statename(q->second.state) << " of " << *dir << dendl; + else + dout(10) << "cleaning up import state (" << q->second.state << ")" + << get_import_statename(q->second.state) << " of " << df << dendl; + + switch (q->second.state) { + case IMPORT_DISCOVERING: + dout(10) << "import state=discovering : clearing state" << dendl; + import_reverse_discovering(df); + break; + + case IMPORT_DISCOVERED: + ceph_assert(diri); + dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; + import_reverse_discovered(df, diri); + break; + + case IMPORT_PREPPING: + ceph_assert(dir); + dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; + import_reverse_prepping(dir, q->second); + break; + + case IMPORT_PREPPED: + ceph_assert(dir); + dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; + { + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + import_remove_pins(dir, bounds); + + // adjust auth back to the exporter + cache->adjust_subtree_auth(dir, q->second.peer); + + // notify bystanders ; wait in aborting state + q->second.state = IMPORT_ABORTING; + import_notify_abort(dir, bounds); + ceph_assert(g_conf()->mds_kill_import_at != 10); + } + break; + + case IMPORT_LOGGINGSTART: + ceph_assert(dir); + dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl; + import_reverse(dir); + break; + + case IMPORT_ACKING: + ceph_assert(dir); + // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate + dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl; + { + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + cache->add_ambiguous_import(dir, bounds); + } + break; + + case IMPORT_FINISHING: + ceph_assert(dir); + dout(10) << "import state=finishing : finishing import on " << *dir << dendl; + import_finish(dir, true); + break; + + case IMPORT_ABORTING: + ceph_assert(dir); + dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl; + break; + } + } else { + auto bystanders_entry = q->second.bystanders.find(who); + if (bystanders_entry != q->second.bystanders.end()) { + q->second.bystanders.erase(bystanders_entry); + if (q->second.state == IMPORT_ABORTING) { + ceph_assert(dir); + dout(10) << "faking export_notify_ack from mds." << who + << " on aborting import " << *dir << " from mds." << q->second.peer + << dendl; + if (q->second.bystanders.empty()) + import_reverse_unfreeze(dir); + } + } + } + + // next! + q = next; + } + + while (!pinned_dirs.empty()) { + CDir *dir = pinned_dirs.front(); + dout(10) << "removing temp auth_pin on " << *dir << dendl; + dir->auth_unpin(this); + pinned_dirs.pop_front(); + } +} + + + +void Migrator::show_importing() +{ + dout(10) << "show_importing" << dendl; + for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin(); + p != import_state.end(); + ++p) { + CDir *dir = mds->mdcache->get_dirfrag(p->first); + if (dir) { + dout(10) << " importing from " << p->second.peer + << ": (" << p->second.state << ") " << get_import_statename(p->second.state) + << " " << p->first << " " << *dir << dendl; + } else { + dout(10) << " importing from " << p->second.peer + << ": (" << p->second.state << ") " << get_import_statename(p->second.state) + << " " << p->first << dendl; + } + } +} + +void Migrator::show_exporting() +{ + dout(10) << "show_exporting" << dendl; + for (map<CDir*,export_state_t>::iterator p = export_state.begin(); + p != export_state.end(); + ++p) + dout(10) << " exporting to " << p->second.peer + << ": (" << p->second.state << ") " << get_export_statename(p->second.state) + << " " << p->first->dirfrag() << " " << *p->first << dendl; +} + + + +void Migrator::audit() +{ + if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 5>()) + return; // hrm. + + // import_state + show_importing(); + for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin(); + p != import_state.end(); + ++p) { + if (p->second.state == IMPORT_DISCOVERING) + continue; + if (p->second.state == IMPORT_DISCOVERED) { + CInode *in = cache->get_inode(p->first.ino); + ceph_assert(in); + continue; + } + CDir *dir = cache->get_dirfrag(p->first); + ceph_assert(dir); + if (p->second.state == IMPORT_PREPPING) + continue; + if (p->second.state == IMPORT_ABORTING) { + ceph_assert(!dir->is_ambiguous_dir_auth()); + ceph_assert(dir->get_dir_auth().first != mds->get_nodeid()); + continue; + } + ceph_assert(dir->is_ambiguous_dir_auth()); + ceph_assert(dir->authority().first == mds->get_nodeid() || + dir->authority().second == mds->get_nodeid()); + } + + // export_state + show_exporting(); + for (map<CDir*,export_state_t>::iterator p = export_state.begin(); + p != export_state.end(); + ++p) { + CDir *dir = p->first; + if (p->second.state == EXPORT_LOCKING || + p->second.state == EXPORT_DISCOVERING || + p->second.state == EXPORT_FREEZING || + p->second.state == EXPORT_CANCELLING) + continue; + ceph_assert(dir->is_ambiguous_dir_auth()); + ceph_assert(dir->authority().first == mds->get_nodeid() || + dir->authority().second == mds->get_nodeid()); + } + + // ambiguous+me subtrees should be importing|exporting + + // write me +} + + + + + +// ========================================================== +// EXPORT + +void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest) +{ + // enqueue + dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl; + export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest)); + + maybe_do_queued_export(); +} + +void Migrator::maybe_do_queued_export() +{ + static bool running; + if (running) + return; + running = true; + + uint64_t max_total_size = max_export_size * 2; + + while (!export_queue.empty() && + max_total_size > total_exporting_size && + max_total_size - total_exporting_size >= + max_export_size * (num_locking_exports + 1)) { + + dirfrag_t df = export_queue.front().first; + mds_rank_t dest = export_queue.front().second; + export_queue.pop_front(); + + CDir *dir = mds->mdcache->get_dirfrag(df); + if (!dir) continue; + if (!dir->is_auth()) continue; + + dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl; + + export_dir(dir, dest); + } + + running = false; +} + + + + +class C_MDC_ExportFreeze : public MigratorContext { + CDir *dir; // dir i'm exporting + uint64_t tid; +public: + C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) : + MigratorContext(m), dir(e), tid(t) { + dir->get(CDir::PIN_PTRWAITER); + } + void finish(int r) override { + if (r >= 0) + mig->export_frozen(dir, tid); + dir->put(CDir::PIN_PTRWAITER); + } +}; + + +void Migrator::get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov) +{ + // path + vector<CDentry*> trace; + cache->make_trace(trace, dir->inode); + + set<CDir*> wouldbe_bounds; + cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds); + + lov.reserve(trace.size() + wouldbe_bounds.size() + 8); + + for (auto& dn : trace) + lov.add_rdlock(&dn->lock); + + // prevent scatter gather race + lov.add_rdlock(&dir->get_inode()->dirfragtreelock); + + // bound dftlocks: + // NOTE: We need to take an rdlock on bounding dirfrags during + // migration for a rather irritating reason: when we export the + // bound inode, we need to send scatterlock state for the dirfrags + // as well, so that the new auth also gets the correct info. If we + // race with a refragment, this info is useless, as we can't + // redivvy it up. And it's needed for the scatterlocks to work + // properly: when the auth is in a sync/lock state it keeps each + // dirfrag's portion in the local (auth OR replica) dirfrag. + for (auto& dir : wouldbe_bounds) + lov.add_rdlock(&dir->get_inode()->dirfragtreelock); + + // above code may add duplicated locks + lov.sort_and_merge(); +} + + +/** export_dir(dir, dest) + * public method to initiate an export. + * will fail if the directory is freezing, frozen, unpinnable, or root. + */ +void Migrator::export_dir(CDir *dir, mds_rank_t dest) +{ + dout(7) << "export_dir " << *dir << " to " << dest << dendl; + ceph_assert(dir->is_auth()); + ceph_assert(dest != mds->get_nodeid()); + + if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) { + dout(25) << "dir is export pinned" << dendl; + return; + } + + if (!(mds->is_active() || mds->is_stopping())) { + dout(7) << "i'm not active, no exports for now" << dendl; + return; + } + if (mds->mdcache->is_readonly()) { + dout(7) << "read-only FS, no exports for now" << dendl; + return; + } + if (!mds->mdsmap->is_active(dest)) { + dout(7) << "dest not active, no exports for now" << dendl; + return; + } + if (mds->is_cluster_degraded()) { + dout(7) << "cluster degraded, no exports for now" << dendl; + return; + } + if (dir->inode->is_system()) { + dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl; + //ceph_abort(); + return; + } + + CDir* parent_dir = dir->inode->get_projected_parent_dir(); + if (parent_dir && parent_dir->inode->is_stray()) { + if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) { + dout(7) << "i won't export anything in stray" << dendl; + return; + } + } + + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl; + return; + } + if (dir->state_test(CDir::STATE_EXPORTING)) { + dout(7) << "already exporting" << dendl; + return; + } + + if (g_conf()->mds_thrash_exports) { + // create random subtree bound (which will not be exported) + list<CDir*> ls; + for (auto p = dir->begin(); p != dir->end(); ++p) { + auto dn = p->second; + CDentry::linkage_t *dnl= dn->get_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (in->is_dir()) + in->get_nested_dirfrags(ls); + } + } + if (ls.size() > 0) { + int n = rand() % ls.size(); + auto p = ls.begin(); + while (n--) ++p; + CDir *bd = *p; + if (!(bd->is_frozen() || bd->is_freezing())) { + ceph_assert(bd->is_auth()); + dir->state_set(CDir::STATE_AUXSUBTREE); + mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid()); + dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl; + } + } + } + + mds->hit_export_target(dest, -1); + + dir->auth_pin(this); + dir->mark_exporting(); + + MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR); + mdr->more()->export_dir = dir; + mdr->pin(dir); + + ceph_assert(export_state.count(dir) == 0); + export_state_t& stat = export_state[dir]; + num_locking_exports++; + stat.state = EXPORT_LOCKING; + stat.peer = dest; + stat.tid = mdr->reqid.tid; + stat.mut = mdr; + + mds->mdcache->dispatch_request(mdr); +} + +/* + * check if directory is too large to be export in whole. If it is, + * choose some subdirs, whose total size is suitable. + */ +void Migrator::maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay, + vector<pair<CDir*, size_t> >& results) +{ + static const unsigned frag_size = 800; + static const unsigned inode_size = 1000; + static const unsigned cap_size = 80; + static const unsigned remote_size = 10; + static const unsigned null_size = 1; + + // state for depth-first search + struct LevelData { + CDir *dir; + CDir::dentry_key_map::iterator iter; + size_t dirfrag_size = frag_size; + size_t subdirs_size = 0; + bool complete = true; + vector<CDir*> siblings; + vector<pair<CDir*, size_t> > subdirs; + LevelData(const LevelData&) = default; + LevelData(CDir *d) : + dir(d), iter(d->begin()) {} + }; + + vector<LevelData> stack; + stack.emplace_back(dir); + + size_t found_size = 0; + size_t skipped_size = 0; + + for (;;) { + auto& data = stack.back(); + CDir *cur = data.dir; + auto& it = data.iter; + auto& dirfrag_size = data.dirfrag_size; + + while(it != cur->end()) { + CDentry *dn = it->second; + ++it; + + dirfrag_size += dn->name.size(); + if (dn->get_linkage()->is_null()) { + dirfrag_size += null_size; + continue; + } + if (dn->get_linkage()->is_remote()) { + dirfrag_size += remote_size; + continue; + } + + CInode *in = dn->get_linkage()->get_inode(); + dirfrag_size += inode_size; + dirfrag_size += in->get_client_caps().size() * cap_size; + + if (in->is_dir()) { + vector<CDir*> ls; + in->get_nested_dirfrags(ls); + std::reverse(ls.begin(), ls.end()); + + bool complete = true; + for (auto p = ls.begin(); p != ls.end(); ) { + if ((*p)->state_test(CDir::STATE_EXPORTING) || + (*p)->is_freezing_dir() || (*p)->is_frozen_dir()) { + complete = false; + p = ls.erase(p); + } else { + ++p; + } + } + if (!complete) { + // skip exporting dir's ancestors. because they can't get + // frozen (exporting dir's parent inode is auth pinned). + for (auto p = stack.rbegin(); p < stack.rend(); ++p) { + if (!p->complete) + break; + p->complete = false; + } + } + if (!ls.empty()) { + stack.emplace_back(ls.back()); + ls.pop_back(); + stack.back().siblings.swap(ls); + break; + } + } + } + // did above loop push new dirfrag into the stack? + if (stack.back().dir != cur) + continue; + + if (data.complete) { + auto cur_size = data.subdirs_size + dirfrag_size; + // we can do nothing with large dirfrag + if (cur_size >= max_size && found_size * 2 > max_size) + break; + + found_size += dirfrag_size; + + if (stack.size() > 1) { + auto& parent = stack[stack.size() - 2]; + parent.subdirs.emplace_back(cur, cur_size); + parent.subdirs_size += cur_size; + } + } else { + // can't merge current dirfrag to its parent if there is skipped subdir + results.insert(results.end(), data.subdirs.begin(), data.subdirs.end()); + skipped_size += dirfrag_size; + } + + vector<CDir*> ls; + ls.swap(data.siblings); + + stack.pop_back(); + if (stack.empty()) + break; + + if (found_size >= max_size) + break; + + // next dirfrag + if (!ls.empty()) { + stack.emplace_back(ls.back()); + ls.pop_back(); + stack.back().siblings.swap(ls); + } + } + + for (auto& p : stack) + results.insert(results.end(), p.subdirs.begin(), p.subdirs.end()); + + if (results.empty() && (!skipped_size || !null_okay)) + results.emplace_back(dir, found_size + skipped_size); +} + +class C_M_ExportDirWait : public MigratorContext { + MDRequestRef mdr; + int count; +public: + C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count) + : MigratorContext(m), mdr(mdr), count(count) {} + void finish(int r) override { + mig->dispatch_export_dir(mdr, count); + } +}; + +void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count) +{ + CDir *dir = mdr->more()->export_dir; + dout(7) << "dispatch_export_dir " << *mdr << " " << *dir << dendl; + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || it->second.tid != mdr->reqid.tid) { + // export must have aborted. + dout(7) << "export must have aborted " << *mdr << dendl; + ceph_assert(mdr->killed || mdr->aborted); + if (mdr->aborted) { + mdr->aborted = false; + mds->mdcache->request_kill(mdr); + } + return; + } + ceph_assert(it->second.state == EXPORT_LOCKING); + + mds_rank_t dest = it->second.peer; + + if (!mds->is_export_target(dest)) { + dout(7) << "dest is not yet an export target" << dendl; + if (count > 3) { + dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl; + export_try_cancel(dir); + return; + } + + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + + mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1)); + return; + } + + if (!dir->inode->get_parent_dn()) { + dout(7) << "waiting for dir to become stable before export: " << *dir << dendl; + dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1)); + return; + } + + if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) { + dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl; + export_try_cancel(dir); + return; + } + + // locks? + MutationImpl::LockOpVec lov; + get_export_lock_set(dir, lov); + // If auth MDS of the subtree root inode is neither the exporter MDS + // nor the importer MDS and it gathers subtree root's fragstat/neststat + // while the subtree is exporting. It's possible that the exporter MDS + // and the importer MDS both are auth MDS of the subtree root or both + // are not auth MDS of the subtree root at the time they receive the + // lock messages. So the auth MDS of the subtree root inode may get no + // or duplicated fragstat/neststat for the subtree root dirfrag. + lov.add_wrlock(&dir->get_inode()->filelock); + lov.add_wrlock(&dir->get_inode()->nestlock); + if (dir->get_inode()->is_auth()) { + dir->get_inode()->filelock.set_scatter_wanted(); + dir->get_inode()->nestlock.set_scatter_wanted(); + } + + if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) { + if (mdr->aborted) + export_try_cancel(dir); + return; + } + + ceph_assert(g_conf()->mds_kill_export_at != 1); + + auto parent = it->second.parent; + + vector<pair<CDir*, size_t> > results; + maybe_split_export(dir, max_export_size, (bool)parent, results); + + if (results.size() == 1 && results.front().first == dir) { + num_locking_exports--; + it->second.state = EXPORT_DISCOVERING; + // send ExportDirDiscover (ask target) + filepath path; + dir->inode->make_path(path); + auto discover = MExportDirDiscover::create(dir->dirfrag(), path, + mds->get_nodeid(), it->second.tid); + mds->send_message_mds(discover, dest); + ceph_assert(g_conf()->mds_kill_export_at != 2); + + it->second.last_cum_auth_pins_change = ceph_clock_now(); + it->second.approx_size = results.front().second; + total_exporting_size += it->second.approx_size; + + // start the freeze, but hold it up with an auth_pin. + dir->freeze_tree(); + ceph_assert(dir->is_freezing_tree()); + dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid)); + return; + } + + if (parent) { + parent->pending_children += results.size(); + } else { + parent = std::make_shared<export_base_t>(dir->dirfrag(), dest, + results.size(), export_queue_gen); + } + + if (results.empty()) { + dout(7) << "subtree's children all are under exporting, retry rest parts of parent export " + << parent->dirfrag << dendl; + parent->restart = true; + } else { + dout(7) << "subtree is too large, splitting it into: " << dendl; + } + + for (auto& p : results) { + CDir *sub = p.first; + ceph_assert(sub != dir); + dout(7) << " sub " << *sub << dendl; + + sub->auth_pin(this); + sub->mark_exporting(); + + MDRequestRef _mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR); + _mdr->more()->export_dir = sub; + _mdr->pin(sub); + + ceph_assert(export_state.count(sub) == 0); + auto& stat = export_state[sub]; + num_locking_exports++; + stat.state = EXPORT_LOCKING; + stat.peer = dest; + stat.tid = _mdr->reqid.tid; + stat.mut = _mdr; + stat.parent = parent; + mds->mdcache->dispatch_request(_mdr); + } + + // cancel the original one + export_try_cancel(dir); +} + +void Migrator::child_export_finish(std::shared_ptr<export_base_t>& parent, bool success) +{ + if (success) + parent->restart = true; + if (--parent->pending_children == 0) { + if (parent->restart && + parent->export_queue_gen == export_queue_gen) { + CDir *origin = mds->mdcache->get_dirfrag(parent->dirfrag); + if (origin && origin->is_auth()) { + dout(7) << "child_export_finish requeue " << *origin << dendl; + export_queue.emplace_front(origin->dirfrag(), parent->dest); + } + } + } +} + +/* + * called on receipt of MExportDirDiscoverAck + * the importer now has the directory's _inode_ in memory, and pinned. + */ +void Migrator::handle_export_discover_ack(const MExportDirDiscoverAck::const_ref &m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + mds_rank_t dest(m->get_source().num()); + ceph_assert(dir); + + dout(7) << "export_discover_ack from " << m->get_source() + << " on " << *dir << dendl; + + mds->hit_export_target(dest, -1); + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || + it->second.tid != m->get_tid() || + it->second.peer != dest) { + dout(7) << "must have aborted" << dendl; + } else { + ceph_assert(it->second.state == EXPORT_DISCOVERING); + + if (m->is_success()) { + // release locks to avoid deadlock + MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get()); + ceph_assert(mdr); + mds->mdcache->request_finish(mdr); + it->second.mut.reset(); + // freeze the subtree + it->second.state = EXPORT_FREEZING; + dir->auth_unpin(this); + ceph_assert(g_conf()->mds_kill_export_at != 3); + + } else { + dout(7) << "peer failed to discover (not active?), canceling" << dendl; + export_try_cancel(dir, false); + } + } +} + +class C_M_ExportSessionsFlushed : public MigratorContext { + CDir *dir; + uint64_t tid; +public: + C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t) : + MigratorContext(m), dir(d), tid(t) { + dir->get(CDir::PIN_PTRWAITER); + } + void finish(int r) override { + mig->export_sessions_flushed(dir, tid); + dir->put(CDir::PIN_PTRWAITER); + } +}; + +void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid) +{ + dout(7) << "export_sessions_flushed " << *dir << dendl; + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || + it->second.state == EXPORT_CANCELLING || + it->second.tid != tid) { + // export must have aborted. + dout(7) << "export must have aborted on " << dir << dendl; + return; + } + + ceph_assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING); + ceph_assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0); + it->second.warning_ack_waiting.erase(MDS_RANK_NONE); + if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty()) + export_go(dir); // start export. +} + +void Migrator::export_frozen(CDir *dir, uint64_t tid) +{ + dout(7) << "export_frozen on " << *dir << dendl; + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || it->second.tid != tid) { + dout(7) << "export must have aborted" << dendl; + return; + } + + ceph_assert(it->second.state == EXPORT_FREEZING); + ceph_assert(dir->is_frozen_tree_root()); + + CInode *diri = dir->get_inode(); + + // ok, try to grab all my locks. + MutationImpl::LockOpVec lov; + get_export_lock_set(dir, lov); + if ((diri->is_auth() && diri->is_frozen()) || + !mds->locker->can_rdlock_set(lov) || + !diri->filelock.can_wrlock(-1) || + !diri->nestlock.can_wrlock(-1)) { + dout(7) << "export_dir couldn't acquire all needed locks, failing. " + << *dir << dendl; + export_try_cancel(dir); + return; + } + + it->second.mut = new MutationImpl(); + if (diri->is_auth()) + it->second.mut->auth_pin(diri); + mds->locker->rdlock_take_set(lov, it->second.mut); + mds->locker->wrlock_force(&diri->filelock, it->second.mut); + mds->locker->wrlock_force(&diri->nestlock, it->second.mut); + + cache->show_subtrees(); + + // CDir::_freeze_tree() should have forced it into subtree. + ceph_assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid())); + // note the bounds. + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + // generate prep message, log entry. + auto prep = MExportDirPrep::create(dir->dirfrag(), it->second.tid); + + // include list of bystanders + for (const auto &p : dir->get_replicas()) { + if (p.first != it->second.peer) { + dout(10) << "bystander mds." << p.first << dendl; + prep->add_bystander(p.first); + } + } + + // include base dirfrag + cache->replicate_dir(dir, it->second.peer, prep->basedir); + + /* + * include spanning tree for all nested exports. + * these need to be on the destination _before_ the final export so that + * dir_auth updates on any nested exports are properly absorbed. + * this includes inodes and dirfrags included in the subtree, but + * only the inodes at the bounds. + * + * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*) + */ + set<inodeno_t> inodes_added; + set<dirfrag_t> dirfrags_added; + + // check bounds + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bound = *p; + + // pin it. + bound->get(CDir::PIN_EXPORTBOUND); + bound->state_set(CDir::STATE_EXPORTBOUND); + + dout(7) << " export bound " << *bound << dendl; + prep->add_bound( bound->dirfrag() ); + + // trace to bound + bufferlist tracebl; + CDir *cur = bound; + + char start = '-'; + while (1) { + // don't repeat inodes + if (inodes_added.count(cur->inode->ino())) + break; + inodes_added.insert(cur->inode->ino()); + + // prepend dentry + inode + ceph_assert(cur->inode->is_auth()); + bufferlist bl; + cache->replicate_dentry(cur->inode->parent, it->second.peer, bl); + dout(7) << " added " << *cur->inode->parent << dendl; + cache->replicate_inode(cur->inode, it->second.peer, bl, + mds->mdsmap->get_up_features()); + dout(7) << " added " << *cur->inode << dendl; + bl.claim_append(tracebl); + tracebl.claim(bl); + + cur = cur->get_parent_dir(); + + // don't repeat dirfrags + if (dirfrags_added.count(cur->dirfrag()) || + cur == dir) { + start = 'd'; // start with dentry + break; + } + dirfrags_added.insert(cur->dirfrag()); + + // prepend dir + cache->replicate_dir(cur, it->second.peer, bl); + dout(7) << " added " << *cur << dendl; + bl.claim_append(tracebl); + tracebl.claim(bl); + + start = 'f'; // start with dirfrag + } + bufferlist final_bl; + dirfrag_t df = cur->dirfrag(); + encode(df, final_bl); + encode(start, final_bl); + final_bl.claim_append(tracebl); + prep->add_trace(final_bl); + } + + // send. + it->second.state = EXPORT_PREPPING; + mds->send_message_mds(prep, it->second.peer); + assert (g_conf()->mds_kill_export_at != 4); + + // make sure any new instantiations of caps are flushed out + ceph_assert(it->second.warning_ack_waiting.empty()); + + set<client_t> export_client_set; + get_export_client_set(dir, export_client_set); + + MDSGatherBuilder gather(g_ceph_context); + mds->server->flush_client_sessions(export_client_set, gather); + if (gather.has_subs()) { + it->second.warning_ack_waiting.insert(MDS_RANK_NONE); + gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid)); + gather.activate(); + } +} + +void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set) +{ + deque<CDir*> dfs; + dfs.push_back(dir); + while (!dfs.empty()) { + CDir *dir = dfs.front(); + dfs.pop_front(); + for (auto& p : *dir) { + CDentry *dn = p.second; + if (!dn->get_linkage()->is_primary()) + continue; + CInode *in = dn->get_linkage()->get_inode(); + if (in->is_dir()) { + // directory? + vector<CDir*> ls; + in->get_dirfrags(ls); + for (auto& q : ls) { + if (!q->state_test(CDir::STATE_EXPORTBOUND)) { + // include nested dirfrag + ceph_assert(q->get_dir_auth().first == CDIR_AUTH_PARENT); + dfs.push_back(q); // it's ours, recurse (later) + } + } + } + for (auto& q : in->get_client_caps()) { + client_set.insert(q.first); + } + } + } +} + +void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set) +{ + for (const auto &p : in->get_client_caps()) { + client_set.insert(p.first); + } +} + +void Migrator::handle_export_prep_ack(const MExportDirPrepAck::const_ref &m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + mds_rank_t dest(m->get_source().num()); + ceph_assert(dir); + + dout(7) << "export_prep_ack " << *dir << dendl; + + mds->hit_export_target(dest, -1); + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || + it->second.tid != m->get_tid() || + it->second.peer != mds_rank_t(m->get_source().num())) { + // export must have aborted. + dout(7) << "export must have aborted" << dendl; + return; + } + ceph_assert(it->second.state == EXPORT_PREPPING); + + if (!m->is_success()) { + dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl; + export_try_cancel(dir, false); + return; + } + + assert (g_conf()->mds_kill_export_at != 5); + // send warnings + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + ceph_assert(it->second.warning_ack_waiting.empty() || + (it->second.warning_ack_waiting.size() == 1 && + it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0)); + ceph_assert(it->second.notify_ack_waiting.empty()); + + for (const auto &p : dir->get_replicas()) { + if (p.first == it->second.peer) continue; + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first)) + continue; // only if active + it->second.warning_ack_waiting.insert(p.first); + it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too! + + auto notify = MExportDirNotify::create(dir->dirfrag(), it->second.tid, true, + mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), + mds_authority_t(mds->get_nodeid(),it->second.peer)); + for (auto &cdir : bounds) { + notify->get_bounds().push_back(cdir->dirfrag()); + } + mds->send_message_mds(notify, p.first); + + } + + it->second.state = EXPORT_WARNING; + + ceph_assert(g_conf()->mds_kill_export_at != 6); + // nobody to warn? + if (it->second.warning_ack_waiting.empty()) + export_go(dir); // start export. +} + + +class C_M_ExportGo : public MigratorContext { + CDir *dir; + uint64_t tid; +public: + C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) : + MigratorContext(m), dir(d), tid(t) { + dir->get(CDir::PIN_PTRWAITER); + } + void finish(int r) override { + mig->export_go_synced(dir, tid); + dir->put(CDir::PIN_PTRWAITER); + } +}; + +void Migrator::export_go(CDir *dir) +{ + auto it = export_state.find(dir); + ceph_assert(it != export_state.end()); + dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl; + + // first sync log to flush out e.g. any cap imports + mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid)); + mds->mdlog->flush(); +} + +void Migrator::export_go_synced(CDir *dir, uint64_t tid) +{ + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end() || + it->second.state == EXPORT_CANCELLING || + it->second.tid != tid) { + // export must have aborted. + dout(7) << "export must have aborted on " << dir << dendl; + return; + } + ceph_assert(it->second.state == EXPORT_WARNING); + mds_rank_t dest = it->second.peer; + + dout(7) << "export_go_synced " << *dir << " to " << dest << dendl; + + cache->show_subtrees(); + + it->second.state = EXPORT_EXPORTING; + ceph_assert(g_conf()->mds_kill_export_at != 7); + + ceph_assert(dir->is_frozen_tree_root()); + + // set ambiguous auth + cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest); + + // take away the popularity we're sending. + mds->balancer->subtract_export(dir); + + // fill export message with cache data + auto req = MExportDir::create(dir->dirfrag(), it->second.tid); + map<client_t,entity_inst_t> exported_client_map; + map<client_t,client_metadata_t> exported_client_metadata_map; + uint64_t num_exported_inodes = encode_export_dir(req->export_data, + dir, // recur start point + exported_client_map, + exported_client_metadata_map); + encode(exported_client_map, req->client_map, mds->mdsmap->get_up_features()); + encode(exported_client_metadata_map, req->client_map); + + // add bounds to message + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) + req->add_export((*p)->dirfrag()); + + // send + mds->send_message_mds(req, dest); + ceph_assert(g_conf()->mds_kill_export_at != 8); + + mds->hit_export_target(dest, num_exported_inodes+1); + + // stats + if (mds->logger) mds->logger->inc(l_mds_exported); + if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes); + + cache->show_subtrees(); +} + + +/** encode_export_inode + * update our local state for this inode to export. + * encode relevant state to be sent over the wire. + * used by: encode_export_dir, file_rename (if foreign) + * + * FIXME: the separation between CInode.encode_export and these methods + * is pretty arbitrary and dumb. + */ +void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, + map<client_t,entity_inst_t>& exported_client_map, + map<client_t,client_metadata_t>& exported_client_metadata_map) +{ + dout(7) << "encode_export_inode " << *in << dendl; + ceph_assert(!in->is_replica(mds->get_nodeid())); + + encode(in->inode.ino, enc_state); + encode(in->last, enc_state); + in->encode_export(enc_state); + + // caps + encode_export_inode_caps(in, true, enc_state, exported_client_map, exported_client_metadata_map); +} + +void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl, + map<client_t,entity_inst_t>& exported_client_map, + map<client_t,client_metadata_t>& exported_client_metadata_map) +{ + dout(20) << "encode_export_inode_caps " << *in << dendl; + + // encode caps + map<client_t,Capability::Export> cap_map; + in->export_client_caps(cap_map); + encode(cap_map, bl); + if (auth_cap) { + encode(in->get_mds_caps_wanted(), bl); + + in->state_set(CInode::STATE_EXPORTINGCAPS); + in->get(CInode::PIN_EXPORTINGCAPS); + } + + // make note of clients named by exported capabilities + for (const auto &p : in->get_client_caps()) { + if (exported_client_map.count(p.first)) + continue; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v)); + exported_client_map[p.first] = session->info.inst; + exported_client_metadata_map[p.first] = session->info.client_metadata; + } +} + +void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer, + map<client_t,Capability::Import>& peer_imported) +{ + dout(20) << "finish_export_inode_caps " << *in << dendl; + + in->state_clear(CInode::STATE_EXPORTINGCAPS); + in->put(CInode::PIN_EXPORTINGCAPS); + + // tell (all) clients about migrating caps.. + for (const auto &p : in->get_client_caps()) { + const Capability *cap = &p.second; + dout(7) << "finish_export_inode_caps telling client." << p.first + << " exported caps on " << *in << dendl; + auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0, + cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier()); + + map<client_t,Capability::Import>::iterator q = peer_imported.find(p.first); + ceph_assert(q != peer_imported.end()); + m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, + (q->second.cap_id > 0 ? peer : -1), 0); + mds->send_message_client_counted(m, p.first); + } + in->clear_client_caps_after_export(); + mds->locker->eval(in, CEPH_CAP_LOCKS); +} + +void Migrator::finish_export_inode(CInode *in, mds_rank_t peer, + map<client_t,Capability::Import>& peer_imported, + MDSContext::vec& finished) +{ + dout(12) << "finish_export_inode " << *in << dendl; + + // clean + if (in->is_dirty()) + in->mark_clean(); + + // clear/unpin cached_by (we're no longer the authority) + in->clear_replica_map(); + + // twiddle lock states for auth -> replica transition + in->authlock.export_twiddle(); + in->linklock.export_twiddle(); + in->dirfragtreelock.export_twiddle(); + in->filelock.export_twiddle(); + in->nestlock.export_twiddle(); + in->xattrlock.export_twiddle(); + in->snaplock.export_twiddle(); + in->flocklock.export_twiddle(); + in->policylock.export_twiddle(); + + // mark auth + ceph_assert(in->is_auth()); + in->state_clear(CInode::STATE_AUTH); + in->replica_nonce = CInode::EXPORT_NONCE; + + in->clear_dirty_rstat(); + + // no more auth subtree? clear scatter dirty + if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) + in->clear_scatter_dirty(); + + in->clear_dirty_parent(); + + in->clear_file_locks(); + + // waiters + in->take_waiting(CInode::WAIT_ANY_MASK, finished); + + in->finish_export(); + + finish_export_inode_caps(in, peer, peer_imported); +} + +uint64_t Migrator::encode_export_dir(bufferlist& exportbl, + CDir *dir, + map<client_t,entity_inst_t>& exported_client_map, + map<client_t,client_metadata_t>& exported_client_metadata_map) +{ + uint64_t num_exported = 0; + + dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl; + + ceph_assert(dir->get_projected_version() == dir->get_version()); + +#ifdef MDS_VERIFY_FRAGSTAT + if (dir->is_complete()) + dir->verify_fragstat(); +#endif + + // dir + dirfrag_t df = dir->dirfrag(); + encode(df, exportbl); + dir->encode_export(exportbl); + + __u32 nden = dir->items.size(); + encode(nden, exportbl); + + // dentries + list<CDir*> subdirs; + for (auto &p : *dir) { + CDentry *dn = p.second; + CInode *in = dn->get_linkage()->get_inode(); + + num_exported++; + + // -- dentry + dout(7) << "encode_export_dir exporting " << *dn << dendl; + + // dn name + encode(dn->get_name(), exportbl); + encode(dn->last, exportbl); + + // state + dn->encode_export(exportbl); + + // points to... + + // null dentry? + if (dn->get_linkage()->is_null()) { + exportbl.append("N", 1); // null dentry + continue; + } + + if (dn->get_linkage()->is_remote()) { + // remote link + exportbl.append("L", 1); // remote link + + inodeno_t ino = dn->get_linkage()->get_remote_ino(); + unsigned char d_type = dn->get_linkage()->get_remote_d_type(); + encode(ino, exportbl); + encode(d_type, exportbl); + continue; + } + + // primary link + // -- inode + exportbl.append("I", 1); // inode dentry + + encode_export_inode(in, exportbl, exported_client_map, exported_client_metadata_map); // encode, and (update state for) export + + // directory? + list<CDir*> dfs; + in->get_dirfrags(dfs); + for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) { + CDir *t = *p; + if (!t->state_test(CDir::STATE_EXPORTBOUND)) { + // include nested dirfrag + ceph_assert(t->get_dir_auth().first == CDIR_AUTH_PARENT); + subdirs.push_front(t); // it's ours, recurse (later) + } + } + } + + // subdirs + for (auto &dir : subdirs) + num_exported += encode_export_dir(exportbl, dir, exported_client_map, exported_client_metadata_map); + + return num_exported; +} + +void Migrator::finish_export_dir(CDir *dir, mds_rank_t peer, + map<inodeno_t,map<client_t,Capability::Import> >& peer_imported, + MDSContext::vec& finished, int *num_dentries) +{ + dout(10) << "finish_export_dir " << *dir << dendl; + + // release open_by + dir->clear_replica_map(); + + // mark + ceph_assert(dir->is_auth()); + dir->state_clear(CDir::STATE_AUTH); + dir->remove_bloom(); + dir->replica_nonce = CDir::EXPORT_NONCE; + + if (dir->is_dirty()) + dir->mark_clean(); + + // suck up all waiters + dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters + + // pop + dir->finish_export(); + + // dentries + list<CDir*> subdirs; + for (auto &p : *dir) { + CDentry *dn = p.second; + CInode *in = dn->get_linkage()->get_inode(); + + // dentry + dn->finish_export(); + + // inode? + if (dn->get_linkage()->is_primary()) { + finish_export_inode(in, peer, peer_imported[in->ino()], finished); + + // subdirs? + in->get_nested_dirfrags(subdirs); + } + + cache->touch_dentry_bottom(dn); // move dentry to tail of LRU + ++(*num_dentries); + } + + // subdirs + for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it) + finish_export_dir(*it, peer, peer_imported, finished, num_dentries); +} + +class C_MDS_ExportFinishLogged : public MigratorLogContext { + CDir *dir; +public: + C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {} + void finish(int r) override { + mig->export_logged_finish(dir); + } +}; + + +/* + * i should get an export_ack from the export target. + */ +void Migrator::handle_export_ack(const MExportDirAck::const_ref &m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + mds_rank_t dest(m->get_source().num()); + ceph_assert(dir); + ceph_assert(dir->is_frozen_tree_root()); // i'm exporting! + + // yay! + dout(7) << "handle_export_ack " << *dir << dendl; + + mds->hit_export_target(dest, -1); + + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + ceph_assert(it != export_state.end()); + ceph_assert(it->second.state == EXPORT_EXPORTING); + ceph_assert(it->second.tid == m->get_tid()); + + auto bp = m->imported_caps.cbegin(); + decode(it->second.peer_imported, bp); + + it->second.state = EXPORT_LOGGINGFINISH; + assert (g_conf()->mds_kill_export_at != 9); + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + // log completion. + // include export bounds, to ensure they're in the journal. + EExport *le = new EExport(mds->mdlog, dir, it->second.peer);; + mds->mdlog->start_entry(le); + + le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); + le->metablob.add_dir(dir, false); + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bound = *p; + le->get_bounds().insert(bound->dirfrag()); + le->metablob.add_dir_context(bound); + le->metablob.add_dir(bound, false); + } + + // list us second, them first. + // this keeps authority().first in sync with subtree auth state in the journal. + cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid()); + + // log export completion, then finish (unfreeze, trigger finish context, etc.) + mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir)); + mds->mdlog->flush(); + assert (g_conf()->mds_kill_export_at != 10); +} + +void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds) +{ + dout(7) << "export_notify_abort " << *dir << dendl; + + ceph_assert(stat.state == EXPORT_CANCELLING); + + if (stat.notify_ack_waiting.empty()) { + stat.state = EXPORT_CANCELLED; + return; + } + + dir->auth_pin(this); + + for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin(); + p != stat.notify_ack_waiting.end(); + ++p) { + auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true, + pair<int,int>(mds->get_nodeid(), stat.peer), + pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN)); + for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i) + notify->get_bounds().push_back((*i)->dirfrag()); + mds->send_message_mds(notify, *p); + } +} + +/* + * this happens if hte dest failes after i send teh export data but before it is acked + * that is, we don't know they safely received and logged it, so we reverse our changes + * and go on. + */ +void Migrator::export_reverse(CDir *dir, export_state_t& stat) +{ + dout(7) << "export_reverse " << *dir << dendl; + + set<CInode*> to_eval; + + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + // remove exporting pins + list<CDir*> rq; + rq.push_back(dir); + while (!rq.empty()) { + CDir *t = rq.front(); + rq.pop_front(); + t->abort_export(); + for (auto &p : *t) { + CDentry *dn = p.second; + dn->abort_export(); + if (!dn->get_linkage()->is_primary()) + continue; + CInode *in = dn->get_linkage()->get_inode(); + in->abort_export(); + if (in->state_test(CInode::STATE_EVALSTALECAPS)) { + in->state_clear(CInode::STATE_EVALSTALECAPS); + to_eval.insert(in); + } + if (in->is_dir()) + in->get_nested_dirfrags(rq); + } + } + + // unpin bounds + for (auto bd : bounds) { + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + + // notify bystanders + export_notify_abort(dir, stat, bounds); + + // unfreeze tree, with possible subtree merge. + cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); + + // process delayed expires + cache->process_delayed_expire(dir); + + dir->unfreeze_tree(); + cache->try_subtree_merge(dir); + + // revoke/resume stale caps + for (auto in : to_eval) { + bool need_issue = false; + for (auto &p : in->client_caps) { + Capability *cap = &p.second; + if (!cap->is_stale()) { + need_issue = true; + break; + } + } + if (need_issue && + (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS))) + mds->locker->issue_caps(in); + } + + cache->show_cache(); +} + + +/* + * once i get the ack, and logged the EExportFinish(true), + * send notifies (if any), otherwise go straight to finish. + * + */ +void Migrator::export_logged_finish(CDir *dir) +{ + dout(7) << "export_logged_finish " << *dir << dendl; + + export_state_t& stat = export_state[dir]; + + // send notifies + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin(); + p != stat.notify_ack_waiting.end(); + ++p) { + auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true, + pair<int,int>(mds->get_nodeid(), stat.peer), + pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN)); + + for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i) + notify->get_bounds().push_back((*i)->dirfrag()); + + mds->send_message_mds(notify, *p); + } + + // wait for notifyacks + stat.state = EXPORT_NOTIFYING; + assert (g_conf()->mds_kill_export_at != 11); + + // no notifies to wait for? + if (stat.notify_ack_waiting.empty()) { + export_finish(dir); // skip notify/notify_ack stage. + } else { + // notify peer to send cap import messages to clients + if (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) { + mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), false, stat.tid), stat.peer); + } else { + dout(7) << "not sending MExportDirFinish, dest has failed" << dendl; + } + } +} + +/* + * warning: + * i'll get an ack from each bystander. + * when i get them all, do the export. + * notify: + * i'll get an ack from each bystander. + * when i get them all, unfreeze and send the finish. + */ +void Migrator::handle_export_notify_ack(const MExportDirNotifyAck::const_ref &m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + mds_rank_t dest(m->get_source().num()); + ceph_assert(dir); + mds_rank_t from = mds_rank_t(m->get_source().num()); + + mds->hit_export_target(dest, -1); + + auto export_state_entry = export_state.find(dir); + if (export_state_entry != export_state.end()) { + export_state_t& stat = export_state_entry->second; + if (stat.state == EXPORT_WARNING && + stat.warning_ack_waiting.erase(from)) { + // exporting. process warning. + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": exporting, processing warning on " << *dir << dendl; + if (stat.warning_ack_waiting.empty()) + export_go(dir); // start export. + } else if (stat.state == EXPORT_NOTIFYING && + stat.notify_ack_waiting.erase(from)) { + // exporting. process notify. + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": exporting, processing notify on " << *dir << dendl; + if (stat.notify_ack_waiting.empty()) + export_finish(dir); + } else if (stat.state == EXPORT_CANCELLING && + m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack + stat.notify_ack_waiting.erase(from)) { + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": cancelling export, processing notify on " << *dir << dendl; + if (stat.notify_ack_waiting.empty()) { + export_cancel_finish(export_state_entry); + } + } + } + else { + auto import_state_entry = import_state.find(dir->dirfrag()); + if (import_state_entry != import_state.end()) { + import_state_t& stat = import_state_entry->second; + if (stat.state == IMPORT_ABORTING) { + // reversing import + dout(7) << "handle_export_notify_ack from " << m->get_source() + << ": aborting import on " << *dir << dendl; + ceph_assert(stat.bystanders.count(from)); + stat.bystanders.erase(from); + if (stat.bystanders.empty()) + import_reverse_unfreeze(dir); + } + } + } +} + +void Migrator::export_finish(CDir *dir) +{ + dout(5) << "export_finish " << *dir << dendl; + + assert (g_conf()->mds_kill_export_at != 12); + map<CDir*,export_state_t>::iterator it = export_state.find(dir); + if (it == export_state.end()) { + dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl; + return; + } + + // send finish/commit to new auth + if (!mds->is_cluster_degraded() || + mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) { + mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), true, it->second.tid), it->second.peer); + } else { + dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl; + } + ceph_assert(g_conf()->mds_kill_export_at != 13); + + // finish export (adjust local cache state) + int num_dentries = 0; + MDSContext::vec finished; + finish_export_dir(dir, it->second.peer, + it->second.peer_imported, finished, &num_dentries); + + ceph_assert(!dir->is_auth()); + cache->adjust_subtree_auth(dir, it->second.peer); + + // unpin bounds + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = *p; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + + if (dir->state_test(CDir::STATE_AUXSUBTREE)) + dir->state_clear(CDir::STATE_AUXSUBTREE); + + // discard delayed expires + cache->discard_delayed_expire(dir); + + dout(7) << "export_finish unfreezing" << dendl; + + // unfreeze tree, with possible subtree merge. + // (we do this _after_ removing EXPORTBOUND pins, to allow merges) + dir->unfreeze_tree(); + cache->try_subtree_merge(dir); + + // no more auth subtree? clear scatter dirty + if (!dir->get_inode()->is_auth() && + !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) { + dir->get_inode()->clear_scatter_dirty(); + // wake up scatter_nudge waiters + dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished); + } + + if (!finished.empty()) + mds->queue_waiters(finished); + + MutationRef mut = std::move(it->second.mut); + auto parent = std::move(it->second.parent); + // remove from exporting list, clean up state + total_exporting_size -= it->second.approx_size; + export_state.erase(it); + + ceph_assert(dir->state_test(CDir::STATE_EXPORTING)); + dir->clear_exporting(); + + cache->show_subtrees(); + audit(); + + cache->trim(num_dentries); // try trimming exported dentries + + // send pending import_maps? + mds->mdcache->maybe_send_pending_resolves(); + + // drop locks, unpin path + if (mut) { + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + } + + if (parent) + child_export_finish(parent, true); + + maybe_do_queued_export(); +} + + + +class C_MDS_ExportDiscover : public MigratorContext { +public: + C_MDS_ExportDiscover(Migrator *mig, const MExportDirDiscover::const_ref& m) : MigratorContext(mig), m(m) {} + void finish(int r) override { + mig->handle_export_discover(m, true); + } +private: + MExportDirDiscover::const_ref m; +}; + +class C_MDS_ExportDiscoverFactory : public MDSContextFactory { +public: + C_MDS_ExportDiscoverFactory(Migrator *mig, MExportDirDiscover::const_ref m) : mig(mig), m(m) {} + MDSContext *build() { + return new C_MDS_ExportDiscover(mig, m); + } +private: + Migrator *mig; + MExportDirDiscover::const_ref m; +}; + +// ========================================================== +// IMPORT + +void Migrator::handle_export_discover(const MExportDirDiscover::const_ref &m, bool started) +{ + mds_rank_t from = m->get_source_mds(); + ceph_assert(from != mds->get_nodeid()); + + dout(7) << "handle_export_discover on " << m->get_path() << dendl; + + // note import state + dirfrag_t df = m->get_dirfrag(); + + if (!mds->is_active()) { + dout(7) << " not active, send NACK " << dendl; + mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid(), false), from); + return; + } + + // only start discovering on this message once. + import_state_t *p_state; + map<dirfrag_t,import_state_t>::iterator it = import_state.find(df); + if (!started) { + ceph_assert(it == import_state.end()); + p_state = &import_state[df]; + p_state->state = IMPORT_DISCOVERING; + p_state->peer = from; + p_state->tid = m->get_tid(); + } else { + // am i retrying after ancient path_traverse results? + if (it == import_state.end() || + it->second.peer != from || + it->second.tid != m->get_tid()) { + dout(7) << " dropping obsolete message" << dendl; + return; + } + ceph_assert(it->second.state == IMPORT_DISCOVERING); + p_state = &it->second; + } + + C_MDS_ExportDiscoverFactory cf(this, m); + if (!mds->mdcache->is_open()) { + dout(5) << " waiting for root" << dendl; + mds->mdcache->wait_for_open(cf.build()); + return; + } + + assert (g_conf()->mds_kill_import_at != 1); + + // do we have it? + CInode *in = cache->get_inode(m->get_dirfrag().ino); + if (!in) { + // must discover it! + filepath fpath(m->get_path()); + vector<CDentry*> trace; + MDRequestRef null_ref; + int r = cache->path_traverse(null_ref, cf, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER); + if (r > 0) return; + if (r < 0) { + dout(7) << "handle_export_discover failed to discover or not dir " << m->get_path() << ", NAK" << dendl; + ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!! + } + + ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded. + } + + // yay + dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; + + p_state->state = IMPORT_DISCOVERED; + + // pin inode in the cache (for now) + ceph_assert(in->is_dir()); + in->get(CInode::PIN_IMPORTING); + + // reply + dout(7) << " sending export_discover_ack on " << *in << dendl; + mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid()), p_state->peer); + assert (g_conf()->mds_kill_import_at != 2); +} + +void Migrator::import_reverse_discovering(dirfrag_t df) +{ + import_state.erase(df); +} + +void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri) +{ + // unpin base + diri->put(CInode::PIN_IMPORTING); + import_state.erase(df); +} + +void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat) +{ + set<CDir*> bounds; + cache->map_dirfrag_set(stat.bound_ls, bounds); + import_remove_pins(dir, bounds); + import_reverse_final(dir); +} + +void Migrator::handle_export_cancel(const MExportDirCancel::const_ref &m) +{ + dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl; + dirfrag_t df = m->get_dirfrag(); + map<dirfrag_t,import_state_t>::iterator it = import_state.find(df); + if (it == import_state.end()) { + ceph_abort_msg("got export_cancel in weird state"); + } else if (it->second.state == IMPORT_DISCOVERING) { + import_reverse_discovering(df); + } else if (it->second.state == IMPORT_DISCOVERED) { + CInode *in = cache->get_inode(df.ino); + ceph_assert(in); + import_reverse_discovered(df, in); + } else if (it->second.state == IMPORT_PREPPING) { + CDir *dir = mds->mdcache->get_dirfrag(df); + ceph_assert(dir); + import_reverse_prepping(dir, it->second); + } else if (it->second.state == IMPORT_PREPPED) { + CDir *dir = mds->mdcache->get_dirfrag(df); + ceph_assert(dir); + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + import_remove_pins(dir, bounds); + // adjust auth back to the exportor + cache->adjust_subtree_auth(dir, it->second.peer); + import_reverse_unfreeze(dir); + } else { + ceph_abort_msg("got export_cancel in weird state"); + } +} + +class C_MDS_ExportPrep : public MigratorContext { +public: + C_MDS_ExportPrep(Migrator *mig, const MExportDirPrep::const_ref& m) : MigratorContext(mig), m(m) {} + void finish(int r) override { + mig->handle_export_prep(m, true); + } +private: + MExportDirPrep::const_ref m; +}; + +class C_MDS_ExportPrepFactory : public MDSContextFactory { +public: + C_MDS_ExportPrepFactory(Migrator *mig, MExportDirPrep::const_ref m) : mig(mig), m(m) {} + MDSContext *build() { + return new C_MDS_ExportPrep(mig, m); + } +private: + Migrator *mig; + MExportDirPrep::const_ref m; +}; + +void Migrator::handle_export_prep(const MExportDirPrep::const_ref &m, bool did_assim) +{ + mds_rank_t oldauth = mds_rank_t(m->get_source().num()); + ceph_assert(oldauth != mds->get_nodeid()); + + CDir *dir; + CInode *diri; + MDSContext::vec finished; + + // assimilate root dir. + map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag()); + if (!did_assim) { + ceph_assert(it != import_state.end()); + ceph_assert(it->second.state == IMPORT_DISCOVERED); + ceph_assert(it->second.peer == oldauth); + diri = cache->get_inode(m->get_dirfrag().ino); + ceph_assert(diri); + auto p = m->basedir.cbegin(); + dir = cache->add_replica_dir(p, diri, oldauth, finished); + dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; + } else { + if (it == import_state.end() || + it->second.peer != oldauth || + it->second.tid != m->get_tid()) { + dout(7) << "handle_export_prep obsolete message, dropping" << dendl; + return; + } + ceph_assert(it->second.state == IMPORT_PREPPING); + ceph_assert(it->second.peer == oldauth); + + dir = cache->get_dirfrag(m->get_dirfrag()); + ceph_assert(dir); + dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; + diri = dir->get_inode(); + } + ceph_assert(dir->is_auth() == false); + + cache->show_subtrees(); + + // build import bound map + map<inodeno_t, fragset_t> import_bound_fragset; + for (const auto &bound : m->get_bounds()) { + dout(10) << " bound " << bound << dendl; + import_bound_fragset[bound.ino].insert(bound.frag); + } + + // assimilate contents? + if (!did_assim) { + dout(7) << "doing assim on " << *dir << dendl; + + // change import state + it->second.state = IMPORT_PREPPING; + it->second.bound_ls = m->get_bounds(); + it->second.bystanders = m->get_bystanders(); + ceph_assert(g_conf()->mds_kill_import_at != 3); + + // bystander list + dout(7) << "bystanders are " << it->second.bystanders << dendl; + + // move pin to dir + diri->put(CInode::PIN_IMPORTING); + dir->get(CDir::PIN_IMPORTING); + dir->state_set(CDir::STATE_IMPORTING); + + // assimilate traces to exports + // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*) + for (const auto &bl : m->traces) { + auto q = bl.cbegin(); + dirfrag_t df; + decode(df, q); + char start; + decode(start, q); + dout(10) << " trace from " << df << " start " << start << " len " << bl.length() << dendl; + + CDir *cur = 0; + if (start == 'd') { + cur = cache->get_dirfrag(df); + ceph_assert(cur); + dout(10) << " had " << *cur << dendl; + } else if (start == 'f') { + CInode *in = cache->get_inode(df.ino); + ceph_assert(in); + dout(10) << " had " << *in << dendl; + cur = cache->add_replica_dir(q, in, oldauth, finished); + dout(10) << " added " << *cur << dendl; + } else if (start == '-') { + // nothing + } else + ceph_abort_msg("unrecognized start char"); + + while (!q.end()) { + CDentry *dn = cache->add_replica_dentry(q, cur, finished); + dout(10) << " added " << *dn << dendl; + CInode *in = cache->add_replica_inode(q, dn, finished); + dout(10) << " added " << *in << dendl; + if (q.end()) + break; + cur = cache->add_replica_dir(q, in, oldauth, finished); + dout(10) << " added " << *cur << dendl; + } + } + + // make bound sticky + for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin(); + p != import_bound_fragset.end(); + ++p) { + CInode *in = cache->get_inode(p->first); + ceph_assert(in); + in->get_stickydirs(); + dout(7) << " set stickydirs on bound inode " << *in << dendl; + } + + } else { + dout(7) << " not doing assim on " << *dir << dendl; + } + + MDSGatherBuilder gather(g_ceph_context); + + if (!finished.empty()) + mds->queue_waiters(finished); + + + bool success = true; + if (mds->is_active()) { + // open all bounds + set<CDir*> import_bounds; + for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin(); + p != import_bound_fragset.end(); + ++p) { + CInode *in = cache->get_inode(p->first); + ceph_assert(in); + + // map fragset into a frag_t list, based on the inode fragtree + frag_vec_t leaves; + for (const auto& frag : p->second) { + in->dirfragtree.get_leaves_under(frag, leaves); + } + dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << leaves << dendl; + + for (const auto& leaf : leaves) { + CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, leaf)); + if (!bound) { + dout(7) << " opening bounding dirfrag " << leaf << " on " << *in << dendl; + cache->open_remote_dirfrag(in, leaf, gather.new_sub()); + continue; + } + + if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { + dout(7) << " pinning import bound " << *bound << dendl; + bound->get(CDir::PIN_IMPORTBOUND); + bound->state_set(CDir::STATE_IMPORTBOUND); + } else { + dout(7) << " already pinned import bound " << *bound << dendl; + } + import_bounds.insert(bound); + } + } + + if (gather.has_subs()) { + C_MDS_ExportPrepFactory cf(this, m); + gather.set_finisher(cf.build()); + gather.activate(); + return; + } + + dout(7) << " all ready, noting auth and freezing import region" << dendl; + + if (!mds->mdcache->is_readonly() && + diri->filelock.can_wrlock(-1) && + diri->nestlock.can_wrlock(-1)) { + it->second.mut = new MutationImpl(); + // force some locks. hacky. + mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut); + mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut); + + // note that i am an ambiguous auth for this subtree. + // specify bounds, since the exporter explicitly defines the region. + cache->adjust_bounded_subtree_auth(dir, import_bounds, + pair<int,int>(oldauth, mds->get_nodeid())); + cache->verify_subtree_bounds(dir, import_bounds); + // freeze. + dir->_freeze_tree(); + // note new state + it->second.state = IMPORT_PREPPED; + } else { + dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl; + success = false; + } + } else { + dout(7) << " not active, failing. " << *dir << dendl; + success = false; + } + + if (!success) + import_reverse_prepping(dir, it->second); + + // ok! + dout(7) << " sending export_prep_ack on " << *dir << dendl; + mds->send_message(MExportDirPrepAck::create(dir->dirfrag(), success, m->get_tid()), m->get_connection()); + + ceph_assert(g_conf()->mds_kill_import_at != 4); +} + + + + +class C_MDS_ImportDirLoggedStart : public MigratorLogContext { + dirfrag_t df; + CDir *dir; + mds_rank_t from; +public: + map<client_t,pair<Session*,uint64_t> > imported_session_map; + + C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) : + MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) { + dir->get(CDir::PIN_PTRWAITER); + } + void finish(int r) override { + mig->import_logged_start(df, dir, from, imported_session_map); + dir->put(CDir::PIN_PTRWAITER); + } +}; + +void Migrator::handle_export_dir(const MExportDir::const_ref &m) +{ + assert (g_conf()->mds_kill_import_at != 5); + CDir *dir = cache->get_dirfrag(m->dirfrag); + ceph_assert(dir); + + mds_rank_t oldauth = mds_rank_t(m->get_source().num()); + dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl; + + ceph_assert(!dir->is_auth()); + ceph_assert(dir->freeze_tree_state); + + map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag); + ceph_assert(it != import_state.end()); + ceph_assert(it->second.state == IMPORT_PREPPED); + ceph_assert(it->second.tid == m->get_tid()); + ceph_assert(it->second.peer == oldauth); + + if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())) + dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag()); + + cache->show_subtrees(); + + C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth); + + // start the journal entry + EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth); + mds->mdlog->start_entry(le); + + le->metablob.add_dir_context(dir); + + // adjust auth (list us _first_) + cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); + + // new client sessions, open these after we journal + // include imported sessions in EImportStart + auto cmp = m->client_map.cbegin(); + map<client_t,entity_inst_t> client_map; + map<client_t,client_metadata_t> client_metadata_map; + decode(client_map, cmp); + decode(client_metadata_map, cmp); + ceph_assert(cmp.end()); + le->cmapv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map, + onlogged->imported_session_map); + encode(client_map, le->client_map, mds->mdsmap->get_up_features()); + encode(client_metadata_map, le->client_map); + + auto blp = m->export_data.cbegin(); + int num_imported_inodes = 0; + while (!blp.end()) { + num_imported_inodes += + decode_import_dir(blp, + oldauth, + dir, // import root + le, + mds->mdlog->get_current_segment(), + it->second.peer_exports, + it->second.updated_scatterlocks); + } + dout(10) << " " << m->bounds.size() << " imported bounds" << dendl; + + // include bounds in EImportStart + set<CDir*> import_bounds; + for (const auto &bound : m->bounds) { + CDir *bd = cache->get_dirfrag(bound); + ceph_assert(bd); + le->metablob.add_dir(bd, false); // note that parent metadata is already in the event + import_bounds.insert(bd); + } + cache->verify_subtree_bounds(dir, import_bounds); + + // adjust popularity + mds->balancer->add_import(dir); + + dout(7) << "handle_export_dir did " << *dir << dendl; + + // note state + it->second.state = IMPORT_LOGGINGSTART; + assert (g_conf()->mds_kill_import_at != 6); + + // log it + mds->mdlog->submit_entry(le, onlogged); + mds->mdlog->flush(); + + // some stats + if (mds->logger) { + mds->logger->inc(l_mds_imported); + mds->logger->inc(l_mds_imported_inodes, num_imported_inodes); + } +} + + +/* + * this is an import helper + * called by import_finish, and import_reverse and friends. + */ +void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds) +{ + import_state_t& stat = import_state[dir->dirfrag()]; + // root + dir->put(CDir::PIN_IMPORTING); + dir->state_clear(CDir::STATE_IMPORTING); + + // bounding inodes + set<inodeno_t> did; + for (list<dirfrag_t>::iterator p = stat.bound_ls.begin(); + p != stat.bound_ls.end(); + ++p) { + if (did.count(p->ino)) + continue; + did.insert(p->ino); + CInode *in = cache->get_inode(p->ino); + ceph_assert(in); + in->put_stickydirs(); + } + + if (stat.state == IMPORT_PREPPING) { + for (auto bd : bounds) { + if (bd->state_test(CDir::STATE_IMPORTBOUND)) { + bd->put(CDir::PIN_IMPORTBOUND); + bd->state_clear(CDir::STATE_IMPORTBOUND); + } + } + } else if (stat.state >= IMPORT_PREPPED) { + // bounding dirfrags + for (auto bd : bounds) { + ceph_assert(bd->state_test(CDir::STATE_IMPORTBOUND)); + bd->put(CDir::PIN_IMPORTBOUND); + bd->state_clear(CDir::STATE_IMPORTBOUND); + } + } +} + +class C_MDC_QueueContexts : public MigratorContext { +public: + MDSContext::vec contexts; + C_MDC_QueueContexts(Migrator *m) : MigratorContext(m) {} + void finish(int r) override { + // execute contexts immediately after 'this' context + get_mds()->queue_waiters_front(contexts); + } +}; + +/* + * note: this does teh full work of reversing and import and cleaning up + * state. + * called by both handle_mds_failure and by handle_resolve (if we are + * a survivor coping with an exporter failure+recovery). + */ +void Migrator::import_reverse(CDir *dir) +{ + dout(7) << "import_reverse " << *dir << dendl; + + import_state_t& stat = import_state[dir->dirfrag()]; + stat.state = IMPORT_ABORTING; + + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + // remove pins + import_remove_pins(dir, bounds); + + // update auth, with possible subtree merge. + ceph_assert(dir->is_subtree_root()); + if (mds->is_resolve()) + cache->trim_non_auth_subtree(dir); + + cache->adjust_subtree_auth(dir, stat.peer); + + auto fin = new C_MDC_QueueContexts(this); + if (!dir->get_inode()->is_auth() && + !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) { + dir->get_inode()->clear_scatter_dirty(); + // wake up scatter_nudge waiters + dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts); + } + + int num_dentries = 0; + // adjust auth bits. + list<CDir*> q; + q.push_back(dir); + while (!q.empty()) { + CDir *cur = q.front(); + q.pop_front(); + + // dir + cur->abort_import(); + + for (auto &p : *cur) { + CDentry *dn = p.second; + + // dentry + dn->state_clear(CDentry::STATE_AUTH); + dn->clear_replica_map(); + dn->set_replica_nonce(CDentry::EXPORT_NONCE); + if (dn->is_dirty()) + dn->mark_clean(); + + // inode? + if (dn->get_linkage()->is_primary()) { + CInode *in = dn->get_linkage()->get_inode(); + in->state_clear(CDentry::STATE_AUTH); + in->clear_replica_map(); + in->set_replica_nonce(CInode::EXPORT_NONCE); + if (in->is_dirty()) + in->mark_clean(); + in->clear_dirty_rstat(); + if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) { + in->clear_scatter_dirty(); + in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts); + } + + in->clear_dirty_parent(); + + in->authlock.clear_gather(); + in->linklock.clear_gather(); + in->dirfragtreelock.clear_gather(); + in->filelock.clear_gather(); + + in->clear_file_locks(); + + // non-bounding dir? + list<CDir*> dfs; + in->get_dirfrags(dfs); + for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) + if (bounds.count(*p) == 0) + q.push_back(*p); + } + + cache->touch_dentry_bottom(dn); // move dentry to tail of LRU + ++num_dentries; + } + } + + dir->add_waiter(CDir::WAIT_UNFREEZE, fin); + + if (stat.state == IMPORT_ACKING) { + // remove imported caps + for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin(); + p != stat.peer_exports.end(); + ++p) { + CInode *in = p->first; + for (map<client_t,Capability::Export>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + Capability *cap = in->get_client_cap(q->first); + if (!cap) { + ceph_assert(!stat.session_map.count(q->first)); + continue; + } + if (cap->is_importing()) + in->remove_client_cap(q->first); + } + in->put(CInode::PIN_IMPORTINGCAPS); + } + for (auto& p : stat.session_map) { + Session *session = p.second.first; + session->dec_importing(); + } + } + + // log our failure + mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure + + cache->trim(num_dentries); // try trimming dentries + + // notify bystanders; wait in aborting state + import_notify_abort(dir, bounds); +} + +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds) +{ + dout(7) << "import_notify_finish " << *dir << dendl; + + import_state_t& stat = import_state[dir->dirfrag()]; + for (set<mds_rank_t>::iterator p = stat.bystanders.begin(); + p != stat.bystanders.end(); + ++p) { + auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, false, + pair<int,int>(stat.peer, mds->get_nodeid()), + pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN)); + for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i) + notify->get_bounds().push_back((*i)->dirfrag()); + mds->send_message_mds(notify, *p); + } +} + +void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds) +{ + dout(7) << "import_notify_abort " << *dir << dendl; + + import_state_t& stat = import_state[dir->dirfrag()]; + for (set<mds_rank_t>::iterator p = stat.bystanders.begin(); + p != stat.bystanders.end(); ) { + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) { + // this can happen if both exporter and bystander fail in the same mdsmap epoch + stat.bystanders.erase(p++); + continue; + } + auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true, + mds_authority_t(stat.peer, mds->get_nodeid()), + mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN)); + for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i) + notify->get_bounds().push_back((*i)->dirfrag()); + mds->send_message_mds(notify, *p); + ++p; + } + if (stat.bystanders.empty()) { + dout(7) << "no bystanders, finishing reverse now" << dendl; + import_reverse_unfreeze(dir); + } else { + assert (g_conf()->mds_kill_import_at != 10); + } +} + +void Migrator::import_reverse_unfreeze(CDir *dir) +{ + dout(7) << "import_reverse_unfreeze " << *dir << dendl; + ceph_assert(!dir->is_auth()); + cache->discard_delayed_expire(dir); + dir->unfreeze_tree(); + if (dir->is_subtree_root()) + cache->try_subtree_merge(dir); + import_reverse_final(dir); +} + +void Migrator::import_reverse_final(CDir *dir) +{ + dout(7) << "import_reverse_final " << *dir << dendl; + + // clean up + map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag()); + ceph_assert(it != import_state.end()); + + MutationRef mut = it->second.mut; + import_state.erase(it); + + // send pending import_maps? + mds->mdcache->maybe_send_pending_resolves(); + + if (mut) { + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + } + + cache->show_subtrees(); + //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) +} + + + + +void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from, + map<client_t,pair<Session*,uint64_t> >& imported_session_map) +{ + dout(7) << "import_logged " << *dir << dendl; + + map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag()); + if (it == import_state.end() || + it->second.state != IMPORT_LOGGINGSTART) { + dout(7) << "import " << df << " must have aborted" << dendl; + mds->server->finish_force_open_sessions(imported_session_map); + return; + } + + // note state + it->second.state = IMPORT_ACKING; + + assert (g_conf()->mds_kill_import_at != 7); + + // force open client sessions and finish cap import + mds->server->finish_force_open_sessions(imported_session_map, false); + + map<inodeno_t,map<client_t,Capability::Import> > imported_caps; + for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin(); + p != it->second.peer_exports.end(); + ++p) { + // parameter 'peer' is NONE, delay sending cap import messages to client + finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map, + p->second, imported_caps[p->first->ino()]); + } + + it->second.session_map.swap(imported_session_map); + + // send notify's etc. + dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl; + + // test surviving observer of a failed migration that did not complete + //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0); + + auto ack = MExportDirAck::create(dir->dirfrag(), it->second.tid); + encode(imported_caps, ack->imported_caps); + + mds->send_message_mds(ack, from); + assert (g_conf()->mds_kill_import_at != 8); + + cache->show_subtrees(); +} + +void Migrator::handle_export_finish(const MExportDirFinish::const_ref &m) +{ + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + ceph_assert(dir); + dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl; + + map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag()); + ceph_assert(it != import_state.end()); + ceph_assert(it->second.tid == m->get_tid()); + + import_finish(dir, false, m->is_last()); +} + +void Migrator::import_finish(CDir *dir, bool notify, bool last) +{ + dout(7) << "import_finish on " << *dir << dendl; + + map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag()); + ceph_assert(it != import_state.end()); + ceph_assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING); + + if (it->second.state == IMPORT_ACKING) { + ceph_assert(dir->is_auth()); + cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); + } + + // log finish + ceph_assert(g_conf()->mds_kill_import_at != 9); + + if (it->second.state == IMPORT_ACKING) { + for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin(); + p != it->second.peer_exports.end(); + ++p) { + CInode *in = p->first; + ceph_assert(in->is_auth()); + for (map<client_t,Capability::Export>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + auto r = it->second.session_map.find(q->first); + if (r == it->second.session_map.end()) + continue; + + Session *session = r->second.first; + Capability *cap = in->get_client_cap(q->first); + ceph_assert(cap); + cap->merge(q->second, true); + cap->clear_importing(); + mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, + q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH); + } + p->second.clear(); + in->replica_caps_wanted = 0; + } + for (auto& p : it->second.session_map) { + Session *session = p.second.first; + session->dec_importing(); + } + } + + if (!last) { + ceph_assert(it->second.state == IMPORT_ACKING); + it->second.state = IMPORT_FINISHING; + return; + } + + // remove pins + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + + if (notify) + import_notify_finish(dir, bounds); + + import_remove_pins(dir, bounds); + + map<CInode*, map<client_t,Capability::Export> > peer_exports; + it->second.peer_exports.swap(peer_exports); + + // clear import state (we're done!) + MutationRef mut = it->second.mut; + import_state.erase(it); + + mds->mdlog->start_submit_entry(new EImportFinish(dir, true)); + + // process delayed expires + cache->process_delayed_expire(dir); + + // unfreeze tree, with possible subtree merge. + dir->unfreeze_tree(); + cache->try_subtree_merge(dir); + + cache->show_subtrees(); + //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) + + if (mut) { + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + } + + // re-eval imported caps + for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin(); + p != peer_exports.end(); + ++p) { + if (p->first->is_auth()) + mds->locker->eval(p->first, CEPH_CAP_LOCKS, true); + p->first->put(CInode::PIN_IMPORTINGCAPS); + } + + // send pending import_maps? + mds->mdcache->maybe_send_pending_resolves(); + + // did i just import mydir? + if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) + cache->populate_mydir(); + + // is it empty? + if (dir->get_num_head_items() == 0 && + !dir->inode->is_auth()) { + // reexport! + export_empty_import(dir); + } +} + + +void Migrator::decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp, + mds_rank_t oldauth, LogSegment *ls, + map<CInode*, map<client_t,Capability::Export> >& peer_exports, + list<ScatterLock*>& updated_scatterlocks) +{ + dout(15) << "decode_import_inode on " << *dn << dendl; + + inodeno_t ino; + snapid_t last; + decode(ino, blp); + decode(last, blp); + + bool added = false; + CInode *in = cache->get_inode(ino, last); + if (!in) { + in = new CInode(mds->mdcache, true, 1, last); + added = true; + } + + // state after link -- or not! -sage + in->decode_import(blp, ls); // cap imports are noted for later action + + // caps + decode_import_inode_caps(in, true, blp, peer_exports); + + // link before state -- or not! -sage + if (dn->get_linkage()->get_inode() != in) { + ceph_assert(!dn->get_linkage()->get_inode()); + dn->dir->link_primary_inode(dn, in); + } + + if (in->is_dir()) + dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru); + + // add inode? + if (added) { + cache->add_inode(in); + dout(10) << "added " << *in << dendl; + } else { + dout(10) << " had " << *in << dendl; + } + + if (in->inode.is_dirty_rstat()) + in->mark_dirty_rstat(); + + // clear if dirtyscattered, since we're going to journal this + // but not until we _actually_ finish the import... + if (in->filelock.is_dirty()) { + updated_scatterlocks.push_back(&in->filelock); + mds->locker->mark_updated_scatterlock(&in->filelock); + } + + if (in->dirfragtreelock.is_dirty()) { + updated_scatterlocks.push_back(&in->dirfragtreelock); + mds->locker->mark_updated_scatterlock(&in->dirfragtreelock); + } + + // adjust replica list + //assert(!in->is_replica(oldauth)); // not true on failed export + in->add_replica(oldauth, CInode::EXPORT_NONCE); + if (in->is_replica(mds->get_nodeid())) + in->remove_replica(mds->get_nodeid()); + + if (in->snaplock.is_stable() && + in->snaplock.get_state() != LOCK_SYNC) + mds->locker->try_eval(&in->snaplock, NULL); +} + +void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap, + bufferlist::const_iterator &blp, + map<CInode*, map<client_t,Capability::Export> >& peer_exports) +{ + map<client_t,Capability::Export> cap_map; + decode(cap_map, blp); + if (auth_cap) { + mempool::mds_co::compact_map<int32_t,int32_t> mds_wanted; + decode(mds_wanted, blp); + mds_wanted.erase(mds->get_nodeid()); + in->set_mds_caps_wanted(mds_wanted); + } + if (!cap_map.empty() || + (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) { + peer_exports[in].swap(cap_map); + in->get(CInode::PIN_IMPORTINGCAPS); + } +} + +void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap, + const map<client_t,pair<Session*,uint64_t> >& session_map, + const map<client_t,Capability::Export> &export_map, + map<client_t,Capability::Import> &import_map) +{ + for (auto& it : export_map) { + dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl; + + auto p = session_map.find(it.first); + if (p == session_map.end()) { + dout(10) << " no session for client." << it.first << dendl; + (void)import_map[it.first]; + continue; + } + + Session *session = p->second.first; + + Capability *cap = in->get_client_cap(it.first); + if (!cap) { + cap = in->add_client_cap(it.first, session); + if (peer < 0) + cap->mark_importing(); + } + + // Always ask exporter mds to send cap export messages for auth caps. + // For non-auth caps, ask exporter mds to send cap export messages to + // clients who haven't opened sessions. The cap export messages will + // make clients open sessions. + if (auth_cap || !session->get_connection()) { + Capability::Import& im = import_map[it.first]; + im.cap_id = cap->get_cap_id(); + im.mseq = auth_cap ? it.second.mseq : cap->get_mseq(); + im.issue_seq = cap->get_last_seq() + 1; + } + + if (peer >= 0) { + cap->merge(it.second, auth_cap); + mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id, + it.second.seq, it.second.mseq - 1, peer, + auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE); + } + } + + if (peer >= 0) { + in->replica_caps_wanted = 0; + in->put(CInode::PIN_IMPORTINGCAPS); + } +} + +int Migrator::decode_import_dir(bufferlist::const_iterator& blp, + mds_rank_t oldauth, + CDir *import_root, + EImportStart *le, + LogSegment *ls, + map<CInode*,map<client_t,Capability::Export> >& peer_exports, + list<ScatterLock*>& updated_scatterlocks) +{ + // set up dir + dirfrag_t df; + decode(df, blp); + + CInode *diri = cache->get_inode(df.ino); + ceph_assert(diri); + CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag); + ceph_assert(dir); + + dout(7) << "decode_import_dir " << *dir << dendl; + + if (!dir->freeze_tree_state) { + ceph_assert(dir->get_version() == 0); + dir->freeze_tree_state = import_root->freeze_tree_state; + } + + // assimilate state + dir->decode_import(blp, ls); + + // adjust replica list + //assert(!dir->is_replica(oldauth)); // not true on failed export + dir->add_replica(oldauth, CDir::EXPORT_NONCE); + if (dir->is_replica(mds->get_nodeid())) + dir->remove_replica(mds->get_nodeid()); + + // add to journal entry + if (le) + le->metablob.add_import_dir(dir); + + int num_imported = 0; + + // take all waiters on this dir + // NOTE: a pass of imported data is guaranteed to get all of my waiters because + // a replica's presense in my cache implies/forces it's presense in authority's. + MDSContext::vec waiters; + dir->take_waiting(CDir::WAIT_ANY_MASK, waiters); + for (auto c : waiters) + dir->add_waiter(CDir::WAIT_UNFREEZE, c); // UNFREEZE will get kicked both on success or failure + + dout(15) << "doing contents" << dendl; + + // contents + __u32 nden; + decode(nden, blp); + + for (; nden>0; nden--) { + num_imported++; + + // dentry + string dname; + snapid_t last; + decode(dname, blp); + decode(last, blp); + + CDentry *dn = dir->lookup_exact_snap(dname, last); + if (!dn) + dn = dir->add_null_dentry(dname, 1, last); + + dn->decode_import(blp, ls); + + dn->add_replica(oldauth, CDentry::EXPORT_NONCE); + if (dn->is_replica(mds->get_nodeid())) + dn->remove_replica(mds->get_nodeid()); + + // dentry lock in unreadable state can block path traverse + if (dn->lock.get_state() != LOCK_SYNC) + mds->locker->try_eval(&dn->lock, NULL); + + dout(15) << "decode_import_dir got " << *dn << dendl; + + // points to... + char icode; + decode(icode, blp); + + if (icode == 'N') { + // null dentry + ceph_assert(dn->get_linkage()->is_null()); + + // fall thru + } + else if (icode == 'L') { + // remote link + inodeno_t ino; + unsigned char d_type; + decode(ino, blp); + decode(d_type, blp); + if (dn->get_linkage()->is_remote()) { + ceph_assert(dn->get_linkage()->get_remote_ino() == ino); + } else { + dir->link_remote_inode(dn, ino, d_type); + } + } + else if (icode == 'I') { + // inode + ceph_assert(le); + decode_import_inode(dn, blp, oldauth, ls, + peer_exports, updated_scatterlocks); + } + + // add dentry to journal entry + if (le) + le->metablob.add_import_dentry(dn); + } + +#ifdef MDS_VERIFY_FRAGSTAT + if (dir->is_complete()) + dir->verify_fragstat(); +#endif + + dir->inode->maybe_export_pin(); + + dout(7) << "decode_import_dir done " << *dir << dendl; + return num_imported; +} + + + + + +// authority bystander + +void Migrator::handle_export_notify(const MExportDirNotify::const_ref &m) +{ + if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) { + return; + } + + CDir *dir = cache->get_dirfrag(m->get_dirfrag()); + + mds_rank_t from = mds_rank_t(m->get_source().num()); + mds_authority_t old_auth = m->get_old_auth(); + mds_authority_t new_auth = m->get_new_auth(); + + if (!dir) { + dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth + << " on missing dir " << m->get_dirfrag() << dendl; + } else if (dir->authority() != old_auth) { + dout(7) << "handle_export_notify old_auth was " << dir->authority() + << " != " << old_auth << " -> " << new_auth + << " on " << *dir << dendl; + } else { + dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth + << " on " << *dir << dendl; + // adjust auth + set<CDir*> have; + cache->map_dirfrag_set(m->get_bounds(), have); + cache->adjust_bounded_subtree_auth(dir, have, new_auth); + + // induce a merge? + cache->try_subtree_merge(dir); + } + + // send ack + if (m->wants_ack()) { + mds->send_message_mds(MExportDirNotifyAck::create(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from); + } else { + // aborted. no ack. + dout(7) << "handle_export_notify no ack requested" << dendl; + } +} + +/** cap exports **/ +void Migrator::export_caps(CInode *in) +{ + mds_rank_t dest = in->authority().first; + dout(7) << "export_caps to mds." << dest << " " << *in << dendl; + + ceph_assert(in->is_any_caps()); + ceph_assert(!in->is_auth()); + ceph_assert(!in->is_ambiguous_auth()); + ceph_assert(!in->state_test(CInode::STATE_EXPORTINGCAPS)); + + auto ex = MExportCaps::create(); + ex->ino = in->ino(); + + encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map, ex->client_metadata_map); + + mds->send_message_mds(ex, dest); +} + +void Migrator::handle_export_caps_ack(const MExportCapsAck::const_ref &ack) +{ + mds_rank_t from = ack->get_source().num(); + CInode *in = cache->get_inode(ack->ino); + if (in) { + ceph_assert(!in->is_auth()); + + dout(10) << "handle_export_caps_ack " << *ack << " from " + << ack->get_source() << " on " << *in << dendl; + + map<client_t,Capability::Import> imported_caps; + map<client_t,uint64_t> caps_ids; + auto blp = ack->cap_bl.cbegin(); + decode(imported_caps, blp); + decode(caps_ids, blp); + + for (auto& it : imported_caps) { + Capability *cap = in->get_client_cap(it.first); + if (!cap || cap->get_cap_id() != caps_ids.at(it.first)) + continue; + + dout(7) << __func__ << " telling client." << it.first + << " exported caps on " << *in << dendl; + auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0, + cap->get_cap_id(), cap->get_mseq(), + mds->get_osd_epoch_barrier()); + m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0); + mds->send_message_client_counted(m, it.first); + + in->remove_client_cap(it.first); + } + + mds->locker->request_inode_file_caps(in); + mds->locker->try_eval(in, CEPH_CAP_LOCKS); + } +} + +void Migrator::handle_gather_caps(const MGatherCaps::const_ref &m) +{ + CInode *in = cache->get_inode(m->ino); + if (!in) + return; + + dout(10) << "handle_gather_caps " << *m << " from " << m->get_source() + << " on " << *in << dendl; + + if (in->is_any_caps() && + !in->is_auth() && + !in->is_ambiguous_auth() && + !in->state_test(CInode::STATE_EXPORTINGCAPS)) + export_caps(in); +} + +class C_M_LoggedImportCaps : public MigratorLogContext { + CInode *in; + mds_rank_t from; +public: + map<client_t,pair<Session*,uint64_t> > imported_session_map; + map<CInode*, map<client_t,Capability::Export> > peer_exports; + + C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {} + void finish(int r) override { + mig->logged_import_caps(in, from, imported_session_map, peer_exports); + } +}; + +void Migrator::handle_export_caps(const MExportCaps::const_ref &ex) +{ + dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl; + CInode *in = cache->get_inode(ex->ino); + + ceph_assert(in); + ceph_assert(in->is_auth()); + + // FIXME + if (!in->can_auth_pin()) { + return; + } + + in->auth_pin(this); + + map<client_t,entity_inst_t> client_map{ex->client_map}; + map<client_t,client_metadata_t> client_metadata_map{ex->client_metadata_map}; + + C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps( + this, in, mds_rank_t(ex->get_source().num())); + + version_t pv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map, + finish->imported_session_map); + // decode new caps + auto blp = ex->cap_bl.cbegin(); + decode_import_inode_caps(in, false, blp, finish->peer_exports); + ceph_assert(!finish->peer_exports.empty()); // thus, inode is pinned. + + // journal open client sessions + ESessions *le = new ESessions(pv, std::move(client_map), + std::move(client_metadata_map)); + mds->mdlog->start_submit_entry(le, finish); + mds->mdlog->flush(); +} + + +void Migrator::logged_import_caps(CInode *in, + mds_rank_t from, + map<client_t,pair<Session*,uint64_t> >& imported_session_map, + map<CInode*, map<client_t,Capability::Export> >& peer_exports) +{ + dout(10) << "logged_import_caps on " << *in << dendl; + // see export_go() vs export_go_synced() + ceph_assert(in->is_auth()); + + // force open client sessions and finish cap import + mds->server->finish_force_open_sessions(imported_session_map); + + auto it = peer_exports.find(in); + ceph_assert(it != peer_exports.end()); + + // clients will release caps from the exporter when they receive the cap import message. + map<client_t,Capability::Import> imported_caps; + finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps); + mds->locker->eval(in, CEPH_CAP_LOCKS, true); + + if (!imported_caps.empty()) { + auto ack = MExportCapsAck::create(in->ino()); + map<client_t,uint64_t> peer_caps_ids; + for (auto &p : imported_caps ) + peer_caps_ids[p.first] = it->second.at(p.first).cap_id; + + encode(imported_caps, ack->cap_bl); + encode(peer_caps_ids, ack->cap_bl); + mds->send_message_mds(ack, from); + } + + in->auth_unpin(this); +} + +Migrator::Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) { + max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size"); + inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race"); +} + +void Migrator::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map) +{ + if (changed.count("mds_max_export_size")) + max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size"); + if (changed.count("mds_inject_migrator_session_race")) { + inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race"); + dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl; + } +} diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h new file mode 100644 index 00000000..de35b427 --- /dev/null +++ b/src/mds/Migrator.h @@ -0,0 +1,376 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * Handles the import and export of mds authorities and actual cache data. + * See src/doc/exports.txt for a description. + */ + +#ifndef CEPH_MDS_MIGRATOR_H +#define CEPH_MDS_MIGRATOR_H + +#include "include/types.h" + +#include "MDSContext.h" + +#include <map> +#include <list> +#include <set> +#include <string_view> + +class MDSRank; +class CDir; +class CInode; +class CDentry; +class Session; + +#include "messages/MExportCaps.h" +#include "messages/MExportCapsAck.h" +#include "messages/MExportDir.h" +#include "messages/MExportDirAck.h" +#include "messages/MExportDirCancel.h" +#include "messages/MExportDirDiscover.h" +#include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirFinish.h" +#include "messages/MExportDirNotify.h" +#include "messages/MExportDirNotifyAck.h" +#include "messages/MExportDirPrep.h" +#include "messages/MExportDirPrepAck.h" +#include "messages/MGatherCaps.h" + +class EImportStart; + +class Migrator { +public: + // export stages. used to clean up intelligently if there's a failure. + const static int EXPORT_CANCELLED = 0; // cancelled + const static int EXPORT_CANCELLING = 1; // waiting for cancel notifyacks + const static int EXPORT_LOCKING = 2; // acquiring locks + const static int EXPORT_DISCOVERING = 3; // dest is disovering export dir + const static int EXPORT_FREEZING = 4; // we're freezing the dir tree + const static int EXPORT_PREPPING = 5; // sending dest spanning tree to export bounds + const static int EXPORT_WARNING = 6; // warning bystanders of dir_auth_pending + const static int EXPORT_EXPORTING = 7; // sent actual export, waiting for ack + const static int EXPORT_LOGGINGFINISH = 8; // logging EExportFinish + const static int EXPORT_NOTIFYING = 9; // waiting for notifyacks + static std::string_view get_export_statename(int s) { + switch (s) { + case EXPORT_CANCELLING: return "cancelling"; + case EXPORT_LOCKING: return "locking"; + case EXPORT_DISCOVERING: return "discovering"; + case EXPORT_FREEZING: return "freezing"; + case EXPORT_PREPPING: return "prepping"; + case EXPORT_WARNING: return "warning"; + case EXPORT_EXPORTING: return "exporting"; + case EXPORT_LOGGINGFINISH: return "loggingfinish"; + case EXPORT_NOTIFYING: return "notifying"; + default: ceph_abort(); return std::string_view(); + } + } + + // -- imports -- + const static int IMPORT_DISCOVERING = 1; // waiting for prep + const static int IMPORT_DISCOVERED = 2; // waiting for prep + const static int IMPORT_PREPPING = 3; // opening dirs on bounds + const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import + const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart + const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish + const static int IMPORT_FINISHING = 7; // sent cap imports, waiting for finish + const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing + static std::string_view get_import_statename(int s) { + switch (s) { + case IMPORT_DISCOVERING: return "discovering"; + case IMPORT_DISCOVERED: return "discovered"; + case IMPORT_PREPPING: return "prepping"; + case IMPORT_PREPPED: return "prepped"; + case IMPORT_LOGGINGSTART: return "loggingstart"; + case IMPORT_ACKING: return "acking"; + case IMPORT_FINISHING: return "finishing"; + case IMPORT_ABORTING: return "aborting"; + default: ceph_abort(); return std::string_view(); + } + } + + // -- cons -- + Migrator(MDSRank *m, MDCache *c); + + void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); + +protected: + struct export_base_t { + dirfrag_t dirfrag; + mds_rank_t dest; + unsigned pending_children; + uint64_t export_queue_gen; + bool restart = false; + export_base_t(dirfrag_t df, mds_rank_t d, unsigned c, uint64_t g) : + dirfrag(df), dest(d), pending_children(c), export_queue_gen(g) {} + }; + + // export fun + struct export_state_t { + int state = 0; + mds_rank_t peer = MDS_RANK_NONE; + uint64_t tid = 0; + std::set<mds_rank_t> warning_ack_waiting; + std::set<mds_rank_t> notify_ack_waiting; + std::map<inodeno_t,std::map<client_t,Capability::Import> > peer_imported; + MutationRef mut; + size_t approx_size = 0; + // for freeze tree deadlock detection + utime_t last_cum_auth_pins_change; + int last_cum_auth_pins = 0; + int num_remote_waiters = 0; // number of remote authpin waiters + export_state_t() {} + + std::shared_ptr<export_base_t> parent; + }; + std::map<CDir*, export_state_t> export_state; + typedef map<CDir*, export_state_t>::iterator export_state_iterator; + + uint64_t total_exporting_size = 0; + unsigned num_locking_exports = 0; // exports in locking state (approx_size == 0) + + std::list<pair<dirfrag_t,mds_rank_t> > export_queue; + uint64_t export_queue_gen = 1; + + // import fun + struct import_state_t { + int state; + mds_rank_t peer; + uint64_t tid; + std::set<mds_rank_t> bystanders; + std::list<dirfrag_t> bound_ls; + std::list<ScatterLock*> updated_scatterlocks; + std::map<client_t,pair<Session*,uint64_t> > session_map; + std::map<CInode*, std::map<client_t,Capability::Export> > peer_exports; + MutationRef mut; + import_state_t() : state(0), peer(0), tid(0), mut() {} + }; + + std::map<dirfrag_t, import_state_t> import_state; + + void handle_export_discover_ack(const MExportDirDiscoverAck::const_ref &m); + void export_frozen(CDir *dir, uint64_t tid); + void handle_export_prep_ack(const MExportDirPrepAck::const_ref &m); + void export_sessions_flushed(CDir *dir, uint64_t tid); + void export_go(CDir *dir); + void export_go_synced(CDir *dir, uint64_t tid); + void export_try_cancel(CDir *dir, bool notify_peer=true); + void export_cancel_finish(export_state_iterator& it); + void export_reverse(CDir *dir, export_state_t& stat); + void export_notify_abort(CDir *dir, export_state_t& stat, std::set<CDir*>& bounds); + void handle_export_ack(const MExportDirAck::const_ref &m); + void export_logged_finish(CDir *dir); + void handle_export_notify_ack(const MExportDirNotifyAck::const_ref &m); + void export_finish(CDir *dir); + + void handle_gather_caps(const MGatherCaps::const_ref &m); + + friend class C_MDC_ExportFreeze; + friend class C_MDS_ExportFinishLogged; + friend class C_M_ExportGo; + friend class C_M_ExportSessionsFlushed; + friend class C_MDS_ExportDiscover; + friend class C_MDS_ExportPrep; + friend class MigratorContext; + friend class MigratorLogContext; + + // importer + void handle_export_discover(const MExportDirDiscover::const_ref &m, bool started=false); + void handle_export_cancel(const MExportDirCancel::const_ref &m); + void handle_export_prep(const MExportDirPrep::const_ref &m, bool did_assim=false); + void handle_export_dir(const MExportDir::const_ref &m); + + void import_reverse_discovering(dirfrag_t df); + void import_reverse_discovered(dirfrag_t df, CInode *diri); + void import_reverse_prepping(CDir *dir, import_state_t& stat); + void import_remove_pins(CDir *dir, std::set<CDir*>& bounds); + void import_reverse_unfreeze(CDir *dir); + void import_reverse_final(CDir *dir); + void import_notify_abort(CDir *dir, std::set<CDir*>& bounds); + void import_notify_finish(CDir *dir, std::set<CDir*>& bounds); + void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from, + std::map<client_t,pair<Session*,uint64_t> >& imported_session_map); + void handle_export_finish(const MExportDirFinish::const_ref &m); + + void handle_export_caps(const MExportCaps::const_ref &m); + void handle_export_caps_ack(const MExportCapsAck::const_ref &m); + void logged_import_caps(CInode *in, + mds_rank_t from, + std::map<client_t,pair<Session*,uint64_t> >& imported_session_map, + std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports); + + + friend class C_MDS_ImportDirLoggedStart; + friend class C_MDS_ImportDirLoggedFinish; + friend class C_M_LoggedImportCaps; + + // bystander + void handle_export_notify(const MExportDirNotify::const_ref &m); + + +public: + + void dispatch(const Message::const_ref &); + + void show_importing(); + void show_exporting(); + + int get_num_exporting() const { return export_state.size(); } + int get_export_queue_size() const { return export_queue.size(); } + + // -- status -- + int is_exporting(CDir *dir) const { + auto it = export_state.find(dir); + if (it != export_state.end()) return it->second.state; + return 0; + } + bool is_exporting() const { return !export_state.empty(); } + int is_importing(dirfrag_t df) const { + auto it = import_state.find(df); + if (it != import_state.end()) return it->second.state; + return 0; + } + bool is_importing() const { return !import_state.empty(); } + + bool is_ambiguous_import(dirfrag_t df) const { + auto it = import_state.find(df); + if (it == import_state.end()) + return false; + if (it->second.state >= IMPORT_LOGGINGSTART && + it->second.state < IMPORT_ABORTING) + return true; + return false; + } + + int get_import_state(dirfrag_t df) const { + auto it = import_state.find(df); + ceph_assert(it != import_state.end()); + return it->second.state; + } + int get_import_peer(dirfrag_t df) const { + auto it = import_state.find(df); + ceph_assert(it != import_state.end()); + return it->second.peer; + } + + int get_export_state(CDir *dir) const { + auto it = export_state.find(dir); + ceph_assert(it != export_state.end()); + return it->second.state; + } + // this returns true if we are export @dir, + // and are not waiting for @who to be + // be warned of ambiguous auth. + // only returns meaningful results during EXPORT_WARNING state. + bool export_has_warned(CDir *dir, mds_rank_t who) { + auto it = export_state.find(dir); + ceph_assert(it != export_state.end()); + ceph_assert(it->second.state == EXPORT_WARNING); + return (it->second.warning_ack_waiting.count(who) == 0); + } + + bool export_has_notified(CDir *dir, mds_rank_t who) const { + auto it = export_state.find(dir); + ceph_assert(it != export_state.end()); + ceph_assert(it->second.state == EXPORT_NOTIFYING); + return (it->second.notify_ack_waiting.count(who) == 0); + } + + void export_freeze_inc_num_waiters(CDir *dir) { + auto it = export_state.find(dir); + ceph_assert(it != export_state.end()); + it->second.num_remote_waiters++; + } + void find_stale_export_freeze(); + + // -- misc -- + void handle_mds_failure_or_stop(mds_rank_t who); + + void audit(); + + // -- import/export -- + // exporter + void dispatch_export_dir(MDRequestRef& mdr, int count); + void export_dir(CDir *dir, mds_rank_t dest); + void export_empty_import(CDir *dir); + + void export_dir_nicely(CDir *dir, mds_rank_t dest); + void maybe_do_queued_export(); + void clear_export_queue() { + export_queue.clear(); + export_queue_gen++; + } + + void maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay, + vector<pair<CDir*, size_t> >& results); + void child_export_finish(std::shared_ptr<export_base_t>& parent, bool success); + + void get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov); + void get_export_client_set(CDir *dir, std::set<client_t> &client_set); + void get_export_client_set(CInode *in, std::set<client_t> &client_set); + + void encode_export_inode(CInode *in, bufferlist& bl, + std::map<client_t,entity_inst_t>& exported_client_map, + std::map<client_t,client_metadata_t>& exported_client_metadata_map); + void encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl, + std::map<client_t,entity_inst_t>& exported_client_map, + std::map<client_t,client_metadata_t>& exported_client_metadata_map); + void finish_export_inode(CInode *in, mds_rank_t target, + std::map<client_t,Capability::Import>& peer_imported, + MDSContext::vec& finished); + void finish_export_inode_caps(CInode *in, mds_rank_t target, + std::map<client_t,Capability::Import>& peer_imported); + + + uint64_t encode_export_dir(bufferlist& exportbl, + CDir *dir, + std::map<client_t,entity_inst_t>& exported_client_map, + std::map<client_t,client_metadata_t>& exported_client_metadata_map); + void finish_export_dir(CDir *dir, mds_rank_t target, + std::map<inodeno_t,std::map<client_t,Capability::Import> >& peer_imported, + MDSContext::vec& finished, int *num_dentries); + + void clear_export_proxy_pins(CDir *dir); + + void export_caps(CInode *in); + + void decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp, + mds_rank_t oldauth, LogSegment *ls, + std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports, + std::list<ScatterLock*>& updated_scatterlocks); + void decode_import_inode_caps(CInode *in, bool auth_cap, bufferlist::const_iterator &blp, + std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports); + void finish_import_inode_caps(CInode *in, mds_rank_t from, bool auth_cap, + const std::map<client_t,pair<Session*,uint64_t> >& smap, + const std::map<client_t,Capability::Export> &export_map, + std::map<client_t,Capability::Import> &import_map); + int decode_import_dir(bufferlist::const_iterator& blp, + mds_rank_t oldauth, + CDir *import_root, + EImportStart *le, + LogSegment *ls, + std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports, + std::list<ScatterLock*>& updated_scatterlocks); + + void import_reverse(CDir *dir); + + void import_finish(CDir *dir, bool notify, bool last=true); + +private: + MDSRank *mds; + MDCache *cache; + uint64_t max_export_size = 0; + bool inject_session_race = false; +}; + +#endif diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc new file mode 100644 index 00000000..ee1978e5 --- /dev/null +++ b/src/mds/Mutation.cc @@ -0,0 +1,473 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Mutation.h" +#include "ScatterLock.h" +#include "CDir.h" + +// MutationImpl + +void MutationImpl::pin(MDSCacheObject *o) +{ + if (pins.count(o) == 0) { + o->get(MDSCacheObject::PIN_REQUEST); + pins.insert(o); + } +} + +void MutationImpl::unpin(MDSCacheObject *o) +{ + ceph_assert(pins.count(o)); + o->put(MDSCacheObject::PIN_REQUEST); + pins.erase(o); +} + +void MutationImpl::set_stickydirs(CInode *in) +{ + if (!stickydiri || stickydiri != in) { + in->get_stickydirs(); + if (stickydiri) + stickydiri->put_stickydirs(); + stickydiri = in; + } +} + +void MutationImpl::put_stickydirs() +{ + if (stickydiri) { + stickydiri->put_stickydirs(); + stickydiri = nullptr; + + } +} + +void MutationImpl::drop_pins() +{ + for (auto& o : pins) + o->put(MDSCacheObject::PIN_REQUEST); + pins.clear(); +} + +void MutationImpl::start_locking(SimpleLock *lock, int target) +{ + ceph_assert(locking == NULL); + pin(lock->get_parent()); + locking = lock; + locking_target_mds = target; +} + +void MutationImpl::finish_locking(SimpleLock *lock) +{ + ceph_assert(locking == lock); + locking = NULL; + locking_target_mds = -1; +} + +void MutationImpl::LockOpVec::erase_rdlock(SimpleLock* lock) +{ + for (int i = size() - 1; i >= 0; --i) { + auto& op = (*this)[i]; + if (op.lock == lock && op.is_rdlock()) { + erase(begin() + i); + return; + } + } +} + +void MutationImpl::LockOpVec::sort_and_merge() +{ + std::sort(begin(), end(), SimpleLock::ptr_lt()); + // merge ops on the same lock + for (auto i = end() - 1; i > begin(); ) { + auto j = i; + while (--j >= begin()) { + if (i->lock != j->lock) + break; + } + if (i - j == 1) { + i = j; + continue; + } + + // merge + ++j; + for (auto k = i; k > j; --k) { + if (k->is_remote_wrlock()) { + ceph_assert(!j->is_remote_wrlock()); + j->wrlock_target = k->wrlock_target; + } + j->flags |= k->flags; + } + if (j->is_xlock()) { + // xlock overwrites other types + ceph_assert(!j->is_remote_wrlock()); + j->flags = MutationImpl::LockOp::XLOCK; + } + erase(j + 1, i + 1); + i = j - 1; + } +} + +// auth pins +bool MutationImpl::is_auth_pinned(MDSCacheObject *object) const +{ + return auth_pins.count(object) || remote_auth_pins.count(object); +} + +void MutationImpl::auth_pin(MDSCacheObject *object) +{ + if (!is_auth_pinned(object)) { + object->auth_pin(this); + auth_pins.insert(object); + } +} + +void MutationImpl::auth_unpin(MDSCacheObject *object) +{ + ceph_assert(auth_pins.count(object)); + object->auth_unpin(this); + auth_pins.erase(object); +} + +void MutationImpl::drop_local_auth_pins() +{ + for (const auto& p : auth_pins) { + ceph_assert(p->is_auth()); + p->auth_unpin(this); + } + auth_pins.clear(); +} + +void MutationImpl::add_projected_inode(CInode *in) +{ + projected_inodes.push_back(in); +} + +void MutationImpl::pop_and_dirty_projected_inodes() +{ + while (!projected_inodes.empty()) { + CInode *in = projected_inodes.front(); + projected_inodes.pop_front(); + in->pop_and_dirty_projected_inode(ls); + } +} + +void MutationImpl::add_projected_fnode(CDir *dir) +{ + projected_fnodes.push_back(dir); +} + +void MutationImpl::pop_and_dirty_projected_fnodes() +{ + while (!projected_fnodes.empty()) { + CDir *dir = projected_fnodes.front(); + projected_fnodes.pop_front(); + dir->pop_and_dirty_projected_fnode(ls); + } +} + +void MutationImpl::add_updated_lock(ScatterLock *lock) +{ + updated_locks.push_back(lock); +} + +void MutationImpl::add_cow_inode(CInode *in) +{ + pin(in); + dirty_cow_inodes.push_back(in); +} + +void MutationImpl::add_cow_dentry(CDentry *dn) +{ + pin(dn); + dirty_cow_dentries.push_back(pair<CDentry*,version_t>(dn, dn->get_projected_version())); +} + +void MutationImpl::apply() +{ + pop_and_dirty_projected_inodes(); + pop_and_dirty_projected_fnodes(); + + for (list<CInode*>::iterator p = dirty_cow_inodes.begin(); + p != dirty_cow_inodes.end(); + ++p) + (*p)->_mark_dirty(ls); + for (list<pair<CDentry*,version_t> >::iterator p = dirty_cow_dentries.begin(); + p != dirty_cow_dentries.end(); + ++p) + p->first->mark_dirty(p->second, ls); + + for (list<ScatterLock*>::iterator p = updated_locks.begin(); + p != updated_locks.end(); + ++p) + (*p)->mark_dirty(); +} + +void MutationImpl::cleanup() +{ + drop_local_auth_pins(); + drop_pins(); +} + +void MutationImpl::_dump_op_descriptor_unlocked(ostream& stream) const +{ + stream << "Mutation"; +} + +// MDRequestImpl + +MDRequestImpl::~MDRequestImpl() +{ + delete _more; +} + +MDRequestImpl::More* MDRequestImpl::more() +{ + if (!_more) + _more = new More(); + return _more; +} + +bool MDRequestImpl::has_more() const +{ + return _more != nullptr; +} + +bool MDRequestImpl::has_witnesses() +{ + return (_more != nullptr) && (!_more->witnessed.empty()); +} + +bool MDRequestImpl::slave_did_prepare() +{ + return has_more() && more()->slave_commit; +} + +bool MDRequestImpl::slave_rolling_back() +{ + return has_more() && more()->slave_rolling_back; +} + +bool MDRequestImpl::did_ino_allocation() const +{ + return alloc_ino || used_prealloc_ino || prealloc_inos.size(); +} + +bool MDRequestImpl::freeze_auth_pin(CInode *inode) +{ + ceph_assert(!more()->rename_inode || more()->rename_inode == inode); + more()->rename_inode = inode; + more()->is_freeze_authpin = true; + auth_pin(inode); + if (!inode->freeze_inode(1)) { + return false; + } + inode->freeze_auth_pin(); + inode->unfreeze_inode(); + return true; +} + +void MDRequestImpl::unfreeze_auth_pin(bool clear_inode) +{ + ceph_assert(more()->is_freeze_authpin); + CInode *inode = more()->rename_inode; + if (inode->is_frozen_auth_pin()) + inode->unfreeze_auth_pin(); + else + inode->unfreeze_inode(); + more()->is_freeze_authpin = false; + if (clear_inode) + more()->rename_inode = NULL; +} + +void MDRequestImpl::set_remote_frozen_auth_pin(CInode *inode) +{ + more()->rename_inode = inode; + more()->is_remote_frozen_authpin = true; +} + +void MDRequestImpl::set_ambiguous_auth(CInode *inode) +{ + ceph_assert(!more()->rename_inode || more()->rename_inode == inode); + ceph_assert(!more()->is_ambiguous_auth); + + inode->set_ambiguous_auth(); + more()->rename_inode = inode; + more()->is_ambiguous_auth = true; +} + +void MDRequestImpl::clear_ambiguous_auth() +{ + CInode *inode = more()->rename_inode; + ceph_assert(inode && more()->is_ambiguous_auth); + inode->clear_ambiguous_auth(); + more()->is_ambiguous_auth = false; +} + +bool MDRequestImpl::can_auth_pin(MDSCacheObject *object) +{ + return object->can_auth_pin() || + (is_auth_pinned(object) && has_more() && + more()->is_freeze_authpin && + more()->rename_inode == object); +} + +void MDRequestImpl::drop_local_auth_pins() +{ + if (has_more() && more()->is_freeze_authpin) + unfreeze_auth_pin(true); + MutationImpl::drop_local_auth_pins(); +} + +const filepath& MDRequestImpl::get_filepath() +{ + if (client_request) + return client_request->get_filepath(); + return more()->filepath1; +} + +const filepath& MDRequestImpl::get_filepath2() +{ + if (client_request) + return client_request->get_filepath2(); + return more()->filepath2; +} + +void MDRequestImpl::set_filepath(const filepath& fp) +{ + ceph_assert(!client_request); + more()->filepath1 = fp; +} + +void MDRequestImpl::set_filepath2(const filepath& fp) +{ + ceph_assert(!client_request); + more()->filepath2 = fp; +} + +bool MDRequestImpl::is_queued_for_replay() const +{ + return client_request ? client_request->is_queued_for_replay() : false; +} + +MClientRequest::const_ref MDRequestImpl::release_client_request() +{ + msg_lock.lock(); + MClientRequest::const_ref req; + req.swap(client_request); + client_request = req; + msg_lock.unlock(); + return req; +} + +void MDRequestImpl::reset_slave_request(const MMDSSlaveRequest::const_ref& req) +{ + msg_lock.lock(); + MMDSSlaveRequest::const_ref old; + old.swap(slave_request); + slave_request = req; + msg_lock.unlock(); + old.reset(); +} + +void MDRequestImpl::print(ostream &out) const +{ + out << "request(" << reqid; + //if (request) out << " " << *request; + if (is_slave()) out << " slave_to mds." << slave_to_mds; + if (client_request) out << " cr=" << client_request; + if (slave_request) out << " sr=" << slave_request; + out << ")"; +} + +void MDRequestImpl::dump(Formatter *f) const +{ + _dump(f); +} + +void MDRequestImpl::_dump(Formatter *f) const +{ + f->dump_string("flag_point", state_string()); + f->dump_stream("reqid") << reqid; + { + msg_lock.lock(); + auto _client_request = client_request; + auto _slave_request =slave_request; + msg_lock.unlock(); + + if (_client_request) { + f->dump_string("op_type", "client_request"); + f->open_object_section("client_info"); + f->dump_stream("client") << _client_request->get_orig_source(); + f->dump_int("tid", _client_request->get_tid()); + f->close_section(); // client_info + } else if (is_slave() && _slave_request) { // replies go to an existing mdr + f->dump_string("op_type", "slave_request"); + f->open_object_section("master_info"); + f->dump_stream("master") << _slave_request->get_orig_source(); + f->close_section(); // master_info + + f->open_object_section("request_info"); + f->dump_int("attempt", _slave_request->get_attempt()); + f->dump_string("op_type", + MMDSSlaveRequest::get_opname(_slave_request->get_op())); + f->dump_int("lock_type", _slave_request->get_lock_type()); + f->dump_stream("object_info") << _slave_request->get_object_info(); + f->dump_stream("srcdnpath") << _slave_request->srcdnpath; + f->dump_stream("destdnpath") << _slave_request->destdnpath; + f->dump_stream("witnesses") << _slave_request->witnesses; + f->dump_bool("has_inode_export", + _slave_request->inode_export_v != 0); + f->dump_int("inode_export_v", _slave_request->inode_export_v); + f->dump_stream("op_stamp") << _slave_request->op_stamp; + f->close_section(); // request_info + } + else if (internal_op != -1) { // internal request + f->dump_string("op_type", "internal_op"); + f->dump_int("internal_op", internal_op); + f->dump_string("op_name", ceph_mds_op_name(internal_op)); + } + else { + f->dump_string("op_type", "no_available_op_found"); + } + } + { + f->open_array_section("events"); + std::lock_guard l(lock); + for (auto& i : events) { + f->dump_object("event", i); + } + f->close_section(); // events + } +} + +void MDRequestImpl::_dump_op_descriptor_unlocked(ostream& stream) const +{ + msg_lock.lock(); + auto _client_request = client_request; + auto _slave_request = slave_request; + msg_lock.unlock(); + + if (_client_request) { + _client_request->print(stream); + } else if (_slave_request) { + _slave_request->print(stream); + } else if (internal_op >= 0) { + stream << "internal op " << ceph_mds_op_name(internal_op) << ":" << reqid; + } else { + // drat, it's triggered by a slave request, but we don't have a message + // FIXME + stream << "rejoin:" << reqid; + } +} diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h new file mode 100644 index 00000000..3177b1d4 --- /dev/null +++ b/src/mds/Mutation.h @@ -0,0 +1,432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_MUTATION_H +#define CEPH_MDS_MUTATION_H + +#include "include/interval_set.h" +#include "include/elist.h" +#include "include/filepath.h" + +#include "MDSCacheObject.h" +#include "MDSContext.h" + +#include "SimpleLock.h" +#include "Capability.h" + +#include "common/TrackedOp.h" +#include "messages/MClientRequest.h" +#include "messages/MMDSSlaveRequest.h" + +class LogSegment; +class Capability; +class CInode; +class CDir; +class CDentry; +class Session; +class ScatterLock; +struct sr_t; + +struct MutationImpl : public TrackedOp { + metareqid_t reqid; + __u32 attempt = 0; // which attempt for this request + LogSegment *ls = nullptr; // the log segment i'm committing to + +private: + utime_t mds_stamp; ///< mds-local timestamp (real time) + utime_t op_stamp; ///< op timestamp (client provided) + +public: + // flag mutation as slave + mds_rank_t slave_to_mds = MDS_RANK_NONE; // this is a slave request if >= 0. + + // -- my pins and locks -- + // cache pins (so things don't expire) + set< MDSCacheObject* > pins; + CInode* stickydiri = nullptr; + + // auth pins + map<MDSCacheObject*, mds_rank_t> remote_auth_pins; + set<MDSCacheObject*> auth_pins; + + // held locks + struct LockOp { + enum { + RDLOCK = 1, + WRLOCK = 2, + XLOCK = 4, + REMOTE_WRLOCK = 8, + }; + SimpleLock* lock; + mutable unsigned flags; + mutable mds_rank_t wrlock_target; + operator SimpleLock*() const { + return lock; + } + LockOp(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) : + lock(l), flags(f), wrlock_target(t) {} + bool is_rdlock() const { return !!(flags & RDLOCK); } + bool is_xlock() const { return !!(flags & XLOCK); } + bool is_wrlock() const { return !!(flags & WRLOCK); } + void clear_wrlock() const { flags &= ~WRLOCK; } + bool is_remote_wrlock() const { return !!(flags & REMOTE_WRLOCK); } + void clear_remote_wrlock() const { + flags &= ~REMOTE_WRLOCK; + wrlock_target = MDS_RANK_NONE; + } + }; + + struct LockOpVec : public vector<LockOp> { + void add_rdlock(SimpleLock *lock) { + emplace_back(lock, LockOp::RDLOCK); + } + void erase_rdlock(SimpleLock *lock); + void add_xlock(SimpleLock *lock) { + emplace_back(lock, LockOp::XLOCK); + } + void add_wrlock(SimpleLock *lock) { + emplace_back(lock, LockOp::WRLOCK); + } + void add_remote_wrlock(SimpleLock *lock, mds_rank_t rank) { + ceph_assert(rank != MDS_RANK_NONE); + emplace_back(lock, LockOp::REMOTE_WRLOCK, rank); + } + void sort_and_merge(); + + LockOpVec() { + reserve(32); + } + }; + typedef set<LockOp, SimpleLock::ptr_lt> lock_set; + typedef lock_set::iterator lock_iterator; + lock_set locks; // full ordering + + bool is_rdlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + return it != locks.end() && it->is_rdlock(); + } + bool is_xlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + return it != locks.end() && it->is_xlock(); + } + bool is_wrlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + return it != locks.end() && it->is_wrlock(); + } + bool is_remote_wrlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + return it != locks.end() && it->is_remote_wrlock(); + } + + // lock we are currently trying to acquire. if we give up for some reason, + // be sure to eval() this. + SimpleLock *locking = nullptr; + mds_rank_t locking_target_mds = -1; + + // if this flag is set, do not attempt to acquire further locks. + // (useful for wrlock, which may be a moving auth target) + bool done_locking = false; + bool committing = false; + bool aborted = false; + bool killed = false; + + // for applying projected inode changes + list<CInode*> projected_inodes; + list<CDir*> projected_fnodes; + list<ScatterLock*> updated_locks; + + list<CInode*> dirty_cow_inodes; + list<pair<CDentry*,version_t> > dirty_cow_dentries; + + // keep our default values synced with MDRequestParam's + MutationImpl() : TrackedOp(nullptr, utime_t()) {} + MutationImpl(OpTracker *tracker, utime_t initiated, + const metareqid_t &ri, __u32 att=0, mds_rank_t slave_to=MDS_RANK_NONE) + : TrackedOp(tracker, initiated), + reqid(ri), attempt(att), + slave_to_mds(slave_to) { } + ~MutationImpl() override { + ceph_assert(locking == NULL); + ceph_assert(pins.empty()); + ceph_assert(auth_pins.empty()); + } + + bool is_master() const { return slave_to_mds == MDS_RANK_NONE; } + bool is_slave() const { return slave_to_mds != MDS_RANK_NONE; } + + client_t get_client() const { + if (reqid.name.is_client()) + return client_t(reqid.name.num()); + return -1; + } + + void set_mds_stamp(utime_t t) { + mds_stamp = t; + } + utime_t get_mds_stamp() const { + return mds_stamp; + } + void set_op_stamp(utime_t t) { + op_stamp = t; + } + utime_t get_op_stamp() const { + if (op_stamp != utime_t()) + return op_stamp; + return get_mds_stamp(); + } + + // pin items in cache + void pin(MDSCacheObject *o); + void unpin(MDSCacheObject *o); + void set_stickydirs(CInode *in); + void put_stickydirs(); + void drop_pins(); + + void start_locking(SimpleLock *lock, int target=-1); + void finish_locking(SimpleLock *lock); + + // auth pins + bool is_auth_pinned(MDSCacheObject *object) const; + void auth_pin(MDSCacheObject *object); + void auth_unpin(MDSCacheObject *object); + void drop_local_auth_pins(); + void add_projected_inode(CInode *in); + void pop_and_dirty_projected_inodes(); + void add_projected_fnode(CDir *dir); + void pop_and_dirty_projected_fnodes(); + void add_updated_lock(ScatterLock *lock); + void add_cow_inode(CInode *in); + void add_cow_dentry(CDentry *dn); + void apply(); + void cleanup(); + + virtual void print(ostream &out) const { + out << "mutation(" << this << ")"; + } + + virtual void dump(Formatter *f) const {} + void _dump_op_descriptor_unlocked(ostream& stream) const override; +}; + +inline ostream& operator<<(ostream &out, const MutationImpl &mut) +{ + mut.print(out); + return out; +} + +typedef boost::intrusive_ptr<MutationImpl> MutationRef; + + + +/** + * MDRequestImpl: state we track for requests we are currently processing. + * mostly information about locks held, so that we can drop them all + * the request is finished or forwarded. see request_*(). + */ +struct MDRequestImpl : public MutationImpl { + Session *session; + elist<MDRequestImpl*>::item item_session_request; // if not on list, op is aborted. + + // -- i am a client (master) request + MClientRequest::const_ref client_request; // client request (if any) + + // store up to two sets of dn vectors, inode pointers, for request path1 and path2. + vector<CDentry*> dn[2]; + CDentry *straydn; + CInode *in[2]; + snapid_t snapid; + + CInode *tracei; + CDentry *tracedn; + + inodeno_t alloc_ino, used_prealloc_ino; + interval_set<inodeno_t> prealloc_inos; + + int snap_caps = 0; + int getattr_caps = 0; ///< caps requested by getattr + bool no_early_reply = false; + bool did_early_reply = false; + bool o_trunc = false; ///< request is an O_TRUNC mutation + bool has_completed = false; ///< request has already completed + + bufferlist reply_extra_bl; + + // inos we did a embedded cap release on, and may need to eval if we haven't since reissued + map<vinodeno_t, ceph_seq_t> cap_releases; + + // -- i am a slave request + MMDSSlaveRequest::const_ref slave_request; // slave request (if one is pending; implies slave == true) + + // -- i am an internal op + int internal_op; + Context *internal_op_finish; + void *internal_op_private; + + // indicates how may retries of request have been made + int retry; + + // indicator for vxattr osdmap update + bool waited_for_osdmap; + + // break rarely-used fields into a separately allocated structure + // to save memory for most ops + struct More { + int slave_error = 0; + set<mds_rank_t> slaves; // mds nodes that have slave requests to me (implies client_request) + set<mds_rank_t> waiting_on_slave; // peers i'm waiting for slavereq replies from. + + // for rename/link/unlink + set<mds_rank_t> witnessed; // nodes who have journaled a RenamePrepare + map<MDSCacheObject*,version_t> pvmap; + + bool has_journaled_slaves = false; + bool slave_update_journaled = false; + bool slave_rolling_back = false; + + // for rename + set<mds_rank_t> extra_witnesses; // replica list from srcdn auth (rename) + mds_rank_t srcdn_auth_mds = MDS_RANK_NONE; + bufferlist inode_import; + version_t inode_import_v = 0; + CInode* rename_inode = nullptr; + bool is_freeze_authpin = false; + bool is_ambiguous_auth = false; + bool is_remote_frozen_authpin = false; + bool is_inode_exporter = false; + + map<client_t, pair<Session*, uint64_t> > imported_session_map; + map<CInode*, map<client_t,Capability::Export> > cap_imports; + + // for lock/flock + bool flock_was_waiting = false; + + // for snaps + version_t stid = 0; + bufferlist snapidbl; + + sr_t *srci_srnode = nullptr; + sr_t *desti_srnode = nullptr; + + // called when slave commits or aborts + Context *slave_commit = nullptr; + bufferlist rollback_bl; + + MDSContext::vec waiting_for_finish; + + // export & fragment + CDir* export_dir = nullptr; + dirfrag_t fragment_base; + + // for internal ops doing lookup + filepath filepath1; + filepath filepath2; + + More() {} + } *_more; + + + // --------------------------------------------------- + struct Params { + metareqid_t reqid; + __u32 attempt; + MClientRequest::const_ref client_req; + Message::const_ref triggering_slave_req; + mds_rank_t slave_to; + utime_t initiated; + utime_t throttled, all_read, dispatched; + int internal_op; + // keep these default values synced to MutationImpl's + Params() : attempt(0), slave_to(MDS_RANK_NONE), internal_op(-1) {} + const utime_t& get_recv_stamp() const { + return initiated; + } + const utime_t& get_throttle_stamp() const { + return throttled; + } + const utime_t& get_recv_complete_stamp() const { + return all_read; + } + const utime_t& get_dispatch_stamp() const { + return dispatched; + } + }; + MDRequestImpl(const Params* params, OpTracker *tracker) : + MutationImpl(tracker, params->initiated, + params->reqid, params->attempt, params->slave_to), + session(NULL), item_session_request(this), + client_request(params->client_req), straydn(NULL), snapid(CEPH_NOSNAP), + tracei(NULL), tracedn(NULL), alloc_ino(0), used_prealloc_ino(0), + internal_op(params->internal_op), internal_op_finish(NULL), + internal_op_private(NULL), + retry(0), + waited_for_osdmap(false), _more(NULL) { + in[0] = in[1] = NULL; + } + ~MDRequestImpl() override; + + More* more(); + bool has_more() const; + bool has_witnesses(); + bool slave_did_prepare(); + bool slave_rolling_back(); + bool did_ino_allocation() const; + bool freeze_auth_pin(CInode *inode); + void unfreeze_auth_pin(bool clear_inode=false); + void set_remote_frozen_auth_pin(CInode *inode); + bool can_auth_pin(MDSCacheObject *object); + void drop_local_auth_pins(); + void set_ambiguous_auth(CInode *inode); + void clear_ambiguous_auth(); + const filepath& get_filepath(); + const filepath& get_filepath2(); + void set_filepath(const filepath& fp); + void set_filepath2(const filepath& fp); + bool is_queued_for_replay() const; + + void print(ostream &out) const override; + void dump(Formatter *f) const override; + + MClientRequest::const_ref release_client_request(); + void reset_slave_request(const MMDSSlaveRequest::const_ref& req=nullptr); + + // TrackedOp stuff + typedef boost::intrusive_ptr<MDRequestImpl> Ref; +protected: + void _dump(Formatter *f) const override; + void _dump_op_descriptor_unlocked(ostream& stream) const override; +private: + mutable ceph::spinlock msg_lock; +}; + +typedef boost::intrusive_ptr<MDRequestImpl> MDRequestRef; + + +struct MDSlaveUpdate { + int origop; + bufferlist rollback; + Context *waiter = nullptr; + set<CInode*> olddirs; + set<CInode*> unlinked; + MDSlaveUpdate(int oo, bufferlist &rbl) : + origop(oo) { + rollback.claim(rbl); + } + ~MDSlaveUpdate() { + if (waiter) + waiter->complete(0); + } +}; + + +#endif diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc new file mode 100644 index 00000000..5e0d2ba5 --- /dev/null +++ b/src/mds/OpenFileTable.cc @@ -0,0 +1,1189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "acconfig.h" +#include "mds/CInode.h" +#include "mds/CDir.h" +#include "mds/MDSRank.h" +#include "mds/MDCache.h" +#include "osdc/Objecter.h" +#include "OpenFileTable.h" + +#include "common/config.h" +#include "common/errno.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".openfiles "; +} + +void OpenFileTable::get_ref(CInode *in) +{ + do { + auto p = anchor_map.find(in->ino()); + if (p != anchor_map.end()) { + ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + ceph_assert(p->second.nref > 0); + p->second.nref++; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + + auto ret = anchor_map.emplace(std::piecewise_construct, std::forward_as_tuple(in->ino()), + std::forward_as_tuple(in->ino(), (pin ? pin->ino() : inodeno_t(0)), + (dn ? dn->get_name() : string()), in->d_type(), 1)); + ceph_assert(ret.second == true); + in->state_set(CInode::STATE_TRACKEDBYOFT); + + auto ret1 = dirty_items.emplace(in->ino(), (int)DIRTY_NEW); + if (!ret1.second) { + int omap_idx = ret1.first->second; + ceph_assert(omap_idx >= 0); + ret.first->second.omap_idx = omap_idx; + } + + in = pin; + } while (in); +} + +void OpenFileTable::put_ref(CInode *in) +{ + do { + ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + auto p = anchor_map.find(in->ino()); + ceph_assert(p != anchor_map.end()); + ceph_assert(p->second.nref > 0); + + if (p->second.nref > 1) { + p->second.nref--; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + if (dn) { + ceph_assert(p->second.dirino == pin->ino()); + ceph_assert(p->second.d_name == dn->get_name()); + } else { + ceph_assert(p->second.dirino == inodeno_t(0)); + ceph_assert(p->second.d_name == ""); + } + + int omap_idx = p->second.omap_idx; + anchor_map.erase(p); + in->state_clear(CInode::STATE_TRACKEDBYOFT); + + auto ret = dirty_items.emplace(in->ino(), omap_idx); + if (!ret.second) { + if (ret.first->second == DIRTY_NEW) { + ceph_assert(omap_idx < 0); + dirty_items.erase(ret.first); + } else { + ceph_assert(omap_idx >= 0); + ret.first->second = omap_idx; + } + } + + in = pin; + } while (in); +} + +void OpenFileTable::add_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + ceph_assert(p == anchor_map.end()); + } + get_ref(in); +} + +void OpenFileTable::remove_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + ceph_assert(p != anchor_map.end()); + ceph_assert(p->second.nref == 1); + } + put_ref(in); +} + +void OpenFileTable::add_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + ceph_assert(!dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_set(CDir::STATE_TRACKEDBYOFT); + auto ret = dirfrags.insert(dir->dirfrag()); + ceph_assert(ret.second); + get_ref(dir->get_inode()); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); +} + +void OpenFileTable::remove_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + ceph_assert(dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_clear(CDir::STATE_TRACKEDBYOFT); + auto p = dirfrags.find(dir->dirfrag()); + ceph_assert(p != dirfrags.end()); + dirfrags.erase(p); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); + put_ref(dir->get_inode()); +} + +void OpenFileTable::notify_link(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + ceph_assert(p != anchor_map.end()); + ceph_assert(p->second.nref > 0); + ceph_assert(p->second.dirino == inodeno_t(0)); + ceph_assert(p->second.d_name == ""); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + + p->second.dirino = pin->ino(); + p->second.d_name = dn->get_name(); + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); + + get_ref(pin); +} + +void OpenFileTable::notify_unlink(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + ceph_assert(p != anchor_map.end()); + ceph_assert(p->second.nref > 0); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + ceph_assert(p->second.dirino == pin->ino()); + ceph_assert(p->second.d_name == dn->get_name()); + + p->second.dirino = inodeno_t(0); + p->second.d_name = ""; + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); + + put_ref(pin); +} + +object_t OpenFileTable::get_object_name(unsigned idx) const +{ + char s[30]; + snprintf(s, sizeof(s), "mds%d_openfiles.%x", int(mds->get_nodeid()), idx); + return object_t(s); +} + +void OpenFileTable::_encode_header(bufferlist &bl, int j_state) +{ + std::string_view magic = CEPH_FS_ONDISK_MAGIC; + encode(magic, bl); + ENCODE_START(1, 1, bl); + encode(omap_version, bl); + encode(omap_num_objs, bl); + encode((__u8)j_state, bl); + ENCODE_FINISH(bl); +} + +class C_IO_OFT_Save : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSContext *fin; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Save(OpenFileTable *t, uint64_t s, MDSContext *c) : + oft(t), log_seq(s), fin(c) {} + void finish(int r) { + oft->_commit_finish(r, log_seq, fin); + } + void print(ostream& out) const override { + out << "openfiles_save"; + } +}; + +void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSContext *fin) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + ceph_assert(log_seq <= committing_log_seq); + ceph_assert(log_seq >= committed_log_seq); + committed_log_seq = log_seq; + num_pending_commit--; + + if (fin) + fin->complete(r); +} + +class C_IO_OFT_Journal : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSContext *fin; + std::map<unsigned, std::vector<ObjectOperation> > ops_map; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Journal(OpenFileTable *t, uint64_t s, MDSContext *c, + std::map<unsigned, std::vector<ObjectOperation> >& ops) : + oft(t), log_seq(s), fin(c) { + ops_map.swap(ops); + } + void finish(int r) { + oft->_journal_finish(r, log_seq, fin, ops_map); + } + void print(ostream& out) const override { + out << "openfiles_journal"; + } +}; + +void OpenFileTable::_journal_finish(int r, uint64_t log_seq, MDSContext *c, + std::map<unsigned, std::vector<ObjectOperation> >& ops_map) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + SnapContext snapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + + journal_state = JOURNAL_NONE; + return; +} + +void OpenFileTable::commit(MDSContext *c, uint64_t log_seq, int op_prio) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + + ceph_assert(num_pending_commit == 0); + num_pending_commit++; + ceph_assert(log_seq >= committing_log_seq); + committing_log_seq = log_seq; + + omap_version++; + + C_GatherBuilder gather(g_ceph_context); + + SnapContext snapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + const unsigned max_write_size = mds->mdcache->max_dir_commit_size; + + struct omap_update_ctl { + unsigned write_size = 0; + unsigned journal_idx = 0; + bool clear = false; + std::map<string, bufferlist> to_update, journaled_update; + std::set<string> to_remove, journaled_remove; + }; + std::vector<omap_update_ctl> omap_updates(omap_num_objs); + + using ceph::encode; + auto journal_func = [&](unsigned idx) { + auto& ctl = omap_updates.at(idx); + + ObjectOperation op; + op.priority = op_prio; + + if (ctl.clear) { + ctl.clear = false; + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } + + if (ctl.journal_idx == 0) { + if (journal_state == JOURNAL_NONE) + journal_state = JOURNAL_START; + else + ceph_assert(journal_state == JOURNAL_START); + + bufferlist header; + _encode_header(header, journal_state); + op.omap_set_header(header); + } + + bufferlist bl; + encode(omap_version, bl); + encode(ctl.to_update, bl); + encode(ctl.to_remove, bl); + + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", ctl.journal_idx++); + std::map<string, bufferlist> tmp_map; + tmp_map[key].swap(bl); + op.omap_set(tmp_map); + + object_t oid = get_object_name(idx); + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, + gather.new_sub()); + +#ifdef HAVE_STDLIB_MAP_SPLICING + ctl.journaled_update.merge(ctl.to_update); + ctl.journaled_remove.merge(ctl.to_remove); +#else + ctl.journaled_update.insert(make_move_iterator(begin(ctl.to_update)), + make_move_iterator(end(ctl.to_update))); + ctl.journaled_remove.insert(make_move_iterator(begin(ctl.to_remove)), + make_move_iterator(end(ctl.to_remove))); +#endif + ctl.to_update.clear(); + ctl.to_remove.clear(); + }; + + std::map<unsigned, std::vector<ObjectOperation> > ops_map; + + auto create_op_func = [&](unsigned idx, bool update_header) { + auto& ctl = omap_updates.at(idx); + + auto& op_vec = ops_map[idx]; + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + op.priority = op_prio; + + if (ctl.clear) { + ctl.clear = false; + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } + + if (update_header) { + bufferlist header; + _encode_header(header, journal_state); + op.omap_set_header(header); + } + + if (!ctl.to_update.empty()) { + op.omap_set(ctl.to_update); + ctl.to_update.clear(); + } + if (!ctl.to_remove.empty()) { + op.omap_rm_keys(ctl.to_remove); + ctl.to_remove.clear(); + } + }; + + auto submit_ops_func = [&]() { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + }; + + bool first_commit = !loaded_anchor_map.empty(); + + unsigned first_free_idx = 0; + unsigned old_num_objs = omap_num_objs; + if (omap_num_objs == 0) { + omap_num_objs = 1; + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + } + + for (auto& it : dirty_items) { + frag_vec_t frags; + auto p = anchor_map.find(it.first); + if (p != anchor_map.end()) { + for (auto q = dirfrags.lower_bound(dirfrag_t(it.first, 0)); + q != dirfrags.end() && q->ino == it.first; + ++q) + frags.push_back(q->frag); + } + + if (first_commit) { + auto q = loaded_anchor_map.find(it.first); + if (q != loaded_anchor_map.end()) { + ceph_assert(p != anchor_map.end()); + p->second.omap_idx = q->second.omap_idx; + bool same = p->second == q->second; + if (same) { + auto r = loaded_dirfrags.lower_bound(dirfrag_t(it.first, 0)); + for (const auto& fg : frags) { + if (r == loaded_dirfrags.end() || !(*r == dirfrag_t(it.first, fg))) { + same = false; + break; + } + ++r; + } + if (same && r != loaded_dirfrags.end() && r->ino == it.first) + same = false; + } + loaded_anchor_map.erase(q); + if (same) + continue; + } + } + + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + + int omap_idx; + if (p != anchor_map.end()) { + omap_idx = p->second.omap_idx; + if (omap_idx < 0) { + ceph_assert(it.second == DIRTY_NEW); + // find omap object to store the key + for (unsigned i = first_free_idx; i < omap_num_objs; i++) { + if (omap_num_items[i] < MAX_ITEMS_PER_OBJ) { + omap_idx = i; + break; + } + } + if (omap_idx < 0) { + ++omap_num_objs; + ceph_assert(omap_num_objs <= MAX_OBJECTS); + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + omap_idx = omap_num_objs - 1; + } + first_free_idx = omap_idx; + + p->second.omap_idx = omap_idx; + ++omap_num_items[omap_idx]; + } + } else { + omap_idx = it.second; + unsigned& count = omap_num_items.at(omap_idx); + ceph_assert(count > 0); + --count; + if ((unsigned)omap_idx < first_free_idx && count < MAX_ITEMS_PER_OBJ) + first_free_idx = omap_idx; + } + auto& ctl = omap_updates.at(omap_idx); + + if (p != anchor_map.end()) { + bufferlist bl; + encode(p->second, bl); + encode(frags, bl); + + ctl.write_size += bl.length() + len + 2 * sizeof(__u32); + ctl.to_update[key].swap(bl); + } else { + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); + } + + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + + dirty_items.clear(); + + if (first_commit) { + for (auto& it : loaded_anchor_map) { + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + + int omap_idx = it.second.omap_idx; + unsigned& count = omap_num_items.at(omap_idx); + ceph_assert(count > 0); + --count; + + auto& ctl = omap_updates.at(omap_idx); + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); + + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } + + { + size_t total_items = 0; + unsigned used_objs = 1; + std::list<unsigned> objs_to_write; + bool journaled = false; + for (unsigned i = 0; i < omap_num_objs; i++) { + total_items += omap_num_items[i]; + if (omap_updates[i].journal_idx) + journaled = true; + else if (omap_updates[i].write_size) + objs_to_write.push_back(i); + + if (omap_num_items[i] > 0) + used_objs = i + 1; + } + ceph_assert(total_items == anchor_map.size()); + // adjust omap object count + if (used_objs < omap_num_objs) { + omap_num_objs = used_objs; + omap_num_items.resize(omap_num_objs); + } + // skip journal if only one osd request is required and object count + // does not change. + if (!journaled && old_num_objs == omap_num_objs && + objs_to_write.size() <= 1) { + ceph_assert(journal_state == JOURNAL_NONE); + ceph_assert(!gather.has_subs()); + + unsigned omap_idx = objs_to_write.empty() ? 0 : objs_to_write.front(); + create_op_func(omap_idx, true); + submit_ops_func(); + return; + } + } + + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + if (ctl.write_size > 0) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + + if (journal_state == JOURNAL_START) { + ceph_assert(gather.has_subs()); + journal_state = JOURNAL_FINISH; + } else { + // only object count changes + ceph_assert(journal_state == JOURNAL_NONE); + ceph_assert(!gather.has_subs()); + } + + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + ceph_assert(ctl.to_update.empty() && ctl.to_remove.empty()); + if (ctl.journal_idx == 0) + ceph_assert(ctl.journaled_update.empty() && ctl.journaled_remove.empty()); + + bool first = true; + for (auto& it : ctl.journaled_update) { + ctl.write_size += it.first.length() + it.second.length() + 2 * sizeof(__u32); + ctl.to_update[it.first].swap(it.second); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (auto& key : ctl.journaled_remove) { + ctl.write_size += key.length() + sizeof(__u32); + ctl.to_remove.emplace(key); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (unsigned i = 0; i < ctl.journal_idx; ++i) { + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", i); + ctl.to_remove.emplace(key); + } + + // update first object's omap header if object count changes + if (ctl.clear || + ctl.journal_idx > 0 || + (omap_idx == 0 && old_num_objs != omap_num_objs)) + create_op_func(omap_idx, first); + } + + ceph_assert(!ops_map.empty()); + if (journal_state == JOURNAL_FINISH) { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Journal(this, log_seq, c, ops_map), + mds->finisher)); + gather.activate(); + } else { + submit_ops_func(); + } +} + +class C_IO_OFT_Load : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } + +public: + int header_r = 0; //< Return value from OMAP header read + int values_r = 0; //< Return value from OMAP value read + bufferlist header_bl; + std::map<std::string, bufferlist> values; + unsigned index; + bool first; + bool more = false; + + C_IO_OFT_Load(OpenFileTable *t, unsigned i, bool f) : + oft(t), index(i), first(f) {} + void finish(int r) override { + oft->_load_finish(r, header_r, values_r, index, first, more, header_bl, values); + } + void print(ostream& out) const override { + out << "openfiles_load"; + } +}; + +class C_IO_OFT_Recover : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Recover(OpenFileTable *t) : oft(t) {} + void finish(int r) override { + oft->_recover_finish(r); + } + void print(ostream& out) const override { + out << "openfiles_recover"; + } +}; + +void OpenFileTable::_recover_finish(int r) +{ + if (r < 0) { + derr << __func__ << " got " << cpp_strerror(r) << dendl; + _reset_states(); + } else { + dout(10) << __func__ << ": load complete" << dendl; + } + + journal_state = JOURNAL_NONE; + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + +void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, + unsigned idx, bool first, bool more, + bufferlist &header_bl, + std::map<std::string, bufferlist> &values) +{ + using ceph::decode; + int err = -EINVAL; + + auto decode_func = [this](unsigned idx, inodeno_t ino, bufferlist &bl) { + auto p = bl.cbegin(); + + size_t count = loaded_anchor_map.size(); + auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), + std::piecewise_construct, + std::make_tuple(ino), + std::make_tuple()); + RecoveredAnchor& anchor = it->second; + decode(anchor, p); + ceph_assert(ino == anchor.ino); + anchor.omap_idx = idx; + anchor.auth = MDS_RANK_NONE; + + frag_vec_t frags; + decode(frags, p); + for (const auto& fg : frags) + loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); + + if (loaded_anchor_map.size() > count) + ++omap_num_items[idx]; + }; + + if (op_r < 0) { + derr << __func__ << " got " << cpp_strerror(op_r) << dendl; + err = op_r; + goto out; + } + + try { + if (first) { + auto p = header_bl.cbegin(); + + string magic; + version_t version; + unsigned num_objs; + __u8 jstate; + + if (header_bl.length() == 13) { + // obsolete format. + decode(version, p); + decode(num_objs, p); + decode(jstate, p); + } else { + decode(magic, p); + if (magic != CEPH_FS_ONDISK_MAGIC) { + std::ostringstream oss; + oss << "invalid magic '" << magic << "'"; + throw buffer::malformed_input(oss.str()); + } + + DECODE_START(1, p); + decode(version, p); + decode(num_objs, p); + decode(jstate, p); + DECODE_FINISH(p); + } + + if (num_objs > MAX_OBJECTS) { + std::ostringstream oss; + oss << "invalid object count '" << num_objs << "'"; + throw buffer::malformed_input(oss.str()); + } + if (jstate > JOURNAL_FINISH) { + std::ostringstream oss; + oss << "invalid journal state '" << jstate << "'"; + throw buffer::malformed_input(oss.str()); + } + + if (version > omap_version) { + omap_version = version; + omap_num_objs = num_objs; + omap_num_items.resize(omap_num_objs); + journal_state = jstate; + } else if (version == omap_version) { + ceph_assert(omap_num_objs == num_objs); + if (jstate > journal_state) + journal_state = jstate; + } + } + + for (auto& it : values) { + if (it.first.compare(0, 9, "_journal.") == 0) { + if (idx >= loaded_journals.size()) + loaded_journals.resize(idx + 1); + + if (journal_state == JOURNAL_FINISH) { + loaded_journals[idx][it.first].swap(it.second); + } else { // incomplete journal + loaded_journals[idx][it.first].length(); + } + continue; + } + + inodeno_t ino; + sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val); + decode_func(idx, ino, it.second); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted header/values: " << e.what() << dendl; + goto out; + } + + if (more || idx + 1 < omap_num_objs) { + // Issue another read if we're not at the end of the omap + std::string last_key; + if (more) + last_key = values.rbegin()->first; + else + idx++; + dout(10) << __func__ << ": continue to load from '" << last_key << "'" << dendl; + object_t oid = get_object_name(idx); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, idx, !more); + ObjectOperation op; + if (!more) + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals(last_key, "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); + return; + } + + // replay journal + if (loaded_journals.size() > 0) { + dout(10) << __func__ << ": recover journal" << dendl; + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Recover(this), + mds->finisher)); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + SnapContext snapc; + + for (unsigned omap_idx = 0; omap_idx < loaded_journals.size(); omap_idx++) { + auto& loaded_journal = loaded_journals[omap_idx]; + + std::vector<ObjectOperation> op_vec; + try { + for (auto& it : loaded_journal) { + if (journal_state != JOURNAL_FINISH) + continue; + auto p = it.second.cbegin(); + version_t version; + std::map<string, bufferlist> to_update; + std::set<string> to_remove; + decode(version, p); + if (version != omap_version) + continue; + decode(to_update, p); + decode(to_remove, p); + it.second.clear(); + + for (auto& q : to_update) { + inodeno_t ino; + sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val); + decode_func(omap_idx, ino, q.second); + } + for (auto& q : to_remove) { + inodeno_t ino; + sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val); + ceph_assert(ino.val > 0); + if (loaded_anchor_map.erase(ino)) { + unsigned& count = omap_num_items[omap_idx]; + ceph_assert(count > 0); + --count; + } + auto r = loaded_dirfrags.lower_bound(dirfrag_t(ino, 0)); + while (r != loaded_dirfrags.end() && r->ino == ino) + loaded_dirfrags.erase(r++); + } + + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + op.priority = CEPH_MSG_PRIO_HIGH; + if (!to_update.empty()) + op.omap_set(to_update); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted journal: " << e.what() << dendl; + goto out; + } + + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + { + bufferlist header; + if (journal_state == JOURNAL_FINISH) + _encode_header(header, JOURNAL_FINISH); + else + _encode_header(header, JOURNAL_NONE); + op.omap_set_header(header); + } + { + // remove journal + std::set<string> to_remove; + for (auto &it : loaded_journal) + to_remove.emplace(it.first); + op.omap_rm_keys(to_remove); + } + loaded_journal.clear(); + + object_t oid = get_object_name(omap_idx); + for (auto& op : op_vec) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + return; + } + + journal_state = JOURNAL_NONE; + err = 0; + dout(10) << __func__ << ": load complete" << dendl; +out: + + if (err < 0) + _reset_states(); + + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + +void OpenFileTable::load(MDSContext *onload) +{ + dout(10) << __func__ << dendl; + ceph_assert(!load_done); + if (onload) + waiting_for_load.push_back(onload); + + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, 0, true); + object_t oid = get_object_name(0); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + ObjectOperation op; + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals("", "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); +} + +bool OpenFileTable::get_ancestors(inodeno_t ino, vector<inode_backpointer_t>& ancestors, + mds_rank_t& auth_hint) +{ + auto p = loaded_anchor_map.find(ino); + if (p == loaded_anchor_map.end()) + return false; + + inodeno_t dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + return false; + + bool first = true; + ancestors.clear(); + while (true) { + ancestors.push_back(inode_backpointer_t(dirino, p->second.d_name, 0)); + + p = loaded_anchor_map.find(dirino); + if (p == loaded_anchor_map.end()) + break; + + if (first) + auth_hint = p->second.auth; + + dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + break; + + first = false; + } + return true; +} + +class C_OFT_OpenInoFinish: public MDSContext { + OpenFileTable *oft; + inodeno_t ino; + MDSRank *get_mds() override { return oft->mds; } +public: + C_OFT_OpenInoFinish(OpenFileTable *t, inodeno_t i) : oft(t), ino(i) {} + void finish(int r) override { + oft->_open_ino_finish(ino, r); + } +}; + +void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) +{ + if (prefetch_state == DIR_INODES && r >= 0 && ino != inodeno_t(0)) { + auto p = loaded_anchor_map.find(ino); + ceph_assert(p != loaded_anchor_map.end()); + p->second.auth = mds_rank_t(r); + } + + if (r != mds->get_nodeid()) + mds->mdcache->rejoin_prefetch_ino_finish(ino, r); + + num_opening_inodes--; + if (num_opening_inodes == 0) { + if (prefetch_state == DIR_INODES) { + prefetch_state = DIRFRAGS; + _prefetch_dirfrags(); + } else if (prefetch_state == FILE_INODES) { + prefetch_state = DONE; + logseg_destroyed_inos.clear(); + destroyed_inos_set.clear(); + finish_contexts(g_ceph_context, waiting_for_prefetch); + waiting_for_prefetch.clear(); + } else { + ceph_abort(); + } + } +} + +void OpenFileTable::_prefetch_dirfrags() +{ + dout(10) << __func__ << dendl; + ceph_assert(prefetch_state == DIRFRAGS); + + MDCache *mdcache = mds->mdcache; + list<CDir*> fetch_queue; + + CInode *last_in = nullptr; + for (auto df : loaded_dirfrags) { + CInode *diri; + if (last_in && last_in->ino() == df.ino) { + diri = last_in; + } else { + diri = mdcache->get_inode(df.ino); + if (!diri) + continue; + last_in = diri; + } + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + + CDir *dir = diri->get_dirfrag(df.frag); + if (dir) { + if (dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } else { + frag_vec_t leaves; + diri->dirfragtree.get_leaves_under(df.frag, leaves); + for (const auto& leaf : leaves) { + if (diri->is_auth()) { + dir = diri->get_or_open_dirfrag(mdcache, leaf); + } else { + dir = diri->get_dirfrag(leaf); + } + if (dir && dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } + } + } + + MDSGatherBuilder gather(g_ceph_context); + int num_opening_dirfrags = 0; + for (auto dir : fetch_queue) { + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); + dir->fetch(gather.new_sub()); + + if (!(++num_opening_dirfrags % 1000)) + mds->heartbeat_reset(); + } + + auto finish_func = [this](int r) { + prefetch_state = FILE_INODES; + _prefetch_inodes(); + }; + if (gather.has_subs()) { + gather.set_finisher( + new MDSInternalContextWrapper(mds, + new FunctionContext(finish_func))); + gather.activate(); + } else { + finish_func(0); + } +} + +void OpenFileTable::_prefetch_inodes() +{ + dout(10) << __func__ << " state " << prefetch_state << dendl; + ceph_assert(!num_opening_inodes); + num_opening_inodes = 1; + + int64_t pool; + if (prefetch_state == DIR_INODES) + pool = mds->mdsmap->get_metadata_pool(); + else if (prefetch_state == FILE_INODES) + pool = mds->mdsmap->get_first_data_pool(); + else + ceph_abort(); + + MDCache *mdcache = mds->mdcache; + + if (destroyed_inos_set.empty()) { + for (auto& it : logseg_destroyed_inos) + destroyed_inos_set.insert(it.second.begin(), it.second.end()); + } + + for (auto& it : loaded_anchor_map) { + if (destroyed_inos_set.count(it.first)) + continue; + if (it.second.d_type == DT_DIR) { + if (prefetch_state != DIR_INODES) + continue; + if (MDS_INO_IS_MDSDIR(it.first)) { + it.second.auth = MDS_INO_MDSDIR_OWNER(it.first); + continue; + } + if (MDS_INO_IS_STRAY(it.first)) { + it.second.auth = MDS_INO_STRAY_OWNER(it.first); + continue; + } + } else { + if (prefetch_state != FILE_INODES) + continue; + // load all file inodes for MDCache::identify_files_to_recover() + } + CInode *in = mdcache->get_inode(it.first); + if (in) + continue; + + num_opening_inodes++; + mdcache->open_ino(it.first, pool, new C_OFT_OpenInoFinish(this, it.first), false); + + if (!(num_opening_inodes % 1000)) + mds->heartbeat_reset(); + } + + _open_ino_finish(inodeno_t(0), 0); +} + +bool OpenFileTable::prefetch_inodes() +{ + dout(10) << __func__ << dendl; + ceph_assert(!prefetch_state); + prefetch_state = DIR_INODES; + + if (!load_done) { + wait_for_load( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + _prefetch_inodes(); + }) + ) + ); + return true; + } + + _prefetch_inodes(); + return !is_prefetched(); +} + +bool OpenFileTable::should_log_open(CInode *in) +{ + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) { + // inode just journaled + if (in->last_journaled >= committing_log_seq) + return false; + // item not dirty. it means the item has already been saved + auto p = dirty_items.find(in->ino()); + if (p == dirty_items.end()) + return false; + } + return true; +} + +void OpenFileTable::note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos) +{ + auto& vec = logseg_destroyed_inos[seq]; + vec.insert(vec.end(), inos.begin(), inos.end()); +} + +void OpenFileTable::trim_destroyed_inos(uint64_t seq) +{ + auto p = logseg_destroyed_inos.begin(); + while (p != logseg_destroyed_inos.end()) { + if (p->first >= seq) + break; + logseg_destroyed_inos.erase(p++); + } +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h new file mode 100644 index 00000000..70d4c09b --- /dev/null +++ b/src/mds/OpenFileTable.h @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OPEN_FILE_TABLE_H +#define OPEN_FILE_TABLE_H + +#include "mdstypes.h" +#include "Anchor.h" + +#include "MDSContext.h" + +class CDir; +class CInode; +class MDSRank; + +class OpenFileTable +{ +public: + explicit OpenFileTable(MDSRank *m) : mds(m) {} + + void add_inode(CInode *in); + void remove_inode(CInode *in); + void add_dirfrag(CDir *dir); + void remove_dirfrag(CDir *dir); + void notify_link(CInode *in); + void notify_unlink(CInode *in); + bool is_any_dirty() const { return !dirty_items.empty(); } + + void commit(MDSContext *c, uint64_t log_seq, int op_prio); + uint64_t get_committed_log_seq() const { return committed_log_seq; } + uint64_t get_committing_log_seq() const { return committing_log_seq; } + bool is_any_committing() const { return num_pending_commit > 0; } + + void load(MDSContext *c); + bool is_loaded() const { return load_done; } + void wait_for_load(MDSContext *c) { + ceph_assert(!load_done); + waiting_for_load.push_back(c); + } + + bool get_ancestors(inodeno_t ino, vector<inode_backpointer_t>& ancestors, + mds_rank_t& auth_hint); + + bool prefetch_inodes(); + bool is_prefetched() const { return prefetch_state == DONE; } + void wait_for_prefetch(MDSContext *c) { + ceph_assert(!is_prefetched()); + waiting_for_prefetch.push_back(c); + } + + bool should_log_open(CInode *in); + + void note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos); + void trim_destroyed_inos(uint64_t seq); + +protected: + friend class C_IO_OFT_Recover; + friend class C_IO_OFT_Load; + friend class C_IO_OFT_Save; + friend class C_IO_OFT_Journal; + friend class C_OFT_OpenInoFinish; + + uint64_t MAX_ITEMS_PER_OBJ = g_conf().get_val<uint64_t>("osd_deep_scrub_large_omap_object_key_threshold"); + static const unsigned MAX_OBJECTS = 1024; // (1024 * osd_deep_scrub_large_omap_object_key_threshold) items at most + + static const int DIRTY_NEW = -1; + static const int DIRTY_UNDEF = -2; + + unsigned num_pending_commit = 0; + void _encode_header(bufferlist& bl, int j_state); + void _commit_finish(int r, uint64_t log_seq, MDSContext *fin); + void _journal_finish(int r, uint64_t log_seq, MDSContext *fin, + std::map<unsigned, std::vector<ObjectOperation> >& ops); + + void get_ref(CInode *in); + void put_ref(CInode *in); + + object_t get_object_name(unsigned idx) const; + + void _reset_states() { + omap_num_objs = 0; + omap_num_items.resize(0); + journal_state = JOURNAL_NONE; + loaded_journals.clear(); + loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } + void _load_finish(int op_r, int header_r, int values_r, + unsigned idx, bool first, bool more, + bufferlist &header_bl, + std::map<std::string, bufferlist> &values); + void _recover_finish(int r); + + void _open_ino_finish(inodeno_t ino, int r); + void _prefetch_inodes(); + void _prefetch_dirfrags(); + + MDSRank *mds; + + version_t omap_version = 0; + + unsigned omap_num_objs = 0; + std::vector<unsigned> omap_num_items; + + map<inodeno_t, OpenedAnchor> anchor_map; + set<dirfrag_t> dirfrags; + + std::map<inodeno_t, int> dirty_items; // ino -> dirty state + + uint64_t committed_log_seq = 0; + uint64_t committing_log_seq = 0; + + enum { + JOURNAL_NONE = 0, + JOURNAL_START = 1, + JOURNAL_FINISH = 2, + }; + int journal_state = 0; + + std::vector<std::map<std::string, bufferlist> > loaded_journals; + map<inodeno_t, RecoveredAnchor> loaded_anchor_map; + set<dirfrag_t> loaded_dirfrags; + MDSContext::vec waiting_for_load; + bool load_done = false; + + enum { + DIR_INODES = 1, + DIRFRAGS = 2, + FILE_INODES = 3, + DONE = 4, + }; + unsigned prefetch_state = 0; + unsigned num_opening_inodes = 0; + MDSContext::vec waiting_for_prefetch; + + std::map<uint64_t, vector<inodeno_t> > logseg_destroyed_inos; + std::set<inodeno_t> destroyed_inos_set; +}; + +#endif diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc new file mode 100644 index 00000000..cc30d9d2 --- /dev/null +++ b/src/mds/PurgeQueue.cc @@ -0,0 +1,776 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/debug.h" +#include "mds/mdstypes.h" +#include "mds/CInode.h" +#include "mds/MDCache.h" + +#include "PurgeQueue.h" + +#include <string.h> + +#define dout_context cct +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, rank) << __func__ << ": " +static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) { + return *_dout << "mds." << rank << ".purge_queue "; +} + +const std::map<std::string, PurgeItem::Action> PurgeItem::actions = { + {"NONE", PurgeItem::NONE}, + {"PURGE_FILE", PurgeItem::PURGE_FILE}, + {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE}, + {"PURGE_DIR", PurgeItem::PURGE_DIR} +}; + +void PurgeItem::encode(bufferlist &bl) const +{ + ENCODE_START(2, 1, bl); + encode((uint8_t)action, bl); + encode(ino, bl); + encode(size, bl); + encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2); + encode(old_pools, bl); + encode(snapc, bl); + encode(fragtree, bl); + encode(stamp, bl); + uint8_t static const pad = 0xff; + for (unsigned int i = 0; i<pad_size; i++) { + encode(pad, bl); + } + ENCODE_FINISH(bl); +} + +void PurgeItem::decode(bufferlist::const_iterator &p) +{ + DECODE_START(2, p); + bool done = false; + if (struct_v == 1) { + auto p_start = p; + try { + // bad encoding introduced by v13.2.2 + decode(stamp, p); + decode(pad_size, p); + p.advance(pad_size); + uint8_t raw_action; + decode(raw_action, p); + action = (Action)raw_action; + decode(ino, p); + decode(size, p); + decode(layout, p); + decode(old_pools, p); + decode(snapc, p); + decode(fragtree, p); + if (p.get_off() > struct_end) + throw buffer::end_of_buffer(); + done = true; + } catch (const buffer::error &e) { + p = p_start; + } + } + if (!done) { + uint8_t raw_action; + decode(raw_action, p); + action = (Action)raw_action; + decode(ino, p); + decode(size, p); + decode(layout, p); + decode(old_pools, p); + decode(snapc, p); + decode(fragtree, p); + if (struct_v >= 2) { + decode(stamp, p); + } + } + DECODE_FINISH(p); +} + +// TODO: if Objecter has any slow requests, take that as a hint and +// slow down our rate of purging (keep accepting pushes though) +PurgeQueue::PurgeQueue( + CephContext *cct_, + mds_rank_t rank_, + const int64_t metadata_pool_, + Objecter *objecter_, + Context *on_error_) + : + cct(cct_), + rank(rank_), + lock("PurgeQueue"), + metadata_pool(metadata_pool_), + finisher(cct, "PurgeQueue", "PQ_Finisher"), + timer(cct, lock), + filer(objecter_, &finisher), + objecter(objecter_), + journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool, + CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0, + &finisher), + on_error(on_error_), + ops_in_flight(0), + max_purge_ops(0), + drain_initial(0), + draining(false), + delayed_flush(nullptr), + recovered(false) +{ + ceph_assert(cct != nullptr); + ceph_assert(on_error != nullptr); + ceph_assert(objecter != nullptr); + journaler.set_write_error_handler(on_error); +} + +PurgeQueue::~PurgeQueue() +{ + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger.get()); + } + delete on_error; +} + +void PurgeQueue::create_logger() +{ + PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last); + + pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed", + "purg", PerfCountersBuilder::PRIO_INTERESTING); + + pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight"); + pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops"); + pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight"); + pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges"); + + logger.reset(pcb.create_perf_counters()); + g_ceph_context->get_perfcounters_collection()->add(logger.get()); +} + +void PurgeQueue::init() +{ + std::lock_guard l(lock); + + ceph_assert(logger != nullptr); + + finisher.start(); + timer.init(); +} + +void PurgeQueue::activate() +{ + std::lock_guard l(lock); + + if (readonly) { + dout(10) << "skipping activate: PurgeQueue is readonly" << dendl; + return; + } + + if (journaler.get_read_pos() == journaler.get_write_pos()) + return; + + if (in_flight.empty()) { + dout(4) << "start work (by drain)" << dendl; + finisher.queue(new FunctionContext([this](int r) { + std::lock_guard l(lock); + _consume(); + })); + } +} + +void PurgeQueue::shutdown() +{ + std::lock_guard l(lock); + + journaler.shutdown(); + timer.shutdown(); + finisher.stop(); +} + +void PurgeQueue::open(Context *completion) +{ + dout(4) << "opening" << dendl; + + std::lock_guard l(lock); + + if (completion) + waiting_for_recovery.push_back(completion); + + journaler.recover(new FunctionContext([this](int r){ + if (r == -ENOENT) { + dout(1) << "Purge Queue not found, assuming this is an upgrade and " + "creating it." << dendl; + create(NULL); + } else if (r == 0) { + std::lock_guard l(lock); + dout(4) << "open complete" << dendl; + + // Journaler only guarantees entries before head write_pos have been + // fully flushed. Before appending new entries, we need to find and + // drop any partial written entry. + if (journaler.last_committed.write_pos < journaler.get_write_pos()) { + dout(4) << "recovering write_pos" << dendl; + journaler.set_read_pos(journaler.last_committed.write_pos); + _recover(); + return; + } + + journaler.set_writeable(); + recovered = true; + finish_contexts(g_ceph_context, waiting_for_recovery); + } else { + derr << "Error " << r << " loading Journaler" << dendl; + _go_readonly(r); + } + })); +} + +void PurgeQueue::wait_for_recovery(Context* c) +{ + std::lock_guard l(lock); + if (recovered) { + c->complete(0); + } else if (readonly) { + dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl; + c->complete(-EROFS); + } else { + waiting_for_recovery.push_back(c); + } +} + +void PurgeQueue::_recover() +{ + ceph_assert(lock.is_locked_by_me()); + + // Journaler::is_readable() adjusts write_pos if partial entry is encountered + while (1) { + if (!journaler.is_readable() && + !journaler.get_error() && + journaler.get_read_pos() < journaler.get_write_pos()) { + journaler.wait_for_readable(new FunctionContext([this](int r) { + std::lock_guard l(lock); + _recover(); + })); + return; + } + + if (journaler.get_error()) { + int r = journaler.get_error(); + derr << "Error " << r << " recovering write_pos" << dendl; + _go_readonly(r); + return; + } + + if (journaler.get_read_pos() == journaler.get_write_pos()) { + dout(4) << "write_pos recovered" << dendl; + // restore original read_pos + journaler.set_read_pos(journaler.last_committed.expire_pos); + journaler.set_writeable(); + recovered = true; + finish_contexts(g_ceph_context, waiting_for_recovery); + return; + } + + bufferlist bl; + bool readable = journaler.try_read_entry(bl); + ceph_assert(readable); // we checked earlier + } +} + +void PurgeQueue::create(Context *fin) +{ + dout(4) << "creating" << dendl; + std::lock_guard l(lock); + + if (fin) + waiting_for_recovery.push_back(fin); + + file_layout_t layout = file_layout_t::get_default(); + layout.pool_id = metadata_pool; + journaler.set_writeable(); + journaler.create(&layout, JOURNAL_FORMAT_RESILIENT); + journaler.write_head(new FunctionContext([this](int r) { + std::lock_guard l(lock); + if (r) { + _go_readonly(r); + } else { + recovered = true; + finish_contexts(g_ceph_context, waiting_for_recovery); + } + })); +} + +/** + * The `completion` context will always be called back via a Finisher + */ +void PurgeQueue::push(const PurgeItem &pi, Context *completion) +{ + dout(4) << "pushing inode " << pi.ino << dendl; + std::lock_guard l(lock); + + if (readonly) { + dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl; + completion->complete(-EROFS); + return; + } + + // Callers should have waited for open() before using us + ceph_assert(!journaler.is_readonly()); + + bufferlist bl; + + encode(pi, bl); + journaler.append_entry(bl); + journaler.wait_for_flush(completion); + + // Maybe go ahead and do something with it right away + bool could_consume = _consume(); + if (!could_consume) { + // Usually, it is not necessary to explicitly flush here, because the reader + // will get flushes generated inside Journaler::is_readable. However, + // if we remain in a _can_consume()==false state for a long period then + // we should flush in order to allow MDCache to drop its strays rather + // than having them wait for purgequeue to progress. + if (!delayed_flush) { + delayed_flush = new FunctionContext([this](int r){ + delayed_flush = nullptr; + journaler.flush(); + }); + + timer.add_event_after( + g_conf()->mds_purge_queue_busy_flush_period, + delayed_flush); + } + } +} + +uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const +{ + uint32_t ops_required = 0; + if (item.action == PurgeItem::PURGE_DIR) { + // Directory, count dirfrags to be deleted + frag_vec_t leaves; + if (!item.fragtree.is_leaf(frag_t())) { + item.fragtree.get_leaves(leaves); + } + // One for the root, plus any leaves + ops_required = 1 + leaves.size(); + } else { + // File, work out concurrent Filer::purge deletes + // Account for removing (or zeroing) backtrace + const uint64_t num = (item.size > 0) ? + Striper::get_num_objects(item.layout, item.size) : 1; + + ops_required = std::min(num, g_conf()->filer_max_purge_ops); + + // Account for deletions for old pools + if (item.action != PurgeItem::TRUNCATE_FILE) { + ops_required += item.old_pools.size(); + } + } + + return ops_required; +} + +bool PurgeQueue::_can_consume() +{ + if (readonly) { + dout(10) << "can't consume: PurgeQueue is readonly" << dendl; + return false; + } + + dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, " + << in_flight.size() << "/" << g_conf()->mds_max_purge_files + << " files" << dendl; + + if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) { + // Always permit consumption if nothing is in flight, so that the ops + // limit can never be so low as to forbid all progress (unless + // administrator has deliberately paused purging by setting max + // purge files to zero). + return true; + } + + if (ops_in_flight >= max_purge_ops) { + dout(20) << "Throttling on op limit " << ops_in_flight << "/" + << max_purge_ops << dendl; + return false; + } + + if (in_flight.size() >= cct->_conf->mds_max_purge_files) { + dout(20) << "Throttling on item limit " << in_flight.size() + << "/" << cct->_conf->mds_max_purge_files << dendl; + return false; + } else { + return true; + } +} + +void PurgeQueue::_go_readonly(int r) +{ + if (readonly) return; + dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl; + readonly = true; + on_error->complete(r); + on_error = nullptr; + journaler.set_readonly(); + finish_contexts(g_ceph_context, waiting_for_recovery, r); +} + +bool PurgeQueue::_consume() +{ + ceph_assert(lock.is_locked_by_me()); + + bool could_consume = false; + while(_can_consume()) { + + if (delayed_flush) { + // We are now going to read from the journal, so any proactive + // flush is no longer necessary. This is not functionally necessary + // but it can avoid generating extra fragmented flush IOs. + timer.cancel_event(delayed_flush); + delayed_flush = nullptr; + } + + if (int r = journaler.get_error()) { + derr << "Error " << r << " recovering write_pos" << dendl; + _go_readonly(r); + return could_consume; + } + + if (!journaler.is_readable()) { + dout(10) << " not readable right now" << dendl; + // Because we are the writer and the reader of the journal + // via the same Journaler instance, we never need to reread_head + if (!journaler.have_waiter()) { + journaler.wait_for_readable(new FunctionContext([this](int r) { + std::lock_guard l(lock); + if (r == 0) { + _consume(); + } else if (r != -EAGAIN) { + _go_readonly(r); + } + })); + } + + return could_consume; + } + + could_consume = true; + // The journaler is readable: consume an entry + bufferlist bl; + bool readable = journaler.try_read_entry(bl); + ceph_assert(readable); // we checked earlier + + dout(20) << " decoding entry" << dendl; + PurgeItem item; + auto q = bl.cbegin(); + try { + decode(item, q); + } catch (const buffer::error &err) { + derr << "Decode error at read_pos=0x" << std::hex + << journaler.get_read_pos() << dendl; + _go_readonly(EIO); + } + dout(20) << " executing item (" << item.ino << ")" << dendl; + _execute_item(item, journaler.get_read_pos()); + } + + dout(10) << " cannot consume right now" << dendl; + + return could_consume; +} + +void PurgeQueue::_execute_item( + const PurgeItem &item, + uint64_t expire_to) +{ + ceph_assert(lock.is_locked_by_me()); + + in_flight[expire_to] = item; + logger->set(l_pq_executing, in_flight.size()); + files_high_water = std::max(files_high_water, in_flight.size()); + logger->set(l_pq_executing_high_water, files_high_water); + auto ops = _calculate_ops(item); + ops_in_flight += ops; + logger->set(l_pq_executing_ops, ops_in_flight); + ops_high_water = std::max(ops_high_water, ops_in_flight); + logger->set(l_pq_executing_ops_high_water, ops_high_water); + + SnapContext nullsnapc; + + C_GatherBuilder gather(cct); + if (item.action == PurgeItem::PURGE_FILE) { + if (item.size > 0) { + uint64_t num = Striper::get_num_objects(item.layout, item.size); + dout(10) << " 0~" << item.size << " objects 0~" << num + << " snapc " << item.snapc << " on " << item.ino << dendl; + filer.purge_range(item.ino, &item.layout, item.snapc, + 0, num, ceph::real_clock::now(), 0, + gather.new_sub()); + } + + // remove the backtrace object if it was not purged + object_t oid = CInode::get_object_name(item.ino, frag_t(), ""); + if (!gather.has_subs() || !item.layout.pool_ns.empty()) { + object_locator_t oloc(item.layout.pool_id); + dout(10) << " remove backtrace object " << oid + << " pool " << oloc.pool << " snapc " << item.snapc << dendl; + objecter->remove(oid, oloc, item.snapc, + ceph::real_clock::now(), 0, + gather.new_sub()); + } + + // remove old backtrace objects + for (const auto &p : item.old_pools) { + object_locator_t oloc(p); + dout(10) << " remove backtrace object " << oid + << " old pool " << p << " snapc " << item.snapc << dendl; + objecter->remove(oid, oloc, item.snapc, + ceph::real_clock::now(), 0, + gather.new_sub()); + } + } else if (item.action == PurgeItem::PURGE_DIR) { + object_locator_t oloc(metadata_pool); + frag_vec_t leaves; + if (!item.fragtree.is_leaf(frag_t())) + item.fragtree.get_leaves(leaves); + leaves.push_back(frag_t()); + for (const auto &leaf : leaves) { + object_t oid = CInode::get_object_name(item.ino, leaf, ""); + dout(10) << " remove dirfrag " << oid << dendl; + objecter->remove(oid, oloc, nullsnapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + } + } else if (item.action == PurgeItem::TRUNCATE_FILE) { + const uint64_t num = Striper::get_num_objects(item.layout, item.size); + dout(10) << " 0~" << item.size << " objects 0~" << num + << " snapc " << item.snapc << " on " << item.ino << dendl; + + // keep backtrace object + if (num > 1) { + filer.purge_range(item.ino, &item.layout, item.snapc, + 1, num - 1, ceph::real_clock::now(), + 0, gather.new_sub()); + } + filer.zero(item.ino, &item.layout, item.snapc, + 0, item.layout.object_size, + ceph::real_clock::now(), + 0, true, gather.new_sub()); + } else { + derr << "Invalid item (action=" << item.action << ") in purge queue, " + "dropping it" << dendl; + ops_in_flight -= ops; + logger->set(l_pq_executing_ops, ops_in_flight); + ops_high_water = std::max(ops_high_water, ops_in_flight); + logger->set(l_pq_executing_ops_high_water, ops_high_water); + in_flight.erase(expire_to); + logger->set(l_pq_executing, in_flight.size()); + files_high_water = std::max(files_high_water, in_flight.size()); + logger->set(l_pq_executing_high_water, files_high_water); + return; + } + ceph_assert(gather.has_subs()); + + gather.set_finisher(new C_OnFinisher( + new FunctionContext([this, expire_to](int r){ + std::lock_guard l(lock); + + if (r == -EBLACKLISTED) { + finisher.queue(on_error, r); + on_error = nullptr; + return; + } + + _execute_item_complete(expire_to); + _consume(); + + // Have we gone idle? If so, do an extra write_head now instead of + // waiting for next flush after journaler_write_head_interval. + // Also do this periodically even if not idle, so that the persisted + // expire_pos doesn't fall too far behind our progress when consuming + // a very long queue. + if (in_flight.empty() || journaler.write_head_needed()) { + journaler.write_head(nullptr); + } + }), &finisher)); + + gather.activate(); +} + +void PurgeQueue::_execute_item_complete( + uint64_t expire_to) +{ + ceph_assert(lock.is_locked_by_me()); + dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl; + ceph_assert(in_flight.count(expire_to) == 1); + + auto iter = in_flight.find(expire_to); + ceph_assert(iter != in_flight.end()); + if (iter == in_flight.begin()) { + uint64_t pos = expire_to; + if (!pending_expire.empty()) { + auto n = iter; + ++n; + if (n == in_flight.end()) { + pos = *pending_expire.rbegin(); + pending_expire.clear(); + } else { + auto p = pending_expire.begin(); + do { + if (*p >= n->first) + break; + pos = *p; + pending_expire.erase(p++); + } while (p != pending_expire.end()); + } + } + dout(10) << "expiring to 0x" << std::hex << pos << std::dec << dendl; + journaler.set_expire_pos(pos); + } else { + // This is completely fine, we're not supposed to purge files in + // order when doing them in parallel. + dout(10) << "non-sequential completion, not expiring anything" << dendl; + pending_expire.insert(expire_to); + } + + ops_in_flight -= _calculate_ops(iter->second); + logger->set(l_pq_executing_ops, ops_in_flight); + ops_high_water = std::max(ops_high_water, ops_in_flight); + logger->set(l_pq_executing_ops_high_water, ops_high_water); + + dout(10) << "completed item for ino " << iter->second.ino << dendl; + + in_flight.erase(iter); + logger->set(l_pq_executing, in_flight.size()); + files_high_water = std::max(files_high_water, in_flight.size()); + logger->set(l_pq_executing_high_water, files_high_water); + dout(10) << "in_flight.size() now " << in_flight.size() << dendl; + + logger->inc(l_pq_executed); +} + +void PurgeQueue::update_op_limit(const MDSMap &mds_map) +{ + std::lock_guard l(lock); + + if (readonly) { + dout(10) << "skipping; PurgeQueue is readonly" << dendl; + return; + } + + uint64_t pg_count = 0; + objecter->with_osdmap([&](const OSDMap& o) { + // Number of PGs across all data pools + const std::vector<int64_t> &data_pools = mds_map.get_data_pools(); + for (const auto dp : data_pools) { + if (o.get_pg_pool(dp) == NULL) { + // It is possible that we have an older OSDMap than MDSMap, + // because we don't start watching every OSDMap until after + // MDSRank is initialized + dout(4) << " data pool " << dp << " not found in OSDMap" << dendl; + continue; + } + pg_count += o.get_pg_num(dp); + } + }); + + // Work out a limit based on n_pgs / n_mdss, multiplied by the user's + // preference for how many ops per PG + max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) * + cct->_conf->mds_max_purge_ops_per_pg); + + // User may also specify a hard limit, apply this if so. + if (cct->_conf->mds_max_purge_ops) { + max_purge_ops = std::min(max_purge_ops, cct->_conf->mds_max_purge_ops); + } +} + +void PurgeQueue::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map) +{ + if (changed.count("mds_max_purge_ops") + || changed.count("mds_max_purge_ops_per_pg")) { + update_op_limit(mds_map); + } else if (changed.count("mds_max_purge_files")) { + std::lock_guard l(lock); + if (in_flight.empty()) { + // We might have gone from zero to a finite limit, so + // might need to kick off consume. + dout(4) << "maybe start work again (max_purge_files=" + << g_conf()->mds_max_purge_files << dendl; + finisher.queue(new FunctionContext([this](int r){ + std::lock_guard l(lock); + _consume(); + })); + } + } +} + +bool PurgeQueue::drain( + uint64_t *progress, + uint64_t *progress_total, + size_t *in_flight_count + ) +{ + std::lock_guard l(lock); + + if (readonly) { + dout(10) << "skipping drain; PurgeQueue is readonly" << dendl; + return true; + } + + ceph_assert(progress != nullptr); + ceph_assert(progress_total != nullptr); + ceph_assert(in_flight_count != nullptr); + + const bool done = in_flight.empty() && ( + journaler.get_read_pos() == journaler.get_write_pos()); + if (done) { + return true; + } + + const uint64_t bytes_remaining = journaler.get_write_pos() + - journaler.get_read_pos(); + + if (!draining) { + // Start of draining: remember how much there was outstanding at + // this point so that we can give a progress percentage later + draining = true; + + // Life the op throttle as this daemon now has nothing to do but + // drain the purge queue, so do it as fast as we can. + max_purge_ops = 0xffff; + } + + drain_initial = std::max(bytes_remaining, drain_initial); + + *progress = drain_initial - bytes_remaining; + *progress_total = drain_initial; + *in_flight_count = in_flight.size(); + + return false; +} + +std::string_view PurgeItem::get_type_str() const +{ + switch(action) { + case PurgeItem::NONE: return "NONE"; + case PurgeItem::PURGE_FILE: return "PURGE_FILE"; + case PurgeItem::PURGE_DIR: return "PURGE_DIR"; + case PurgeItem::TRUNCATE_FILE: return "TRUNCATE_FILE"; + default: + return "UNKNOWN"; + } +} + diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h new file mode 100644 index 00000000..9a603a26 --- /dev/null +++ b/src/mds/PurgeQueue.h @@ -0,0 +1,228 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PURGE_QUEUE_H_ +#define PURGE_QUEUE_H_ + +#include "include/compact_set.h" +#include "mds/MDSMap.h" +#include "osdc/Journaler.h" + + +/** + * Descriptor of the work associated with purging a file. We record + * the minimal amount of information from the inode such as the size + * and layout: all other un-needed inode metadata (times, permissions, etc) + * has been discarded. + */ +class PurgeItem +{ +public: + enum Action : uint8_t { + NONE = 0, + PURGE_FILE = 1, + TRUNCATE_FILE, + PURGE_DIR + }; + + utime_t stamp; + //None PurgeItem serves as NoOp for splicing out journal entries; + //so there has to be a "pad_size" to specify the size of journal + //space to be spliced. + uint32_t pad_size; + Action action; + inodeno_t ino; + uint64_t size; + file_layout_t layout; + compact_set<int64_t> old_pools; + SnapContext snapc; + fragtree_t fragtree; + + PurgeItem() + : pad_size(0), action(NONE), ino(0), size(0) + {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &p); + + static Action str_to_type(std::string_view str) { + return PurgeItem::actions.at(std::string(str)); + } + + void dump(Formatter *f) const + { + f->dump_int("action", action); + f->dump_int("ino", ino); + f->dump_int("size", size); + f->open_object_section("layout"); + layout.dump(f); + f->close_section(); + f->open_object_section("SnapContext"); + snapc.dump(f); + f->close_section(); + f->open_object_section("fragtree"); + fragtree.dump(f); + f->close_section(); + } + + std::string_view get_type_str() const; +private: + static const std::map<std::string, PurgeItem::Action> actions; +}; +WRITE_CLASS_ENCODER(PurgeItem) + +enum { + l_pq_first = 3500, + + // How many items have been finished by PurgeQueue + l_pq_executing_ops, + l_pq_executing_ops_high_water, + l_pq_executing, + l_pq_executing_high_water, + l_pq_executed, + l_pq_last +}; + +/** + * A persistent queue of PurgeItems. This class both writes and reads + * to the queue. There is one of these per MDS rank. + * + * Note that this class does not take a reference to MDSRank: we are + * independent of all the metadata structures and do not need to + * take mds_lock for anything. + */ +class PurgeQueue +{ +private: + CephContext *cct; + const mds_rank_t rank; + Mutex lock; + bool readonly = false; + + int64_t metadata_pool; + + // Don't use the MDSDaemon's Finisher and Timer, because this class + // operates outside of MDSDaemon::mds_lock + Finisher finisher; + SafeTimer timer; + Filer filer; + Objecter *objecter; + std::unique_ptr<PerfCounters> logger; + + Journaler journaler; + + Context *on_error; + + // Map of Journaler offset to PurgeItem + std::map<uint64_t, PurgeItem> in_flight; + + std::set<uint64_t> pending_expire; + + // Throttled allowances + uint64_t ops_in_flight; + + // Dynamic op limit per MDS based on PG count + uint64_t max_purge_ops; + + uint32_t _calculate_ops(const PurgeItem &item) const; + + bool _can_consume(); + + // How many bytes were remaining when drain() was first called, + // used for indicating progress. + uint64_t drain_initial; + + // Has drain() ever been called on this instance? + bool draining; + + // recover the journal write_pos (drop any partial written entry) + void _recover(); + + /** + * @return true if we were in a position to try and consume something: + * does not mean we necessarily did. + */ + bool _consume(); + + // Do we currently have a flush timer event waiting? + Context *delayed_flush; + + void _execute_item( + const PurgeItem &item, + uint64_t expire_to); + void _execute_item_complete( + uint64_t expire_to); + + bool recovered; + std::list<Context*> waiting_for_recovery; + + void _go_readonly(int r); + + uint64_t ops_high_water = 0; + uint64_t files_high_water = 0; + +public: + void init(); + void activate(); + void shutdown(); + + void create_logger(); + + // Write an empty queue, use this during MDS rank creation + void create(Context *completion); + + // Read the Journaler header for an existing queue and start consuming + void open(Context *completion); + + void wait_for_recovery(Context *c); + + // Submit one entry to the work queue. Call back when it is persisted + // to the queue (there is no callback for when it is executed) + void push(const PurgeItem &pi, Context *completion); + + // If the on-disk queue is empty and we are not currently processing + // anything. + bool is_idle() const; + + /** + * Signal to the PurgeQueue that you would like it to hurry up and + * finish consuming everything in the queue. Provides progress + * feedback. + * + * @param progress: bytes consumed since we started draining + * @param progress_total: max bytes that were outstanding during purge + * @param in_flight_count: number of file purges currently in flight + * + * @returns true if drain is complete + */ + bool drain( + uint64_t *progress, + uint64_t *progress_total, + size_t *in_flight_count); + + void update_op_limit(const MDSMap &mds_map); + + void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); + + PurgeQueue( + CephContext *cct_, + mds_rank_t rank_, + const int64_t metadata_pool_, + Objecter *objecter_, + Context *on_error); + ~PurgeQueue(); +}; + +#endif + diff --git a/src/mds/RecoveryQueue.cc b/src/mds/RecoveryQueue.cc new file mode 100644 index 00000000..e02de367 --- /dev/null +++ b/src/mds/RecoveryQueue.cc @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "CInode.h" +#include "MDCache.h" +#include "MDSRank.h" +#include "Locker.h" +#include "osdc/Filer.h" + +#include "RecoveryQueue.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " " + +class C_MDC_Recover : public MDSIOContextBase { +protected: + RecoveryQueue *rq; + CInode *in; + void finish(int r) override { + rq->_recovered(in, r, size, mtime); + } + + MDSRank *get_mds() override { + return rq->mds; + } + +public: + uint64_t size; + utime_t mtime; + + C_MDC_Recover(RecoveryQueue *rq_, CInode *i) : + MDSIOContextBase(false), rq(rq_), in(i), size(0) { + ceph_assert(rq != NULL); + } + void print(ostream& out) const override { + out << "file_recover(" << in->ino() << ")"; + } +}; + + +RecoveryQueue::RecoveryQueue(MDSRank *mds_) : + file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)), + file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)), + mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher) +{ } + + +/** + * Progress the queue. Call this after enqueuing something or on + * completion of something. + */ +void RecoveryQueue::advance() +{ + dout(10) << file_recover_queue_size << " queued, " + << file_recover_queue_front_size << " prioritized, " + << file_recovering.size() << " recovering" << dendl; + + while (file_recovering.size() < g_conf()->mds_max_file_recover) { + if (!file_recover_queue_front.empty()) { + CInode *in = file_recover_queue_front.front(); + in->item_recover_queue_front.remove_myself(); + file_recover_queue_front_size--; + _start(in); + } else if (!file_recover_queue.empty()) { + CInode *in = file_recover_queue.front(); + in->item_recover_queue.remove_myself(); + file_recover_queue_size--; + _start(in); + } else { + break; + } + } + + logger->set(l_mdc_num_recovering_processing, file_recovering.size()); + logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); + logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); +} + +void RecoveryQueue::_start(CInode *in) +{ + auto pi = in->get_projected_inode(); + + // blech + if (pi->client_ranges.size() && !pi->get_max_size()) { + mds->clog->warn() << "bad client_range " << pi->client_ranges + << " on ino " << pi->ino; + } + + auto p = file_recovering.find(in); + if (pi->client_ranges.size() && pi->get_max_size()) { + dout(10) << "starting " << in->inode.size << " " << pi->client_ranges + << " " << *in << dendl; + if (p == file_recovering.end()) { + file_recovering.insert(make_pair(in, false)); + + C_MDC_Recover *fin = new C_MDC_Recover(this, in); + filer.probe(in->inode.ino, &in->inode.layout, in->last, + pi->get_max_size(), &fin->size, &fin->mtime, false, + 0, fin); + } else { + p->second = true; + dout(10) << "already working on " << *in << ", set need_restart flag" << dendl; + } + } else { + dout(10) << "skipping " << in->inode.size << " " << *in << dendl; + if (p == file_recovering.end()) { + in->state_clear(CInode::STATE_RECOVERING); + mds->locker->eval(in, CEPH_LOCK_IFILE); + in->auth_unpin(this); + } + } +} + +void RecoveryQueue::prioritize(CInode *in) +{ + if (file_recovering.count(in)) { + dout(10) << "already working on " << *in << dendl; + return; + } + + if (!in->item_recover_queue_front.is_on_list()) { + dout(20) << *in << dendl; + + ceph_assert(in->item_recover_queue.is_on_list()); + in->item_recover_queue.remove_myself(); + file_recover_queue_size--; + + file_recover_queue_front.push_back(&in->item_recover_queue_front); + + file_recover_queue_front_size++; + logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); + return; + } + + dout(10) << "not queued " << *in << dendl; +} + +static bool _is_in_any_recover_queue(CInode *in) +{ + return in->item_recover_queue.is_on_list() || + in->item_recover_queue_front.is_on_list(); +} + +/** + * Given an authoritative inode which is in the cache, + * enqueue it for recovery. + */ +void RecoveryQueue::enqueue(CInode *in) +{ + dout(15) << "RecoveryQueue::enqueue " << *in << dendl; + ceph_assert(logger); // Caller should have done set_logger before using me + ceph_assert(in->is_auth()); + + in->state_clear(CInode::STATE_NEEDSRECOVER); + if (!in->state_test(CInode::STATE_RECOVERING)) { + in->state_set(CInode::STATE_RECOVERING); + in->auth_pin(this); + logger->inc(l_mdc_recovery_started); + } + + if (!_is_in_any_recover_queue(in)) { + file_recover_queue.push_back(&in->item_recover_queue); + file_recover_queue_size++; + logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); + } +} + + +/** + * Call back on completion of Filer probe on an inode. + */ +void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime) +{ + dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime + << " for " << *in << dendl; + + if (r != 0) { + dout(0) << "recovery error! " << r << dendl; + if (r == -EBLACKLISTED) { + mds->respawn(); + return; + } else { + // Something wrong on the OSD side trying to recover the size + // of this inode. In principle we could record this as a piece + // of per-inode damage, but it's actually more likely that + // this indicates something wrong with the MDS (like maybe + // it has the wrong auth caps?) + mds->clog->error() << " OSD read error while recovering size" + " for inode " << in->ino(); + mds->damaged(); + } + } + + auto p = file_recovering.find(in); + ceph_assert(p != file_recovering.end()); + bool restart = p->second; + file_recovering.erase(p); + + logger->set(l_mdc_num_recovering_processing, file_recovering.size()); + logger->inc(l_mdc_recovery_completed); + in->state_clear(CInode::STATE_RECOVERING); + + if (restart) { + if (in->item_recover_queue.is_on_list()) { + in->item_recover_queue.remove_myself(); + file_recover_queue_size--; + } + if (in->item_recover_queue_front.is_on_list()) { + in->item_recover_queue_front.remove_myself(); + file_recover_queue_front_size--; + } + logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); + logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); + _start(in); + } else if (!_is_in_any_recover_queue(in)) { + // journal + mds->locker->check_inode_max_size(in, true, 0, size, mtime); + mds->locker->eval(in, CEPH_LOCK_IFILE); + in->auth_unpin(this); + } + + advance(); +} + diff --git a/src/mds/RecoveryQueue.h b/src/mds/RecoveryQueue.h new file mode 100644 index 00000000..a1e6ac48 --- /dev/null +++ b/src/mds/RecoveryQueue.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +//class C_MDC_Recover; +// +#ifndef RECOVERY_QUEUE_H +#define RECOVERY_QUEUE_H + +#include <set> + +#include "osdc/Filer.h" + +class CInode; +class MDSRank; +class PerfCounters; + +class RecoveryQueue { +public: + void enqueue(CInode *in); + void advance(); + void prioritize(CInode *in); ///< do this inode now/soon + explicit RecoveryQueue(MDSRank *mds_); + + void set_logger(PerfCounters *p) {logger=p;} + +private: + void _start(CInode *in); ///< start recovering this file + void _recovered(CInode *in, int r, uint64_t size, utime_t mtime); + + size_t file_recover_queue_size = 0; + size_t file_recover_queue_front_size = 0; + + elist<CInode*> file_recover_queue; ///< the queue + elist<CInode*> file_recover_queue_front; ///< elevated priority items + std::map<CInode*, bool> file_recovering; // inode -> need_restart + + MDSRank *mds; + PerfCounters *logger; + Filer filer; + + friend class C_MDC_Recover; +}; + +#endif // RECOVERY_QUEUE_H diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h new file mode 100644 index 00000000..f2fe7938 --- /dev/null +++ b/src/mds/ScatterLock.h @@ -0,0 +1,255 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_SCATTERLOCK_H +#define CEPH_SCATTERLOCK_H + +#include "SimpleLock.h" + +#include "MDSContext.h" + +class ScatterLock : public SimpleLock { + + struct more_bits_t { + xlist<ScatterLock*>::item item_updated; + utime_t update_stamp; + + explicit more_bits_t(ScatterLock *lock) : + item_updated(lock) + {} + }; + + mutable std::unique_ptr<more_bits_t> _more; + + more_bits_t *more() { + if (!_more) + _more.reset(new more_bits_t(this)); + return _more.get(); + } + + enum { + SCATTER_WANTED = 1 << 8, + UNSCATTER_WANTED = 1 << 9, + DIRTY = 1 << 10, + FLUSHING = 1 << 11, + FLUSHED = 1 << 12, + }; + +public: + ScatterLock(MDSCacheObject *o, LockType *lt) : + SimpleLock(o, lt) {} + ~ScatterLock() override { + ceph_assert(!_more); + } + + bool is_scatterlock() const override { + return true; + } + + bool is_sync_and_unlocked() const { + return + SimpleLock::is_sync_and_unlocked() && + !is_dirty() && + !is_flushing(); + } + + bool can_scatter_pin(client_t loner) { + /* + LOCK : NOT okay because it can MIX and force replicas to journal something + TSYN : also not okay for same reason + EXCL : also not okay + + MIX : okay, replica can stall before sending AC_SYNCACK + SYNC : okay, replica can stall before sending AC_MIXACK or AC_LOCKACK + */ + return + get_state() == LOCK_SYNC || + get_state() == LOCK_MIX; + } + + void set_xlock_snap_sync(MDSContext *c) + { + ceph_assert(get_type() == CEPH_LOCK_IFILE); + ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE); + state = LOCK_XLOCKSNAP; + add_waiter(WAIT_STABLE, c); + } + + xlist<ScatterLock*>::item *get_updated_item() { return &more()->item_updated; } + + utime_t get_update_stamp() { + return _more ? _more->update_stamp : utime_t(); + } + + void set_update_stamp(utime_t t) { more()->update_stamp = t; } + + void set_scatter_wanted() { + state_flags |= SCATTER_WANTED; + } + void set_unscatter_wanted() { + state_flags |= UNSCATTER_WANTED; + } + void clear_scatter_wanted() { + state_flags &= ~SCATTER_WANTED; + } + void clear_unscatter_wanted() { + state_flags &= ~UNSCATTER_WANTED; + } + bool get_scatter_wanted() const { + return state_flags & SCATTER_WANTED; + } + bool get_unscatter_wanted() const { + return state_flags & UNSCATTER_WANTED; + } + + bool is_dirty() const override { + return state_flags & DIRTY; + } + bool is_flushing() const override { + return state_flags & FLUSHING; + } + bool is_flushed() const override { + return state_flags & FLUSHED; + } + bool is_dirty_or_flushing() const { + return is_dirty() || is_flushing(); + } + + void mark_dirty() { + if (!is_dirty()) { + if (!is_flushing()) + parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); + set_dirty(); + } + } + void start_flush() { + if (is_dirty()) { + set_flushing(); + clear_dirty(); + } + } + void finish_flush() { + if (is_flushing()) { + clear_flushing(); + set_flushed(); + if (!is_dirty()) { + parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); + parent->clear_dirty_scattered(get_type()); + } + } + } + void clear_flushed() override { + state_flags &= ~FLUSHED; + } + void remove_dirty() { + start_flush(); + finish_flush(); + clear_flushed(); + } + + void infer_state_from_strong_rejoin(int rstate, bool locktoo) { + if (rstate == LOCK_MIX || + rstate == LOCK_MIX_LOCK || // replica still has wrlocks? + rstate == LOCK_MIX_SYNC) + state = LOCK_MIX; + else if (locktoo && rstate == LOCK_LOCK) + state = LOCK_LOCK; + } + + void encode_state_for_rejoin(bufferlist& bl, int rep) { + __s16 s = get_replica_state(); + if (is_gathering(rep)) { + // the recovering mds may hold rejoined wrlocks + if (state == LOCK_MIX_SYNC) + s = LOCK_MIX_SYNC; + else + s = LOCK_MIX_LOCK; + } + + // If there is a recovering mds who replcated an object when it failed + // and scatterlock in the object was in MIX state, It's possible that + // the recovering mds needs to take wrlock on the scatterlock when it + // replays unsafe requests. So this mds should delay taking rdlock on + // the scatterlock until the recovering mds finishes replaying unsafe. + // Otherwise unsafe requests may get replayed after current request. + // + // For example: + // The recovering mds is auth mds of a dirfrag, this mds is auth mds + // of corresponding inode. when 'rm -rf' the direcotry, this mds should + // delay the rmdir request until the recovering mds has replayed unlink + // requests. + if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC) + mark_need_recover(); + + using ceph::encode; + encode(s, bl); + } + + void decode_state_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, bool survivor) { + SimpleLock::decode_state_rejoin(p, waiters, survivor); + if (is_flushing()) { + set_dirty(); + clear_flushing(); + } + } + + bool remove_replica(int from, bool rejoin) { + if (rejoin && + (state == LOCK_MIX || + state == LOCK_MIX_SYNC || + state == LOCK_MIX_LOCK2 || + state == LOCK_MIX_TSYN || + state == LOCK_MIX_EXCL)) + return false; + return SimpleLock::remove_replica(from); + } + + void print(ostream& out) const override { + out << "("; + _print(out); + if (is_dirty()) + out << " dirty"; + if (is_flushing()) + out << " flushing"; + if (is_flushed()) + out << " flushed"; + if (get_scatter_wanted()) + out << " scatter_wanted"; + out << ")"; + } + +private: + void set_flushing() { + state_flags |= FLUSHING; + } + void clear_flushing() { + state_flags &= ~FLUSHING; + } + void set_flushed() { + state_flags |= FLUSHED; + } + void set_dirty() { + state_flags |= DIRTY; + } + void clear_dirty() { + state_flags &= ~DIRTY; + if (_more) { + _more->item_updated.remove_myself(); + _more.reset(); + } + } +}; + +#endif diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h new file mode 100644 index 00000000..f49598d8 --- /dev/null +++ b/src/mds/ScrubHeader.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef SCRUB_HEADER_H_ +#define SCRUB_HEADER_H_ + +#include <string_view> + +class CInode; + +/** + * Externally input parameters for a scrub, associated with the root + * of where we are doing a recursive scrub + */ +class ScrubHeader { +public: + ScrubHeader(std::string_view tag_, bool is_tag_internal_, bool force_, + bool recursive_, bool repair_, Formatter *f_) + : tag(tag_), is_tag_internal(is_tag_internal_), force(force_), + recursive(recursive_), repair(repair_), formatter(f_), origin(nullptr) + { + ceph_assert(formatter != nullptr); + } + + // Set after construction because it won't be known until we've + // started resolving path and locking + void set_origin(CInode *origin_) { origin = origin_; } + + bool get_recursive() const { return recursive; } + bool get_repair() const { return repair; } + bool get_force() const { return force; } + bool is_internal_tag() const { return is_tag_internal; } + CInode *get_origin() const { return origin; } + std::string_view get_tag() const { return tag; } + Formatter &get_formatter() const { return *formatter; } + + bool get_repaired() const { return repaired; } + void set_repaired() { repaired = true; } + +protected: + const std::string tag; + bool is_tag_internal; + const bool force; + const bool recursive; + const bool repair; + Formatter * const formatter; + CInode *origin; + + bool repaired = false; // May be set during scrub if repairs happened +}; + +typedef std::shared_ptr<ScrubHeader> ScrubHeaderRef; +typedef std::shared_ptr<const ScrubHeader> ScrubHeaderRefConst; + +#endif // SCRUB_HEADER_H_ + diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc new file mode 100644 index 00000000..2743347e --- /dev/null +++ b/src/mds/ScrubStack.cc @@ -0,0 +1,755 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> + +#include "ScrubStack.h" +#include "common/Finisher.h" +#include "mds/MDSRank.h" +#include "mds/MDCache.h" +#include "mds/MDSContinuation.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, scrubstack->mdcache->mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".scrubstack "; +} + +std::ostream &operator<<(std::ostream &os, const ScrubStack::State &state) { + switch(state) { + case ScrubStack::STATE_RUNNING: + os << "RUNNING"; + break; + case ScrubStack::STATE_IDLE: + os << "IDLE"; + break; + case ScrubStack::STATE_PAUSING: + os << "PAUSING"; + break; + case ScrubStack::STATE_PAUSED: + os << "PAUSED"; + break; + default: + ceph_abort(); + } + + return os; +} + +void ScrubStack::push_inode(CInode *in) +{ + dout(20) << "pushing " << *in << " on top of ScrubStack" << dendl; + if (!in->item_scrub.is_on_list()) { + in->get(CInode::PIN_SCRUBQUEUE); + stack_size++; + } + inode_stack.push_front(&in->item_scrub); +} + +void ScrubStack::push_inode_bottom(CInode *in) +{ + dout(20) << "pushing " << *in << " on bottom of ScrubStack" << dendl; + if (!in->item_scrub.is_on_list()) { + in->get(CInode::PIN_SCRUBQUEUE); + stack_size++; + } + inode_stack.push_back(&in->item_scrub); +} + +void ScrubStack::pop_inode(CInode *in) +{ + dout(20) << "popping " << *in + << " off of ScrubStack" << dendl; + ceph_assert(in->item_scrub.is_on_list()); + in->put(CInode::PIN_SCRUBQUEUE); + in->item_scrub.remove_myself(); + stack_size--; +} + +void ScrubStack::_enqueue_inode(CInode *in, CDentry *parent, + ScrubHeaderRef& header, + MDSContext *on_finish, bool top) +{ + dout(10) << __func__ << " with {" << *in << "}" + << ", on_finish=" << on_finish << ", top=" << top << dendl; + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + in->scrub_initialize(parent, header, on_finish); + if (top) + push_inode(in); + else + push_inode_bottom(in); +} + +void ScrubStack::enqueue_inode(CInode *in, ScrubHeaderRef& header, + MDSContext *on_finish, bool top) +{ + // abort in progress + if (clear_inode_stack) { + on_finish->complete(-EAGAIN); + return; + } + + _enqueue_inode(in, NULL, header, on_finish, top); + kick_off_scrubs(); +} + +void ScrubStack::kick_off_scrubs() +{ + ceph_assert(mdcache->mds->mds_lock.is_locked()); + dout(20) << __func__ << ": state=" << state << dendl; + + if (clear_inode_stack || state == STATE_PAUSING || state == STATE_PAUSED) { + if (scrubs_in_progress == 0) { + dout(10) << __func__ << ": in progress scrub operations finished, " + << stack_size << " in the stack" << dendl; + + State final_state = state; + if (clear_inode_stack) { + abort_pending_scrubs(); + final_state = STATE_IDLE; + } + if (state == STATE_PAUSING) { + final_state = STATE_PAUSED; + } + + set_state(final_state); + complete_control_contexts(0); + } + + return; + } + + dout(20) << __func__ << " entering with " << scrubs_in_progress << " in " + "progress and " << stack_size << " in the stack" << dendl; + bool can_continue = true; + elist<CInode*>::iterator i = inode_stack.begin(); + while (g_conf()->mds_max_scrub_ops_in_progress > scrubs_in_progress && + can_continue) { + if (i.end()) { + if (scrubs_in_progress == 0) { + set_state(STATE_IDLE); + } + + return; + } + + assert(state == STATE_RUNNING || state == STATE_IDLE); + set_state(STATE_RUNNING); + + CInode *curi = *i; + ++i; // we have our reference, push iterator forward + + dout(20) << __func__ << " examining " << *curi << dendl; + + if (!curi->is_dir()) { + // it's a regular file, symlink, or hard link + pop_inode(curi); // we only touch it this once, so remove from stack + + if (!curi->scrub_info()->on_finish) { + scrubs_in_progress++; + curi->scrub_set_finisher(&scrub_kick); + } + scrub_file_inode(curi); + can_continue = true; + } else { + bool completed; // it's done, so pop it off the stack + bool terminal; // not done, but we can start ops on other directories + bool progress; // it added new dentries to the top of the stack + scrub_dir_inode(curi, &progress, &terminal, &completed); + if (completed) { + dout(20) << __func__ << " dir completed" << dendl; + pop_inode(curi); + } else if (progress) { + dout(20) << __func__ << " dir progressed" << dendl; + // we added new stuff to top of stack, so reset ourselves there + i = inode_stack.begin(); + } else { + dout(20) << __func__ << " dir no-op" << dendl; + } + + can_continue = progress || terminal || completed; + } + } +} + +void ScrubStack::scrub_dir_inode(CInode *in, + bool *added_children, + bool *terminal, + bool *done) +{ + dout(10) << __func__ << " " << *in << dendl; + + *added_children = false; + bool all_frags_terminal = true; + bool all_frags_done = true; + + ScrubHeaderRef header = in->get_scrub_header(); + ceph_assert(header != nullptr); + + if (header->get_recursive()) { + frag_vec_t scrubbing_frags; + list<CDir*> scrubbing_cdirs; + in->scrub_dirfrags_scrubbing(&scrubbing_frags); + dout(20) << __func__ << " iterating over " << scrubbing_frags.size() + << " scrubbing frags" << dendl; + for (const auto& fg : scrubbing_frags) { + // turn frags into CDir * + CDir *dir = in->get_dirfrag(fg); + if (dir) { + scrubbing_cdirs.push_back(dir); + dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl; + } else { + in->scrub_dirfrag_finished(fg); + dout(25) << __func__ << " missing dirfrag " << fg << " skip scrubbing" << dendl; + } + } + + dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size() + << " scrubbing cdirs" << dendl; + + list<CDir*>::iterator i = scrubbing_cdirs.begin(); + while (g_conf()->mds_max_scrub_ops_in_progress > scrubs_in_progress) { + // select next CDir + CDir *cur_dir = NULL; + if (i != scrubbing_cdirs.end()) { + cur_dir = *i; + ++i; + dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl; + } else { + bool ready = get_next_cdir(in, &cur_dir); + dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl; + + if (ready && cur_dir) { + scrubbing_cdirs.push_back(cur_dir); + } else if (!ready) { + // We are waiting for load of a frag + all_frags_done = false; + all_frags_terminal = false; + break; + } else { + // Finished with all frags + break; + } + } + // scrub that CDir + bool frag_added_children = false; + bool frag_terminal = true; + bool frag_done = false; + scrub_dirfrag(cur_dir, header, + &frag_added_children, &frag_terminal, &frag_done); + if (frag_done) { + cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag); + } + *added_children |= frag_added_children; + all_frags_terminal = all_frags_terminal && frag_terminal; + all_frags_done = all_frags_done && frag_done; + } + + dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal + << ", all_frags_done=" << all_frags_done << dendl; + } else { + dout(20) << "!scrub_recursive" << dendl; + } + + if (all_frags_done) { + assert (!*added_children); // can't do this if children are still pending + + // OK, so now I can... fire off a validate on the dir inode, and + // when it completes, come through here again, noticing that we've + // set a flag to indicate the validate happened, and + scrub_dir_inode_final(in); + } + + *terminal = all_frags_terminal; + *done = all_frags_done; + dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl; + return; +} + +bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir) +{ + dout(20) << __func__ << " on " << *in << dendl; + frag_t next_frag; + int r = in->scrub_dirfrag_next(&next_frag); + assert (r >= 0); + + if (r == 0) { + // we got a frag to scrub, otherwise it would be ENOENT + dout(25) << "looking up new frag " << next_frag << dendl; + CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag); + if (!next_dir->is_complete()) { + scrubs_in_progress++; + next_dir->fetch(&scrub_kick); + dout(25) << "fetching frag from RADOS" << dendl; + return false; + } + *new_dir = next_dir; + dout(25) << "returning dir " << *new_dir << dendl; + return true; + } + ceph_assert(r == ENOENT); + // there are no dirfrags left + *new_dir = NULL; + return true; +} + +class C_InodeValidated : public MDSInternalContext +{ + public: + ScrubStack *stack; + CInode::validated_data result; + CInode *target; + + C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) + : MDSInternalContext(mds), stack(stack_), target(target_) + {} + + void finish(int r) override + { + stack->_validate_inode_done(target, r, result); + } +}; + + +void ScrubStack::scrub_dir_inode_final(CInode *in) +{ + dout(20) << __func__ << " " << *in << dendl; + + // Two passes through this function. First one triggers inode validation, + // second one sets finally_done + // FIXME: kind of overloading scrub_in_progress here, using it while + // dentry is still on stack to indicate that we have finished + // doing our validate_disk_state on the inode + // FIXME: the magic-constructing scrub_info() is going to leave + // an unneeded scrub_infop lying around here + if (!in->scrub_info()->children_scrubbed) { + if (!in->scrub_info()->on_finish) { + scrubs_in_progress++; + in->scrub_set_finisher(&scrub_kick); + } + + in->scrub_children_finished(); + C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); + in->validate_disk_state(&fin->result, fin); + } + + return; +} + +void ScrubStack::scrub_dirfrag(CDir *dir, + ScrubHeaderRef& header, + bool *added_children, bool *is_terminal, + bool *done) +{ + ceph_assert(dir != NULL); + + dout(20) << __func__ << " on " << *dir << dendl; + *added_children = false; + *is_terminal = false; + *done = false; + + + if (!dir->scrub_info()->directory_scrubbing) { + // Get the frag complete before calling + // scrub initialize, so that it can populate its lists + // of dentries. + if (!dir->is_complete()) { + scrubs_in_progress++; + dir->fetch(&scrub_kick); + return; + } + + dir->scrub_initialize(header); + } + + int r = 0; + while(r == 0) { + CDentry *dn = NULL; + scrubs_in_progress++; + r = dir->scrub_dentry_next(&scrub_kick, &dn); + if (r != EAGAIN) { + scrubs_in_progress--; + } + + if (r == EAGAIN) { + // Drop out, CDir fetcher will call back our kicker context + dout(20) << __func__ << " waiting for fetch on " << *dir << dendl; + return; + } + + if (r == ENOENT) { + // Nothing left to scrub, are we done? + std::list<CDentry*> scrubbing; + dir->scrub_dentries_scrubbing(&scrubbing); + if (scrubbing.empty()) { + dout(20) << __func__ << " dirfrag done: " << *dir << dendl; + // FIXME: greg: What's the diff meant to be between done and terminal + dir->scrub_finished(); + *done = true; + *is_terminal = true; + } else { + dout(20) << __func__ << " " << scrubbing.size() << " dentries still " + "scrubbing in " << *dir << dendl; + } + return; + } + + // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should + // never get random IO errors here. + ceph_assert(r == 0); + + _enqueue_inode(dn->get_projected_inode(), dn, header, NULL, true); + + *added_children = true; + } +} + +void ScrubStack::scrub_file_inode(CInode *in) +{ + C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); + // At this stage the DN is already past scrub_initialize, so + // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned + in->validate_disk_state(&fin->result, fin); +} + +void ScrubStack::_validate_inode_done(CInode *in, int r, + const CInode::validated_data &result) +{ + LogChannelRef clog = mdcache->mds->clog; + const ScrubHeaderRefConst header = in->scrub_info()->header; + + std::string path; + if (!result.passed_validation) { + // Build path string for use in messages + in->make_path_string(path, true); + } + + if (result.backtrace.checked && !result.backtrace.passed && + !result.backtrace.repaired) + { + // Record backtrace fails as remote linkage damage, as + // we may not be able to resolve hard links to this inode + mdcache->mds->damage_table.notify_remote_damaged(in->inode.ino, path); + } else if (result.inode.checked && !result.inode.passed && + !result.inode.repaired) { + // Record damaged inode structures as damaged dentries as + // that is where they are stored + auto parent = in->get_projected_parent_dn(); + if (parent) { + auto dir = parent->get_dir(); + mdcache->mds->damage_table.notify_dentry( + dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path); + } + } + + // Inform the cluster log if we found an error + if (!result.passed_validation) { + if (result.all_damage_repaired()) { + clog->info() << "Scrub repaired inode " << in->ino() + << " (" << path << ")"; + } else { + clog->warn() << "Scrub error on inode " << in->ino() + << " (" << path << ") see " << g_conf()->name + << " log and `damage ls` output for details"; + } + + // Put the verbose JSON output into the MDS log for later inspection + JSONFormatter f; + result.dump(&f); + std::ostringstream out; + f.flush(out); + derr << __func__ << " scrub error on inode " << *in << ": " << out.str() + << dendl; + } else { + dout(10) << __func__ << " scrub passed on inode " << *in << dendl; + } + + MDSContext *c = NULL; + in->scrub_finished(&c); + + if (in == header->get_origin()) { + scrub_origins.erase(in); + clog_scrub_summary(in); + if (!header->get_recursive()) { + if (r >= 0) { // we got into the scrubbing dump it + result.dump(&(header->get_formatter())); + } else { // we failed the lookup or something; dump ourselves + header->get_formatter().open_object_section("results"); + header->get_formatter().dump_int("return_code", r); + header->get_formatter().close_section(); // results + } + } + } + if (c) { + finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0); + } +} + +ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s) + : MDSInternalContext(mdcache->mds), stack(s) { } + +void ScrubStack::complete_control_contexts(int r) { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + + for (auto &ctx : control_ctxs) { + ctx->complete(r); + } + control_ctxs.clear(); +} + +void ScrubStack::set_state(State next_state) { + if (state != next_state) { + dout(20) << __func__ << ", from state=" << state << ", to state=" + << next_state << dendl; + state = next_state; + clog_scrub_summary(); + } +} + +bool ScrubStack::scrub_in_transition_state() { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + dout(20) << __func__ << ": state=" << state << dendl; + + // STATE_RUNNING is considered as a transition state so as to + // "delay" the scrub control operation. + if (state == STATE_RUNNING || state == STATE_PAUSING) { + return true; + } + + return false; +} + +std::string_view ScrubStack::scrub_summary() { + ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); + + bool have_more = false; + CachedStackStringStream cs; + + if (state == STATE_IDLE) { + return "idle"; + } + + if (state == STATE_RUNNING) { + if (clear_inode_stack) { + *cs << "aborting"; + } else { + *cs << "active"; + } + } else { + if (state == STATE_PAUSING) { + have_more = true; + *cs << "pausing"; + } else if (state == STATE_PAUSED) { + have_more = true; + *cs << "paused"; + } + + if (clear_inode_stack) { + if (have_more) { + *cs << "+"; + } + *cs << "aborting"; + } + } + + if (!scrub_origins.empty()) { + *cs << " [paths:"; + for (auto inode = scrub_origins.begin(); inode != scrub_origins.end(); ++inode) { + if (inode != scrub_origins.begin()) { + *cs << ","; + } + + *cs << scrub_inode_path(*inode); + } + *cs << "]"; + } + + return cs->strv(); +} + +void ScrubStack::scrub_status(Formatter *f) { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + + f->open_object_section("result"); + + std::stringstream ss; + bool have_more = false; + + if (state == STATE_IDLE) { + ss << "no active scrubs running"; + } else if (state == STATE_RUNNING) { + if (clear_inode_stack) { + ss << "ABORTING"; + } else { + ss << "scrub active"; + } + ss << " (" << stack_size << " inodes in the stack)"; + } else { + if (state == STATE_PAUSING || state == STATE_PAUSED) { + have_more = true; + ss << state; + } + if (clear_inode_stack) { + if (have_more) { + ss << "+"; + } + ss << "ABORTING"; + } + + ss << " (" << stack_size << " inodes in the stack)"; + } + f->dump_string("status", ss.str()); + + f->open_object_section("scrubs"); + for (auto &inode : scrub_origins) { + have_more = false; + ScrubHeaderRefConst header = inode->get_scrub_header(); + + std::string tag(header->get_tag()); + f->open_object_section(tag.c_str()); // scrub id + + f->dump_string("path", scrub_inode_path(inode)); + + std::stringstream optss; + if (header->get_recursive()) { + optss << "recursive"; + have_more = true; + } + if (header->get_repair()) { + if (have_more) { + optss << ","; + } + optss << "repair"; + have_more = true; + } + if (header->get_force()) { + if (have_more) { + optss << ","; + } + optss << "force"; + } + + f->dump_string("options", optss.str()); + f->close_section(); // scrub id + } + f->close_section(); // scrubs + f->close_section(); // result +} + +void ScrubStack::abort_pending_scrubs() { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + ceph_assert(clear_inode_stack); + + for (auto inode = inode_stack.begin(); !inode.end(); ++inode) { + CInode *in = *inode; + if (in == in->scrub_info()->header->get_origin()) { + scrub_origins.erase(in); + clog_scrub_summary(in); + } + + MDSContext *ctx = nullptr; + in->scrub_aborted(&ctx); + if (ctx != nullptr) { + ctx->complete(-ECANCELED); + } + } + + stack_size = 0; + inode_stack.clear(); + clear_inode_stack = false; +} + +void ScrubStack::scrub_abort(Context *on_finish) { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + ceph_assert(on_finish != nullptr); + + dout(10) << __func__ << ": aborting with " << scrubs_in_progress + << " scrubs in progress and " << stack_size << " in the" + << " stack" << dendl; + + clear_inode_stack = true; + if (scrub_in_transition_state()) { + control_ctxs.push_back(on_finish); + return; + } + + abort_pending_scrubs(); + if (state != STATE_PAUSED) { + set_state(STATE_IDLE); + } + on_finish->complete(0); +} + +void ScrubStack::scrub_pause(Context *on_finish) { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + ceph_assert(on_finish != nullptr); + + dout(10) << __func__ << ": pausing with " << scrubs_in_progress + << " scrubs in progress and " << stack_size << " in the" + << " stack" << dendl; + + // abort is in progress + if (clear_inode_stack) { + on_finish->complete(-EINVAL); + return; + } + + bool done = scrub_in_transition_state(); + if (done) { + set_state(STATE_PAUSING); + control_ctxs.push_back(on_finish); + return; + } + + set_state(STATE_PAUSED); + on_finish->complete(0); +} + +bool ScrubStack::scrub_resume() { + ceph_assert(mdcache->mds->mds_lock.is_locked_by_me()); + dout(20) << __func__ << ": state=" << state << dendl; + + int r = 0; + + if (clear_inode_stack) { + r = -EINVAL; + } else if (state == STATE_PAUSING) { + set_state(STATE_RUNNING); + complete_control_contexts(-ECANCELED); + } else if (state == STATE_PAUSED) { + set_state(STATE_RUNNING); + kick_off_scrubs(); + } + + return r; +} + +// send current scrub summary to cluster log +void ScrubStack::clog_scrub_summary(CInode *in) { + if (in) { + std::string what; + if (clear_inode_stack) { + what = "aborted"; + } else if (scrub_origins.count(in)) { + what = "queued"; + } else { + what = "completed"; + } + clog->info() << "scrub " << what << " for path: " << scrub_inode_path(in); + } + + clog->info() << "scrub summary: " << scrub_summary(); +} diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h new file mode 100644 index 00000000..3586daf2 --- /dev/null +++ b/src/mds/ScrubStack.h @@ -0,0 +1,306 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SCRUBSTACK_H_ +#define SCRUBSTACK_H_ + +#include "CDir.h" +#include "CDentry.h" +#include "CInode.h" +#include "MDSContext.h" +#include "ScrubHeader.h" + +#include "common/LogClient.h" +#include "include/elist.h" + +class MDCache; +class Finisher; + +class ScrubStack { +protected: + // reference to global cluster log client + LogChannelRef &clog; + + /// A finisher needed so that we don't re-enter kick_off_scrubs + Finisher *finisher; + + /// The stack of inodes we want to scrub + elist<CInode*> inode_stack; + /// current number of dentries we're actually scrubbing + int scrubs_in_progress; + ScrubStack *scrubstack; // hack for dout + int stack_size; + + class C_KickOffScrubs : public MDSInternalContext { + ScrubStack *stack; + public: + C_KickOffScrubs(MDCache *mdcache, ScrubStack *s); + void finish(int r) override { } + void complete(int r) override { + if (r == -ECANCELED) { + return; + } + + stack->scrubs_in_progress--; + stack->kick_off_scrubs(); + // don't delete self + } + }; + C_KickOffScrubs scrub_kick; + +public: + MDCache *mdcache; + ScrubStack(MDCache *mdc, LogChannelRef &clog, Finisher *finisher_) : + clog(clog), + finisher(finisher_), + inode_stack(member_offset(CInode, item_scrub)), + scrubs_in_progress(0), + scrubstack(this), + stack_size(0), + scrub_kick(mdc, this), + mdcache(mdc) {} + ~ScrubStack() { + ceph_assert(inode_stack.empty()); + ceph_assert(!scrubs_in_progress); + } + /** + * Put a inode on the top of the scrub stack, so it is the highest priority. + * If there are other scrubs in progress, they will not continue scrubbing new + * entries until this one is completed. + * @param in The inodey to scrub + * @param header The ScrubHeader propagated from wherever this scrub + * was initiated + */ + void enqueue_inode_top(CInode *in, ScrubHeaderRef& header, + MDSContext *on_finish) { + enqueue_inode(in, header, on_finish, true); + scrub_origins.emplace(in); + clog_scrub_summary(in); + } + /** Like enqueue_inode_top, but we wait for all pending scrubs before + * starting this one. + */ + void enqueue_inode_bottom(CInode *in, ScrubHeaderRef& header, + MDSContext *on_finish) { + enqueue_inode(in, header, on_finish, false); + scrub_origins.emplace(in); + clog_scrub_summary(in); + } + + /** + * Abort an ongoing scrub operation. The abort operation could be + * delayed if there are in-progress scrub operations on going. The + * caller should provide a context which is completed after all + * in-progress scrub operations are completed and pending inodes + * are removed from the scrub stack (with the context callbacks for + * inodes completed with -ECANCELED). + * @param on_finish Context callback to invoke after abort + */ + void scrub_abort(Context *on_finish); + + /** + * Pause scrub operations. Similar to abort, pause is delayed if + * there are in-progress scrub operations on going. The caller + * should provide a context which is completed after all in-progress + * scrub operations are completed. Subsequent scrub operations are + * queued until scrub is resumed. + * @param on_finish Context callback to invoke after pause + */ + void scrub_pause(Context *on_finish); + + /** + * Resume a paused scrub. Unlike abort or pause, this is instantaneous. + * Pending pause operations are cancelled (context callbacks are + * invoked with -ECANCELED). + * @returns 0 (success) if resumed, -EINVAL if an abort is in-progress. + */ + bool scrub_resume(); + + /** + * Get the current scrub status as human readable string. Some basic + * information is returned such as number of inodes pending abort/pause. + */ + void scrub_status(Formatter *f); + + bool is_scrubbing() const { return !inode_stack.empty(); } + + /** + * Get a high level scrub status summary such as current scrub state + * and scrub paths. + */ + std::string_view scrub_summary(); + +private: + // scrub abort is _not_ a state, rather it's an operation that's + // performed after in-progress scrubs are finished. + enum State { + STATE_RUNNING = 0, + STATE_IDLE, + STATE_PAUSING, + STATE_PAUSED, + }; + friend std::ostream &operator<<(std::ostream &os, const State &state); + + State state = STATE_IDLE; + bool clear_inode_stack = false; + + // list of pending context completions for asynchronous scrub + // control operations. + std::list<Context *> control_ctxs; + + // list of inodes for which scrub operations are running -- used + // to diplay out in `scrub status`. + std::set<CInode *> scrub_origins; + + /** + * Put the inode at either the top or bottom of the stack, with + * the given scrub params, and then try and kick off more scrubbing. + */ + void enqueue_inode(CInode *in, ScrubHeaderRef& header, + MDSContext *on_finish, bool top); + void _enqueue_inode(CInode *in, CDentry *parent, ScrubHeaderRef& header, + MDSContext *on_finish, bool top); + /** + * Kick off as many scrubs as are appropriate, based on the current + * state of the stack. + */ + void kick_off_scrubs(); + /** + * Push a inode on top of the stack. + */ + inline void push_inode(CInode *in); + /** + * Push a inode to the bottom of the stack. + */ + inline void push_inode_bottom(CInode *in); + /** + * Pop the given inode off the stack. + */ + inline void pop_inode(CInode *in); + + /** + * Scrub a file inode. + * @param in The inode to scrub + */ + void scrub_file_inode(CInode *in); + + /** + * Callback from completion of CInode::validate_disk_state + * @param in The inode we were validating + * @param r The return status from validate_disk_state + * @param result Populated results from validate_disk_state + */ + void _validate_inode_done(CInode *in, int r, + const CInode::validated_data &result); + friend class C_InodeValidated; + + /** + * Make progress on scrubbing a directory-representing dirfrag and + * its children.. + * + * 1) Select the next dirfrag which hasn't been scrubbed, and make progress + * on it if possible. + * + * 2) If not, move on to the next dirfrag and start it up, if any. + * + * 3) If waiting for results from dirfrag scrubs, do nothing. + * + * 4) If all dirfrags have been scrubbed, scrub my inode. + * + * @param in The CInode to scrub as a directory + * @param added_children set to true if we pushed some of our children + * onto the ScrubStack + * @param is_terminal set to true if there are no descendant dentries + * remaining to start scrubbing. + * @param done set to true if we and all our children have finished scrubbing + */ + void scrub_dir_inode(CInode *in, bool *added_children, bool *is_terminal, + bool *done); + /** + * Make progress on scrubbing a dirfrag. It may return after each of the + * following steps, but will report making progress on each one. + * + * 1) enqueues the next unscrubbed child directory dentry at the + * top of the stack. + * + * 2) Initiates a scrub on the next unscrubbed file dentry + * + * If there are scrubs currently in progress on child dentries, no more child + * dentries to scrub, and this function is invoked, it will report no + * progress. Try again later. + * + */ + void scrub_dirfrag(CDir *dir, ScrubHeaderRef& header, + bool *added_children, bool *is_terminal, bool *done); + /** + * Scrub a directory-representing dentry. + * + * @param in The directory inode we're doing final scrub on. + */ + void scrub_dir_inode_final(CInode *in); + + /** + * Get a CDir into memory, and return it if it's already complete. + * Otherwise, fetch it and kick off scrubbing when done. + * + * @param in The Inode to get the next directory from + * @param new_dir The CDir we're returning to you. NULL if + * not ready yet or there aren't any. + * @returns false if you have to wait, true if there's no work + * left to do (we returned it, or there are none left in this inode). + */ + bool get_next_cdir(CInode *in, CDir **new_dir); + + /** + * Set scrub state + * @param next_state State to move the scrub to. + */ + void set_state(State next_state); + + /** + * Is scrub in one of transition states (running, pausing) + */ + bool scrub_in_transition_state(); + + /** + * complete queued up contexts + * @param r return value to complete contexts. + */ + void complete_control_contexts(int r); + + /** + * Abort pending scrubs for inodes waiting in the inode stack. + * Completion context is complete with -ECANCELED. + */ + void abort_pending_scrubs(); + + /** + * Return path for a given inode. + * @param in inode to make path entry. + */ + std::string scrub_inode_path(CInode *in) { + std::string path; + in->make_path_string(path, true); + return (path.empty() ? "/" : path.c_str()); + } + + /** + * Send scrub information (queued/finished scrub path and summary) + * to cluster log. + * @param in inode for which scrub has been queued or finished. + */ + void clog_scrub_summary(CInode *in=nullptr); +}; + +#endif /* SCRUBSTACK_H_ */ diff --git a/src/mds/Server.cc b/src/mds/Server.cc new file mode 100644 index 00000000..5d0be194 --- /dev/null +++ b/src/mds/Server.cc @@ -0,0 +1,10206 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/lexical_cast.hpp> +#include "include/ceph_assert.h" // lexical_cast includes system assert.h + +#include <boost/config/warning_disable.hpp> +#include <boost/fusion/include/std_pair.hpp> +#include <boost/range/adaptor/reversed.hpp> + +#include "MDSRank.h" +#include "Server.h" +#include "Locker.h" +#include "MDCache.h" +#include "MDLog.h" +#include "Migrator.h" +#include "MDBalancer.h" +#include "InoTable.h" +#include "SnapClient.h" +#include "Mutation.h" +#include "cephfs_features.h" + +#include "msg/Messenger.h" + +#include "osdc/Objecter.h" + +#include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" +#include "events/ESession.h" +#include "events/EOpen.h" +#include "events/ECommitted.h" + +#include "include/stringify.h" +#include "include/filepath.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "include/compat.h" +#include "osd/OSDMap.h" + +#include <errno.h> +#include <math.h> + +#include <list> +#include <iostream> +#include <string_view> + +#include "common/config.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server " + +class ServerContext : public MDSContext { + protected: + Server *server; + MDSRank *get_mds() override + { + return server->mds; + } + + public: + explicit ServerContext(Server *s) : server(s) { + ceph_assert(server != NULL); + } +}; + +class ServerLogContext : public MDSLogContextBase { +protected: + Server *server; + MDSRank *get_mds() override + { + return server->mds; + } + + MDRequestRef mdr; + void pre_finish(int r) override { + if (mdr) + mdr->mark_event("journal_committed: "); + } +public: + explicit ServerLogContext(Server *s) : server(s) { + ceph_assert(server != NULL); + } + explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) { + ceph_assert(server != NULL); + } +}; + +void Server::create_logger() +{ + PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last); + + plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request", + "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request", + "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64_counter(l_mdss_handle_client_session, + "handle_client_session", "Client session messages", "hcs", + PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction", + "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING); + plb.add_u64_counter(l_mdss_cap_acquisition_throttle, + "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat", + PerfCountersBuilder::PRIO_INTERESTING); + + // fop latencies are useful + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency", + "Request type lookup hash of inode latency"); + plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency", + "Request type lookup inode latency"); + plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency", + "Request type lookup parent latency"); + plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency", + "Request type lookup name latency"); + plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency", + "Request type lookup latency"); + plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency", + "Request type lookup snapshot latency"); + plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency", + "Request type get attribute latency"); + plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency", + "Request type set attribute latency"); + plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency", + "Request type set file layout latency"); + plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency", + "Request type set directory layout latency"); + plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency", + "Request type set extended attribute latency"); + plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency", + "Request type remove extended attribute latency"); + plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency", + "Request type read directory latency"); + plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency", + "Request type set file lock latency"); + plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency", + "Request type get file lock latency"); + plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency", + "Request type create latency"); + plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency", + "Request type open latency"); + plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency", + "Request type make node latency"); + plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency", + "Request type link latency"); + plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency", + "Request type unlink latency"); + plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency", + "Request type remove directory latency"); + plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency", + "Request type rename latency"); + plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency", + "Request type make directory latency"); + plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency", + "Request type symbolic link latency"); + plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency", + "Request type list snapshot latency"); + plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency", + "Request type make snapshot latency"); + plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency", + "Request type remove snapshot latency"); + plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency", + "Request type rename snapshot latency"); + + plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); + plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", + "Client requests dispatched"); + plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", + "Server requests dispatched"); + + logger = plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); +} + +Server::Server(MDSRank *m) : + mds(m), + mdcache(mds->mdcache), mdlog(mds->mdlog), + logger(0), + is_full(false), + reconnect_done(NULL), + failed_reconnects(0), + reconnect_evicting(false), + terminating_sessions(false), + recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")) +{ + max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir"); + replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session"); + cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout"); + max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client"); + cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle"); + max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio"); + caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout"); + supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED); +} + +void Server::dispatch(const Message::const_ref &m) +{ + switch (m->get_type()) { + case CEPH_MSG_CLIENT_RECONNECT: + handle_client_reconnect(MClientReconnect::msgref_cast(m)); + return; + } + +/* + *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this: + +1. In reconnect phase, client sent unsafe requests to mds. +2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed. +(Another situation is #31668, which will deny all client reconnect msg to speed up reboot). +3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase. + +*/ + bool sessionclosed_isok = replay_unsafe_with_closed_session; + // active? + // handle_slave_request()/handle_client_session() will wait if necessary + if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) { + const auto &req = MClientRequest::msgref_cast(m); + if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) { + Session *session = mds->get_session(req); + if (!session || (!session->is_open() && !sessionclosed_isok)) { + dout(5) << "session is closed, dropping " << req->get_reqid() << dendl; + return; + } + bool queue_replay = false; + if (req->is_replay()) { + dout(3) << "queuing replayed op" << dendl; + queue_replay = true; + if (req->head.ino && + !session->have_completed_request(req->get_reqid().tid, nullptr)) { + mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino)); + } + } else if (req->get_retry_attempt()) { + // process completed request in clientreplay stage. The completed request + // might have created new file/directorie. This guarantees MDS sends a reply + // to client before other request modifies the new file/directorie. + if (session->have_completed_request(req->get_reqid().tid, NULL)) { + dout(3) << "queuing completed op" << dendl; + queue_replay = true; + } + // this request was created before the cap reconnect message, drop any embedded + // cap releases. + req->releases.clear(); + } + if (queue_replay) { + req->mark_queued_for_replay(); + mds->enqueue_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + } + + bool wait_for_active = true; + if (mds->is_stopping()) { + wait_for_active = false; + } else if (mds->is_clientreplay()) { + if (req->is_queued_for_replay()) { + wait_for_active = false; + } + } + if (wait_for_active) { + dout(3) << "not active yet, waiting" << dendl; + mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); + return; + } + } + + switch (m->get_type()) { + case CEPH_MSG_CLIENT_SESSION: + handle_client_session(MClientSession::msgref_cast(m)); + return; + case CEPH_MSG_CLIENT_REQUEST: + handle_client_request(MClientRequest::msgref_cast(m)); + return; + case CEPH_MSG_CLIENT_RECLAIM: + handle_client_reclaim(MClientReclaim::msgref_cast(m)); + return; + case MSG_MDS_SLAVE_REQUEST: + handle_slave_request(MMDSSlaveRequest::msgref_cast(m)); + return; + default: + derr << "server unknown message " << m->get_type() << dendl; + ceph_abort_msg("server unknown message"); + } +} + + + +// ---------------------------------------------------------- +// SESSION management + +class C_MDS_session_finish : public ServerLogContext { + Session *session; + uint64_t state_seq; + bool open; + version_t cmapv; + interval_set<inodeno_t> inos; + version_t inotablev; + Context *fin; +public: + C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) : + ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { } + C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) : + ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { } + void finish(int r) override { + ceph_assert(r == 0); + server->_session_logged(session, state_seq, open, cmapv, inos, inotablev); + if (fin) { + fin->complete(r); + } + } +}; + +Session* Server::find_session_by_uuid(std::string_view uuid) +{ + Session* session = nullptr; + for (auto& it : mds->sessionmap.get_sessions()) { + auto& metadata = it.second->info.client_metadata; + + auto p = metadata.find("uuid"); + if (p == metadata.end() || p->second != uuid) + continue; + + if (!session) { + session = it.second; + } else if (!session->reclaiming_from) { + assert(it.second->reclaiming_from == session); + session = it.second; + } else { + assert(session->reclaiming_from == it.second); + } + } + return session; +} + +void Server::reclaim_session(Session *session, const MClientReclaim::const_ref &m) +{ + if (!session->is_open() && !session->is_stale()) { + dout(10) << "session not open, dropping this req" << dendl; + return; + } + + auto reply = MClientReclaimReply::create(0); + if (m->get_uuid().empty()) { + dout(10) << __func__ << " invalid message (no uuid)" << dendl; + reply->set_result(-EINVAL); + mds->send_message_client(reply, session); + return; + } + + unsigned flags = m->get_flags(); + if (flags != CEPH_RECLAIM_RESET) { // currently only support reset + dout(10) << __func__ << " unsupported flags" << dendl; + reply->set_result(-EOPNOTSUPP); + mds->send_message_client(reply, session); + return; + } + + Session* target = find_session_by_uuid(m->get_uuid()); + if (target) { + if (session->info.auth_name != target->info.auth_name) { + dout(10) << __func__ << " session auth_name " << session->info.auth_name + << " != target auth_name " << target->info.auth_name << dendl; + reply->set_result(-EPERM); + mds->send_message_client(reply, session); + } + + assert(!target->reclaiming_from); + assert(!session->reclaiming_from); + session->reclaiming_from = target; + reply->set_addrs(entity_addrvec_t(target->info.inst.addr)); + } + + if (flags & CEPH_RECLAIM_RESET) { + finish_reclaim_session(session, reply); + return; + } + + ceph_abort(); +} + +void Server::finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply) +{ + Session *target = session->reclaiming_from; + if (target) { + session->reclaiming_from = nullptr; + + Context *send_reply; + if (reply) { + int64_t session_id = session->get_client().v; + send_reply = new FunctionContext([this, session_id, reply](int r) { + assert(mds->mds_lock.is_locked_by_me()); + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id)); + if (!session) { + return; + } + auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); }); + reply->set_epoch(epoch); + mds->send_message_client(reply, session); + }); + } else { + send_reply = nullptr; + } + + bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) { + return map.is_blacklisted(target->info.inst.addr); + }); + + if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) { + kill_session(target, send_reply); + } else { + std::stringstream ss; + mds->evict_client(target->get_client().v, false, true, ss, send_reply); + } + } else if (reply) { + mds->send_message_client(reply, session); + } +} + +void Server::handle_client_reclaim(const MClientReclaim::const_ref &m) +{ + Session *session = mds->get_session(m); + dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl; + assert(m->get_source().is_client()); // should _not_ come from an mds! + + if (!session) { + dout(0) << " ignoring sessionless msg " << *m << dendl; + return; + } + + if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) { + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + + if (m->get_flags() & MClientReclaim::FLAG_FINISH) { + finish_reclaim_session(session); + } else { + reclaim_session(session, m); + } +} + +void Server::handle_client_session(const MClientSession::const_ref &m) +{ + version_t pv; + Session *session = mds->get_session(m); + + dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl; + ceph_assert(m->get_source().is_client()); // should _not_ come from an mds! + + if (!session) { + dout(0) << " ignoring sessionless msg " << *m << dendl; + auto reply = MClientSession::create(CEPH_SESSION_REJECT); + reply->metadata["error_string"] = "sessionless"; + mds->send_message(reply, m->get_connection()); + return; + } + + if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) { + // always handle renewcaps (state >= MDSMap::STATE_RECONNECT) + } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) { + // close requests need to be handled when mds is active + if (mds->get_state() < MDSMap::STATE_ACTIVE) { + mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); + return; + } + } else { + if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) { + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + } + + if (logger) + logger->inc(l_mdss_handle_client_session); + + uint64_t sseq = 0; + switch (m->get_op()) { + case CEPH_SESSION_REQUEST_OPEN: + if (session->is_opening() || + session->is_open() || + session->is_stale() || + session->is_killing() || + terminating_sessions) { + dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl; + return; + } + ceph_assert(session->is_closed() || session->is_closing()); + + if (mds->is_stopping()) { + dout(10) << "mds is stopping, dropping open req" << dendl; + return; + } + + { + auto& addr = session->info.inst.addr; + session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features)); + auto& client_metadata = session->info.client_metadata; + + auto log_session_status = [this, m, session](std::string_view status, std::string_view err) { + auto now = ceph_clock_now(); + auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp(); + auto elapsed = now - m->get_recv_stamp(); + CachedStackStringStream css; + *css << "New client session:" + << " addr=\"" << session->info.inst.addr << "\"" + << ",elapsed=" << elapsed + << ",throttled=" << throttle_elapsed + << ",status=\"" << status << "\""; + if (!err.empty()) { + *css << ",error=\"" << err << "\""; + } + const auto& metadata = session->info.client_metadata; + if (auto it = metadata.find("root"); it != metadata.end()) { + *css << ",root=\"" << it->second << "\""; + } + dout(2) << css->strv() << dendl; + }; + + auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) { + auto m = MClientSession::create(CEPH_SESSION_REJECT); + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + m->metadata["error_string"] = err_str; + mds->send_message_client(m, session); + log_session_status("REJECTED", err_str); + }; + + bool blacklisted = mds->objecter->with_osdmap( + [&addr](const OSDMap &osd_map) -> bool { + return osd_map.is_blacklisted(addr); + }); + + if (blacklisted) { + dout(10) << "rejecting blacklisted client " << addr << dendl; + send_reject_message("blacklisted"); + session->clear(); + break; + } + + if (client_metadata.features.empty()) + infer_supported_features(session, client_metadata); + + dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl; + dout(20) << " features: '" << client_metadata.features << dendl; + for (const auto& p : client_metadata) { + dout(20) << " " << p.first << ": " << p.second << dendl; + } + + feature_bitset_t missing_features = required_client_features; + missing_features -= client_metadata.features; + if (!missing_features.empty()) { + stringstream ss; + ss << "missing required features '" << missing_features << "'"; + send_reject_message(ss.str()); + mds->clog->warn() << "client session (" << session->info.inst + << ") lacks required features " << missing_features + << "; client supports " << client_metadata.features; + session->clear(); + break; + } + + // Special case for the 'root' metadata path; validate that the claimed + // root is actually within the caps of the session + if (auto it = client_metadata.find("root"); it != client_metadata.end()) { + auto claimed_root = it->second; + stringstream ss; + bool denied = false; + // claimed_root has a leading "/" which we strip before passing + // into caps check + if (claimed_root.empty() || claimed_root[0] != '/') { + denied = true; + ss << "invalue root '" << claimed_root << "'"; + } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) { + denied = true; + ss << "non-allowable root '" << claimed_root << "'"; + } + + if (denied) { + // Tell the client we're rejecting their open + send_reject_message(ss.str()); + mds->clog->warn() << "client session with " << ss.str() + << " denied (" << session->info.inst << ")"; + session->clear(); + break; + } + } + + if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) { + if (find_session_by_uuid(it->second)) { + send_reject_message("duplicated session uuid"); + mds->clog->warn() << "client session with duplicated session uuid '" + << it->second << "' denied (" << session->info.inst << ")"; + session->clear(); + break; + } + } + + if (session->is_closed()) + mds->sessionmap.add_session(session); + + pv = mds->sessionmap.mark_projected(session); + sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING); + mds->sessionmap.touch_session(session); + auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){ + ceph_assert(r == 0); + log_session_status("ACCEPTED", ""); + }); + mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata), + new C_MDS_session_finish(this, session, sseq, true, pv, fin)); + mdlog->flush(); + } + break; + + case CEPH_SESSION_REQUEST_RENEWCAPS: + if (session->is_open() || session->is_stale()) { + mds->sessionmap.touch_session(session); + if (session->is_stale()) { + mds->sessionmap.set_state(session, Session::STATE_OPEN); + mds->locker->resume_stale_caps(session); + mds->sessionmap.touch_session(session); + } + auto reply = MClientSession::create(CEPH_SESSION_RENEWCAPS, m->get_seq()); + mds->send_message_client(reply, session); + } else { + dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl; + } + break; + + case CEPH_SESSION_REQUEST_CLOSE: + { + if (session->is_closed() || + session->is_closing() || + session->is_killing()) { + dout(10) << "already closed|closing|killing, dropping this req" << dendl; + return; + } + if (session->is_importing()) { + dout(10) << "ignoring close req on importing session" << dendl; + return; + } + ceph_assert(session->is_open() || + session->is_stale() || + session->is_opening()); + if (m->get_seq() < session->get_push_seq()) { + dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq() + << ", dropping" << dendl; + return; + } + // We are getting a seq that is higher than expected. + // Handle the same as any other seqn error. + // + if (m->get_seq() != session->get_push_seq()) { + dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq() + << ", BUGGY!" << dendl; + mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != " + << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name(); + return; + } + journal_close_session(session, Session::STATE_CLOSING, NULL); + } + break; + + case CEPH_SESSION_FLUSHMSG_ACK: + finish_flush_session(session, m->get_seq()); + break; + + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: + if (mds->is_active()) + mdlog->flush(); + break; + + default: + ceph_abort(); + } +} + + +void Server::flush_session(Session *session, MDSGatherBuilder& gather) { + if (!session->is_open() || + !session->get_connection() || + !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) { + return; + } + + version_t seq = session->wait_for_flush(gather.new_sub()); + mds->send_message_client( + MClientSession::create(CEPH_SESSION_FLUSHMSG, seq), session); +} + +void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather) +{ + for (const auto& client : client_set) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); + ceph_assert(session); + flush_session(session, gather); + } +} + +void Server::finish_flush_session(Session *session, version_t seq) +{ + MDSContext::vec finished; + session->finish_flush(seq, finished); + mds->queue_waiters(finished); +} + +void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv, + interval_set<inodeno_t>& inos, version_t piv) +{ + dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close") + << " " << pv << dendl; + + if (piv) { + ceph_assert(session->is_closing() || session->is_killing() || + session->is_opening()); // re-open closing session + session->info.prealloc_inos.subtract(inos); + mds->inotable->apply_release_ids(inos); + ceph_assert(mds->inotable->get_version() == piv); + } + + mds->sessionmap.mark_dirty(session); + + // apply + if (session->get_state_seq() != state_seq) { + dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq() + << ", noop" << dendl; + // close must have been canceled (by an import?), or any number of other things.. + } else if (open) { + ceph_assert(session->is_opening()); + mds->sessionmap.set_state(session, Session::STATE_OPEN); + mds->sessionmap.touch_session(session); + ceph_assert(session->get_connection()); + auto reply = MClientSession::create(CEPH_SESSION_OPEN); + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + reply->supported_features = supported_features; + mds->send_message_client(reply, session); + if (mdcache->is_readonly()) { + auto m = MClientSession::create(CEPH_SESSION_FORCE_RO); + mds->send_message_client(m, session); + } + } else if (session->is_closing() || + session->is_killing()) { + // kill any lingering capabilities, leases, requests + bool killing = session->is_killing(); + while (!session->caps.empty()) { + Capability *cap = session->caps.front(); + CInode *in = cap->get_inode(); + dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl; + mds->locker->remove_client_cap(in, cap, killing); + } + while (!session->leases.empty()) { + ClientLease *r = session->leases.front(); + CDentry *dn = static_cast<CDentry*>(r->parent); + dout(20) << " killing client lease of " << *dn << dendl; + dn->remove_client_lease(r, mds->locker); + } + if (client_reconnect_gather.erase(session->info.get_client())) { + dout(20) << " removing client from reconnect set" << dendl; + if (client_reconnect_gather.empty()) { + dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl; + reconnect_gather_finish(); + } + } + if (client_reclaim_gather.erase(session->info.get_client())) { + dout(20) << " removing client from reclaim set" << dendl; + if (client_reclaim_gather.empty()) { + dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl; + mds->maybe_clientreplay_done(); + } + } + + if (session->is_closing()) { + // mark con disposable. if there is a fault, we will get a + // reset and clean it up. if the client hasn't received the + // CLOSE message yet, they will reconnect and get an + // ms_handle_remote_reset() and realize they had in fact closed. + // do this *before* sending the message to avoid a possible + // race. + if (session->get_connection()) { + // Conditional because terminate_sessions will indiscrimately + // put sessions in CLOSING whether they ever had a conn or not. + session->get_connection()->mark_disposable(); + } + + // reset session + mds->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE), session); + mds->sessionmap.set_state(session, Session::STATE_CLOSED); + session->clear(); + mds->sessionmap.remove_session(session); + } else if (session->is_killing()) { + // destroy session, close connection + if (session->get_connection()) { + session->get_connection()->mark_down(); + mds->sessionmap.set_state(session, Session::STATE_CLOSED); + session->set_connection(nullptr); + } + mds->sessionmap.remove_session(session); + } else { + ceph_abort(); + } + } else { + ceph_abort(); + } +} + +/** + * Inject sessions from some source other than actual connections. + * + * For example: + * - sessions inferred from journal replay + * - sessions learned from other MDSs during rejoin + * - sessions learned from other MDSs during dir/caps migration + * - sessions learned from other MDSs during a cross-MDS rename + */ +version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm, + map<client_t,client_metadata_t>& cmm, + map<client_t, pair<Session*,uint64_t> >& smap) +{ + version_t pv = mds->sessionmap.get_projected(); + + dout(10) << "prepare_force_open_sessions " << pv + << " on " << cm.size() << " clients" + << dendl; + + mds->objecter->with_osdmap( + [this, &cm, &cmm](const OSDMap &osd_map) { + for (auto p = cm.begin(); p != cm.end(); ) { + if (osd_map.is_blacklisted(p->second.addr)) { + dout(10) << " ignoring blacklisted client." << p->first + << " (" << p->second.addr << ")" << dendl; + cmm.erase(p->first); + cm.erase(p++); + } else { + ++p; + } + } + }); + + for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) { + Session *session = mds->sessionmap.get_or_add_session(p->second); + pv = mds->sessionmap.mark_projected(session); + uint64_t sseq; + if (session->is_closed() || + session->is_closing() || + session->is_killing()) { + sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING); + auto q = cmm.find(p->first); + if (q != cmm.end()) + session->info.client_metadata.merge(q->second); + } else { + ceph_assert(session->is_open() || + session->is_opening() || + session->is_stale()); + sseq = 0; + } + smap[p->first] = make_pair(session, sseq); + session->inc_importing(); + } + return pv; +} + +void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap, + bool dec_import) +{ + /* + * FIXME: need to carefully consider the race conditions between a + * client trying to close a session and an MDS doing an import + * trying to force open a session... + */ + dout(10) << "finish_force_open_sessions on " << smap.size() << " clients," + << " initial v " << mds->sessionmap.get_version() << dendl; + + for (auto &it : smap) { + Session *session = it.second.first; + uint64_t sseq = it.second.second; + if (sseq > 0) { + if (session->get_state_seq() != sseq) { + dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl; + } else { + dout(10) << "force_open_sessions opened " << session->info.inst << dendl; + mds->sessionmap.set_state(session, Session::STATE_OPEN); + mds->sessionmap.touch_session(session); + + auto reply = MClientSession::create(CEPH_SESSION_OPEN); + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + reply->supported_features = supported_features; + mds->send_message_client(reply, session); + + if (mdcache->is_readonly()) + mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session); + } + } else { + dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl; + ceph_assert(session->is_open() || session->is_stale()); + } + + if (dec_import) { + session->dec_importing(); + } + + mds->sessionmap.mark_dirty(session); + } + + dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl; +} + +class C_MDS_TerminatedSessions : public ServerContext { + void finish(int r) override { + server->terminating_sessions = false; + } + public: + explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {} +}; + +void Server::terminate_sessions() +{ + dout(5) << "terminating all sessions..." << dendl; + + terminating_sessions = true; + + // kill them off. clients will retry etc. + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (set<Session*>::const_iterator p = sessions.begin(); + p != sessions.end(); + ++p) { + Session *session = *p; + if (session->is_closing() || + session->is_killing() || + session->is_closed()) + continue; + journal_close_session(session, Session::STATE_CLOSING, NULL); + } + + mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this)); +} + + +void Server::find_idle_sessions() +{ + auto now = clock::now(); + auto last_cleared_laggy = mds->last_cleared_laggy(); + + dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl; + + // timeout/stale + // (caps go stale, lease die) + double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now()); + double cutoff = queue_max_age + mds->mdsmap->get_session_timeout(); + + // don't kick clients if we've been laggy + if (last_cleared_laggy < cutoff) { + dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff + << "), not marking any client stale" << dendl; + return; + } + + std::vector<Session*> to_evict; + + bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale"); + const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN); + if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) { + std::vector<Session*> new_stale; + + for (auto session : *(sessions_p1->second)) { + auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count(); + if (last_cap_renew_span < cutoff) { + dout(20) << "laggiest active session is " << session->info.inst + << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl; + break; + } + + if (session->last_seen > session->last_cap_renew) { + last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count(); + if (last_cap_renew_span < cutoff) { + dout(20) << "laggiest active session is " << session->info.inst + << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl; + continue; + } + } + + if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) { + dout(20) << "evicting session " << session->info.inst << " since autoclose " + "has arrived" << dendl; + // evict session without marking it stale + to_evict.push_back(session); + continue; + } + + if (defer_session_stale && + !session->is_any_flush_waiter() && + !mds->locker->is_revoking_any_caps_from(session->get_client())) { + dout(20) << "deferring marking session " << session->info.inst << " stale " + "since it holds no caps" << dendl; + continue; + } + + auto it = session->info.client_metadata.find("timeout"); + if (it != session->info.client_metadata.end()) { + unsigned timeout = strtoul(it->second.c_str(), nullptr, 0); + if (timeout == 0) { + dout(10) << "skipping session " << session->info.inst + << ", infinite timeout specified" << dendl; + continue; + } + double cutoff = queue_max_age + timeout; + if (last_cap_renew_span < cutoff) { + dout(10) << "skipping session " << session->info.inst + << ", timeout (" << timeout << ") specified" + << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl; + continue; + } + + // do not go through stale, evict it directly. + to_evict.push_back(session); + } else { + dout(10) << "new stale session " << session->info.inst + << " last renewed caps " << last_cap_renew_span << "s ago" << dendl; + new_stale.push_back(session); + } + } + + for (auto session : new_stale) { + mds->sessionmap.set_state(session, Session::STATE_STALE); + if (mds->locker->revoke_stale_caps(session)) { + mds->locker->remove_stale_leases(session); + finish_flush_session(session, session->get_push_seq()); + auto m = MClientSession::create(CEPH_SESSION_STALE, session->get_push_seq()); + mds->send_message_client(m, session); + } else { + to_evict.push_back(session); + } + } + } + + // autoclose + cutoff = queue_max_age + mds->mdsmap->get_session_autoclose(); + + // Collect a list of sessions exceeding the autoclose threshold + const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE); + if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) { + for (auto session : *(sessions_p2->second)) { + assert(session->is_stale()); + auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count(); + if (last_cap_renew_span < cutoff) { + dout(20) << "oldest stale session is " << session->info.inst + << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl; + break; + } + to_evict.push_back(session); + } + } + + for (auto session: to_evict) { + if (session->is_importing()) { + dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl; + continue; + } + + auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count(); + mds->clog->warn() << "evicting unresponsive client " << *session + << ", after " << last_cap_renew_span << " seconds"; + dout(10) << "autoclosing stale session " << session->info.inst + << " last renewed caps " << last_cap_renew_span << "s ago" << dendl; + + if (g_conf()->mds_session_blacklist_on_timeout) { + std::stringstream ss; + mds->evict_client(session->get_client().v, false, true, ss, nullptr); + } else { + kill_session(session, NULL); + } + } +} + +void Server::evict_cap_revoke_non_responders() { + if (!cap_revoke_eviction_timeout) { + return; + } + + std::list<client_t> to_evict; + mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout); + + for (auto const &client: to_evict) { + mds->clog->warn() << "client id " << client << " has not responded to" + << " cap revoke by MDS for over " << cap_revoke_eviction_timeout + << " seconds, evicting"; + dout(1) << __func__ << ": evicting cap revoke non-responder client id " + << client << dendl; + + std::stringstream ss; + bool evicted = mds->evict_client(client.v, false, + g_conf()->mds_session_blacklist_on_evict, + ss, nullptr); + if (evicted && logger) { + logger->inc(l_mdss_cap_revoke_eviction); + } + } +} + +void Server::handle_conf_change(const std::set<std::string>& changed) { + if (changed.count("mds_replay_unsafe_with_closed_session")) { + replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session"); + } + if (changed.count("mds_cap_revoke_eviction_timeout")) { + cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout"); + dout(20) << __func__ << " cap revoke eviction timeout changed to " + << cap_revoke_eviction_timeout << dendl; + } + if (changed.count("mds_recall_max_decay_rate")) { + recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate")); + } + if (changed.count("mds_max_snaps_per_dir")) { + max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir"); + dout(20) << __func__ << " max snapshots per directory changed to " + << max_snaps_per_dir << dendl; + } + if (changed.count("mds_max_caps_per_client")) { + max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client"); + } + if (changed.count("mds_session_cap_acquisition_throttle")) { + cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle"); + } + if (changed.count("mds_session_max_caps_throttle_ratio")) { + max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio"); + } + if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) { + caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout"); + } +} + +/* + * XXX bump in the interface here, not using an MDSContext here + * because all the callers right now happen to use a SaferCond + */ +void Server::kill_session(Session *session, Context *on_safe) +{ + ceph_assert(mds->mds_lock.is_locked_by_me()); + + if ((session->is_opening() || + session->is_open() || + session->is_stale()) && + !session->is_importing()) { + dout(10) << "kill_session " << session << dendl; + journal_close_session(session, Session::STATE_KILLING, on_safe); + } else { + dout(10) << "kill_session importing or already closing/killing " << session << dendl; + if (session->is_closing() || + session->is_killing()) { + if (on_safe) + mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe)); + } else { + ceph_assert(session->is_closed() || + session->is_importing()); + if (on_safe) + on_safe->complete(0); + } + } +} + +size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist) +{ + bool prenautilus = mds->objecter->with_osdmap( + [&](const OSDMap& o) { + return o.require_osd_release < CEPH_RELEASE_NAUTILUS; + }); + + std::vector<Session*> victims; + const auto& sessions = mds->sessionmap.get_sessions(); + for (const auto& p : sessions) { + if (!p.first.is_client()) { + // Do not apply OSDMap blacklist to MDS daemons, we find out + // about their death via MDSMap. + continue; + } + + Session *s = p.second; + auto inst_addr = s->info.inst.addr; + // blacklist entries are always TYPE_ANY for nautilus+ + inst_addr.set_type(entity_addr_t::TYPE_ANY); + if (blacklist.count(inst_addr)) { + victims.push_back(s); + continue; + } + if (prenautilus) { + // ...except pre-nautilus, they were TYPE_LEGACY + inst_addr.set_type(entity_addr_t::TYPE_LEGACY); + if (blacklist.count(inst_addr)) { + victims.push_back(s); + } + } + } + + for (const auto s : victims) { + kill_session(s, nullptr); + } + + dout(10) << "apply_blacklist: killed " << victims.size() << dendl; + + return victims.size(); +} + +void Server::journal_close_session(Session *session, int state, Context *on_safe) +{ + uint64_t sseq = mds->sessionmap.set_state(session, state); + version_t pv = mds->sessionmap.mark_projected(session); + version_t piv = 0; + + // release alloc and pending-alloc inos for this session + // and wipe out session state, in case the session close aborts for some reason + interval_set<inodeno_t> both; + both.insert(session->info.prealloc_inos); + both.insert(session->pending_prealloc_inos); + if (both.size()) { + mds->inotable->project_release_ids(both); + piv = mds->inotable->get_projected_version(); + } else + piv = 0; + + mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv), + new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe)); + mdlog->flush(); + + // clean up requests, too + while(!session->requests.empty()) { + auto mdr = MDRequestRef(*session->requests.begin()); + mdcache->request_kill(mdr); + } + + finish_flush_session(session, session->get_push_seq()); +} + +void Server::reconnect_clients(MDSContext *reconnect_done_) +{ + reconnect_done = reconnect_done_; + + auto now = clock::now(); + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (auto session : sessions) { + if (session->is_open()) { + client_reconnect_gather.insert(session->get_client()); + session->set_reconnecting(true); + session->last_cap_renew = now; + } + } + + if (client_reconnect_gather.empty()) { + dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl; + reconnect_gather_finish(); + return; + } + + // clients will get the mdsmap and discover we're reconnecting via the monitor. + + reconnect_start = now; + dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl; + mds->sessionmap.dump(); +} + +void Server::handle_client_reconnect(const MClientReconnect::const_ref &m) +{ + dout(7) << "handle_client_reconnect " << m->get_source() + << (m->has_more() ? " (more)" : "") << dendl; + client_t from = m->get_source().num(); + Session *session = mds->get_session(m); + if (!session) { + dout(0) << " ignoring sessionless msg " << *m << dendl; + auto reply = MClientSession::create(CEPH_SESSION_REJECT); + reply->metadata["error_string"] = "sessionless"; + mds->send_message(reply, m->get_connection()); + return; + } + + if (!session->is_open()) { + dout(0) << " ignoring msg from not-open session" << *m << dendl; + auto reply = MClientSession::create(CEPH_SESSION_CLOSE); + mds->send_message(reply, m->get_connection()); + return; + } + + if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) { + dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl; + mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m)); + return; + } + + auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count(); + dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl; + + bool deny = false; + if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) { + // XXX maybe in the future we can do better than this? + dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl; + mds->clog->info() << "denied reconnect attempt (mds is " + << ceph_mds_state_name(mds->get_state()) + << ") from " << m->get_source_inst() + << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")"; + deny = true; + } else { + std::string error_str; + if (!session->is_open()) { + error_str = "session is closed"; + } else if (mdcache->is_readonly()) { + error_str = "mds is readonly"; + } else { + if (session->info.client_metadata.features.empty()) + infer_supported_features(session, session->info.client_metadata); + + feature_bitset_t missing_features = required_client_features; + missing_features -= session->info.client_metadata.features; + if (!missing_features.empty()) { + stringstream ss; + ss << "missing required features '" << missing_features << "'"; + error_str = ss.str(); + } + } + + if (!error_str.empty()) { + deny = true; + dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl; + mds->clog->info() << "denied reconnect attempt from " + << m->get_source_inst() << " (" << error_str << ")"; + } + } + + if (deny) { + auto r = MClientSession::create(CEPH_SESSION_CLOSE); + mds->send_message_client(r, session); + if (session->is_open()) + kill_session(session, nullptr); + return; + } + + if (!m->has_more()) { + // notify client of success with an OPEN + auto reply = MClientSession::create(CEPH_SESSION_OPEN); + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + reply->supported_features = supported_features; + mds->send_message_client(reply, session); + mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay; + } + + session->last_cap_renew = clock::now(); + + // snaprealms + for (const auto &r : m->realms) { + CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino)); + if (in && in->state_test(CInode::STATE_PURGING)) + continue; + if (in) { + if (in->snaprealm) { + dout(15) << "open snaprealm (w inode) on " << *in << dendl; + } else { + // this can happen if we are non-auth or we rollback snaprealm + dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl; + } + mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq)); + } else { + dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino) + << " seq " << r.realm.seq << dendl; + mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq)); + } + } + + // caps + for (const auto &p : m->caps) { + // make sure our last_cap_id is MAX over all issued caps + if (p.second.capinfo.cap_id > mdcache->last_cap_id) + mdcache->last_cap_id = p.second.capinfo.cap_id; + + CInode *in = mdcache->get_inode(p.first); + if (in && in->state_test(CInode::STATE_PURGING)) + continue; + if (in && in->is_auth()) { + // we recovered it, and it's ours. take note. + dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm) + << " on " << *in << dendl; + in->reconnect_cap(from, p.second, session); + mdcache->add_reconnected_cap(from, p.first, p.second); + recover_filelocks(in, p.second.flockbl, m->get_orig_source().num()); + continue; + } + + if (in && !in->is_auth()) { + // not mine. + dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl; + // add to cap export list. + mdcache->rejoin_export_caps(p.first, from, p.second, + in->authority().first, true); + } else { + // don't know if the inode is mine + dout(10) << "missing ino " << p.first << ", will load later" << dendl; + mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE); + } + } + + reconnect_last_seen = clock::now(); + + if (!m->has_more()) { + mdcache->rejoin_recovered_client(session->get_client(), session->info.inst); + + // remove from gather set + client_reconnect_gather.erase(from); + session->set_reconnecting(false); + if (client_reconnect_gather.empty()) + reconnect_gather_finish(); + } +} + +void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata) +{ + int supported = -1; + auto it = client_metadata.find("ceph_version"); + if (it != client_metadata.end()) { + // user space client + if (it->second.compare(0, 16, "ceph version 12.") == 0) + supported = CEPHFS_FEATURE_LUMINOUS; + else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR)) + supported = CEPHFS_FEATURE_KRAKEN; + } else { + it = client_metadata.find("kernel_version"); + if (it != client_metadata.end()) { + // kernel client + if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING)) + supported = CEPHFS_FEATURE_LUMINOUS; + } + } + if (supported == -1 && + session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) + supported = CEPHFS_FEATURE_JEWEL; + + if (supported >= 0) { + unsigned long value = (1UL << (supported + 1)) - 1; + client_metadata.features = feature_bitset_t(value); + dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl; + } +} + +void Server::update_required_client_features() +{ + vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED; + + int min_compat = mds->mdsmap->get_min_compat_client(); + if (min_compat >= CEPH_RELEASE_NAUTILUS) { + bits.push_back(CEPHFS_FEATURE_NAUTILUS); + } else if (min_compat >= CEPH_RELEASE_MIMIC) + bits.push_back(CEPHFS_FEATURE_MIMIC); + else if (min_compat >= CEPH_RELEASE_LUMINOUS) + bits.push_back(CEPHFS_FEATURE_LUMINOUS); + else if (min_compat >= CEPH_RELEASE_KRAKEN) + bits.push_back(CEPHFS_FEATURE_KRAKEN); + else if (min_compat >= CEPH_RELEASE_JEWEL) + bits.push_back(CEPHFS_FEATURE_JEWEL); + + std::sort(bits.begin(), bits.end()); + required_client_features = feature_bitset_t(bits); + dout(7) << "required_client_features: " << required_client_features << dendl; + + if (mds->get_state() >= MDSMap::STATE_RECONNECT) { + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (auto session : sessions) { + feature_bitset_t missing_features = required_client_features; + missing_features -= session->info.client_metadata.features; + if (!missing_features.empty()) { + bool blacklisted = mds->objecter->with_osdmap( + [session](const OSDMap &osd_map) -> bool { + return osd_map.is_blacklisted(session->info.inst.addr); + }); + if (blacklisted) + continue; + + mds->clog->warn() << "evicting session " << *session << ", missing required features '" + << missing_features << "'"; + std::stringstream ss; + mds->evict_client(session->get_client().v, false, + g_conf()->mds_session_blacklist_on_evict, ss); + } + } + } +} + +void Server::reconnect_gather_finish() +{ + dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl; + ceph_assert(reconnect_done); + + if (!mds->snapclient->is_synced()) { + // make sure snaptable cache is populated. snaprealms will be + // extensively used in rejoin stage. + dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl; + mds->snapclient->wait_for_sync(reconnect_done); + } else { + reconnect_done->complete(0); + } + reconnect_done = NULL; +} + +void Server::reconnect_tick() +{ + if (reconnect_evicting) { + dout(7) << "reconnect_tick: waiting for evictions" << dendl; + return; + } + + if (client_reconnect_gather.empty()) + return; + + auto now = clock::now(); + auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count(); + if (elapse1 < g_conf()->mds_reconnect_timeout) + return; + + vector<Session*> remaining_sessions; + remaining_sessions.reserve(client_reconnect_gather.size()); + for (auto c : client_reconnect_gather) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v)); + ceph_assert(session); + remaining_sessions.push_back(session); + // client re-sends cap flush messages before the reconnect message + if (session->last_seen > reconnect_last_seen) + reconnect_last_seen = session->last_seen; + } + + auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count(); + if (elapse2 < g_conf()->mds_reconnect_timeout / 2) { + dout(7) << "reconnect_tick: last seen " << elapse2 + << " seconds ago, extending reconnect interval" << dendl; + return; + } + + dout(7) << "reconnect timed out, " << remaining_sessions.size() + << " clients have not reconnected in time" << dendl; + + // If we're doing blacklist evictions, use this to wait for them before + // proceeding to reconnect_gather_finish + MDSGatherBuilder gather(g_ceph_context); + + for (auto session : remaining_sessions) { + // Keep sessions that have specified timeout. These sessions will prevent + // mds from going to active. MDS goes to active after they all have been + // killed or reclaimed. + if (session->info.client_metadata.find("timeout") != + session->info.client_metadata.end()) { + dout(1) << "reconnect keeps " << session->info.inst + << ", need to be reclaimed" << dendl; + client_reclaim_gather.insert(session->get_client()); + continue; + } + + dout(1) << "reconnect gives up on " << session->info.inst << dendl; + + mds->clog->warn() << "evicting unresponsive client " << *session + << ", after waiting " << elapse1 + << " seconds during MDS startup"; + + if (g_conf()->mds_session_blacklist_on_timeout) { + std::stringstream ss; + mds->evict_client(session->get_client().v, false, true, ss, + gather.new_sub()); + } else { + kill_session(session, NULL); + } + + failed_reconnects++; + } + client_reconnect_gather.clear(); + + if (gather.has_subs()) { + dout(1) << "reconnect will complete once clients are evicted" << dendl; + gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext( + [this](int r){reconnect_gather_finish();}))); + gather.activate(); + reconnect_evicting = true; + } else { + reconnect_gather_finish(); + } +} + +void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client) +{ + if (!locks.length()) return; + int numlocks; + ceph_filelock lock; + auto p = locks.cbegin(); + decode(numlocks, p); + for (int i = 0; i < numlocks; ++i) { + decode(lock, p); + lock.client = client; + in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock)); + ++in->get_fcntl_lock_state()->client_held_lock_counts[client]; + } + decode(numlocks, p); + for (int i = 0; i < numlocks; ++i) { + decode(lock, p); + lock.client = client; + in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock)); + ++in->get_flock_lock_state()->client_held_lock_counts[client]; + } +} + +/** + * Call this when the MDCache is oversized, to send requests to the clients + * to trim some caps, and consequently unpin some inodes in the MDCache so + * that it can trim too. + */ +std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags) +{ + const auto now = clock::now(); + const bool steady = !!(flags&RecallFlags::STEADY); + const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX); + const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS); + const bool trim = !!(flags&RecallFlags::TRIM); + + const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client"); + const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client"); + const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold"); + const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps"); + const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold"); + const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude"); + + dout(7) << __func__ << ":" + << " min=" << min_caps_per_client + << " max=" << max_caps_per_client + << " total=" << Capability::count() + << " flags=" << flags + << dendl; + + /* trim caps of sessions with the most caps first */ + std::multimap<uint64_t, Session*> caps_session; + auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) { + auto num_caps = s->caps.size(); + auto cache_liveness = s->get_session_cache_liveness(); + if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) { + caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s)); + } + }; + mds->sessionmap.get_client_sessions(std::move(f)); + + std::pair<bool, uint64_t> result = {false, 0}; + auto& [throttled, caps_recalled] = result; + last_recall_state = now; + for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) { + if (!session->is_open() || + !session->get_connection() || + !session->info.inst.name.is_client()) + continue; + + dout(10) << __func__ << ":" + << " session " << session->info.inst + << " caps " << num_caps + << ", leases " << session->leases.size() + << dendl; + + uint64_t newlim; + if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) { + newlim = min_caps_per_client; + } else { + newlim = num_caps-recall_max_caps; + } + if (num_caps > newlim) { + /* now limit the number of caps we recall at a time to prevent overloading ourselves */ + uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim); + newlim = num_caps-recall; + const uint64_t session_recall_throttle = session->get_recall_caps_throttle(); + const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o(); + const uint64_t global_recall_throttle = recall_throttle.get(); + if (session_recall_throttle+recall > recall_max_decay_threshold) { + dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl; + throttled = true; + continue; + } else if (session_recall_throttle2o+recall > recall_max_caps*2) { + dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl; + throttled = true; + continue; + } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) { + dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl; + throttled = true; + break; + } + + // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall + if (steady) { + const auto session_recall = session->get_recall_caps(); + const auto session_release = session->get_release_caps(); + if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) { + /* The session has been unable to keep up with the number of caps + * recalled (by half); additionally, to prevent marking sessions + * we've just begun to recall from, the session_recall counter + * (decayed count of caps recently recalled) is **greater** than the + * session threshold for the session's cap recall throttle. + */ + dout(15) << " 2*session_release < session_recall" + " (2*" << session_release << " < " << session_recall << ") &&" + " 2*session_recall < recall_max_decay_threshold" + " (2*" << session_recall << " > " << recall_max_decay_threshold << ")" + " Skipping because we are unlikely to get more released." << dendl; + continue; + } else if (recall < recall_max_caps && 2*recall < session_recall) { + /* The number of caps recalled is less than the number we *could* + * recall (so there isn't much left to recall?) and the number of + * caps is less than the current recall_caps counter (decayed count + * of caps recently recalled). + */ + dout(15) << " 2*recall < session_recall " + " (2*" << recall << " < " << session_recall << ") &&" + " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");" + " Skipping because we are unlikely to get more released." << dendl; + continue; + } + } + + dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl; + + auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE); + m->head.max_caps = newlim; + mds->send_message_client(m, session); + if (gather) { + flush_session(session, *gather); + } + caps_recalled += session->notify_recall_sent(newlim); + recall_throttle.hit(recall); + } + } + + dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl; + + return result; +} + +void Server::force_clients_readonly() +{ + dout(10) << "force_clients_readonly" << dendl; + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (set<Session*>::const_iterator p = sessions.begin(); + p != sessions.end(); + ++p) { + Session *session = *p; + if (!session->info.inst.name.is_client() || + !(session->is_open() || session->is_stale())) + continue; + mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session); + } +} + +/******* + * some generic stuff for finishing off requests + */ +void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin) +{ + dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl; + ceph_assert(!mdr->has_completed); + + // note trace items for eventual reply. + mdr->tracei = in; + if (in) + mdr->pin(in); + + mdr->tracedn = dn; + if (dn) + mdr->pin(dn); + + early_reply(mdr, in, dn); + + mdr->committing = true; + submit_mdlog_entry(le, fin, mdr, __func__); + + if (mdr->client_request && mdr->client_request->is_queued_for_replay()) { + if (mds->queue_one_replay()) { + dout(10) << " queued next replay op" << dendl; + } else { + dout(10) << " journaled last replay op" << dendl; + } + } else if (mdr->did_early_reply) + mds->locker->drop_rdlocks_for_early_reply(mdr.get()); + else + mdlog->flush(); +} + +void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr, + std::string_view event) +{ + if (mdr) { + string event_str("submit entry: "); + event_str += event; + mdr->mark_event(event_str); + } + mdlog->submit_entry(le, fin); +} + +/* + * send response built from mdr contents and error code; clean up mdr + */ +void Server::respond_to_request(MDRequestRef& mdr, int r) +{ + if (mdr->client_request) { + reply_client_request(mdr, MClientReply::create(*mdr->client_request, r)); + } else if (mdr->internal_op > -1) { + dout(10) << "respond_to_request on internal request " << mdr << dendl; + if (!mdr->internal_op_finish) + ceph_abort_msg("trying to respond to internal op without finisher"); + mdr->internal_op_finish->complete(r); + mdcache->request_finish(mdr); + } +} + +// statistics mds req op number and latency +void Server::perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat) +{ + int code = l_mdss_first; + switch(req->get_op()) { + case CEPH_MDS_OP_LOOKUPHASH: + code = l_mdss_req_lookuphash_latency; + break; + case CEPH_MDS_OP_LOOKUPINO: + code = l_mdss_req_lookupino_latency; + break; + case CEPH_MDS_OP_LOOKUPPARENT: + code = l_mdss_req_lookupparent_latency; + break; + case CEPH_MDS_OP_LOOKUPNAME: + code = l_mdss_req_lookupname_latency; + break; + case CEPH_MDS_OP_LOOKUP: + code = l_mdss_req_lookup_latency; + break; + case CEPH_MDS_OP_LOOKUPSNAP: + code = l_mdss_req_lookupsnap_latency; + break; + case CEPH_MDS_OP_GETATTR: + code = l_mdss_req_getattr_latency; + break; + case CEPH_MDS_OP_SETATTR: + code = l_mdss_req_setattr_latency; + break; + case CEPH_MDS_OP_SETLAYOUT: + code = l_mdss_req_setlayout_latency; + break; + case CEPH_MDS_OP_SETDIRLAYOUT: + code = l_mdss_req_setdirlayout_latency; + break; + case CEPH_MDS_OP_SETXATTR: + code = l_mdss_req_setxattr_latency; + break; + case CEPH_MDS_OP_RMXATTR: + code = l_mdss_req_rmxattr_latency; + break; + case CEPH_MDS_OP_READDIR: + code = l_mdss_req_readdir_latency; + break; + case CEPH_MDS_OP_SETFILELOCK: + code = l_mdss_req_setfilelock_latency; + break; + case CEPH_MDS_OP_GETFILELOCK: + code = l_mdss_req_getfilelock_latency; + break; + case CEPH_MDS_OP_CREATE: + code = l_mdss_req_create_latency; + break; + case CEPH_MDS_OP_OPEN: + code = l_mdss_req_open_latency; + break; + case CEPH_MDS_OP_MKNOD: + code = l_mdss_req_mknod_latency; + break; + case CEPH_MDS_OP_LINK: + code = l_mdss_req_link_latency; + break; + case CEPH_MDS_OP_UNLINK: + code = l_mdss_req_unlink_latency; + break; + case CEPH_MDS_OP_RMDIR: + code = l_mdss_req_rmdir_latency; + break; + case CEPH_MDS_OP_RENAME: + code = l_mdss_req_rename_latency; + break; + case CEPH_MDS_OP_MKDIR: + code = l_mdss_req_mkdir_latency; + break; + case CEPH_MDS_OP_SYMLINK: + code = l_mdss_req_symlink_latency; + break; + case CEPH_MDS_OP_LSSNAP: + code = l_mdss_req_lssnap_latency; + break; + case CEPH_MDS_OP_MKSNAP: + code = l_mdss_req_mksnap_latency; + break; + case CEPH_MDS_OP_RMSNAP: + code = l_mdss_req_rmsnap_latency; + break; + case CEPH_MDS_OP_RENAMESNAP: + code = l_mdss_req_renamesnap_latency; + break; + default: ceph_abort(); + } + logger->tinc(code, lat); +} + +void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn) +{ + if (!g_conf()->mds_early_reply) + return; + + if (mdr->no_early_reply) { + dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl; + return; + } + + if (mdr->has_more() && mdr->more()->has_journaled_slaves) { + dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl; + return; + } + + if (mdr->alloc_ino) { + dout(10) << "early_reply - allocated ino, not allowed" << dendl; + return; + } + + const MClientRequest::const_ref &req = mdr->client_request; + entity_inst_t client_inst = req->get_source_inst(); + if (client_inst.name.is_mds()) + return; + + if (req->is_replay()) { + dout(10) << " no early reply on replay op" << dendl; + return; + } + + + auto reply = MClientReply::create(*req, 0); + reply->set_unsafe(); + + // mark xlocks "done", indicating that we are exposing uncommitted changes. + // + //_rename_finish() does not send dentry link/unlink message to replicas. + // so do not set xlocks on dentries "done", the xlocks prevent dentries + // that have projected linkages from getting new replica. + mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME); + + dout(10) << "early_reply " << reply->get_result() + << " (" << cpp_strerror(reply->get_result()) + << ") " << *req << dendl; + + if (tracei || tracedn) { + if (tracei) + mdr->cap_releases.erase(tracei->vino()); + if (tracedn) + mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino()); + + set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid, + req->get_dentry_wanted(), mdr); + } + + reply->set_extra_bl(mdr->reply_extra_bl); + mds->send_message_client(reply, mdr->session); + + mdr->did_early_reply = true; + + mds->logger->inc(l_mds_reply); + utime_t lat = ceph_clock_now() - req->get_recv_stamp(); + mds->logger->tinc(l_mds_reply_latency, lat); + if (client_inst.name.is_client()) { + mds->sessionmap.hit_session(mdr->session); + } + perf_gather_op_latency(req, lat); + dout(20) << "lat " << lat << dendl; + + mdr->mark_event("early_replied"); +} + +/* + * send given reply + * include a trace to tracei + * Clean up mdr + */ +void Server::reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply) +{ + ceph_assert(mdr.get()); + const MClientRequest::const_ref &req = mdr->client_request; + + dout(7) << "reply_client_request " << reply->get_result() + << " (" << cpp_strerror(reply->get_result()) + << ") " << *req << dendl; + + mdr->mark_event("replying"); + + Session *session = mdr->session; + + // note successful request in session map? + // + // setfilelock requests are special, they only modify states in MDS memory. + // The states get lost when MDS fails. If Client re-send a completed + // setfilelock request, it means that client did not receive corresponding + // setfilelock reply. So MDS should re-execute the setfilelock request. + if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK && + reply->get_result() == 0 && session) { + inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino; + session->add_completed_request(mdr->reqid.tid, created); + if (mdr->ls) { + mdr->ls->touched_sessions.insert(session->info.inst.name); + } + } + + // give any preallocated inos to the session + apply_allocated_inos(mdr, session); + + // get tracei/tracedn from mdr? + snapid_t snapid = mdr->snapid; + CInode *tracei = mdr->tracei; + CDentry *tracedn = mdr->tracedn; + + bool is_replay = mdr->client_request->is_replay(); + bool did_early_reply = mdr->did_early_reply; + entity_inst_t client_inst = req->get_source_inst(); + int dentry_wanted = req->get_dentry_wanted(); + + if (!did_early_reply && !is_replay) { + + mds->logger->inc(l_mds_reply); + utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp(); + mds->logger->tinc(l_mds_reply_latency, lat); + if (session && client_inst.name.is_client()) { + mds->sessionmap.hit_session(session); + } + perf_gather_op_latency(req, lat); + dout(20) << "lat " << lat << dendl; + + if (tracei) + mdr->cap_releases.erase(tracei->vino()); + if (tracedn) + mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino()); + } + + // drop non-rdlocks before replying, so that we can issue leases + mdcache->request_drop_non_rdlocks(mdr); + + // reply at all? + if (session && !client_inst.name.is_mds()) { + // send reply. + if (!did_early_reply && // don't issue leases if we sent an earlier reply already + (tracei || tracedn)) { + if (is_replay) { + if (tracei) + mdcache->try_reconnect_cap(tracei, session); + } else { + // include metadata in reply + set_trace_dist(session, reply, tracei, tracedn, + snapid, dentry_wanted, + mdr); + } + } + + // We can set the extra bl unconditionally: if it's already been sent in the + // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty + reply->set_extra_bl(mdr->reply_extra_bl); + + reply->set_mdsmap_epoch(mds->mdsmap->get_epoch()); + mds->send_message_client(reply, session); + } + + if (req->is_queued_for_replay() && + (mdr->has_completed || reply->get_result() < 0)) { + if (reply->get_result() < 0) { + int r = reply->get_result(); + derr << "reply_client_request: failed to replay " << *req + << " error " << r << " (" << cpp_strerror(r) << ")" << dendl; + mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r; + } + mds->queue_one_replay(); + } + + // clean up request + mdcache->request_finish(mdr); + + // take a closer look at tracei, if it happens to be a remote link + if (tracei && + tracedn && + tracedn->get_projected_linkage()->is_remote()) { + mdcache->eval_remote(tracedn); + } +} + +/* + * pass inode OR dentry (not both, or we may get confused) + * + * trace is in reverse order (i.e. root inode comes last) + */ +void Server::set_trace_dist(Session *session, const MClientReply::ref &reply, + CInode *in, CDentry *dn, + snapid_t snapid, + int dentry_wanted, + MDRequestRef& mdr) +{ + // skip doing this for debugging purposes? + if (g_conf()->mds_inject_traceless_reply_probability && + mdr->ls && !mdr->o_trunc && + (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) { + dout(5) << "deliberately skipping trace for " << *reply << dendl; + return; + } + + // inode, dentry, dir, ..., inode + bufferlist bl; + mds_rank_t whoami = mds->get_nodeid(); + client_t client = session->get_client(); + utime_t now = ceph_clock_now(); + + dout(20) << "set_trace_dist snapid " << snapid << dendl; + + //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups + + // realm + if (snapid == CEPH_NOSNAP) { + SnapRealm *realm; + if (in) + realm = in->find_snaprealm(); + else + realm = dn->get_dir()->get_inode()->find_snaprealm(); + reply->snapbl = realm->get_snap_trace(); + dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl; + } + + // dir + dentry? + if (dn) { + reply->head.is_dentry = 1; + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); + + diri->encode_inodestat(bl, session, NULL, snapid); + dout(20) << "set_trace_dist added diri " << *diri << dendl; + +#ifdef MDS_VERIFY_FRAGSTAT + if (dir->is_complete()) + dir->verify_fragstat(); +#endif + DirStat ds; + ds.frag = dir->get_frag(); + ds.auth = dir->get_dir_auth().first; + if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth()) + dir->get_dist_spec(ds.dist, whoami); + + dir->encode_dirstat(bl, session->info, ds); + dout(20) << "set_trace_dist added dir " << *dir << dendl; + + encode(dn->get_name(), bl); + if (snapid == CEPH_NOSNAP) + mds->locker->issue_client_lease(dn, client, bl, now, session); + else { + //null lease + LeaseStat e; + mds->locker->encode_lease(bl, session->info, e); + } + dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl; + } else + reply->head.is_dentry = 0; + + // inode + if (in) { + in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps); + dout(20) << "set_trace_dist added in " << *in << dendl; + reply->head.is_target = 1; + } else + reply->head.is_target = 0; + + reply->set_trace(bl); +} + +void Server::handle_client_request(const MClientRequest::const_ref &req) +{ + dout(4) << "handle_client_request " << *req << dendl; + + if (mds->logger) + mds->logger->inc(l_mds_request); + if (logger) + logger->inc(l_mdss_handle_client_request); + + if (!mdcache->is_open()) { + dout(5) << "waiting for root" << dendl; + mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req)); + return; + } + + bool sessionclosed_isok = replay_unsafe_with_closed_session; + // active session? + Session *session = 0; + if (req->get_source().is_client()) { + session = mds->get_session(req); + if (!session) { + dout(5) << "no session for " << req->get_source() << ", dropping" << dendl; + } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) || + session->is_closing() || + session->is_killing()) { + dout(5) << "session closed|closing|killing, dropping" << dendl; + session = NULL; + } + if (!session) { + if (req->is_queued_for_replay()) + mds->queue_one_replay(); + return; + } + } + + // old mdsmap? + if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) { + // send it? hrm, this isn't ideal; they may get a lot of copies if + // they have a high request rate. + } + + // completed request? + bool has_completed = false; + if (req->is_replay() || req->get_retry_attempt()) { + ceph_assert(session); + inodeno_t created; + if (session->have_completed_request(req->get_reqid().tid, &created)) { + has_completed = true; + if (!session->is_open()) + return; + // Don't send traceless reply if the completed request has created + // new inode. Treat the request as lookup request instead. + if (req->is_replay() || + ((created == inodeno_t() || !mds->is_clientreplay()) && + req->get_op() != CEPH_MDS_OP_OPEN && + req->get_op() != CEPH_MDS_OP_CREATE)) { + dout(5) << "already completed " << req->get_reqid() << dendl; + auto reply = MClientReply::create(*req, 0); + if (created != inodeno_t()) { + bufferlist extra; + encode(created, extra); + reply->set_extra_bl(extra); + } + mds->send_message_client(reply, session); + + if (req->is_queued_for_replay()) + mds->queue_one_replay(); + + return; + } + if (req->get_op() != CEPH_MDS_OP_OPEN && + req->get_op() != CEPH_MDS_OP_CREATE) { + dout(10) << " completed request which created new inode " << created + << ", convert it to lookup request" << dendl; + req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR; + req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL; + } + } + } + + // trim completed_request list + if (req->get_oldest_client_tid() > 0) { + dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; + ceph_assert(session); + if (session->trim_completed_requests(req->get_oldest_client_tid())) { + // Sessions 'completed_requests' was dirtied, mark it to be + // potentially flushed at segment expiry. + mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name); + + if (session->get_num_trim_requests_warnings() > 0 && + session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests) + session->reset_num_trim_requests_warnings(); + } else { + if (session->get_num_completed_requests() >= + (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) { + session->inc_num_trim_requests_warnings(); + stringstream ss; + ss << "client." << session->get_client() << " does not advance its oldest_client_tid (" + << req->get_oldest_client_tid() << "), " + << session->get_num_completed_requests() + << " completed requests recorded in session\n"; + mds->clog->warn() << ss.str(); + dout(20) << __func__ << " " << ss.str() << dendl; + } + } + } + + // register + dispatch + MDRequestRef mdr = mdcache->request_start(req); + if (!mdr.get()) + return; + + if (session) { + mdr->session = session; + session->requests.push_back(&mdr->item_session_request); + } + + if (has_completed) + mdr->has_completed = true; + + // process embedded cap releases? + // (only if NOT replay!) + if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) { + client_t client = req->get_source().num(); + for (const auto &r : req->releases) { + mds->locker->process_request_cap_release(mdr, client, r.item, r.dname); + } + req->releases.clear(); + } + + dispatch_client_request(mdr); + return; +} + +void Server::handle_osd_map() +{ + /* Note that we check the OSDMAP_FULL flag directly rather than + * using osdmap_full_flag(), because we want to know "is the flag set" + * rather than "does the flag apply to us?" */ + mds->objecter->with_osdmap([this](const OSDMap& o) { + auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool()); + is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL); + dout(7) << __func__ << ": full = " << is_full << " epoch = " + << o.get_epoch() << dendl; + }); +} + +void Server::dispatch_client_request(MDRequestRef& mdr) +{ + // we shouldn't be waiting on anyone. + ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty()); + + if (mdr->killed) { + dout(10) << "request " << *mdr << " was killed" << dendl; + return; + } else if (mdr->aborted) { + mdr->aborted = false; + mdcache->request_kill(mdr); + return; + } + + const MClientRequest::const_ref &req = mdr->client_request; + + if (logger) logger->inc(l_mdss_dispatch_client_request); + + dout(7) << "dispatch_client_request " << *req << dendl; + + if (req->may_write()) { + if (mdcache->is_readonly()) { + dout(10) << " read-only FS" << dendl; + respond_to_request(mdr, -EROFS); + return; + } + if (mdr->has_more() && mdr->more()->slave_error) { + dout(10) << " got error from slaves" << dendl; + respond_to_request(mdr, mdr->more()->slave_error); + return; + } + } + + if (is_full) { + if (req->get_op() == CEPH_MDS_OP_SETLAYOUT || + req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT || + req->get_op() == CEPH_MDS_OP_SETLAYOUT || + req->get_op() == CEPH_MDS_OP_RMXATTR || + req->get_op() == CEPH_MDS_OP_SETXATTR || + req->get_op() == CEPH_MDS_OP_CREATE || + req->get_op() == CEPH_MDS_OP_SYMLINK || + req->get_op() == CEPH_MDS_OP_MKSNAP || + ((req->get_op() == CEPH_MDS_OP_LINK || + req->get_op() == CEPH_MDS_OP_RENAME) && + (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request + ) { + + dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl; + respond_to_request(mdr, -ENOSPC); + return; + } else { + dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl; + } + } + + switch (req->get_op()) { + case CEPH_MDS_OP_LOOKUPHASH: + case CEPH_MDS_OP_LOOKUPINO: + handle_client_lookup_ino(mdr, false, false); + break; + case CEPH_MDS_OP_LOOKUPPARENT: + handle_client_lookup_ino(mdr, true, false); + break; + case CEPH_MDS_OP_LOOKUPNAME: + handle_client_lookup_ino(mdr, false, true); + break; + + // inodes ops. + case CEPH_MDS_OP_LOOKUP: + handle_client_getattr(mdr, true); + break; + + case CEPH_MDS_OP_LOOKUPSNAP: + // lookupsnap does not reference a CDentry; treat it as a getattr + case CEPH_MDS_OP_GETATTR: + handle_client_getattr(mdr, false); + break; + + case CEPH_MDS_OP_SETATTR: + handle_client_setattr(mdr); + break; + case CEPH_MDS_OP_SETLAYOUT: + handle_client_setlayout(mdr); + break; + case CEPH_MDS_OP_SETDIRLAYOUT: + handle_client_setdirlayout(mdr); + break; + case CEPH_MDS_OP_SETXATTR: + handle_client_setxattr(mdr); + break; + case CEPH_MDS_OP_RMXATTR: + handle_client_removexattr(mdr); + break; + + case CEPH_MDS_OP_READDIR: + handle_client_readdir(mdr); + break; + + case CEPH_MDS_OP_SETFILELOCK: + handle_client_file_setlock(mdr); + break; + + case CEPH_MDS_OP_GETFILELOCK: + handle_client_file_readlock(mdr); + break; + + // funky. + case CEPH_MDS_OP_CREATE: + if (mdr->has_completed) + handle_client_open(mdr); // already created.. just open + else + handle_client_openc(mdr); + break; + + case CEPH_MDS_OP_OPEN: + handle_client_open(mdr); + break; + + // namespace. + // no prior locks. + case CEPH_MDS_OP_MKNOD: + handle_client_mknod(mdr); + break; + case CEPH_MDS_OP_LINK: + handle_client_link(mdr); + break; + case CEPH_MDS_OP_UNLINK: + case CEPH_MDS_OP_RMDIR: + handle_client_unlink(mdr); + break; + case CEPH_MDS_OP_RENAME: + handle_client_rename(mdr); + break; + case CEPH_MDS_OP_MKDIR: + handle_client_mkdir(mdr); + break; + case CEPH_MDS_OP_SYMLINK: + handle_client_symlink(mdr); + break; + + + // snaps + case CEPH_MDS_OP_LSSNAP: + handle_client_lssnap(mdr); + break; + case CEPH_MDS_OP_MKSNAP: + handle_client_mksnap(mdr); + break; + case CEPH_MDS_OP_RMSNAP: + handle_client_rmsnap(mdr); + break; + case CEPH_MDS_OP_RENAMESNAP: + handle_client_renamesnap(mdr); + break; + + default: + dout(1) << " unknown client op " << req->get_op() << dendl; + respond_to_request(mdr, -EOPNOTSUPP); + } +} + + +// --------------------------------------- +// SLAVE REQUESTS + +void Server::handle_slave_request(const MMDSSlaveRequest::const_ref &m) +{ + dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + if (logger) logger->inc(l_mdss_handle_slave_request); + + // reply? + if (m->is_reply()) + return handle_slave_request_reply(m); + + // the purpose of rename notify is enforcing causal message ordering. making sure + // bystanders have received all messages from rename srcdn's auth MDS. + if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) { + auto reply = MMDSSlaveRequest::create(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK); + mds->send_message(reply, m->get_connection()); + return; + } + + CDentry *straydn = NULL; + if (m->straybl.length() > 0) { + straydn = mdcache->add_replica_stray(m->straybl, from); + ceph_assert(straydn); + m->straybl.clear(); + } + + // am i a new slave? + MDRequestRef mdr; + if (mdcache->have_request(m->get_reqid())) { + // existing? + mdr = mdcache->request_get(m->get_reqid()); + + // is my request newer? + if (mdr->attempt > m->get_attempt()) { + dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt() + << ", dropping " << *m << dendl; + return; + } + + + if (mdr->attempt < m->get_attempt()) { + // mine is old, close it out + dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt() + << ", closing out" << dendl; + mdcache->request_finish(mdr); + mdr.reset(); + } else if (mdr->slave_to_mds != from) { + dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl; + return; + } + + if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) { + mdr->aborted = true; + if (mdr->slave_request) { + // only abort on-going xlock, wrlock and auth pin + ceph_assert(!mdr->slave_did_prepare()); + } else { + mdcache->request_finish(mdr); + } + return; + } + } + if (!mdr.get()) { + // new? + if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { + dout(10) << "missing slave request for " << m->get_reqid() + << " OP_FINISH, must have lost race with a forward" << dendl; + return; + } + mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m); + mdr->set_op_stamp(m->op_stamp); + } + ceph_assert(mdr->slave_request == 0); // only one at a time, please! + + if (straydn) { + mdr->pin(straydn); + mdr->straydn = straydn; + } + + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) { + dout(3) << "not clientreplay|active yet, waiting" << dendl; + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) && + mdr->locks.empty()) { + dout(3) << "not active yet, waiting" << dendl; + mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); + return; + } + + mdr->reset_slave_request(m); + + dispatch_slave_request(mdr); +} + +void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) { + metareqid_t r = m->get_reqid(); + if (!mdcache->have_uncommitted_master(r, from)) { + dout(10) << "handle_slave_request_reply ignoring slave reply from mds." + << from << " reqid " << r << dendl; + return; + } + dout(3) << "not clientreplay|active yet, waiting" << dendl; + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m)); + return; + } + + if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) { + metareqid_t r = m->get_reqid(); + mdcache->committed_master_slave(r, from); + return; + } + + MDRequestRef mdr = mdcache->request_get(m->get_reqid()); + if (m->get_attempt() != mdr->attempt) { + dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt " + << m->get_attempt() << dendl; + return; + } + + switch (m->get_op()) { + case MMDSSlaveRequest::OP_XLOCKACK: + { + // identify lock, master request + SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), + m->get_object_info()); + mdr->more()->slaves.insert(from); + lock->decode_locked_state(m->get_lock_data()); + dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; + mdr->locks.emplace_hint(mdr->locks.end(), lock, MutationImpl::LockOp::XLOCK); + mdr->finish_locking(lock); + lock->get_xlock(mdr, mdr->get_client()); + + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + ceph_assert(mdr->more()->waiting_on_slave.empty()); + mdcache->dispatch_request(mdr); + } + break; + + case MMDSSlaveRequest::OP_WRLOCKACK: + { + // identify lock, master request + SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), + m->get_object_info()); + mdr->more()->slaves.insert(from); + dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl; + auto it = mdr->locks.emplace_hint(mdr->locks.end(), + lock, MutationImpl::LockOp::REMOTE_WRLOCK, from); + ceph_assert(it->is_remote_wrlock()); + ceph_assert(it->wrlock_target == from); + + mdr->finish_locking(lock); + + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + ceph_assert(mdr->more()->waiting_on_slave.empty()); + mdcache->dispatch_request(mdr); + } + break; + + case MMDSSlaveRequest::OP_AUTHPINACK: + handle_slave_auth_pin_ack(mdr, m); + break; + + case MMDSSlaveRequest::OP_LINKPREPACK: + handle_slave_link_prep_ack(mdr, m); + break; + + case MMDSSlaveRequest::OP_RMDIRPREPACK: + handle_slave_rmdir_prep_ack(mdr, m); + break; + + case MMDSSlaveRequest::OP_RENAMEPREPACK: + handle_slave_rename_prep_ack(mdr, m); + break; + + case MMDSSlaveRequest::OP_RENAMENOTIFYACK: + handle_slave_rename_notify_ack(mdr, m); + break; + + default: + ceph_abort(); + } +} + +void Server::dispatch_slave_request(MDRequestRef& mdr) +{ + dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl; + + if (mdr->aborted) { + dout(7) << " abort flag set, finishing" << dendl; + mdcache->request_finish(mdr); + return; + } + + if (logger) logger->inc(l_mdss_dispatch_slave_request); + + int op = mdr->slave_request->get_op(); + switch (op) { + case MMDSSlaveRequest::OP_XLOCK: + case MMDSSlaveRequest::OP_WRLOCK: + { + // identify object + SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), + mdr->slave_request->get_object_info()); + + if (!lock) { + dout(10) << "don't have object, dropping" << dendl; + ceph_abort(); // can this happen, if we auth pinned properly. + } + if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) { + dout(10) << "not auth for remote xlock attempt, dropping on " + << *lock << " on " << *lock->get_parent() << dendl; + } else { + // use acquire_locks so that we get auth_pinning. + MutationImpl::LockOpVec lov; + for (const auto& p : mdr->locks) { + if (p.is_xlock()) + lov.add_xlock(p.lock); + else if (p.is_wrlock()) + lov.add_wrlock(p.lock); + } + + int replycode = 0; + switch (op) { + case MMDSSlaveRequest::OP_XLOCK: + lov.add_xlock(lock); + replycode = MMDSSlaveRequest::OP_XLOCKACK; + break; + case MMDSSlaveRequest::OP_WRLOCK: + lov.add_wrlock(lock); + replycode = MMDSSlaveRequest::OP_WRLOCKACK; + break; + } + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // ack + auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, replycode); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + if (replycode == MMDSSlaveRequest::OP_XLOCKACK) + lock->encode_locked_state(r->get_lock_data()); + mds->send_message(r, mdr->slave_request->get_connection()); + } + + // done. + mdr->reset_slave_request(); + } + break; + + case MMDSSlaveRequest::OP_UNXLOCK: + case MMDSSlaveRequest::OP_UNWRLOCK: + { + SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), + mdr->slave_request->get_object_info()); + ceph_assert(lock); + auto it = mdr->locks.find(lock); + ceph_assert(it != mdr->locks.end()); + bool need_issue = false; + switch (op) { + case MMDSSlaveRequest::OP_UNXLOCK: + mds->locker->xlock_finish(it, mdr.get(), &need_issue); + break; + case MMDSSlaveRequest::OP_UNWRLOCK: + mds->locker->wrlock_finish(it, mdr.get(), &need_issue); + break; + } + if (need_issue) + mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent())); + + // done. no ack necessary. + mdr->reset_slave_request(); + } + break; + + case MMDSSlaveRequest::OP_DROPLOCKS: + mds->locker->drop_locks(mdr.get()); + mdr->reset_slave_request(); + break; + + case MMDSSlaveRequest::OP_AUTHPIN: + handle_slave_auth_pin(mdr); + break; + + case MMDSSlaveRequest::OP_LINKPREP: + case MMDSSlaveRequest::OP_UNLINKPREP: + handle_slave_link_prep(mdr); + break; + + case MMDSSlaveRequest::OP_RMDIRPREP: + handle_slave_rmdir_prep(mdr); + break; + + case MMDSSlaveRequest::OP_RENAMEPREP: + handle_slave_rename_prep(mdr); + break; + + case MMDSSlaveRequest::OP_FINISH: + // information about rename imported caps + if (mdr->slave_request->inode_export.length() > 0) + mdr->more()->inode_import = mdr->slave_request->inode_export; + // finish off request. + mdcache->request_finish(mdr); + break; + + default: + ceph_abort(); + } +} + +void Server::handle_slave_auth_pin(MDRequestRef& mdr) +{ + dout(10) << "handle_slave_auth_pin " << *mdr << dendl; + + // build list of objects + list<MDSCacheObject*> objects; + CInode *auth_pin_freeze = NULL; + bool fail = false, wouldblock = false, readonly = false; + + if (mdcache->is_readonly()) { + dout(10) << " read-only FS" << dendl; + readonly = true; + fail = true; + } + + if (!fail) { + for (const auto &oi : mdr->slave_request->get_authpins()) { + MDSCacheObject *object = mdcache->get_object(oi); + if (!object) { + dout(10) << " don't have " << oi << dendl; + fail = true; + break; + } + + objects.push_back(object); + if (oi == mdr->slave_request->get_authpin_freeze()) + auth_pin_freeze = static_cast<CInode*>(object); + } + } + + // can we auth pin them? + if (!fail) { + for (list<MDSCacheObject*>::iterator p = objects.begin(); + p != objects.end(); + ++p) { + if (!(*p)->is_auth()) { + dout(10) << " not auth for " << **p << dendl; + fail = true; + break; + } + if (mdr->is_auth_pinned(*p)) + continue; + if (!mdr->can_auth_pin(*p)) { + if (mdr->slave_request->is_nonblock()) { + dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl; + fail = true; + wouldblock = true; + break; + } + // wait + dout(10) << " waiting for authpinnable on " << **p << dendl; + (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + mdr->drop_local_auth_pins(); + + mds->locker->notify_freeze_waiter(*p); + return; + } + } + } + + // auth pin! + if (fail) { + mdr->drop_local_auth_pins(); // just in case + } else { + /* freeze authpin wrong inode */ + if (mdr->has_more() && mdr->more()->is_freeze_authpin && + mdr->more()->rename_inode != auth_pin_freeze) + mdr->unfreeze_auth_pin(true); + + /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations + * on the source inode to complete. This happens after all locks for the rename + * operation are acquired. But to acquire locks, we need auth pin locks' parent + * objects first. So there is an ABBA deadlock if someone auth pins the source inode + * after locks are acquired and before Server::handle_slave_rename_prep() is called. + * The solution is freeze the inode and prevent other MDRequests from getting new + * auth pins. + */ + if (auth_pin_freeze) { + dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl; + if (!mdr->freeze_auth_pin(auth_pin_freeze)) { + auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); + mds->mdlog->flush(); + return; + } + } + for (list<MDSCacheObject*>::iterator p = objects.begin(); + p != objects.end(); + ++p) { + dout(10) << "auth_pinning " << **p << dendl; + mdr->auth_pin(*p); + } + } + + // ack! + auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK); + + // return list of my auth_pins (if any) + for (const auto &p : mdr->auth_pins) { + MDSCacheObjectInfo info; + p->set_object_info(info); + reply->get_authpins().push_back(info); + if (p == (MDSCacheObject*)auth_pin_freeze) + auth_pin_freeze->set_object_info(reply->get_authpin_freeze()); + } + + if (wouldblock) + reply->mark_error_wouldblock(); + if (readonly) + reply->mark_error_rofs(); + + mds->send_message_mds(reply, mdr->slave_to_mds); + + // clean up this request + mdr->reset_slave_request(); + return; +} + +void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack) +{ + dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + // added auth pins? + set<MDSCacheObject*> pinned; + for (const auto &oi : ack->get_authpins()) { + MDSCacheObject *object = mdcache->get_object(oi); + ceph_assert(object); // we pinned it + dout(10) << " remote has pinned " << *object << dendl; + if (!mdr->is_auth_pinned(object)) + mdr->remote_auth_pins[object] = from; + if (oi == ack->get_authpin_freeze()) + mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object)); + pinned.insert(object); + } + + // removed frozen auth pin ? + if (mdr->more()->is_remote_frozen_authpin && + ack->get_authpin_freeze() == MDSCacheObjectInfo()) { + auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode); + ceph_assert(p != mdr->remote_auth_pins.end()); + if (p->second == from) { + mdr->more()->is_remote_frozen_authpin = false; + } + } + + // removed auth pins? + auto p = mdr->remote_auth_pins.begin(); + while (p != mdr->remote_auth_pins.end()) { + MDSCacheObject* object = p->first; + if (p->second == from && pinned.count(object) == 0) { + dout(10) << " remote has unpinned " << *object << dendl; + mdr->remote_auth_pins.erase(p++); + } else { + ++p; + } + } + + if (ack->is_error_rofs()) { + mdr->more()->slave_error = -EROFS; + mdr->aborted = true; + } else if (ack->is_error_wouldblock()) { + mdr->more()->slave_error = -EWOULDBLOCK; + mdr->aborted = true; + } + + // note slave + mdr->more()->slaves.insert(from); + + // clear from waiting list + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + + // go again? + if (mdr->more()->waiting_on_slave.empty()) + mdcache->dispatch_request(mdr); + else + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; +} + + +// --------------------------------------- +// HELPERS + + +/** + * check whether we are permitted to complete a request + * + * Check whether we have permission to perform the operation specified + * by mask on the given inode, based on the capability in the mdr's + * session. + */ +bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask) +{ + if (mdr->session) { + int r = mdr->session->check_access( + in, mask, + mdr->client_request->get_caller_uid(), + mdr->client_request->get_caller_gid(), + &mdr->client_request->get_caller_gid_list(), + mdr->client_request->head.args.setattr.uid, + mdr->client_request->head.args.setattr.gid); + if (r < 0) { + respond_to_request(mdr, r); + return false; + } + } + return true; +} + +/** + * check whether fragment has reached maximum size + * + */ +bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in) +{ + const auto size = in->get_frag_size(); + if (size >= g_conf()->mds_bal_fragment_size_max) { + dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl; + respond_to_request(mdr, -ENOSPC); + return false; + } + + return true; +} + + +/** validate_dentry_dir + * + * verify that the dir exists and would own the dname. + * do not check if the dentry exists. + */ +CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname) +{ + // make sure parent is a dir? + if (!diri->is_dir()) { + dout(7) << "validate_dentry_dir: not a dir" << dendl; + respond_to_request(mdr, -ENOTDIR); + return NULL; + } + + // which dirfrag? + frag_t fg = diri->pick_dirfrag(dname); + CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); + if (!dir) + return 0; + + // frozen? + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << dendl; + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return NULL; + } + + return dir; +} + + +/** prepare_null_dentry + * prepare a null (or existing) dentry in given dir. + * wait for any dn lock. + */ +CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist) +{ + dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl; + ceph_assert(dir->is_auth()); + + client_t client = mdr->get_client(); + + // does it already exist? + CDentry *dn = dir->lookup(dname); + if (dn) { + /* + if (dn->lock.is_xlocked_by_other(mdr)) { + dout(10) << "waiting on xlocked dentry " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + */ + if (!dn->get_linkage(client, mdr)->is_null()) { + // name already exists + dout(10) << "dentry " << dname << " exists in " << *dir << dendl; + if (!okexist) { + respond_to_request(mdr, -EEXIST); + return 0; + } + } else { + snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + dn->first = std::max(dn->first, next_snap); + } + return dn; + } + + // make sure dir is complete + if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // create + dn = dir->add_null_dentry(dname, mdcache->get_global_snaprealm()->get_newest_seq() + 1); + dn->mark_new(); + dout(10) << "prepare_null_dentry added " << *dn << dendl; + return dn; +} + +CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in) +{ + CDentry *straydn = mdr->straydn; + if (straydn) { + string straydname; + in->name_stray_dentry(straydname); + if (straydn->get_name() == straydname) + return straydn; + + ceph_assert(!mdr->done_locking); + mdr->unpin(straydn); + } + + CDir *straydir = mdcache->get_stray_dir(in); + + if (!mdr->client_request->is_replay() && + !check_fragment_space(mdr, straydir)) + return NULL; + + straydn = mdcache->get_or_create_stray_dentry(in); + mdr->straydn = straydn; + mdr->pin(straydn); + return straydn; +} + +/** prepare_new_inode + * + * create a new inode. set c/m/atime. hit dir pop. + */ +CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode, + file_layout_t *layout) +{ + CInode *in = new CInode(mdcache); + + // Server::prepare_force_open_sessions() can re-open session in closing + // state. In that corner case, session's prealloc_inos are being freed. + // To simplify the code, we disallow using/refilling session's prealloc_ino + // while session is opening. + bool allow_prealloc_inos = mdr->session->is_open(); + + // assign ino + if (allow_prealloc_inos && + mdr->session->info.prealloc_inos.size()) { + mdr->used_prealloc_ino = + in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used + mds->sessionmap.mark_projected(mdr->session); + + dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino + << " (" << mdr->session->info.prealloc_inos + << ", " << mdr->session->info.prealloc_inos.size() << " left)" + << dendl; + } else { + mdr->alloc_ino = + in->inode.ino = mds->inotable->project_alloc_id(useino); + dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl; + } + + if (useino && useino != in->inode.ino) { + dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl; + mds->clog->error() << mdr->client_request->get_source() + << " specified ino " << useino + << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino; + //ceph_abort(); // just for now. + } + + if (allow_prealloc_inos && + mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) { + int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos(); + mds->inotable->project_alloc_ids(mdr->prealloc_inos, need); + ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics + mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos); + mds->sessionmap.mark_projected(mdr->session); + dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl; + } + + in->inode.version = 1; + in->inode.xattr_version = 1; + in->inode.nlink = 1; // FIXME + + in->inode.mode = mode; + + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); + if (in->inode.is_dir()) { + in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + } else if (layout) { + in->inode.layout = *layout; + } else { + in->inode.layout = mdcache->default_file_layout; + } + + in->inode.truncate_size = -1ull; // not truncated, yet! + in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */ + + CInode *diri = dir->get_inode(); + + dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl; + + if (diri->inode.mode & S_ISGID) { + dout(10) << " dir is sticky" << dendl; + in->inode.gid = diri->inode.gid; + if (S_ISDIR(mode)) { + dout(10) << " new dir also sticky" << dendl; + in->inode.mode |= S_ISGID; + } + } else + in->inode.gid = mdr->client_request->get_caller_gid(); + + in->inode.uid = mdr->client_request->get_caller_uid(); + + in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime = + mdr->get_op_stamp(); + + in->inode.change_attr = 0; + + const MClientRequest::const_ref &req = mdr->client_request; + if (req->get_data().length()) { + auto p = req->get_data().cbegin(); + + // xattrs on new inode? + CInode::mempool_xattr_map xattrs; + decode_noshare(xattrs, p); + for (const auto &p : xattrs) { + dout(10) << "prepare_new_inode setting xattr " << p.first << dendl; + auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second)); + if (!em.second) + em.first->second = p.second; + } + } + + if (!mds->mdsmap->get_inline_data_enabled() || + !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) + in->inode.inline_data.version = CEPH_INLINE_NONE; + + mdcache->add_inode(in); // add + dout(10) << "prepare_new_inode " << *in << dendl; + return in; +} + +void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob) +{ + dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected() + << " inotablev " << mds->inotable->get_projected_version() + << dendl; + blob->set_ino_alloc(mdr->alloc_ino, + mdr->used_prealloc_ino, + mdr->prealloc_inos, + mdr->client_request->get_source(), + mds->sessionmap.get_projected(), + mds->inotable->get_projected_version()); +} + +void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session) +{ + dout(10) << "apply_allocated_inos " << mdr->alloc_ino + << " / " << mdr->prealloc_inos + << " / " << mdr->used_prealloc_ino << dendl; + + if (mdr->alloc_ino) { + mds->inotable->apply_alloc_id(mdr->alloc_ino); + } + if (mdr->prealloc_inos.size()) { + ceph_assert(session); + session->pending_prealloc_inos.subtract(mdr->prealloc_inos); + session->info.prealloc_inos.insert(mdr->prealloc_inos); + mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino); + mds->inotable->apply_alloc_ids(mdr->prealloc_inos); + } + if (mdr->used_prealloc_ino) { + ceph_assert(session); + session->info.used_inos.erase(mdr->used_prealloc_ino); + mds->sessionmap.mark_dirty(session); + } +} + +class C_MDS_TryFindInode : public ServerContext { + MDRequestRef mdr; +public: + C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {} + void finish(int r) override { + if (r == -ESTALE) // :( find_ino_peers failed + server->respond_to_request(mdr, r); + else + server->dispatch_client_request(mdr); + } +}; + +class CF_MDS_MDRContextFactory : public MDSContextFactory { +public: + CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr) : cache(cache), mdr(mdr) {} + MDSContext *build() { + return new C_MDS_RetryRequest(cache, mdr); + } +private: + MDCache *cache; + MDRequestRef mdr; +}; + +CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath) +{ + // figure parent dir vs dname + if (refpath.depth() == 0) { + dout(7) << "can't do that to root" << dendl; + respond_to_request(mdr, -EINVAL); + return 0; + } + string dname = refpath.last_dentry(); + refpath.pop_dentry(); + + dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl; + + // traverse to parent dir + CInode *diri; + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD); + if (r > 0) return 0; // delayed + if (r < 0) { + if (r == -ESTALE) { + dout(10) << "FAIL on ESTALE but attempting recovery" << dendl; + mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr)); + return 0; + } + respond_to_request(mdr, r); + return 0; + } + + // is it an auth dir? + CDir *dir = validate_dentry_dir(mdr, diri, dname); + if (!dir) + return 0; // forwarded or waiting for freeze + + dout(10) << "traverse_to_auth_dir " << *dir << dendl; + return dir; +} + +/* If this returns null, the request has been handled + * as appropriate: forwarded on, or the client's been replied to */ +CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n, + MutationImpl::LockOpVec& lov, + bool want_auth, + bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's + a snapped dir */ + file_layout_t **layout, + bool no_lookup) // true if we cannot return a null dentry lease +{ + const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath(); + dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl; + + if (mdr->done_locking) + return mdr->in[n]; + + // traverse + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD); + if (r > 0) + return NULL; // delayed + if (r < 0) { // error + if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) { + if (!no_lookup) { + mdr->tracedn = mdr->dn[n].back(); + } + respond_to_request(mdr, r); + } else if (r == -ESTALE) { + dout(10) << "FAIL on ESTALE but attempting recovery" << dendl; + MDSContext *c = new C_MDS_TryFindInode(this, mdr); + mdcache->find_ino_peers(refpath.get_ino(), c); + } else { + dout(10) << "FAIL on error " << r << dendl; + respond_to_request(mdr, r); + } + return 0; + } + CInode *ref = mdr->in[n]; + dout(10) << "ref is " << *ref << dendl; + + // fw to inode auth? + if (mdr->snapid != CEPH_NOSNAP && !no_want_auth) + want_auth = true; + + if (want_auth) { + if (ref->is_ambiguous_auth()) { + dout(10) << "waiting for single auth on " << *ref << dendl; + ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + if (!ref->is_auth()) { + dout(10) << "fw to auth for " << *ref << dendl; + mdcache->request_forward(mdr, ref->authority().first); + return 0; + } + + // auth_pin? + // do NOT proceed if freezing, as cap release may defer in that case, and + // we could deadlock when we try to lock @ref. + // if we're already auth_pinned, continue; the release has already been processed. + if (ref->is_frozen() || ref->is_frozen_auth_pin() || + (ref->is_freezing() && !mdr->is_auth_pinned(ref))) { + dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; + ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + /* If we have any auth pins, this will deadlock. + * But the only way to get here if we've already got auth pins + * is because we're on an inode with snapshots that got updated + * between dispatches of this request. So we're going to drop + * our locks and our auth pins and reacquire them later. + * + * This is safe since we're only in this function when working on + * a single MDS request; otherwise we'd be in + * rdlock_path_xlock_dentry. + */ + mds->locker->drop_locks(mdr.get(), NULL); + mdr->drop_local_auth_pins(); + if (!mdr->remote_auth_pins.empty()) + mds->locker->notify_freeze_waiter(ref); + return 0; + } + + mdr->auth_pin(ref); + } + + for (int i=0; i<(int)mdr->dn[n].size(); i++) + lov.add_rdlock(&mdr->dn[n][i]->lock); + if (layout) + mds->locker->include_snap_rdlocks_wlayout(ref, lov, layout); + else + mds->locker->include_snap_rdlocks(ref, lov); + + // set and pin ref + mdr->pin(ref); + return ref; +} + + +/** rdlock_path_xlock_dentry + * traverse path to the directory that could/would contain dentry. + * make sure i am auth for that dentry, forward as necessary. + * create null dentry in place (or use existing if okexist). + * get rdlocks on traversed dentries, xlock on new dentry. + */ +CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n, + MutationImpl::LockOpVec& lov, + bool okexist, bool mustexist, bool alwaysxlock, + file_layout_t **layout) +{ + const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath(); + + dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl; + + client_t client = mdr->get_client(); + + if (mdr->done_locking) + return mdr->dn[n].back(); + + CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath); + if (!dir) return 0; + + CInode *diri = dir->get_inode(); + if (!mdr->reqid.name.is_mds()) { + if (diri->is_system() && !diri->is_root()) { + respond_to_request(mdr, -EROFS); + return 0; + } + } + if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) { + respond_to_request(mdr, -ENOENT); + return 0; + } + + // make a null dentry? + std::string_view dname = refpath.last_dentry(); + CDentry *dn; + if (mustexist) { + dn = dir->lookup(dname); + + // make sure dir is complete + if (!dn && !dir->is_complete() && + (!dir->has_bloom() || dir->is_in_bloom(dname))) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // readable? + if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) { + dout(10) << "waiting on xlocked dentry " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // exists? + if (!dn || dn->get_linkage(client, mdr)->is_null()) { + dout(7) << "dentry " << dname << " dne in " << *dir << dendl; + respond_to_request(mdr, -ENOENT); + return 0; + } + } else { + dn = prepare_null_dentry(mdr, dir, dname, okexist); + if (!dn) + return 0; + } + + mdr->dn[n].push_back(dn); + CDentry::linkage_t *dnl = dn->get_linkage(client, mdr); + mdr->in[n] = dnl->get_inode(); + + // -- lock -- + // NOTE: rename takes the same set of locks for srcdn + for (int i=0; i<(int)mdr->dn[n].size(); i++) + lov.add_rdlock(&mdr->dn[n][i]->lock); + if (alwaysxlock || dnl->is_null()) + lov.add_xlock(&dn->lock); // new dn, xlock + else + lov.add_rdlock(&dn->lock); // existing dn, rdlock + lov.add_wrlock(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime + lov.add_wrlock(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime + if (layout) + mds->locker->include_snap_rdlocks_wlayout(dn->get_dir()->inode, lov, layout); + else + mds->locker->include_snap_rdlocks(dn->get_dir()->inode, lov); + + return dn; +} + + + + + +/** + * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth + * + * @param diri base inode + * @param fg the exact frag we want + * @param mdr request + * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of) + */ +CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr) +{ + CDir *dir = diri->get_dirfrag(fg); + + // not open and inode not mine? + if (!dir && !diri->is_auth()) { + mds_rank_t inauth = diri->authority().first; + dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl; + mdcache->request_forward(mdr, inauth); + return 0; + } + + // not open and inode frozen? + if (!dir && diri->is_frozen()) { + dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl; + ceph_assert(diri->get_parent_dir()); + diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // invent? + if (!dir) + dir = diri->get_or_open_dirfrag(mdcache, fg); + + // am i auth for the dirfrag? + if (!dir->is_auth()) { + mds_rank_t auth = dir->authority().first; + dout(7) << "try_open_auth_dirfrag: not auth for " << *dir + << ", fw to mds." << auth << dendl; + mdcache->request_forward(mdr, auth); + return 0; + } + + return dir; +} + + +// =============================================================================== +// STAT + +void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + if (req->get_filepath().depth() == 0 && is_lookup) { + // refpath can't be empty for lookup but it can for + // getattr (we do getattr with empty refpath for mount of '/') + respond_to_request(mdr, -EINVAL); + return; + } + + bool want_auth = false; + int mask = req->head.args.getattr.mask; + if (mask & CEPH_STAT_RSTAT) + want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask + + MutationImpl::LockOpVec lov; + CInode *ref = rdlock_path_pin_ref(mdr, 0, lov, want_auth, false, NULL, + !is_lookup); + if (!ref) return; + + /* + * if client currently holds the EXCL cap on a field, do not rdlock + * it; client's stat() will result in valid info if _either_ EXCL + * cap is held or MDS rdlocks and reads the value here. + * + * handling this case here is easier than weakening rdlock + * semantics... that would cause problems elsewhere. + */ + client_t client = mdr->get_client(); + int issued = 0; + Capability *cap = ref->get_client_cap(client); + if (cap && (mdr->snapid == CEPH_NOSNAP || + mdr->snapid <= cap->client_follows)) + issued = cap->issued(); + + if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL)) + lov.add_rdlock(&ref->linklock); + if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL)) + lov.add_rdlock(&ref->authlock); + if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL)) + lov.add_rdlock(&ref->xattrlock); + if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) { + // Don't wait on unstable filelock if client is allowed to read file size. + // This can reduce the response time of getattr in the case that multiple + // clients do stat(2) and there are writers. + // The downside of this optimization is that mds may not issue Fs caps along + // with getattr reply. Client may need to send more getattr requests. + if (mdr->is_rdlocked(&ref->filelock)) { + lov.add_rdlock(&ref->filelock); + } else if (ref->filelock.is_stable() || + ref->filelock.get_num_wrlocks() > 0 || + !ref->filelock.can_read(mdr->get_client())) { + lov.add_rdlock(&ref->filelock); + mdr->done_locking = false; + } + } + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, ref, MAY_READ)) + return; + + utime_t now = ceph_clock_now(); + mdr->set_mds_stamp(now); + + // note which caps are requested, so we return at least a snapshot + // value for them. (currently this matters for xattrs and inline data) + mdr->getattr_caps = mask; + + mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num()); + + // reply + dout(10) << "reply to stat on " << *req << dendl; + mdr->tracei = ref; + if (is_lookup) + mdr->tracedn = mdr->dn[0].back(); + respond_to_request(mdr, 0); +} + +struct C_MDS_LookupIno2 : public ServerContext { + MDRequestRef mdr; + C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {} + void finish(int r) override { + server->_lookup_ino_2(mdr, r); + } +}; + +/* + * filepath: ino + */ +void Server::handle_client_lookup_ino(MDRequestRef& mdr, + bool want_parent, bool want_dentry) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + if ((uint64_t)req->head.args.lookupino.snapid > 0) + return _lookup_snap_ino(mdr); + + inodeno_t ino = req->get_filepath().get_ino(); + CInode *in = mdcache->get_inode(ino); + if (in && in->state_test(CInode::STATE_PURGING)) { + respond_to_request(mdr, -ESTALE); + return; + } + if (!in) { + mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false); + return; + } + + if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() && + !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) { + return; + } + + // check for nothing (not read or write); this still applies the + // path check. + if (!check_access(mdr, in, 0)) + return; + + CDentry *dn = in->get_projected_parent_dn(); + CInode *diri = dn ? dn->get_dir()->inode : NULL; + + MutationImpl::LockOpVec lov; + if (dn && (want_parent || want_dentry)) { + mdr->pin(dn); + lov.add_rdlock(&dn->lock); + } + + unsigned mask = req->head.args.lookupino.mask; + if (mask) { + Capability *cap = in->get_client_cap(mdr->get_client()); + int issued = 0; + if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows)) + issued = cap->issued(); + // permission bits, ACL/security xattrs + if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) + lov.add_rdlock(&in->authlock); + if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) + lov.add_rdlock(&in->xattrlock); + + mdr->getattr_caps = mask; + } + + if (!lov.empty()) { + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (diri != NULL) { + // need read access to directory inode + if (!check_access(mdr, diri, MAY_READ)) + return; + } + } + + if (want_parent) { + if (in->is_base()) { + respond_to_request(mdr, -EINVAL); + return; + } + if (!diri || diri->is_stray()) { + respond_to_request(mdr, -ESTALE); + return; + } + dout(10) << "reply to lookup_parent " << *in << dendl; + mdr->tracei = diri; + respond_to_request(mdr, 0); + } else { + if (want_dentry) { + inodeno_t dirino = req->get_filepath2().get_ino(); + if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) { + respond_to_request(mdr, -ENOENT); + return; + } + dout(10) << "reply to lookup_name " << *in << dendl; + } else + dout(10) << "reply to lookup_ino " << *in << dendl; + + mdr->tracei = in; + if (want_dentry) + mdr->tracedn = dn; + respond_to_request(mdr, 0); + } +} + +void Server::_lookup_snap_ino(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + vinodeno_t vino; + vino.ino = req->get_filepath().get_ino(); + vino.snapid = (__u64)req->head.args.lookupino.snapid; + inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent; + __u32 hash = req->head.args.lookupino.hash; + + dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl; + + CInode *in = mdcache->lookup_snap_inode(vino); + if (!in) { + in = mdcache->get_inode(vino.ino); + if (in) { + if (in->state_test(CInode::STATE_PURGING) || + !in->has_snap_data(vino.snapid)) { + if (in->is_dir() || !parent_ino) { + respond_to_request(mdr, -ESTALE); + return; + } + in = NULL; + } + } + } + + if (in) { + dout(10) << "reply to lookup_snap_ino " << *in << dendl; + mdr->snapid = vino.snapid; + mdr->tracei = in; + respond_to_request(mdr, 0); + return; + } + + CInode *diri = NULL; + if (parent_ino) { + diri = mdcache->get_inode(parent_ino); + if (!diri) { + mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr)); + return; + } + + if (!diri->is_dir()) { + respond_to_request(mdr, -EINVAL); + return; + } + + MutationImpl::LockOpVec lov; + lov.add_rdlock(&diri->dirfragtreelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + frag_t frag = diri->dirfragtree[hash]; + CDir *dir = try_open_auth_dirfrag(diri, frag, mdr); + if (!dir) + return; + + if (!dir->is_complete()) { + if (dir->is_frozen()) { + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true); + return; + } + + respond_to_request(mdr, -ESTALE); + } else { + mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false); + } +} + +void Server::_lookup_ino_2(MDRequestRef& mdr, int r) +{ + inodeno_t ino = mdr->client_request->get_filepath().get_ino(); + dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl; + + // `r` is a rank if >=0, else an error code + if (r >= 0) { + mds_rank_t dest_rank(r); + if (dest_rank == mds->get_nodeid()) + dispatch_client_request(mdr); + else + mdcache->request_forward(mdr, dest_rank); + return; + } + + // give up + if (r == -ENOENT || r == -ENODATA) + r = -ESTALE; + respond_to_request(mdr, r); +} + + +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_open(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + dout(7) << "open on " << req->get_filepath() << dendl; + + int flags = req->head.args.open.flags; + int cmode = ceph_flags_to_mode(flags); + if (cmode < 0) { + respond_to_request(mdr, -EINVAL); + return; + } + + bool need_auth = !file_mode_is_readonly(cmode) || + (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY)); + + if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) { + dout(7) << "read-only FS" << dendl; + respond_to_request(mdr, -EROFS); + return; + } + + MutationImpl::LockOpVec lov; + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, need_auth); + if (!cur) + return; + + if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) { + ceph_assert(!need_auth); + mdr->done_locking = false; + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) + return; + } + + if (!cur->inode.is_file()) { + // can only open non-regular inode with mode FILE_MODE_PIN, at least for now. + cmode = CEPH_FILE_MODE_PIN; + // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag. + if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW)) + flags &= ~CEPH_O_TRUNC; + } + + dout(10) << "open flags = " << flags + << ", filemode = " << cmode + << ", need_auth = " << need_auth + << dendl; + + // regular file? + /*if (!cur->inode.is_file() && !cur->inode.is_dir()) { + dout(7) << "not a file or dir " << *cur << dendl; + respond_to_request(mdr, -ENXIO); // FIXME what error do we want? + return; + }*/ + if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) { + dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) { + dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl; + // we should return -EISDIR for directory, return -EINVAL for other non-regular + respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL); + return; + } + + if (cur->inode.inline_data.version != CEPH_INLINE_NONE && + !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { + dout(7) << "old client cannot open inline data file " << *cur << dendl; + respond_to_request(mdr, -EPERM); + return; + } + + // snapped data is read only + if (mdr->snapid != CEPH_NOSNAP && + ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) { + dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl; + respond_to_request(mdr, -EROFS); + return; + } + + unsigned mask = req->head.args.open.mask; + if (mask) { + Capability *cap = cur->get_client_cap(mdr->get_client()); + int issued = 0; + if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows)) + issued = cap->issued(); + // permission bits, ACL/security xattrs + if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) + lov.add_rdlock(&cur->authlock); + if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) + lov.add_rdlock(&cur->xattrlock); + + mdr->getattr_caps = mask; + } + + // O_TRUNC + if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) { + ceph_assert(cur->is_auth()); + + lov.add_xlock(&cur->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, cur, MAY_WRITE)) + return; + + // wait for pending truncate? + const auto pi = cur->get_projected_inode(); + if (pi->is_truncating()) { + dout(10) << " waiting for pending truncate from " << pi->truncate_from + << " to " << pi->truncate_size << " to complete on " << *cur << dendl; + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + do_open_truncate(mdr, cmode); + return; + } + + // sync filelock if snapped. + // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata, + // and that data itself is flushed so that we can read the snapped data off disk. + if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) { + lov.add_rdlock(&cur->filelock); + } + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + mask = MAY_READ; + if (cmode & CEPH_FILE_MODE_WR) + mask |= MAY_WRITE; + if (!check_access(mdr, cur, mask)) + return; + + utime_t now = ceph_clock_now(); + mdr->set_mds_stamp(now); + + if (cur->is_file() || cur->is_dir()) { + if (mdr->snapid == CEPH_NOSNAP) { + // register new cap + Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay()); + if (cap) + dout(12) << "open issued caps " << ccap_string(cap->pending()) + << " for " << req->get_source() + << " on " << *cur << dendl; + } else { + int caps = ceph_caps_for_mode(cmode); + dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps) + << " for " << req->get_source() + << " snapid " << mdr->snapid + << " on " << *cur << dendl; + mdr->snap_caps = caps; + } + } + + // increase max_size? + if (cmode & CEPH_FILE_MODE_WR) + mds->locker->check_inode_max_size(cur); + + // make sure this inode gets into the journal + if (cur->is_auth() && cur->last == CEPH_NOSNAP && + mdcache->open_file_table.should_log_open(cur)) { + EOpen *le = new EOpen(mds->mdlog); + mdlog->start_entry(le); + le->add_clean_inode(cur); + mdlog->submit_entry(le); + } + + // hit pop + if (cmode & CEPH_FILE_MODE_WR) + mds->balancer->hit_inode(cur, META_POP_IWR); + else + mds->balancer->hit_inode(cur, META_POP_IRD, + mdr->client_request->get_source().num()); + + CDentry *dn = 0; + if (req->get_dentry_wanted()) { + ceph_assert(mdr->dn[0].size()); + dn = mdr->dn[0].back(); + } + + mdr->tracei = cur; + mdr->tracedn = dn; + respond_to_request(mdr, 0); +} + +class C_MDS_openc_finish : public ServerLogContext { + CDentry *dn; + CInode *newi; +public: + C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) : + ServerLogContext(s, r), dn(d), newi(ni) {} + void finish(int r) override { + ceph_assert(r == 0); + + dn->pop_projected_linkage(); + + // dirty inode, dn, dir + newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish + newi->mark_dirty(newi->inode.version+1, mdr->ls); + newi->mark_dirty_parent(mdr->ls, true); + + mdr->apply(); + + get_mds()->locker->share_inode_max_size(newi); + + MDRequestRef null_ref; + get_mds()->mdcache->send_dentry_link(dn, null_ref); + + get_mds()->balancer->hit_inode(newi, META_POP_IWR); + + server->respond_to_request(mdr, 0); + + ceph_assert(g_conf()->mds_kill_openc_at != 1); + } +}; + +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_openc(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + client_t client = mdr->get_client(); + + dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl; + + int cmode = ceph_flags_to_mode(req->head.args.open.flags); + if (cmode < 0) { + respond_to_request(mdr, -EINVAL); + return; + } + + bool excl = req->head.args.open.flags & CEPH_O_EXCL; + + if (!excl) { + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, req->get_filepath(), + &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD); + if (r > 0) return; + if (r == 0) { + // it existed. + handle_client_open(mdr); + return; + } + if (r < 0 && r != -ENOENT) { + if (r == -ESTALE) { + dout(10) << "FAIL on ESTALE but attempting recovery" << dendl; + MDSContext *c = new C_MDS_TryFindInode(this, mdr); + mdcache->find_ino_peers(req->get_filepath().get_ino(), c); + } else { + dout(10) << "FAIL on error " << r << dendl; + respond_to_request(mdr, r); + } + return; + } + } + + MutationImpl::LockOpVec lov; + file_layout_t *dir_layout = nullptr; + CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, + !excl, false, false, &dir_layout); + if (!dn) return; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + // set layout + file_layout_t layout; + if (dir_layout) + layout = *dir_layout; + else + layout = mdcache->default_file_layout; + + // What kind of client caps are required to complete this operation + uint64_t access = MAY_WRITE; + + const auto default_layout = layout; + + // fill in any special params from client + if (req->head.args.open.stripe_unit) + layout.stripe_unit = req->head.args.open.stripe_unit; + if (req->head.args.open.stripe_count) + layout.stripe_count = req->head.args.open.stripe_count; + if (req->head.args.open.object_size) + layout.object_size = req->head.args.open.object_size; + if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) && + (__s32)req->head.args.open.pool >= 0) { + layout.pool_id = req->head.args.open.pool; + + // make sure we have as new a map as the client + if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) { + mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + // If client doesn't have capability to modify layout pools, then + // only permit this request if the requested pool matches what the + // file would have inherited anyway from its parent. + if (default_layout != layout) { + access |= MAY_SET_VXATTR; + } + + if (!layout.is_valid()) { + dout(10) << " invalid initial file layout" << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + if (!mds->mdsmap->is_data_pool(layout.pool_id)) { + dout(10) << " invalid data pool " << layout.pool_id << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + // created null dn. + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); + lov.add_rdlock(&diri->authlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, access)) + return; + + if (!check_fragment_space(mdr, dir)) + return; + + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + + if (!dnl->is_null()) { + // it existed. + ceph_assert(req->head.args.open.flags & CEPH_O_EXCL); + dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl; + mdr->tracei = dnl->get_inode(); + mdr->tracedn = dn; + respond_to_request(mdr, -EEXIST); + return; + } + + // create inode. + CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), + req->head.args.open.mode | S_IFREG, &layout); + ceph_assert(in); + + // it's a file. + dn->push_projected_linkage(in); + + in->inode.version = dn->pre_dirty(); + if (layout.pool_id != mdcache->default_file_layout.pool_id) + in->inode.add_old_pool(mdcache->default_file_layout.pool_id); + in->inode.update_backtrace(); + in->inode.rstat.rfiles = 1; + + SnapRealm *realm = diri->find_snaprealm(); + snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); + ceph_assert(follows >= realm->get_newest_seq()); + + ceph_assert(dn->first == follows+1); + in->first = dn->first; + + // do the open + Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay()); + in->authlock.set_state(LOCK_EXCL); + in->xattrlock.set_state(LOCK_EXCL); + + if (cap && (cmode & CEPH_FILE_MODE_WR)) { + in->inode.client_ranges[client].range.first = 0; + in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment(); + in->inode.client_ranges[client].follows = follows; + cap->mark_clientwriteable(); + } + + // prepare finisher + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "openc"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + journal_allocated_inos(mdr, &le->metablob); + mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, in, true, true, true); + + // make sure this inode gets into the journal + le->metablob.add_opened_ino(in->ino()); + + C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in); + + if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) { + dout(10) << "adding ino to reply to indicate inode was created" << dendl; + // add the file created flag onto the reply if create_flags features is supported + encode(in->inode.ino, mdr->reply_extra_bl); + } + + journal_and_reply(mdr, in, dn, le, fin); + + // We hit_dir (via hit_inode) in our finish callback, but by then we might + // have overshot the split size (multiple opencs in flight), so here is + // an early chance to split the dir if this openc makes it oversized. + mds->balancer->maybe_fragment(dir, false); +} + + + +void Server::handle_client_readdir(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + Session *session = mds->get_session(req); + client_t client = req->get_source().num(); + MutationImpl::LockOpVec lov; + CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true); + if (!diri) return; + + // it's a directory, right? + if (!diri->is_dir()) { + // not a dir + dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl; + respond_to_request(mdr, -ENOTDIR); + return; + } + + auto num_caps = session->get_num_caps(); + auto session_cap_acquisition = session->get_cap_acquisition(); + + if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) { + dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps + << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl; + if (logger) + logger->inc(l_mdss_cap_acquisition_throttle); + + mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + lov.add_rdlock(&diri->filelock); + lov.add_rdlock(&diri->dirfragtreelock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_READ)) + return; + + // which frag? + frag_t fg = (__u32)req->head.args.readdir.frag; + unsigned req_flags = (__u32)req->head.args.readdir.flags; + string offset_str = req->get_path2(); + + __u32 offset_hash = 0; + if (!offset_str.empty()) + offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str)); + else + offset_hash = (__u32)req->head.args.readdir.offset_hash; + + dout(10) << " frag " << fg << " offset '" << offset_str << "'" + << " offset_hash " << offset_hash << " flags " << req_flags << dendl; + + // does the frag exist? + if (diri->dirfragtree[fg.value()] != fg) { + frag_t newfg; + if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { + if (fg.contains((unsigned)offset_hash)) { + newfg = diri->dirfragtree[offset_hash]; + } else { + // client actually wants next frag + newfg = diri->dirfragtree[fg.value()]; + } + } else { + offset_str.clear(); + newfg = diri->dirfragtree[fg.value()]; + } + dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl; + fg = newfg; + } + + CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); + if (!dir) return; + + // ok! + dout(10) << "handle_client_readdir on " << *dir << dendl; + ceph_assert(dir->is_auth()); + + if (!dir->is_complete()) { + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << dendl; + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + // fetch + dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true); + return; + } + +#ifdef MDS_VERIFY_FRAGSTAT + dir->verify_fragstat(); +#endif + + utime_t now = ceph_clock_now(); + mdr->set_mds_stamp(now); + + snapid_t snapid = mdr->snapid; + dout(10) << "snapid " << snapid << dendl; + + SnapRealm *realm = diri->find_snaprealm(); + + unsigned max = req->head.args.readdir.max_entries; + if (!max) + max = dir->get_num_any(); // whatever, something big. + unsigned max_bytes = req->head.args.readdir.max_bytes; + if (!max_bytes) + // make sure at least one item can be encoded + max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + + // start final blob + bufferlist dirbl; + DirStat ds; + ds.frag = dir->get_frag(); + ds.auth = dir->get_dir_auth().first; + if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth()) + dir->get_dist_spec(ds.dist, mds->get_nodeid()); + + dir->encode_dirstat(dirbl, mdr->session->info, ds); + + // count bytes available. + // this isn't perfect, but we should capture the main variable/unbounded size items! + int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2; + int bytes_left = max_bytes - front_bytes; + bytes_left -= realm->get_snap_trace().length(); + + // build dir contents + bufferlist dnbl; + __u32 numfiles = 0; + bool start = !offset_hash && offset_str.empty(); + // skip all dns < dentry_key_t(snapid, offset_str, offset_hash) + dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash); + auto it = start ? dir->begin() : dir->lower_bound(skip_key); + bool end = (it == dir->end()); + for (; !end && numfiles < max; end = (it == dir->end())) { + CDentry *dn = it->second; + ++it; + + if (dn->state_test(CDentry::STATE_PURGING)) + continue; + + bool dnp = dn->use_projected(client, mdr); + CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage(); + + if (dnl->is_null()) + continue; + + if (dn->last < snapid || dn->first > snapid) { + dout(20) << "skipping non-overlapping snap " << *dn << dendl; + continue; + } + + if (!start) { + dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash); + if (!(offset_key < dn->key())) + continue; + } + + CInode *in = dnl->get_inode(); + + if (in && in->ino() == CEPH_INO_CEPH) + continue; + + // remote link? + // better for the MDS to do the work, if we think the client will stat any of these files. + if (dnl->is_remote() && !in) { + in = mdcache->get_inode(dnl->get_remote_ino()); + if (in) { + dn->link_remote(dnl, in); + } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) { + dout(10) << "skipping bad remote ino on " << *dn << dendl; + continue; + } else { + // touch everything i _do_ have + for (auto &p : *dir) { + if (!p.second->get_linkage()->is_null()) + mdcache->lru.lru_touch(p.second); + } + + // already issued caps and leases, reply immediately. + if (dnbl.length() > 0) { + mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop); + dout(10) << " open remote dentry after caps were issued, stopping at " + << dnbl.length() << " < " << bytes_left << dendl; + break; + } + + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + ceph_assert(in); + + if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) { + dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + break; + } + + unsigned start_len = dnbl.length(); + + // dentry + dout(12) << "including dn " << *dn << dendl; + encode(dn->get_name(), dnbl); + mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session); + + // inode + dout(12) << "including inode " << *in << dendl; + int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length()); + if (r < 0) { + // chop off dn->name, lease + dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl; + bufferlist keep; + keep.substr_of(dnbl, 0, start_len); + dnbl.swap(keep); + break; + } + ceph_assert(r >= 0); + numfiles++; + + // touch dn + mdcache->lru.lru_touch(dn); + } + + session->touch_readdir_cap(numfiles); + + __u16 flags = 0; + if (end) { + flags = CEPH_READDIR_FRAG_END; + if (start) + flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve + } + // client only understand END and COMPLETE flags ? + if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { + flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH; + } + + // finish final blob + encode(numfiles, dirbl); + encode(flags, dirbl); + dirbl.claim_append(dnbl); + + // yay, reply + dout(10) << "reply to " << *req << " readdir num=" << numfiles + << " bytes=" << dirbl.length() + << " start=" << (int)start + << " end=" << (int)end + << dendl; + mdr->reply_extra_bl = dirbl; + + // bump popularity. NOTE: this doesn't quite capture it. + mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles); + + // reply + mdr->tracei = diri; + respond_to_request(mdr, 0); +} + + + +// =============================================================================== +// INODE UPDATES + + +/* + * finisher for basic inode updates + */ +class C_MDS_inode_update_finish : public ServerLogContext { + CInode *in; + bool truncating_smaller, changed_ranges, adjust_realm; +public: + C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i, + bool sm=false, bool cr=false, bool ar=false) : + ServerLogContext(s, r), in(i), + truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { } + void finish(int r) override { + ceph_assert(r == 0); + + int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT); + + // apply + in->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + MDSRank *mds = get_mds(); + + // notify any clients + if (truncating_smaller && in->inode.is_truncating()) { + mds->locker->issue_truncate(in); + mds->mdcache->truncate_inode(in, mdr->ls); + } + + if (adjust_realm) { + mds->mdcache->send_snap_update(in, 0, snap_op); + mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op); + } + + get_mds()->balancer->hit_inode(in, META_POP_IWR); + + server->respond_to_request(mdr, 0); + + if (changed_ranges) + get_mds()->locker->share_inode_max_size(in); + } +}; + +void Server::handle_client_file_setlock(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + + // get the inode to operate on, and set up any locks needed for that + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) + return; + + lov.add_xlock(&cur->flocklock); + /* acquire_locks will return true if it gets the locks. If it fails, + it will redeliver this request at a later date, so drop the request. + */ + if (!mds->locker->acquire_locks(mdr, lov)) { + dout(10) << "handle_client_file_setlock could not get locks!" << dendl; + return; + } + + // copy the lock change into a ceph_filelock so we can store/apply it + ceph_filelock set_lock; + set_lock.start = req->head.args.filelock_change.start; + set_lock.length = req->head.args.filelock_change.length; + set_lock.client = req->get_orig_source().num(); + set_lock.owner = req->head.args.filelock_change.owner; + set_lock.pid = req->head.args.filelock_change.pid; + set_lock.type = req->head.args.filelock_change.type; + bool will_wait = req->head.args.filelock_change.wait; + + dout(10) << "handle_client_file_setlock: " << set_lock << dendl; + + ceph_lock_state_t *lock_state = NULL; + bool interrupt = false; + + // get the appropriate lock state + switch (req->head.args.filelock_change.rule) { + case CEPH_LOCK_FLOCK_INTR: + interrupt = true; + // fall-thru + case CEPH_LOCK_FLOCK: + lock_state = cur->get_flock_lock_state(); + break; + + case CEPH_LOCK_FCNTL_INTR: + interrupt = true; + // fall-thru + case CEPH_LOCK_FCNTL: + lock_state = cur->get_fcntl_lock_state(); + break; + + default: + dout(10) << "got unknown lock type " << set_lock.type + << ", dropping request!" << dendl; + respond_to_request(mdr, -EOPNOTSUPP); + return; + } + + dout(10) << " state prior to lock change: " << *lock_state << dendl; + if (CEPH_LOCK_UNLOCK == set_lock.type) { + list<ceph_filelock> activated_locks; + MDSContext::vec waiters; + if (lock_state->is_waiting(set_lock)) { + dout(10) << " unlock removing waiting lock " << set_lock << dendl; + lock_state->remove_waiting(set_lock); + cur->take_waiting(CInode::WAIT_FLOCK, waiters); + } else if (!interrupt) { + dout(10) << " unlock attempt on " << set_lock << dendl; + lock_state->remove_lock(set_lock, activated_locks); + cur->take_waiting(CInode::WAIT_FLOCK, waiters); + } + mds->queue_waiters(waiters); + + respond_to_request(mdr, 0); + } else { + dout(10) << " lock attempt on " << set_lock << dendl; + bool deadlock = false; + if (mdr->more()->flock_was_waiting && + !lock_state->is_waiting(set_lock)) { + dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl; + respond_to_request(mdr, -EINTR); + } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) { + dout(10) << " it failed on this attempt" << dendl; + // couldn't set lock right now + if (deadlock) { + respond_to_request(mdr, -EDEADLK); + } else if (!will_wait) { + respond_to_request(mdr, -EWOULDBLOCK); + } else { + dout(10) << " added to waiting list" << dendl; + ceph_assert(lock_state->is_waiting(set_lock)); + mdr->more()->flock_was_waiting = true; + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + mdr->mark_event("failed to add lock, waiting"); + mdr->mark_nowarn(); + cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr)); + } + } else + respond_to_request(mdr, 0); + } + dout(10) << " state after lock change: " << *lock_state << dendl; +} + +void Server::handle_client_file_readlock(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + + // get the inode to operate on, and set up any locks needed for that + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) + return; + + /* acquire_locks will return true if it gets the locks. If it fails, + it will redeliver this request at a later date, so drop the request. + */ + lov.add_rdlock(&cur->flocklock); + if (!mds->locker->acquire_locks(mdr, lov)) { + dout(10) << "handle_client_file_readlock could not get locks!" << dendl; + return; + } + + // copy the lock change into a ceph_filelock so we can store/apply it + ceph_filelock checking_lock; + checking_lock.start = req->head.args.filelock_change.start; + checking_lock.length = req->head.args.filelock_change.length; + checking_lock.client = req->get_orig_source().num(); + checking_lock.owner = req->head.args.filelock_change.owner; + checking_lock.pid = req->head.args.filelock_change.pid; + checking_lock.type = req->head.args.filelock_change.type; + + // get the appropriate lock state + ceph_lock_state_t *lock_state = NULL; + switch (req->head.args.filelock_change.rule) { + case CEPH_LOCK_FLOCK: + lock_state = cur->get_flock_lock_state(); + break; + + case CEPH_LOCK_FCNTL: + lock_state = cur->get_fcntl_lock_state(); + break; + + default: + dout(10) << "got unknown lock type " << checking_lock.type << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + lock_state->look_for_lock(checking_lock); + + bufferlist lock_bl; + encode(checking_lock, lock_bl); + + mdr->reply_extra_bl = lock_bl; + respond_to_request(mdr, 0); +} + +void Server::handle_client_setattr(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) return; + + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) { + respond_to_request(mdr, -EPERM); + return; + } + + __u32 mask = req->head.args.setattr.mask; + __u32 access_mask = MAY_WRITE; + + // xlock inode + if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID)) + lov.add_xlock(&cur->authlock); + if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE)) + lov.add_xlock(&cur->filelock); + if (mask & CEPH_SETATTR_CTIME) + lov.add_wrlock(&cur->versionlock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid)) + access_mask |= MAY_CHOWN; + + if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid)) + access_mask |= MAY_CHGRP; + + if (!check_access(mdr, cur, access_mask)) + return; + + // trunc from bigger -> smaller? + auto pip = cur->get_projected_inode(); + + uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size); + + // ENOSPC on growing file while full, but allow shrinks + if (is_full && req->head.args.setattr.size > old_size) { + dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl; + respond_to_request(mdr, -ENOSPC); + return; + } + + bool truncating_smaller = false; + if (mask & CEPH_SETATTR_SIZE) { + truncating_smaller = req->head.args.setattr.size < old_size; + if (truncating_smaller && pip->is_truncating()) { + dout(10) << " waiting for pending truncate from " << pip->truncate_from + << " to " << pip->truncate_size << " to complete on " << *cur << dendl; + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + bool changed_ranges = false; + + // project update + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "setattr"); + mdlog->start_entry(le); + + auto &pi = cur->project_inode(); + + if (mask & CEPH_SETATTR_UID) + pi.inode.uid = req->head.args.setattr.uid; + if (mask & CEPH_SETATTR_GID) + pi.inode.gid = req->head.args.setattr.gid; + + if (mask & CEPH_SETATTR_MODE) + pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777); + else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) && + S_ISREG(pi.inode.mode) && + (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) { + pi.inode.mode &= ~(S_ISUID|S_ISGID); + } + + if (mask & CEPH_SETATTR_MTIME) + pi.inode.mtime = req->head.args.setattr.mtime; + if (mask & CEPH_SETATTR_ATIME) + pi.inode.atime = req->head.args.setattr.atime; + if (mask & CEPH_SETATTR_BTIME) + pi.inode.btime = req->head.args.setattr.btime; + if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME)) + pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point. + if (mask & CEPH_SETATTR_SIZE) { + if (truncating_smaller) { + pi.inode.truncate(old_size, req->head.args.setattr.size); + le->metablob.add_truncate_start(cur->ino()); + } else { + pi.inode.size = req->head.args.setattr.size; + pi.inode.rstat.rbytes = pi.inode.size; + } + pi.inode.mtime = mdr->get_op_stamp(); + + // adjust client's max_size? + CInode::mempool_inode::client_range_map new_ranges; + bool max_increased = false; + mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased); + if (pi.inode.client_ranges != new_ranges) { + dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl; + pi.inode.client_ranges = new_ranges; + changed_ranges = true; + } + } + + pi.inode.version = cur->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + + // log + wait + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur, + truncating_smaller, changed_ranges)); + + // flush immediately if there are readers/writers waiting + if (mdr->is_xlocked(&cur->filelock) && + (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + mds->mdlog->flush(); +} + +/* Takes responsibility for mdr */ +void Server::do_open_truncate(MDRequestRef& mdr, int cmode) +{ + CInode *in = mdr->in[0]; + client_t client = mdr->get_client(); + ceph_assert(in); + + dout(10) << "do_open_truncate " << *in << dendl; + + SnapRealm *realm = in->find_snaprealm(); + Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay()); + + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "open_truncate"); + mdlog->start_entry(le); + + // prepare + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + + uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size); + if (old_size > 0) { + pi.inode.truncate(old_size, 0); + le->metablob.add_truncate_start(in->ino()); + } + + bool changed_ranges = false; + if (cap && (cmode & CEPH_FILE_MODE_WR)) { + pi.inode.client_ranges[client].range.first = 0; + pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment(); + pi.inode.client_ranges[client].follows = realm->get_newest_seq(); + changed_ranges = true; + cap->mark_clientwriteable(); + } + + le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); + + mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in); + + // make sure ino gets into the journal + le->metablob.add_opened_ino(in->ino()); + + mdr->o_trunc = true; + + CDentry *dn = 0; + if (mdr->client_request->get_dentry_wanted()) { + ceph_assert(mdr->dn[0].size()); + dn = mdr->dn[0].back(); + } + + journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0, + changed_ranges)); + // Although the `open` part can give an early reply, the truncation won't + // happen until our EUpdate is persistent, to give the client a prompt + // response we must also flush that event. + mdlog->flush(); +} + + +/* This function cleans up the passed mdr */ +void Server::handle_client_setlayout(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) return; + + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + if (!cur->is_file()) { + respond_to_request(mdr, -EINVAL); + return; + } + if (cur->get_projected_inode()->size || + cur->get_projected_inode()->truncate_seq > 1) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + + // validate layout + file_layout_t layout = cur->get_projected_inode()->layout; + // save existing layout for later + const auto old_layout = layout; + + int access = MAY_WRITE; + + if (req->head.args.setlayout.layout.fl_object_size > 0) + layout.object_size = req->head.args.setlayout.layout.fl_object_size; + if (req->head.args.setlayout.layout.fl_stripe_unit > 0) + layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit; + if (req->head.args.setlayout.layout.fl_stripe_count > 0) + layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count; + if (req->head.args.setlayout.layout.fl_pg_pool > 0) { + layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool; + + // make sure we have as new a map as the client + if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) { + mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + // Don't permit layout modifications without 'p' caps + if (layout != old_layout) { + access |= MAY_SET_VXATTR; + } + + if (!layout.is_valid()) { + dout(10) << "bad layout" << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + if (!mds->mdsmap->is_data_pool(layout.pool_id)) { + dout(10) << " invalid data pool " << layout.pool_id << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + lov.add_xlock(&cur->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, cur, access)) + return; + + // project update + auto &pi = cur->project_inode(); + pi.inode.layout = layout; + // add the old pool to the inode + pi.inode.add_old_pool(old_layout.pool_id); + pi.inode.version = cur->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "setlayout"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur)); +} + +void Server::handle_client_setdirlayout(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + file_layout_t *dir_layout = nullptr; + CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout); + if (!cur) return; + + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + + if (!cur->is_dir()) { + respond_to_request(mdr, -ENOTDIR); + return; + } + + lov.add_xlock(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // validate layout + const auto old_pi = cur->get_projected_inode(); + file_layout_t layout; + if (old_pi->has_layout()) + layout = old_pi->layout; + else if (dir_layout) + layout = *dir_layout; + else + layout = mdcache->default_file_layout; + + // Level of access required to complete + int access = MAY_WRITE; + + const auto old_layout = layout; + + if (req->head.args.setlayout.layout.fl_object_size > 0) + layout.object_size = req->head.args.setlayout.layout.fl_object_size; + if (req->head.args.setlayout.layout.fl_stripe_unit > 0) + layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit; + if (req->head.args.setlayout.layout.fl_stripe_count > 0) + layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count; + if (req->head.args.setlayout.layout.fl_pg_pool > 0) { + layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool; + // make sure we have as new a map as the client + if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) { + mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + if (layout != old_layout) { + access |= MAY_SET_VXATTR; + } + + if (!layout.is_valid()) { + dout(10) << "bad layout" << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + if (!mds->mdsmap->is_data_pool(layout.pool_id)) { + dout(10) << " invalid data pool " << layout.pool_id << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + if (!check_access(mdr, cur, access)) + return; + + auto &pi = cur->project_inode(); + pi.inode.layout = layout; + pi.inode.version = cur->pre_dirty(); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "setlayout"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + mdr->no_early_reply = true; + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur)); +} + +// XATTRS + +int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap, + file_layout_t *layout, bool validate) +{ + dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl; + try { + if (name == "layout") { + string::iterator begin = value.begin(); + string::iterator end = value.end(); + keys_and_values<string::iterator> p; // create instance of parser + std::map<string, string> m; // map to receive results + if (!qi::parse(begin, end, p, m)) { // returns true if successful + return -EINVAL; + } + string left(begin, end); + dout(10) << " parsed " << m << " left '" << left << "'" << dendl; + if (begin != end) + return -EINVAL; + for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) { + // Skip validation on each attr, we do it once at the end (avoid + // rejecting intermediate states if the overall result is ok) + int r = parse_layout_vxattr(string("layout.") + q->first, q->second, + osdmap, layout, false); + if (r < 0) + return r; + } + } else if (name == "layout.object_size") { + layout->object_size = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.stripe_unit") { + layout->stripe_unit = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.stripe_count") { + layout->stripe_count = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.pool") { + try { + layout->pool_id = boost::lexical_cast<unsigned>(value); + } catch (boost::bad_lexical_cast const&) { + int64_t pool = osdmap.lookup_pg_pool_name(value); + if (pool < 0) { + dout(10) << " unknown pool " << value << dendl; + return -ENOENT; + } + layout->pool_id = pool; + } + } else if (name == "layout.pool_namespace") { + layout->pool_ns = value; + } else { + dout(10) << " unknown layout vxattr " << name << dendl; + return -EINVAL; + } + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse int for " << name << dendl; + return -EINVAL; + } + + if (validate && !layout->is_valid()) { + dout(10) << "bad layout" << dendl; + return -EINVAL; + } + if (!mds->mdsmap->is_data_pool(layout->pool_id)) { + dout(10) << " invalid data pool " << layout->pool_id << dendl; + return -EINVAL; + } + return 0; +} + +int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota) +{ + dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl; + try { + if (name == "quota") { + string::iterator begin = value.begin(); + string::iterator end = value.end(); + if (begin == end) { + // keep quota unchanged. (for create_quota_realm()) + return 0; + } + keys_and_values<string::iterator> p; // create instance of parser + std::map<string, string> m; // map to receive results + if (!qi::parse(begin, end, p, m)) { // returns true if successful + return -EINVAL; + } + string left(begin, end); + dout(10) << " parsed " << m << " left '" << left << "'" << dendl; + if (begin != end) + return -EINVAL; + for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) { + int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota); + if (r < 0) + return r; + } + } else if (name == "quota.max_bytes") { + int64_t q = boost::lexical_cast<int64_t>(value); + if (q < 0) + return -EINVAL; + quota->max_bytes = q; + } else if (name == "quota.max_files") { + int64_t q = boost::lexical_cast<int64_t>(value); + if (q < 0) + return -EINVAL; + quota->max_files = q; + } else { + dout(10) << " unknown quota vxattr " << name << dendl; + return -EINVAL; + } + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse int for " << name << dendl; + return -EINVAL; + } + + if (!quota->is_valid()) { + dout(10) << "bad quota" << dendl; + return -EINVAL; + } + return 0; +} + +void Server::create_quota_realm(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + + auto req = MClientRequest::create(CEPH_MDS_OP_SETXATTR); + req->set_filepath(filepath(in->ino())); + req->set_string2("ceph.quota"); + // empty vxattr value + req->set_tid(mds->issue_tid()); + + mds->send_message_mds(req, in->authority().first); +} + +/* + * Verify that the file layout attribute carried by client + * is well-formatted. + * Return 0 on success, otherwise this function takes + * responsibility for the passed mdr. + */ +int Server::check_layout_vxattr(MDRequestRef& mdr, + string name, + string value, + file_layout_t *layout) +{ + const MClientRequest::const_ref &req = mdr->client_request; + epoch_t epoch; + int r; + + mds->objecter->with_osdmap([&](const OSDMap& osdmap) { + r = parse_layout_vxattr(name, value, osdmap, layout); + epoch = osdmap.get_epoch(); + }); + + if (r == -ENOENT) { + + // we don't have the specified pool, make sure our map + // is newer than or as new as the client. + epoch_t req_epoch = req->get_osdmap_epoch(); + + if (req_epoch > epoch) { + + // well, our map is older. consult mds. + Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr)); + + if (!mds->objecter->wait_for_map(req_epoch, fin)) + return r; // wait, fin will retry this request later + + delete fin; + + // now we have at least as new a map as the client, try again. + mds->objecter->with_osdmap([&](const OSDMap& osdmap) { + r = parse_layout_vxattr(name, value, osdmap, layout); + epoch = osdmap.get_epoch(); + }); + + ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie + + } else if (req_epoch == 0 && !mdr->waited_for_osdmap) { + + // For compatibility with client w/ old code, we still need get the + // latest map. One day if COMPACT_VERSION of MClientRequest >=3, + // we can remove those code. + mdr->waited_for_osdmap = true; + mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper( + mds, new C_MDS_RetryRequest(mdcache, mdr))); + return r; + } + } + + if (r < 0) { + + if (r == -ENOENT) + r = -EINVAL; + + respond_to_request(mdr, r); + return r; + } + + // all is well + return 0; +} + +void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur, + file_layout_t *dir_layout, + MutationImpl::LockOpVec& lov) +{ + const MClientRequest::const_ref &req = mdr->client_request; + string name(req->get_path2()); + bufferlist bl = req->get_data(); + string value (bl.c_str(), bl.length()); + dout(10) << "handle_set_vxattr " << name + << " val " << value.length() + << " bytes on " << *cur + << dendl; + + CInode::mempool_inode *pip = nullptr; + string rest; + + if (!check_access(mdr, cur, MAY_SET_VXATTR)) { + return; + } + + bool adjust_realm = false; + if (name.compare(0, 15, "ceph.dir.layout") == 0) { + if (!cur->is_dir()) { + respond_to_request(mdr, -EINVAL); + return; + } + + file_layout_t layout; + if (cur->get_projected_inode()->has_layout()) + layout = cur->get_projected_inode()->layout; + else if (dir_layout) + layout = *dir_layout; + else + layout = mdcache->default_file_layout; + + rest = name.substr(name.find("layout")); + if (check_layout_vxattr(mdr, rest, value, &layout) < 0) + return; + + lov.add_xlock(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + auto &pi = cur->project_inode(); + pi.inode.layout = layout; + mdr->no_early_reply = true; + pip = &pi.inode; + } else if (name.compare(0, 16, "ceph.file.layout") == 0) { + if (!cur->is_file()) { + respond_to_request(mdr, -EINVAL); + return; + } + if (cur->get_projected_inode()->size || + cur->get_projected_inode()->truncate_seq > 1) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + file_layout_t layout = cur->get_projected_inode()->layout; + rest = name.substr(name.find("layout")); + if (check_layout_vxattr(mdr, rest, value, &layout) < 0) + return; + + lov.add_xlock(&cur->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + auto &pi = cur->project_inode(); + int64_t old_pool = pi.inode.layout.pool_id; + pi.inode.add_old_pool(old_pool); + pi.inode.layout = layout; + pip = &pi.inode; + } else if (name.compare(0, 10, "ceph.quota") == 0) { + if (!cur->is_dir() || cur->is_root()) { + respond_to_request(mdr, -EINVAL); + return; + } + + quota_info_t quota = cur->get_projected_inode()->quota; + + rest = name.substr(name.find("quota")); + int r = parse_quota_vxattr(rest, value, "a); + if (r < 0) { + respond_to_request(mdr, r); + return; + } + + lov.add_xlock(&cur->policylock); + if (quota.is_enable() && !cur->get_projected_srnode()) { + lov.add_xlock(&cur->snaplock); + adjust_realm = true; + } + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (cur->get_projected_inode()->quota == quota) { + respond_to_request(mdr, 0); + return; + } + + auto &pi = cur->project_inode(false, adjust_realm); + pi.inode.quota = quota; + + if (adjust_realm) + pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq(); + + mdr->no_early_reply = true; + pip = &pi.inode; + + client_t exclude_ct = mdr->get_client(); + mdcache->broadcast_quota_to_client(cur, exclude_ct, true); + } else if (name == "ceph.dir.subvolume"sv) { + if (!cur->is_dir()) { + respond_to_request(mdr, -EINVAL); + return; + } + + bool val; + try { + val = boost::lexical_cast<bool>(value); + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + lov.add_xlock(&cur->policylock); + lov.add_xlock(&cur->snaplock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + SnapRealm *realm = cur->find_snaprealm(); + if (val) { + inodeno_t subvol_ino = realm->get_subvolume_ino(); + // can't create subvolume inside another subvolume + if (subvol_ino && subvol_ino != cur->ino()) { + respond_to_request(mdr, -EINVAL); + return; + } + } + + const auto srnode = cur->get_projected_srnode(); + if (val == (srnode && srnode->is_subvolume())) { + respond_to_request(mdr, 0); + return; + } + + auto& pi = cur->project_inode(false, true); + if (!srnode) + pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq(); + if (val) + pi.snapnode->mark_subvolume(); + else + pi.snapnode->clear_subvolume(); + + mdr->no_early_reply = true; + pip = &pi.inode; + adjust_realm = true; + } else if (name == "ceph.dir.pin"sv) { + if (!cur->is_dir() || cur->is_root()) { + respond_to_request(mdr, -EINVAL); + return; + } + + mds_rank_t rank; + try { + rank = boost::lexical_cast<mds_rank_t>(value); + if (rank < 0) rank = MDS_RANK_NONE; + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse int for " << name << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + lov.add_xlock(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + auto &pi = cur->project_inode(); + cur->set_export_pin(rank); + pip = &pi.inode; + } else { + dout(10) << " unknown vxattr " << name << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + pip->change_attr++; + pip->ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pip->rstat.rctime) + pip->rstat.rctime = mdr->get_op_stamp(); + pip->version = cur->pre_dirty(); + if (cur->is_file()) + pip->update_backtrace(); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "set vxattr layout"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur, + false, false, adjust_realm)); + return; +} + +void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur, + file_layout_t *dir_layout, + MutationImpl::LockOpVec& lov) +{ + const MClientRequest::const_ref &req = mdr->client_request; + string name(req->get_path2()); + + dout(10) << __func__ << " " << name << " on " << *cur << dendl; + + if (name == "ceph.dir.layout") { + if (!cur->is_dir()) { + respond_to_request(mdr, -ENODATA); + return; + } + if (cur->is_root()) { + dout(10) << "can't remove layout policy on the root directory" << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + + if (!cur->get_projected_inode()->has_layout()) { + respond_to_request(mdr, -ENODATA); + return; + } + + lov.add_xlock(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + auto &pi = cur->project_inode(); + pi.inode.clear_layout(); + pi.inode.version = cur->pre_dirty(); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + mdr->no_early_reply = true; + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur)); + return; + } else if (name == "ceph.dir.layout.pool_namespace" + || name == "ceph.file.layout.pool_namespace") { + // Namespace is the only layout field that has a meaningful + // null/none value (empty string, means default layout). Is equivalent + // to a setxattr with empty string: pass through the empty payload of + // the rmxattr request to do this. + handle_set_vxattr(mdr, cur, dir_layout, lov); + return; + } + + respond_to_request(mdr, -ENODATA); +} + +class C_MDS_inode_xattr_update_finish : public ServerLogContext { + CInode *in; +public: + + C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) : + ServerLogContext(s, r), in(i) { } + void finish(int r) override { + ceph_assert(r == 0); + + // apply + in->pop_and_dirty_projected_inode(mdr->ls); + + mdr->apply(); + + get_mds()->balancer->hit_inode(in, META_POP_IWR); + + server->respond_to_request(mdr, 0); + } +}; + +void Server::handle_client_setxattr(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + string name(req->get_path2()); + MutationImpl::LockOpVec lov; + CInode *cur; + + file_layout_t *dir_layout = NULL; + if (name.compare(0, 15, "ceph.dir.layout") == 0) + cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout); + else + cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) + return; + + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + + int flags = req->head.args.setxattr.flags; + + // magic ceph.* namespace? + if (name.compare(0, 5, "ceph.") == 0) { + handle_set_vxattr(mdr, cur, dir_layout, lov); + return; + } + + lov.add_xlock(&cur->xattrlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, cur, MAY_WRITE)) + return; + + auto pxattrs = cur->get_projected_xattrs(); + size_t len = req->get_data().length(); + size_t inc = len + name.length(); + + // check xattrs kv pairs size + size_t cur_xattrs_size = 0; + for (const auto& p : *pxattrs) { + if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) { + continue; + } + cur_xattrs_size += p.first.length() + p.second.length(); + } + + if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) { + dout(10) << "xattr kv pairs size too big. cur_xattrs_size " + << cur_xattrs_size << ", inc " << inc << dendl; + respond_to_request(mdr, -ENOSPC); + return; + } + + if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) { + dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl; + respond_to_request(mdr, -EEXIST); + return; + } + if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) { + dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl; + respond_to_request(mdr, -ENODATA); + return; + } + + dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl; + + // project update + auto &pi = cur->project_inode(true); + pi.inode.version = cur->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + pi.inode.xattr_version++; + auto &px = *pi.xattrs; + if ((flags & CEPH_XATTR_REMOVE)) { + px.erase(mempool::mds_co::string(name)); + } else { + bufferptr b = buffer::create(len); + if (len) + req->get_data().copy(0, len, b.c_str()); + auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b)); + if (!em.second) + em.first->second = b; + } + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "setxattr"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur)); +} + +void Server::handle_client_removexattr(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + std::string name(req->get_path2()); + + MutationImpl::LockOpVec lov; + file_layout_t *dir_layout = nullptr; + CInode *cur; + if (name == "ceph.dir.layout") + cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout); + else + cur = rdlock_path_pin_ref(mdr, 0, lov, true); + if (!cur) + return; + + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + + if (name.compare(0, 5, "ceph.") == 0) { + handle_remove_vxattr(mdr, cur, dir_layout, lov); + return; + } + + lov.add_xlock(&cur->xattrlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + auto pxattrs = cur->get_projected_xattrs(); + if (pxattrs->count(mempool::mds_co::string(name)) == 0) { + dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl; + respond_to_request(mdr, -ENODATA); + return; + } + + dout(10) << "removexattr '" << name << "' on " << *cur << dendl; + + // project update + auto &pi = cur->project_inode(true); + auto &px = *pi.xattrs; + pi.inode.version = cur->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + pi.inode.xattr_version++; + px.erase(mempool::mds_co::string(name)); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "removexattr"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur)); +} + + +// ================================================================= +// DIRECTORY and NAMESPACE OPS + + +// ------------------------------------------------ + +// MKNOD + +class C_MDS_mknod_finish : public ServerLogContext { + CDentry *dn; + CInode *newi; +public: + C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) : + ServerLogContext(s, r), dn(d), newi(ni) {} + void finish(int r) override { + ceph_assert(r == 0); + + // link the inode + dn->pop_projected_linkage(); + + // be a bit hacky with the inode version, here.. we decrement it + // just to keep mark_dirty() happen. (we didn't bother projecting + // a new version of hte inode since it's just been created) + newi->inode.version--; + newi->mark_dirty(newi->inode.version + 1, mdr->ls); + newi->mark_dirty_parent(mdr->ls, true); + + // mkdir? + if (newi->inode.is_dir()) { + CDir *dir = newi->get_dirfrag(frag_t()); + ceph_assert(dir); + dir->fnode.version--; + dir->mark_dirty(dir->fnode.version + 1, mdr->ls); + dir->mark_new(mdr->ls); + } + + mdr->apply(); + + MDRequestRef null_ref; + get_mds()->mdcache->send_dentry_link(dn, null_ref); + + if (newi->inode.is_file()) + get_mds()->locker->share_inode_max_size(newi); + + // hit pop + get_mds()->balancer->hit_inode(newi, META_POP_IWR); + + // reply + server->respond_to_request(mdr, 0); + } +}; + + +void Server::handle_client_mknod(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + client_t client = mdr->get_client(); + MutationImpl::LockOpVec lov; + file_layout_t *dir_layout = nullptr; + CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false, + &dir_layout); + if (!dn) return; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + CInode *diri = dn->get_dir()->get_inode(); + lov.add_rdlock(&diri->authlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_WRITE)) + return; + + if (!check_fragment_space(mdr, dn->get_dir())) + return; + + unsigned mode = req->head.args.mknod.mode; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + + // set layout + file_layout_t layout; + if (dir_layout && S_ISREG(mode)) + layout = *dir_layout; + else + layout = mdcache->default_file_layout; + + CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout); + ceph_assert(newi); + + dn->push_projected_linkage(newi); + + newi->inode.rdev = req->head.args.mknod.rdev; + newi->inode.version = dn->pre_dirty(); + newi->inode.rstat.rfiles = 1; + if (layout.pool_id != mdcache->default_file_layout.pool_id) + newi->inode.add_old_pool(mdcache->default_file_layout.pool_id); + newi->inode.update_backtrace(); + + snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); + SnapRealm *realm = dn->get_dir()->inode->find_snaprealm(); + ceph_assert(follows >= realm->get_newest_seq()); + + // if the client created a _regular_ file via MKNOD, it's highly likely they'll + // want to write to it (e.g., if they are reexporting NFS) + if (S_ISREG(newi->inode.mode)) { + // issue a cap on the file + int cmode = CEPH_FILE_MODE_RDWR; + Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay()); + if (cap) { + cap->set_wanted(0); + + // put locks in excl mode + newi->filelock.set_state(LOCK_EXCL); + newi->authlock.set_state(LOCK_EXCL); + newi->xattrlock.set_state(LOCK_EXCL); + + dout(15) << " setting a client_range too, since this is a regular file" << dendl; + newi->inode.client_ranges[client].range.first = 0; + newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment(); + newi->inode.client_ranges[client].follows = follows; + cap->mark_clientwriteable(); + } + } + + ceph_assert(dn->first == follows + 1); + newi->first = dn->first; + + dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl; + + // prepare finisher + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "mknod"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + journal_allocated_inos(mdr, &le->metablob); + + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), + PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, newi, true, true, true); + + journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi)); + mds->balancer->maybe_fragment(dn->get_dir(), false); +} + + + +// MKDIR +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_mkdir(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + if (req->get_filepath().is_last_dot_or_dotdot()) { + respond_to_request(mdr, -EEXIST); + return; + } + + MutationImpl::LockOpVec lov; + CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false); + if (!dn) return; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); + lov.add_rdlock(&diri->authlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // mkdir check access + if (!check_access(mdr, diri, MAY_WRITE)) + return; + + if (!check_fragment_space(mdr, dir)) + return; + + // new inode + unsigned mode = req->head.args.mkdir.mode; + mode &= ~S_IFMT; + mode |= S_IFDIR; + CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); + ceph_assert(newi); + + // it's a directory. + dn->push_projected_linkage(newi); + + newi->inode.version = dn->pre_dirty(); + newi->inode.rstat.rsubdirs = 1; + newi->inode.update_backtrace(); + + snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); + SnapRealm *realm = dn->get_dir()->inode->find_snaprealm(); + ceph_assert(follows >= realm->get_newest_seq()); + + dout(12) << " follows " << follows << dendl; + ceph_assert(dn->first == follows + 1); + newi->first = dn->first; + + // ...and that new dir is empty. + CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t()); + newdir->state_set(CDir::STATE_CREATING); + newdir->mark_complete(); + newdir->fnode.version = newdir->pre_dirty(); + + // prepare finisher + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "mkdir"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + journal_allocated_inos(mdr, &le->metablob); + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, newi, true, true); + le->metablob.add_new_dir(newdir); // dirty AND complete AND new + + // issue a cap on the directory + int cmode = CEPH_FILE_MODE_RDWR; + Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay()); + if (cap) { + cap->set_wanted(0); + + // put locks in excl mode + newi->filelock.set_state(LOCK_EXCL); + newi->authlock.set_state(LOCK_EXCL); + newi->xattrlock.set_state(LOCK_EXCL); + } + + // make sure this inode gets into the journal + le->metablob.add_opened_ino(newi->ino()); + + journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi)); + + // We hit_dir (via hit_inode) in our finish callback, but by then we might + // have overshot the split size (multiple mkdir in flight), so here is + // an early chance to split the dir if this mkdir makes it oversized. + mds->balancer->maybe_fragment(dir, false); +} + + +// SYMLINK + +void Server::handle_client_symlink(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + MutationImpl::LockOpVec lov; + CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false); + if (!dn) return; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + CDir *dir = dn->get_dir(); + CInode *diri = dir->get_inode(); + lov.add_rdlock(&diri->authlock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_WRITE)) + return; + + if (!check_fragment_space(mdr, dir)) + return; + + unsigned mode = S_IFLNK | 0777; + CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); + ceph_assert(newi); + + // it's a symlink + dn->push_projected_linkage(newi); + + newi->symlink = req->get_path2(); + newi->inode.size = newi->symlink.length(); + newi->inode.rstat.rbytes = newi->inode.size; + newi->inode.rstat.rfiles = 1; + newi->inode.version = dn->pre_dirty(); + newi->inode.update_backtrace(); + + newi->first = dn->first; + + // prepare finisher + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "symlink"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + journal_allocated_inos(mdr, &le->metablob); + mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, newi, true, true); + + journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi)); + mds->balancer->maybe_fragment(dir, false); +} + + + + + +// LINK + +void Server::handle_client_link(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + dout(7) << "handle_client_link " << req->get_filepath() + << " to " << req->get_filepath2() + << dendl; + + MutationImpl::LockOpVec lov; + + CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false); + if (!dn) return; + CInode *targeti = rdlock_path_pin_ref(mdr, 1, lov, false); + if (!targeti) return; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + + CDir *dir = dn->get_dir(); + dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl; + dout(7) << "target is " << *targeti << dendl; + if (targeti->is_dir()) { + // if srcdn is replica, need to make sure its linkage is correct + vector<CDentry*>& trace = mdr->dn[1]; + if (trace.empty() || + trace.back()->is_auth() || + trace.back()->lock.can_read(mdr->get_client())) { + dout(7) << "target is a dir, failing..." << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + } + + lov.erase_rdlock(&targeti->snaplock); + lov.add_xlock(&targeti->snaplock); + lov.add_xlock(&targeti->linklock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if ((!mdr->has_more() || mdr->more()->witnessed.empty())) { + if (!check_access(mdr, targeti, MAY_WRITE)) + return; + + if (!check_access(mdr, dir->get_inode(), MAY_WRITE)) + return; + + if (!check_fragment_space(mdr, dir)) + return; + } + + CInode* target_pin = targeti->get_projected_parent_dir()->inode; + SnapRealm *target_realm = target_pin->find_snaprealm(); + if (target_pin != dir->inode && + target_realm->get_subvolume_ino() != + dir->inode->find_snaprealm()->get_subvolume_ino()) { + dout(7) << "target is in different subvolume, failing..." << dendl; + respond_to_request(mdr, -EXDEV); + return; + } + + // go! + ceph_assert(g_conf()->mds_kill_link_at != 1); + + // local or remote? + if (targeti->is_auth()) + _link_local(mdr, dn, targeti, target_realm); + else + _link_remote(mdr, true, dn, targeti); + mds->balancer->maybe_fragment(dir, false); +} + + +class C_MDS_link_local_finish : public ServerLogContext { + CDentry *dn; + CInode *targeti; + version_t dnpv; + version_t tipv; + bool adjust_realm; +public: + C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti, + version_t dnpv_, version_t tipv_, bool ar) : + ServerLogContext(s, r), dn(d), targeti(ti), + dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { } + void finish(int r) override { + ceph_assert(r == 0); + server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm); + } +}; + + +void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm) +{ + dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; + + mdr->ls = mdlog->get_current_segment(); + + // predirty NEW dentry + version_t dnpv = dn->pre_dirty(); + version_t tipv = targeti->pre_dirty(); + + // project inode update + auto &pi = targeti->project_inode(); + pi.inode.nlink++; + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + pi.inode.version = tipv; + + bool adjust_realm = false; + if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) { + sr_t *newsnap = targeti->project_snaprealm(); + targeti->mark_snaprealm_global(newsnap); + targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true); + adjust_realm = true; + } + + // log + wait + EUpdate *le = new EUpdate(mdlog, "link_local"); + mdlog->start_entry(le); + le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti + le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti); + + // do this after predirty_*, to avoid funky extra dnl arg + dn->push_projected_linkage(targeti->ino(), targeti->d_type()); + + journal_and_reply(mdr, targeti, dn, le, + new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm)); +} + +void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti, + version_t dnpv, version_t tipv, bool adjust_realm) +{ + dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl; + + // link and unlock the NEW dentry + CDentry::linkage_t *dnl = dn->pop_projected_linkage(); + if (!dnl->get_inode()) + dn->link_remote(dnl, targeti); + dn->mark_dirty(dnpv, mdr->ls); + + // target inode + targeti->pop_and_dirty_projected_inode(mdr->ls); + + mdr->apply(); + + MDRequestRef null_ref; + mdcache->send_dentry_link(dn, null_ref); + + if (adjust_realm) { + int op = CEPH_SNAP_OP_SPLIT; + mds->mdcache->send_snap_update(targeti, 0, op); + mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op); + } + + // bump target popularity + mds->balancer->hit_inode(targeti, META_POP_IWR); + mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR); + + // reply + respond_to_request(mdr, 0); +} + + +// link / unlink remote + +class C_MDS_link_remote_finish : public ServerLogContext { + bool inc; + CDentry *dn; + CInode *targeti; + version_t dpv; +public: + C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) : + ServerLogContext(s, r), inc(i), dn(d), targeti(ti), + dpv(d->get_projected_version()) {} + void finish(int r) override { + ceph_assert(r == 0); + server->_link_remote_finish(mdr, inc, dn, targeti, dpv); + } +}; + +void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti) +{ + dout(10) << "_link_remote " + << (inc ? "link ":"unlink ") + << *dn << " to " << *targeti << dendl; + + // 1. send LinkPrepare to dest (journal nlink++ prepare) + mds_rank_t linkauth = targeti->authority().first; + if (mdr->more()->witnessed.count(linkauth) == 0) { + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) { + dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + dout(10) << " targeti auth must prepare nlink++/--" << dendl; + int op; + if (inc) + op = MMDSSlaveRequest::OP_LINKPREP; + else + op = MMDSSlaveRequest::OP_UNLINKPREP; + auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, op); + targeti->set_object_info(req->get_object_info()); + req->op_stamp = mdr->get_op_stamp(); + if (auto& desti_srnode = mdr->more()->desti_srnode) + encode(*desti_srnode, req->desti_snapbl); + mds->send_message_mds(req, linkauth); + + ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); + mdr->more()->waiting_on_slave.insert(linkauth); + return; + } + dout(10) << " targeti auth has prepared nlink++/--" << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 2); + + if (auto& desti_srnode = mdr->more()->desti_srnode) { + delete desti_srnode; + desti_srnode = NULL; + } + + mdr->set_mds_stamp(ceph_clock_now()); + + // add to event + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote"); + mdlog->start_entry(le); + le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); + if (!mdr->more()->witnessed.empty()) { + dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; + le->reqid = mdr->reqid; + le->had_slaves = true; + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + } + + if (inc) { + dn->pre_dirty(); + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); + le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote + dn->push_projected_linkage(targeti->ino(), targeti->d_type()); + } else { + dn->pre_dirty(); + mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1); + mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn); + le->metablob.add_null_dentry(dn, true); + dn->push_projected_linkage(); + } + + journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti)); +} + +void Server::_link_remote_finish(MDRequestRef& mdr, bool inc, + CDentry *dn, CInode *targeti, + version_t dpv) +{ + dout(10) << "_link_remote_finish " + << (inc ? "link ":"unlink ") + << *dn << " to " << *targeti << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 3); + + if (!mdr->more()->witnessed.empty()) + mdcache->logged_master_update(mdr->reqid); + + if (inc) { + // link the new dentry + CDentry::linkage_t *dnl = dn->pop_projected_linkage(); + if (!dnl->get_inode()) + dn->link_remote(dnl, targeti); + dn->mark_dirty(dpv, mdr->ls); + } else { + // unlink main dentry + dn->get_dir()->unlink_inode(dn); + dn->pop_projected_linkage(); + dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry + } + + mdr->apply(); + + MDRequestRef null_ref; + if (inc) + mdcache->send_dentry_link(dn, null_ref); + else + mdcache->send_dentry_unlink(dn, NULL, null_ref); + + // bump target popularity + mds->balancer->hit_inode(targeti, META_POP_IWR); + mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR); + + // reply + respond_to_request(mdr, 0); + + if (!inc) + // removing a new dn? + dn->get_dir()->try_remove_unlinked_dn(dn); +} + + +// remote linking/unlinking + +class C_MDS_SlaveLinkPrep : public ServerLogContext { + CInode *targeti; + bool adjust_realm; +public: + C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) : + ServerLogContext(s, r), targeti(t), adjust_realm(ar) { } + void finish(int r) override { + ceph_assert(r == 0); + server->_logged_slave_link(mdr, targeti, adjust_realm); + } +}; + +class C_MDS_SlaveLinkCommit : public ServerContext { + MDRequestRef mdr; + CInode *targeti; +public: + C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) : + ServerContext(s), mdr(r), targeti(t) { } + void finish(int r) override { + server->_commit_slave_link(mdr, r, targeti); + } +}; + +void Server::handle_slave_link_prep(MDRequestRef& mdr) +{ + dout(10) << "handle_slave_link_prep " << *mdr + << " on " << mdr->slave_request->get_object_info() + << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 4); + + CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); + ceph_assert(targeti); + dout(10) << "targeti " << *targeti << dendl; + CDentry *dn = targeti->get_parent_dn(); + CDentry::linkage_t *dnl = dn->get_linkage(); + ceph_assert(dnl->is_primary()); + + mdr->set_op_stamp(mdr->slave_request->op_stamp); + + mdr->auth_pin(targeti); + + //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare... + ceph_assert(g_conf()->mds_kill_link_at != 5); + + // journal it + mdr->ls = mdlog->get_current_segment(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK); + mdlog->start_entry(le); + + auto &pi = dnl->get_inode()->project_inode(); + + // update journaled target inode + bool inc; + bool adjust_realm = false; + bool realm_projected = false; + if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { + inc = true; + pi.inode.nlink++; + + CDentry *target_pdn = targeti->get_projected_parent_dn(); + SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm(); + if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) { + sr_t *newsnap = targeti->project_snaprealm(); + targeti->mark_snaprealm_global(newsnap); + targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true); + adjust_realm = true; + realm_projected = true; + } + } else { + inc = false; + pi.inode.nlink--; + if (targeti->is_projected_snaprealm_global()) { + ceph_assert(mdr->slave_request->desti_snapbl.length()); + auto p = mdr->slave_request->desti_snapbl.cbegin(); + + sr_t *newsnap = targeti->project_snaprealm(); + decode(*newsnap, p); + + if (pi.inode.nlink == 0) + ceph_assert(!newsnap->is_parent_global()); + + realm_projected = true; + } else { + ceph_assert(mdr->slave_request->desti_snapbl.length() == 0); + } + } + + link_rollback rollback; + rollback.reqid = mdr->reqid; + rollback.ino = targeti->ino(); + rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections + const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode(); + rollback.old_dir_mtime = pf->fragstat.mtime; + rollback.old_dir_rctime = pf->rstat.rctime; + rollback.was_inc = inc; + if (realm_projected) { + if (targeti->snaprealm) { + encode(true, rollback.snapbl); + targeti->encode_snap_blob(rollback.snapbl); + } else { + encode(false, rollback.snapbl); + } + } + encode(rollback, le->rollback); + mdr->more()->rollback_bl = le->rollback; + + pi.inode.ctime = mdr->get_op_stamp(); + pi.inode.version = targeti->pre_dirty(); + + dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl; + + // commit case + mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti); + mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds); + + // set up commit waiter + mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti); + + mdr->more()->slave_update_journaled = true; + submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm), + mdr, __func__); + mdlog->flush(); +} + +void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm) +{ + dout(10) << "_logged_slave_link " << *mdr + << " " << *targeti << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 6); + + // update the target + targeti->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + // hit pop + mds->balancer->hit_inode(targeti, META_POP_IWR); + + // done. + mdr->reset_slave_request(); + + if (adjust_realm) { + int op = CEPH_SNAP_OP_SPLIT; + mds->mdcache->send_snap_update(targeti, 0, op); + mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op); + } + + // ack + if (!mdr->aborted) { + auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK); + mds->send_message_mds(reply, mdr->slave_to_mds); + } else { + dout(10) << " abort flag set, finishing" << dendl; + mdcache->request_finish(mdr); + } +} + + +struct C_MDS_CommittedSlave : public ServerLogContext { + C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {} + void finish(int r) override { + server->_committed_slave(mdr); + } +}; + +void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti) +{ + dout(10) << "_commit_slave_link " << *mdr + << " r=" << r + << " " << *targeti << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 7); + + if (r == 0) { + // drop our pins, etc. + mdr->cleanup(); + + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK); + mdlog->start_entry(le); + submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__); + mdlog->flush(); + } else { + do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); + } +} + +void Server::_committed_slave(MDRequestRef& mdr) +{ + dout(10) << "_committed_slave " << *mdr << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 8); + + bool assert_exist = mdr->more()->slave_update_journaled; + mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist); + auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED); + mds->send_message_mds(req, mdr->slave_to_mds); + mdcache->request_finish(mdr); +} + +struct C_MDS_LoggedLinkRollback : public ServerLogContext { + MutationRef mut; + map<client_t,MClientSnap::ref> splits; + C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r, + map<client_t,MClientSnap::ref>&& _splits) : + ServerLogContext(s, r), mut(m), splits(std::move(_splits)) { + } + void finish(int r) override { + server->_link_rollback_finish(mut, mdr, splits); + } +}; + +void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr) +{ + link_rollback rollback; + auto p = rbl.cbegin(); + decode(rollback, p); + + dout(10) << "do_link_rollback on " << rollback.reqid + << (rollback.was_inc ? " inc":" dec") + << " ino " << rollback.ino + << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 9); + + mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + ceph_assert(mdr || mds->is_resolve()); + + MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid)); + mut->ls = mds->mdlog->get_current_segment(); + + CInode *in = mdcache->get_inode(rollback.ino); + ceph_assert(in); + dout(10) << " target is " << *in << dendl; + ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock. + + auto &pi = in->project_inode(); + pi.inode.version = in->pre_dirty(); + mut->add_projected_inode(in); + + // parent dir rctime + CDir *parent = in->get_projected_parent_dn()->get_dir(); + fnode_t *pf = parent->project_fnode(); + mut->add_projected_fnode(parent); + pf->version = parent->pre_dirty(); + if (pf->fragstat.mtime == pi.inode.ctime) { + pf->fragstat.mtime = rollback.old_dir_mtime; + if (pf->rstat.rctime == pi.inode.ctime) + pf->rstat.rctime = rollback.old_dir_rctime; + mut->add_updated_lock(&parent->get_inode()->filelock); + mut->add_updated_lock(&parent->get_inode()->nestlock); + } + + // inode + pi.inode.ctime = rollback.old_ctime; + if (rollback.was_inc) + pi.inode.nlink--; + else + pi.inode.nlink++; + + map<client_t,MClientSnap::ref> splits; + if (rollback.snapbl.length() && in->snaprealm) { + bool hadrealm; + auto p = rollback.snapbl.cbegin(); + decode(hadrealm, p); + if (hadrealm) { + if (!mds->is_resolve()) { + sr_t *new_srnode = new sr_t(); + decode(*new_srnode, p); + in->project_snaprealm(new_srnode); + } else { + decode(in->snaprealm->srnode, p); + } + } else { + SnapRealm *realm = parent->get_inode()->find_snaprealm(); + if (!mds->is_resolve()) + mdcache->prepare_realm_merge(in->snaprealm, realm, splits); + in->project_snaprealm(NULL); + } + } + + // journal it + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master, + ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK); + mdlog->start_entry(le); + le->commit.add_dir_context(parent); + le->commit.add_dir(parent, true); + le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true); + + submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)), + mdr, __func__); + mdlog->flush(); +} + +void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr, + map<client_t,MClientSnap::ref>& splits) +{ + dout(10) << "_link_rollback_finish" << dendl; + + ceph_assert(g_conf()->mds_kill_link_at != 10); + + mut->apply(); + + if (!mds->is_resolve()) + mdcache->send_snaps(splits); + + if (mdr) + mdcache->request_finish(mdr); + + mdcache->finish_rollback(mut->reqid, mdr); + + mut->cleanup(); +} + + +void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m) +{ + dout(10) << "handle_slave_link_prep_ack " << *mdr + << " " << *m << dendl; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + ceph_assert(g_conf()->mds_kill_link_at != 11); + + // note slave + mdr->more()->slaves.insert(from); + + // witnessed! + ceph_assert(mdr->more()->witnessed.count(from) == 0); + mdr->more()->witnessed.insert(from); + ceph_assert(!m->is_not_journaled()); + mdr->more()->has_journaled_slaves = true; + + // remove from waiting list + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + + ceph_assert(mdr->more()->waiting_on_slave.empty()); + + dispatch_client_request(mdr); // go again! +} + + + + + +// UNLINK + +void Server::handle_client_unlink(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + client_t client = mdr->get_client(); + + // rmdir or unlink? + bool rmdir = false; + if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true; + + const filepath& refpath = req->get_filepath(); + if (refpath.depth() == 0) { + respond_to_request(mdr, -EINVAL); + return; + } + if (refpath.is_last_dot_or_dotdot()) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + + // traverse to path + vector<CDentry*> trace; + CInode *in; + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &in, MDS_TRAVERSE_FORWARD); + if (r > 0) return; + if (r < 0) { + if (r == -ESTALE) { + dout(10) << "FAIL on ESTALE but attempting recovery" << dendl; + mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr)); + return; + } + respond_to_request(mdr, r); + return; + } + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + + CDentry *dn = trace.back(); + ceph_assert(dn); + if (!dn->is_auth()) { + mdcache->request_forward(mdr, dn->authority().first); + return; + } + + CInode *diri = dn->get_dir()->get_inode(); + + CDentry::linkage_t *dnl = dn->get_linkage(client, mdr); + ceph_assert(!dnl->is_null()); + + if (rmdir) { + dout(7) << "handle_client_rmdir on " << *dn << dendl; + } else { + dout(7) << "handle_client_unlink on " << *dn << dendl; + } + dout(7) << "dn links to " << *in << dendl; + + // rmdir vs is_dir + if (in->is_dir()) { + if (rmdir) { + // do empty directory checks + if (_dir_is_nonempty_unlocked(mdr, in)) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + } else { + dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl; + respond_to_request(mdr, -EISDIR); + return; + } + } else { + if (rmdir) { + // unlink + dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl; + respond_to_request(mdr, -ENOTDIR); + return; + } + } + + // -- create stray dentry? -- + CDentry *straydn = NULL; + if (dnl->is_primary()) { + straydn = prepare_stray_dentry(mdr, dnl->get_inode()); + if (!straydn) + return; + dout(10) << " straydn is " << *straydn << dendl; + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } + + // lock + MutationImpl::LockOpVec lov; + + for (int i=0; i<(int)trace.size()-1; i++) + lov.add_rdlock(&trace[i]->lock); + lov.add_xlock(&dn->lock); + lov.add_wrlock(&diri->filelock); + lov.add_wrlock(&diri->nestlock); + lov.add_xlock(&in->linklock); + if (straydn) { + lov.add_wrlock(&straydn->get_dir()->inode->filelock); + lov.add_wrlock(&straydn->get_dir()->inode->nestlock); + lov.add_xlock(&straydn->lock); + } + + mds->locker->include_snap_rdlocks(diri, lov); + lov.add_xlock(&in->snaplock); + if (in->is_dir()) + lov.add_rdlock(&in->filelock); // to verify it's empty + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (in->is_dir() && + _dir_is_nonempty(mdr, in)) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + + if ((!mdr->has_more() || mdr->more()->witnessed.empty())) { + if (!check_access(mdr, diri, MAY_WRITE)) + return; + } + + if (straydn) + straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + + if (!mdr->more()->desti_srnode) { + if (in->is_projected_snaprealm_global()) { + sr_t *new_srnode = in->prepare_new_srnode(0); + in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary()); + // dropping the last linkage or dropping the last remote linkage, + // detch the inode from global snaprealm + auto nlink = in->get_projected_inode()->nlink; + if (nlink == 1 || + (nlink == 2 && !dnl->is_primary() && + !in->get_projected_parent_dir()->inode->is_stray())) + in->clear_snaprealm_global(new_srnode); + mdr->more()->desti_srnode = new_srnode; + } else if (dnl->is_primary()) { + // prepare snaprealm blob for slave request + SnapRealm *realm = in->find_snaprealm(); + snapid_t follows = realm->get_newest_seq(); + if (in->snaprealm || follows + 1 > in->get_oldest_snap()) { + sr_t *new_srnode = in->prepare_new_srnode(follows); + in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm()); + mdr->more()->desti_srnode = new_srnode; + } + } + } + + // yay! + if (in->is_dir() && in->has_subtree_root_dirfrag()) { + // subtree root auths need to be witnesses + set<mds_rank_t> witnesses; + in->list_replicas(witnesses); + dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; + + for (set<mds_rank_t>::iterator p = witnesses.begin(); + p != witnesses.end(); + ++p) { + if (mdr->more()->witnessed.count(*p)) { + dout(10) << " already witnessed by mds." << *p << dendl; + } else if (mdr->more()->waiting_on_slave.count(*p)) { + dout(10) << " already waiting on witness mds." << *p << dendl; + } else { + if (!_rmdir_prepare_witness(mdr, *p, trace, straydn)) + return; + } + } + if (!mdr->more()->waiting_on_slave.empty()) + return; // we're waiting for a witness. + } + + // ok! + if (dnl->is_remote() && !dnl->get_inode()->is_auth()) + _link_remote(mdr, false, dn, dnl->get_inode()); + else + _unlink_local(mdr, dn, straydn); +} + +class C_MDS_unlink_local_finish : public ServerLogContext { + CDentry *dn; + CDentry *straydn; + version_t dnpv; // deleted dentry +public: + C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) : + ServerLogContext(s, r), dn(d), straydn(sd), + dnpv(d->get_projected_version()) {} + void finish(int r) override { + ceph_assert(r == 0); + server->_unlink_local_finish(mdr, dn, straydn, dnpv); + } +}; + +void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_unlink_local " << *dn << dendl; + + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + CInode *in = dnl->get_inode(); + + + // ok, let's do it. + mdr->ls = mdlog->get_current_segment(); + + // prepare log entry + EUpdate *le = new EUpdate(mdlog, "unlink_local"); + mdlog->start_entry(le); + le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); + if (!mdr->more()->witnessed.empty()) { + dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; + le->reqid = mdr->reqid; + le->had_slaves = true; + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + } + + if (straydn) { + ceph_assert(dnl->is_primary()); + straydn->push_projected_linkage(in); + } + + // the unlinked dentry + dn->pre_dirty(); + + auto &pi = in->project_inode(); + { + std::string t; + dn->make_path_string(t, true); + pi.inode.stray_prior_path = std::move(t); + } + pi.inode.version = in->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.change_attr++; + pi.inode.nlink--; + if (pi.inode.nlink == 0) + in->state_set(CInode::STATE_ORPHAN); + + if (mdr->more()->desti_srnode) { + auto& desti_srnode = mdr->more()->desti_srnode; + in->project_snaprealm(desti_srnode); + desti_srnode = NULL; + } + + if (straydn) { + // will manually pop projected inode + + // primary link. add stray dentry. + mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1); + mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + + pi.inode.update_backtrace(); + le->metablob.add_primary_dentry(straydn, in, true, true); + } else { + mdr->add_projected_inode(in); + // remote link. update remote inode. + mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1); + mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in); + } + + mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn); + le->metablob.add_null_dentry(dn, true); + + if (in->is_dir()) { + dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; + le->metablob.renamed_dirino = in->ino(); + } + + dn->push_projected_linkage(); + + if (straydn) { + ceph_assert(in->first <= straydn->first); + in->first = straydn->first; + } + + if (in->is_dir()) { + ceph_assert(straydn); + mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); + } + + journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn)); +} + +void Server::_unlink_local_finish(MDRequestRef& mdr, + CDentry *dn, CDentry *straydn, + version_t dnpv) +{ + dout(10) << "_unlink_local_finish " << *dn << dendl; + + if (!mdr->more()->witnessed.empty()) + mdcache->logged_master_update(mdr->reqid); + + CInode *strayin = NULL; + bool hadrealm = false; + if (straydn) { + // if there is newly created snaprealm, need to split old snaprealm's + // inodes_with_caps. So pop snaprealm before linkage changes. + strayin = dn->get_linkage()->get_inode(); + hadrealm = strayin->snaprealm ? true : false; + strayin->early_pop_projected_snaprealm(); + } + + // unlink main dentry + dn->get_dir()->unlink_inode(dn); + dn->pop_projected_linkage(); + + // relink as stray? (i.e. was primary link?) + if (straydn) { + dout(20) << " straydn is " << *straydn << dendl; + straydn->pop_projected_linkage(); + + strayin->pop_and_dirty_projected_inode(mdr->ls); + + mdcache->touch_dentry_bottom(straydn); + } + + dn->mark_dirty(dnpv, mdr->ls); + mdr->apply(); + + mdcache->send_dentry_unlink(dn, straydn, mdr); + + if (straydn) { + // update subtree map? + if (strayin->is_dir()) + mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true); + + if (strayin->snaprealm && !hadrealm) + mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false); + } + + // bump pop + mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR); + + // reply + respond_to_request(mdr, 0); + + // removing a new dn? + dn->get_dir()->try_remove_unlinked_dn(dn); + + // clean up ? + // respond_to_request() drops locks. So stray reintegration can race with us. + if (straydn && !straydn->get_projected_linkage()->is_null()) { + // Tip off the MDCache that this dentry is a stray that + // might be elegible for purge. + mdcache->notify_stray(straydn); + } +} + +bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn) +{ + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + dout(10) << "_rmdir_prepare_witness mds." << who << dendl; + auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP); + req->srcdnpath = filepath(trace.front()->get_dir()->ino()); + for (auto dn : trace) + req->srcdnpath.push_dentry(dn->get_name()); + mdcache->replicate_stray(straydn, who, req->straybl); + if (mdr->more()->desti_srnode) + encode(*mdr->more()->desti_srnode, req->desti_snapbl); + + req->op_stamp = mdr->get_op_stamp(); + mds->send_message_mds(req, who); + + ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0); + mdr->more()->waiting_on_slave.insert(who); + return true; +} + +struct C_MDS_SlaveRmdirPrep : public ServerLogContext { + CDentry *dn, *straydn; + C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st) + : ServerLogContext(s, r), dn(d), straydn(st) {} + void finish(int r) override { + server->_logged_slave_rmdir(mdr, dn, straydn); + } +}; + +struct C_MDS_SlaveRmdirCommit : public ServerContext { + MDRequestRef mdr; + CDentry *straydn; + C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd) + : ServerContext(s), mdr(r), straydn(sd) { } + void finish(int r) override { + server->_commit_slave_rmdir(mdr, r, straydn); + } +}; + +void Server::handle_slave_rmdir_prep(MDRequestRef& mdr) +{ + dout(10) << "handle_slave_rmdir_prep " << *mdr + << " " << mdr->slave_request->srcdnpath + << " to " << mdr->slave_request->destdnpath + << dendl; + + vector<CDentry*> trace; + filepath srcpath(mdr->slave_request->srcdnpath); + dout(10) << " src " << srcpath << dendl; + CInode *in; + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK); + if (r > 0) return; + if (r == -ESTALE) { + mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr), + mdr->slave_to_mds); + return; + } + ceph_assert(r == 0); + CDentry *dn = trace.back(); + dout(10) << " dn " << *dn << dendl; + mdr->pin(dn); + + ceph_assert(mdr->straydn); + CDentry *straydn = mdr->straydn; + dout(10) << " straydn " << *straydn << dendl; + + mdr->set_op_stamp(mdr->slave_request->op_stamp); + + rmdir_rollback rollback; + rollback.reqid = mdr->reqid; + rollback.src_dir = dn->get_dir()->dirfrag(); + rollback.src_dname = dn->get_name(); + rollback.dest_dir = straydn->get_dir()->dirfrag(); + rollback.dest_dname = straydn->get_name(); + if (mdr->slave_request->desti_snapbl.length()) { + if (in->snaprealm) { + encode(true, rollback.snapbl); + in->encode_snap_blob(rollback.snapbl); + } else { + encode(false, rollback.snapbl); + } + } + encode(rollback, mdr->more()->rollback_bl); + // FIXME: rollback snaprealm + dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + + // set up commit waiter + mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn); + + straydn->push_projected_linkage(in); + dn->push_projected_linkage(); + + ceph_assert(straydn->first >= in->first); + in->first = straydn->first; + + if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) { + dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl; + _logged_slave_rmdir(mdr, dn, straydn); + return; + } + + mdr->ls = mdlog->get_current_segment(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + le->rollback = mdr->more()->rollback_bl; + + le->commit.add_dir_context(straydn->get_dir()); + le->commit.add_primary_dentry(straydn, in, true); + // slave: no need to journal original dentry + + dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = in->ino(); + + mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); + mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds); + + mdr->more()->slave_update_journaled = true; + submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn), + mdr, __func__); + mdlog->flush(); +} + +void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl; + CInode *in = dn->get_linkage()->get_inode(); + + bool new_realm; + if (mdr->slave_request->desti_snapbl.length()) { + new_realm = !in->snaprealm; + in->decode_snap_blob(mdr->slave_request->desti_snapbl); + ceph_assert(in->snaprealm); + ceph_assert(in->snaprealm->have_past_parents_open()); + } else { + new_realm = false; + } + + // update our cache now, so we are consistent with what is in the journal + // when we journal a subtree map + dn->get_dir()->unlink_inode(dn); + straydn->pop_projected_linkage(); + dn->pop_projected_linkage(); + + mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled); + + if (new_realm) + mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false); + + // done. + mdr->reset_slave_request(); + mdr->straydn = 0; + + if (!mdr->aborted) { + auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK); + if (!mdr->more()->slave_update_journaled) + reply->mark_not_journaled(); + mds->send_message_mds(reply, mdr->slave_to_mds); + } else { + dout(10) << " abort flag set, finishing" << dendl; + mdcache->request_finish(mdr); + } +} + +void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack) +{ + dout(10) << "handle_slave_rmdir_prep_ack " << *mdr + << " " << *ack << dendl; + + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + mdr->more()->slaves.insert(from); + mdr->more()->witnessed.insert(from); + if (!ack->is_not_journaled()) + mdr->more()->has_journaled_slaves = true; + + // remove from waiting list + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + + if (mdr->more()->waiting_on_slave.empty()) + dispatch_client_request(mdr); // go again! + else + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; +} + +void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn) +{ + dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl; + + if (r == 0) { + if (mdr->more()->slave_update_journaled) { + CInode *strayin = straydn->get_projected_linkage()->get_inode(); + if (strayin && !strayin->snaprealm) + mdcache->clear_dirty_bits_for_stray(strayin); + } + + mdr->cleanup(); + + if (mdr->more()->slave_update_journaled) { + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid, + mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT, + ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__); + mdlog->flush(); + } else { + _committed_slave(mdr); + } + } else { + // abort + do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); + } +} + +struct C_MDS_LoggedRmdirRollback : public ServerLogContext { + metareqid_t reqid; + CDentry *dn; + CDentry *straydn; + C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st) + : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {} + void finish(int r) override { + server->_rmdir_rollback_finish(mdr, reqid, dn, straydn); + } +}; + +void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr) +{ + // unlink the other rollback methods, the rmdir rollback is only + // needed to record the subtree changes in the journal for inode + // replicas who are auth for empty dirfrags. no actual changes to + // the file system are taking place here, so there is no Mutation. + + rmdir_rollback rollback; + auto p = rbl.cbegin(); + decode(rollback, p); + + dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl; + mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + ceph_assert(mdr || mds->is_resolve()); + + CDir *dir = mdcache->get_dirfrag(rollback.src_dir); + if (!dir) + dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname); + ceph_assert(dir); + CDentry *dn = dir->lookup(rollback.src_dname); + ceph_assert(dn); + dout(10) << " dn " << *dn << dendl; + CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir); + ceph_assert(straydir); + CDentry *straydn = straydir->lookup(rollback.dest_dname); + ceph_assert(straydn); + dout(10) << " straydn " << *straydn << dendl; + CInode *in = straydn->get_linkage()->get_inode(); + + dn->push_projected_linkage(in); + straydn->push_projected_linkage(); + + if (rollback.snapbl.length() && in->snaprealm) { + bool hadrealm; + auto p = rollback.snapbl.cbegin(); + decode(hadrealm, p); + if (hadrealm) { + decode(in->snaprealm->srnode, p); + } else { + in->snaprealm->merge_to(dir->get_inode()->find_snaprealm()); + } + } + + if (mdr && !mdr->more()->slave_update_journaled) { + ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid())); + + _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn); + return; + } + + + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master, + ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + + le->commit.add_dir_context(dn->get_dir()); + le->commit.add_primary_dentry(dn, in, true); + // slave: no need to journal straydn + + dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = in->ino(); + + mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir()); + + submit_mdlog_entry(le, + new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid, + dn, straydn), + mdr, __func__); + mdlog->flush(); +} + +void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn) +{ + dout(10) << "_rmdir_rollback_finish " << reqid << dendl; + + straydn->get_dir()->unlink_inode(straydn); + dn->pop_projected_linkage(); + straydn->pop_projected_linkage(); + + CInode *in = dn->get_linkage()->get_inode(); + mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), + !mdr || mdr->more()->slave_update_journaled); + + if (mds->is_resolve()) { + CDir *root = mdcache->get_subtree_root(straydn->get_dir()); + mdcache->try_trim_non_auth_subtree(root); + } + + if (mdr) + mdcache->request_finish(mdr); + + mdcache->finish_rollback(reqid, mdr); +} + + +/** _dir_is_nonempty[_unlocked] + * + * check if a directory is non-empty (i.e. we can rmdir it). + * + * the unlocked varient this is a fastpath check. we can't really be + * sure until we rdlock the filelock. + */ +bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in) +{ + dout(10) << "dir_is_nonempty_unlocked " << *in << dendl; + ceph_assert(in->is_auth()); + + if (in->snaprealm && in->snaprealm->srnode.snaps.size()) + return true; // in a snapshot! + + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + // is the frag obviously non-empty? + if (dir->is_auth()) { + if (dir->get_projected_fnode()->fragstat.size()) { + dout(10) << "dir_is_nonempty_unlocked dirstat has " + << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl; + return true; + } + } + } + + return false; +} + +bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in) +{ + dout(10) << "dir_is_nonempty " << *in << dendl; + ceph_assert(in->is_auth()); + ceph_assert(in->filelock.can_read(mdr->get_client())); + + frag_info_t dirstat; + version_t dirstat_version = in->get_projected_inode()->dirstat.version; + + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + const fnode_t *pf = dir->get_projected_fnode(); + if (pf->fragstat.size()) { + dout(10) << "dir_is_nonempty dirstat has " + << pf->fragstat.size() << " items " << *dir << dendl; + return true; + } + + if (pf->accounted_fragstat.version == dirstat_version) + dirstat.add(pf->accounted_fragstat); + else + dirstat.add(pf->fragstat); + } + + return dirstat.size() != in->get_projected_inode()->dirstat.size(); +} + + +// ====================================================== + + +class C_MDS_rename_finish : public ServerLogContext { + CDentry *srcdn; + CDentry *destdn; + CDentry *straydn; +public: + C_MDS_rename_finish(Server *s, MDRequestRef& r, + CDentry *sdn, CDentry *ddn, CDentry *stdn) : + ServerLogContext(s, r), + srcdn(sdn), destdn(ddn), straydn(stdn) { } + void finish(int r) override { + ceph_assert(r == 0); + server->_rename_finish(mdr, srcdn, destdn, straydn); + } +}; + + +/** handle_client_rename + * + * rename master is the destdn auth. this is because cached inodes + * must remain connected. thus, any replica of srci, must also + * replicate destdn, and possibly straydn, so that srci (and + * destdn->inode) remain connected during the rename. + * + * to do this, we freeze srci, then master (destdn auth) verifies that + * all other nodes have also replciated destdn and straydn. note that + * destdn replicas need not also replicate srci. this only works when + * destdn is master. + * + * This function takes responsibility for the passed mdr. + */ +void Server::handle_client_rename(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + dout(7) << "handle_client_rename " << *req << dendl; + + filepath destpath = req->get_filepath(); + filepath srcpath = req->get_filepath2(); + if (destpath.depth() == 0 || srcpath.depth() == 0) { + respond_to_request(mdr, -EINVAL); + return; + } + if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) { + respond_to_request(mdr, -EBUSY); + return; + } + + std::string_view destname = destpath.last_dentry(); + + vector<CDentry*>& srctrace = mdr->dn[1]; + vector<CDentry*>& desttrace = mdr->dn[0]; + + MutationImpl::LockOpVec lov; + + CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, lov, true, false, true); + if (!destdn) return; + dout(10) << " destdn " << *destdn << dendl; + if (mdr->snapid != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + CDentry::linkage_t *destdnl = destdn->get_projected_linkage(); + CDir *destdir = destdn->get_dir(); + ceph_assert(destdir->is_auth()); + + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER); + if (r > 0) + return; // delayed + if (r < 0) { + if (r == -ESTALE) { + dout(10) << "FAIL on ESTALE but attempting recovery" << dendl; + mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr)); + } else { + dout(10) << "FAIL on error " << r << dendl; + respond_to_request(mdr, r); + } + return; + + } + ceph_assert(!srctrace.empty()); + CDentry *srcdn = srctrace.back(); + dout(10) << " srcdn " << *srcdn << dendl; + if (srcdn->last != CEPH_NOSNAP) { + respond_to_request(mdr, -EROFS); + return; + } + CDir *srcdir = srcdn->get_dir(); + CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage(); + CInode *srci = srcdnl->get_inode(); + dout(10) << " srci " << *srci << dendl; + + CInode *oldin = 0; + if (!destdnl->is_null()) { + //dout(10) << "dest dn exists " << *destdn << dendl; + oldin = mdcache->get_dentry_inode(destdn, mdr, true); + if (!oldin) return; + dout(10) << " oldin " << *oldin << dendl; + + // non-empty dir? do trivial fast unlocked check, do another check later with read locks + if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + + // if srcdn is replica, need to make sure its linkage is correct + if (srcdn->is_auth() || + srcdn->lock.can_read(mdr->get_client()) || + (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) { + // mv /some/thing /to/some/existing_other_thing + if (oldin->is_dir() && !srci->is_dir()) { + respond_to_request(mdr, -EISDIR); + return; + } + if (!oldin->is_dir() && srci->is_dir()) { + respond_to_request(mdr, -ENOTDIR); + return; + } + if (srci == oldin && !srcdir->inode->is_stray()) { + respond_to_request(mdr, 0); // no-op. POSIX makes no sense. + return; + } + } + } + + // -- some sanity checks -- + + // src+dest traces _must_ share a common ancestor for locking to prevent orphans + if (destpath.get_ino() != srcpath.get_ino() && + !(req->get_source().is_mds() && + MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok! + CInode *srcbase = srctrace[0]->get_dir()->get_inode(); + CInode *destbase = desttrace[0]->get_dir()->get_inode(); + // ok, extend srctrace toward root until it is an ancestor of desttrace. + while (srcbase != destbase && + !srcbase->is_projected_ancestor_of(destbase)) { + CDentry *pdn = srcbase->get_projected_parent_dn(); + srctrace.insert(srctrace.begin(), pdn); + dout(10) << "rename prepending srctrace with " << *pdn << dendl; + srcbase = pdn->get_dir()->get_inode(); + } + + // then, extend destpath until it shares the same parent inode as srcpath. + while (destbase != srcbase) { + CDentry *pdn = destbase->get_projected_parent_dn(); + desttrace.insert(desttrace.begin(), pdn); + lov.add_rdlock(&pdn->lock); + dout(10) << "rename prepending desttrace with " << *pdn << dendl; + destbase = pdn->get_dir()->get_inode(); + } + dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl; + } + + // src == dest? + if (srcdir == destdir && srcdn->get_name() == destname) { + dout(7) << "rename src=dest, noop" << dendl; + respond_to_request(mdr, 0); + return; + } + + // dest a child of src? + // e.g. mv /usr /usr/foo + CDentry *pdn = destdir->inode->get_projected_parent_dn(); + while (pdn) { + if (pdn == srcdn) { + dout(7) << "cannot rename item to be a child of itself" << dendl; + respond_to_request(mdr, -EINVAL); + return; + } + pdn = pdn->get_dir()->inode->parent; + } + + // is this a stray migration, reintegration or merge? (sanity checks!) + if (mdr->reqid.name.is_mds() && + !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) && + MDS_INO_IS_MDSDIR(destpath.get_ino())) && + !(destdnl->is_remote() && + destdnl->get_remote_ino() == srci->ino())) { + respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev. + return; + } + + bool linkmerge = srcdnl->get_inode() == destdnl->get_inode(); + if (linkmerge) + dout(10) << " this is a link merge" << dendl; + + // -- create stray dentry? -- + CDentry *straydn = NULL; + if (destdnl->is_primary() && !linkmerge) { + straydn = prepare_stray_dentry(mdr, destdnl->get_inode()); + if (!straydn) + return; + dout(10) << " straydn is " << *straydn << dendl; + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } + + // -- prepare witness list -- + /* + * NOTE: we use _all_ replicas as witnesses. + * this probably isn't totally necessary (esp for file renames), + * but if/when we change that, we have to make sure rejoin is + * sufficiently robust to handle strong rejoins from survivors + * with totally wrong dentry->inode linkage. + * (currently, it can ignore rename effects, because the resolve + * stage will sort them out.) + */ + set<mds_rank_t> witnesses = mdr->more()->extra_witnesses; + if (srcdn->is_auth()) + srcdn->list_replicas(witnesses); + else + witnesses.insert(srcdn->authority().first); + if (srcdnl->is_remote() && !srci->is_auth()) + witnesses.insert(srci->authority().first); + destdn->list_replicas(witnesses); + if (destdnl->is_remote() && !oldin->is_auth()) + witnesses.insert(oldin->authority().first); + dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; + + + // -- locks -- + + // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry + for (int i=0; i<(int)srctrace.size(); i++) + lov.add_rdlock(&srctrace[i]->lock); + lov.add_xlock(&srcdn->lock); + mds_rank_t srcdirauth = srcdir->authority().first; + if (srcdirauth != mds->get_nodeid()) { + dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl; + lov.add_remote_wrlock(&srcdir->inode->filelock, srcdirauth); + lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdirauth); + if (srci->is_dir()) + lov.add_rdlock(&srci->dirfragtreelock); + } else { + lov.add_wrlock(&srcdir->inode->filelock); + lov.add_wrlock(&srcdir->inode->nestlock); + } + mds->locker->include_snap_rdlocks(srcdir->inode, lov); + + // straydn? + if (straydn) { + lov.add_wrlock(&straydn->get_dir()->inode->filelock); + lov.add_wrlock(&straydn->get_dir()->inode->nestlock); + lov.add_xlock(&straydn->lock); + } + + // xlock versionlock on dentries if there are witnesses. + // replicas can't see projected dentry linkages, and will get + // confused if we try to pipeline things. + if (!witnesses.empty()) { + // take xlock on all projected ancestor dentries for srcdn and destdn. + // this ensures the srcdn and destdn can be traversed to by the witnesses. + for (int i= 0; i<(int)srctrace.size(); i++) { + if (srctrace[i]->is_auth() && srctrace[i]->is_projected()) + lov.add_xlock(&srctrace[i]->versionlock); + } + for (int i=0; i<(int)desttrace.size(); i++) { + if (desttrace[i]->is_auth() && desttrace[i]->is_projected()) + lov.add_xlock(&desttrace[i]->versionlock); + } + // xlock srci and oldin's primary dentries, so witnesses can call + // open_remote_ino() with 'want_locked=true' when the srcdn or destdn + // is traversed. + if (srcdnl->is_remote()) + lov.add_xlock(&srci->get_projected_parent_dn()->lock); + if (destdnl->is_remote()) + lov.add_xlock(&oldin->get_projected_parent_dn()->lock); + } + + // we need to update srci's ctime. xlock its least contended lock to do that... + lov.add_xlock(&srci->linklock); + lov.add_xlock(&srci->snaplock); + + if (oldin) { + // xlock oldin (for nlink--) + lov.add_xlock(&oldin->linklock); + lov.add_xlock(&oldin->snaplock); + if (oldin->is_dir()) + lov.add_rdlock(&oldin->filelock); // to verify it's empty + } + + CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL; + if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze)) + return; + + if (linkmerge) + ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote()); + + if ((!mdr->has_more() || mdr->more()->witnessed.empty())) { + if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE)) + return; + + if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE)) + return; + + if (!check_fragment_space(mdr, destdn->get_dir())) + return; + + if (!check_access(mdr, srci, MAY_WRITE)) + return; + } + + // with read lock, really verify oldin is empty + if (oldin && + oldin->is_dir() && + _dir_is_nonempty(mdr, oldin)) { + respond_to_request(mdr, -ENOTEMPTY); + return; + } + + /* project_snaprealm_past_parent() will do this job + * + // moving between snaprealms? + if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) { + SnapRealm *srcrealm = srci->find_snaprealm(); + SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm(); + if (srcrealm != destrealm && + (srcrealm->get_newest_seq() + 1 > srcdn->first || + destrealm->get_newest_seq() + 1 > srcdn->first)) { + dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl; + mdcache->snaprealm_create(mdr, srci); + return; + } + } + */ + + SnapRealm *dest_realm = nullptr; + SnapRealm *src_realm = nullptr; + if (!linkmerge) { + dest_realm = destdir->inode->find_snaprealm(); + if (srcdir->inode == destdir->inode) + src_realm = dest_realm; + else + src_realm = srcdir->inode->find_snaprealm(); + if (src_realm != dest_realm && + src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) { + respond_to_request(mdr, -EXDEV); + return; + } + } + + ceph_assert(g_conf()->mds_kill_rename_at != 1); + + // -- open all srcdn inode frags, if any -- + // we need these open so that auth can properly delegate from inode to dirfrags + // after the inode is _ours_. + if (srcdnl->is_primary() && + !srcdn->is_auth() && + srci->is_dir()) { + dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl; + mdr->set_stickydirs(srci); + + frag_vec_t leaves; + srci->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = srci->get_dirfrag(leaf); + if (!dir) { + dout(10) << " opening " << leaf << " under " << *srci << dendl; + mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + } + + // -- prepare snaprealm --- + + if (linkmerge) { + if (!mdr->more()->srci_srnode && + srci->get_projected_inode()->nlink == 1 && + srci->is_projected_snaprealm_global()) { + sr_t *new_srnode = srci->prepare_new_srnode(0); + srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false); + + srci->clear_snaprealm_global(new_srnode); + mdr->more()->srci_srnode = new_srnode; + } + } else { + if (oldin && !mdr->more()->desti_srnode) { + if (oldin->is_projected_snaprealm_global()) { + sr_t *new_srnode = oldin->prepare_new_srnode(0); + oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary()); + // dropping the last linkage or dropping the last remote linkage, + // detch the inode from global snaprealm + auto nlink = oldin->get_projected_inode()->nlink; + if (nlink == 1 || + (nlink == 2 && !destdnl->is_primary() && + !oldin->get_projected_parent_dir()->inode->is_stray())) + oldin->clear_snaprealm_global(new_srnode); + mdr->more()->desti_srnode = new_srnode; + } else if (destdnl->is_primary()) { + snapid_t follows = dest_realm->get_newest_seq(); + if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) { + sr_t *new_srnode = oldin->prepare_new_srnode(follows); + oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm()); + mdr->more()->desti_srnode = new_srnode; + } + } + } + if (!mdr->more()->srci_srnode) { + if (srci->is_projected_snaprealm_global()) { + sr_t *new_srnode = srci->prepare_new_srnode(0); + srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary()); + mdr->more()->srci_srnode = new_srnode; + } else if (srcdnl->is_primary()) { + snapid_t follows = src_realm->get_newest_seq(); + if (src_realm != dest_realm && + (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) { + sr_t *new_srnode = srci->prepare_new_srnode(follows); + srci->record_snaprealm_past_parent(new_srnode, dest_realm); + mdr->more()->srci_srnode = new_srnode; + } + } + } + } + + // -- prepare witnesses -- + + // do srcdn auth last + mds_rank_t last = MDS_RANK_NONE; + if (!srcdn->is_auth()) { + last = srcdn->authority().first; + mdr->more()->srcdn_auth_mds = last; + // ask auth of srci to mark srci as ambiguous auth if more than two MDS + // are involved in the rename operation. + if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) { + dout(10) << " preparing ambiguous auth for srci" << dendl; + ceph_assert(mdr->more()->is_remote_frozen_authpin); + ceph_assert(mdr->more()->rename_inode == srci); + _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn); + return; + } + } + + for (set<mds_rank_t>::iterator p = witnesses.begin(); + p != witnesses.end(); + ++p) { + if (*p == last) continue; // do it last! + if (mdr->more()->witnessed.count(*p)) { + dout(10) << " already witnessed by mds." << *p << dendl; + } else if (mdr->more()->waiting_on_slave.count(*p)) { + dout(10) << " already waiting on witness mds." << *p << dendl; + } else { + if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn)) + return; + } + } + if (!mdr->more()->waiting_on_slave.empty()) + return; // we're waiting for a witness. + + if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) { + dout(10) << " preparing last witness (srcdn auth)" << dendl; + ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0); + _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn); + return; + } + + // test hack: bail after slave does prepare, so we can verify it's _live_ rollback. + if (!mdr->more()->slaves.empty() && !srci->is_dir()) + ceph_assert(g_conf()->mds_kill_rename_at != 3); + if (!mdr->more()->slaves.empty() && srci->is_dir()) + ceph_assert(g_conf()->mds_kill_rename_at != 4); + + // -- declare now -- + mdr->set_mds_stamp(ceph_clock_now()); + + // -- prepare journal entry -- + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "rename"); + mdlog->start_entry(le); + le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); + if (!mdr->more()->witnessed.empty()) { + dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; + + le->reqid = mdr->reqid; + le->had_slaves = true; + + mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + // no need to send frozen auth pin to recovring auth MDS of srci + mdr->more()->is_remote_frozen_authpin = false; + } + + _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn); + if (le->client_map.length()) + le->cmapv = mds->sessionmap.get_projected(); + + // -- commit locally -- + C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn); + + journal_and_reply(mdr, srci, destdn, le, fin); + mds->balancer->maybe_fragment(destdn->get_dir(), false); +} + + +void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_finish " << *mdr << dendl; + + if (!mdr->more()->witnessed.empty()) + mdcache->logged_master_update(mdr->reqid); + + // apply + _rename_apply(mdr, srcdn, destdn, straydn); + + mdcache->send_dentry_link(destdn, mdr); + + CDentry::linkage_t *destdnl = destdn->get_linkage(); + CInode *in = destdnl->get_inode(); + bool need_eval = mdr->more()->cap_imports.count(in); + + // test hack: test slave commit + if (!mdr->more()->slaves.empty() && !in->is_dir()) + ceph_assert(g_conf()->mds_kill_rename_at != 5); + if (!mdr->more()->slaves.empty() && in->is_dir()) + ceph_assert(g_conf()->mds_kill_rename_at != 6); + + // bump popularity + mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR); + if (destdnl->is_remote() && in->is_auth()) + mds->balancer->hit_inode(in, META_POP_IWR); + + // did we import srci? if so, explicitly ack that import that, before we unlock and reply. + + ceph_assert(g_conf()->mds_kill_rename_at != 7); + + // reply + respond_to_request(mdr, 0); + + if (need_eval) + mds->locker->eval(in, CEPH_CAP_LOCKS, true); + + // clean up? + // respond_to_request() drops locks. So stray reintegration can race with us. + if (straydn && !straydn->get_projected_linkage()->is_null()) { + mdcache->notify_stray(straydn); + } +} + + + +// helpers + +bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse, + vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn) +{ + if (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + dout(10) << "_rename_prepare_witness mds." << who << dendl; + auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP); + + req->srcdnpath = filepath(srctrace.front()->get_dir()->ino()); + for (auto dn : srctrace) + req->srcdnpath.push_dentry(dn->get_name()); + req->destdnpath = filepath(dsttrace.front()->get_dir()->ino()); + for (auto dn : dsttrace) + req->destdnpath.push_dentry(dn->get_name()); + if (straydn) + mdcache->replicate_stray(straydn, who, req->straybl); + + if (mdr->more()->srci_srnode) + encode(*mdr->more()->srci_srnode, req->srci_snapbl); + if (mdr->more()->desti_srnode) + encode(*mdr->more()->desti_srnode, req->desti_snapbl); + + req->srcdn_auth = mdr->more()->srcdn_auth_mds; + + // srcdn auth will verify our current witness list is sufficient + req->witnesses = witnesse; + + req->op_stamp = mdr->get_op_stamp(); + mds->send_message_mds(req, who); + + ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0); + mdr->more()->waiting_on_slave.insert(who); + return true; +} + +version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl) +{ + version_t oldpv = mdr->more()->inode_import_v; + + CDentry::linkage_t *srcdnl = srcdn->get_linkage(); + + /* import node */ + auto blp = mdr->more()->inode_import.cbegin(); + + // imported caps + map<client_t,entity_inst_t> client_map; + map<client_t, client_metadata_t> client_metadata_map; + decode(client_map, blp); + decode(client_metadata_map, blp); + prepare_force_open_sessions(client_map, client_metadata_map, + mdr->more()->imported_session_map); + encode(client_map, *client_map_bl, mds->mdsmap->get_up_features()); + encode(client_metadata_map, *client_map_bl); + + list<ScatterLock*> updated_scatterlocks; + mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls, + mdr->more()->cap_imports, updated_scatterlocks); + + // hack: force back to !auth and clean, temporarily + srcdnl->get_inode()->state_clear(CInode::STATE_AUTH); + srcdnl->get_inode()->mark_clean(); + + return oldpv; +} + +bool Server::_need_force_journal(CInode *diri, bool empty) +{ + std::vector<CDir*> dirs; + diri->get_dirfrags(dirs); + + bool force_journal = false; + if (empty) { + for (const auto& dir : dirs) { + if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) { + dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl; + force_journal = true; + break; + } else + dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl; + } + } else { + // see if any children of our frags are auth subtrees. + std::vector<CDir*> subtrees; + mdcache->get_subtrees(subtrees); + dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl; + for (const auto& dir : dirs) { + for (const auto& subtree : subtrees) { + if (dir->contains(subtree)) { + if (subtree->get_dir_auth().first == mds->get_nodeid()) { + dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal " + << *subtree << dendl; + force_journal = true; + break; + } else + dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl; + } else + dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl; + } + if (force_journal) + break; + } + } + return force_journal; +} + +void Server::_rename_prepare(MDRequestRef& mdr, + EMetaBlob *metablob, bufferlist *client_map_bl, + CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; + if (straydn) + dout(10) << " straydn " << *straydn << dendl; + + CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage(); + CDentry::linkage_t *destdnl = destdn->get_projected_linkage(); + CInode *srci = srcdnl->get_inode(); + CInode *oldin = destdnl->get_inode(); + + // primary+remote link merge? + bool linkmerge = (srci == oldin); + if (linkmerge) + ceph_assert(srcdnl->is_primary() && destdnl->is_remote()); + bool silent = srcdn->get_dir()->inode->is_stray(); + + bool force_journal_dest = false; + if (srci->is_dir() && !destdn->is_auth()) { + if (srci->is_auth()) { + // if we are auth for srci and exporting it, force journal because journal replay needs + // the source inode to create auth subtrees. + dout(10) << " we are exporting srci, will force journal destdn" << dendl; + force_journal_dest = true; + } else + force_journal_dest = _need_force_journal(srci, false); + } + + bool force_journal_stray = false; + if (oldin && oldin->is_dir() && straydn && !straydn->is_auth()) + force_journal_stray = _need_force_journal(oldin, true); + + if (linkmerge) + dout(10) << " merging remote and primary links to the same inode" << dendl; + if (silent) + dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl; + if (force_journal_dest) + dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl; + if (force_journal_stray) + dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl; + + if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) { + dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl; + metablob->renamed_dirino = srci->ino(); + } else if (oldin && oldin->is_dir() && force_journal_stray) { + dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl; + metablob->renamed_dirino = oldin->ino(); + } + + // prepare + CInode::mempool_inode *spi = 0; // renamed inode + CInode::mempool_inode *tpi = 0; // target/overwritten inode + + // target inode + if (!linkmerge) { + if (destdnl->is_primary()) { + ceph_assert(straydn); // moving to straydn. + // link--, and move. + if (destdn->is_auth()) { + auto &pi= oldin->project_inode(); //project_snaprealm + pi.inode.version = straydn->pre_dirty(pi.inode.version); + pi.inode.update_backtrace(); + tpi = &pi.inode; + } + straydn->push_projected_linkage(oldin); + } else if (destdnl->is_remote()) { + // nlink-- targeti + if (oldin->is_auth()) { + auto &pi = oldin->project_inode(); + pi.inode.version = oldin->pre_dirty(); + tpi = &pi.inode; + } + } + } + + // dest + if (srcdnl->is_remote()) { + if (!linkmerge) { + // destdn + if (destdn->is_auth()) + mdr->more()->pvmap[destdn] = destdn->pre_dirty(); + destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type()); + // srci + if (srci->is_auth()) { + auto &pi = srci->project_inode(); + pi.inode.version = srci->pre_dirty(); + spi = &pi.inode; + } + } else { + dout(10) << " will merge remote onto primary link" << dendl; + if (destdn->is_auth()) { + auto &pi = oldin->project_inode(); + pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version); + spi = &pi.inode; + } + } + } else { // primary + if (destdn->is_auth()) { + version_t oldpv; + if (srcdn->is_auth()) + oldpv = srci->get_projected_version(); + else { + oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl); + + // note which dirfrags have child subtrees in the journal + // event, so that we can open those (as bounds) during replay. + if (srci->is_dir()) { + list<CDir*> ls; + srci->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (!dir->is_auth()) + metablob->renamed_dir_frags.push_back(dir->get_frag()); + } + dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl; + } + } + auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary + // & srcdnl->snaprealm + pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv); + pi.inode.update_backtrace(); + spi = &pi.inode; + } + destdn->push_projected_linkage(srci); + } + + // src + if (srcdn->is_auth()) + mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); + srcdn->push_projected_linkage(); // push null linkage + + if (!silent) { + if (spi) { + spi->ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > spi->rstat.rctime) + spi->rstat.rctime = mdr->get_op_stamp(); + spi->change_attr++; + if (linkmerge) + spi->nlink--; + } + if (tpi) { + tpi->ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > tpi->rstat.rctime) + tpi->rstat.rctime = mdr->get_op_stamp(); + tpi->change_attr++; + { + std::string t; + destdn->make_path_string(t, true); + tpi->stray_prior_path = std::move(t); + } + tpi->nlink--; + if (tpi->nlink == 0) + oldin->state_set(CInode::STATE_ORPHAN); + } + } + + // prepare nesting, mtime updates + int predirty_dir = silent ? 0:PREDIRTY_DIR; + + // guarantee stray dir is processed first during journal replay. unlink the old inode, + // then link the source inode to destdn + if (destdnl->is_primary()) { + ceph_assert(straydn); + if (straydn->is_auth()) { + metablob->add_dir_context(straydn->get_dir()); + metablob->add_dir(straydn->get_dir(), true); + } + } + + // sub off target + if (destdn->is_auth() && !destdnl->is_null()) { + mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(), + (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1); + if (destdnl->is_primary()) { + ceph_assert(straydn); + mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(), + PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + } + } + + // move srcdn + int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0; + int flags = predirty_dir | predirty_primary; + if (srcdn->is_auth()) + mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1); + if (destdn->is_auth()) + mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1); + + // add it all to the metablob + // target inode + if (!linkmerge) { + if (destdnl->is_primary()) { + ceph_assert(straydn); + if (destdn->is_auth()) { + // project snaprealm, too + if (auto& desti_srnode = mdr->more()->desti_srnode) { + oldin->project_snaprealm(desti_srnode); + if (tpi->nlink == 0) + ceph_assert(!desti_srnode->is_parent_global()); + desti_srnode = NULL; + } + straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + metablob->add_primary_dentry(straydn, oldin, true, true); + } else if (force_journal_stray) { + dout(10) << " forced journaling straydn " << *straydn << dendl; + metablob->add_dir_context(straydn->get_dir()); + metablob->add_primary_dentry(straydn, oldin, true); + } + } else if (destdnl->is_remote()) { + if (oldin->is_auth()) { + sr_t *new_srnode = NULL; + if (mdr->slave_request) { + if (mdr->slave_request->desti_snapbl.length() > 0) { + new_srnode = new sr_t(); + auto p = mdr->slave_request->desti_snapbl.cbegin(); + decode(*new_srnode, p); + } + } else if (auto& desti_srnode = mdr->more()->desti_srnode) { + new_srnode = desti_srnode; + desti_srnode = NULL; + } + if (new_srnode) { + oldin->project_snaprealm(new_srnode); + if (tpi->nlink == 0) + ceph_assert(!new_srnode->is_parent_global()); + } + // auth for targeti + metablob->add_dir_context(oldin->get_projected_parent_dir()); + mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(), + CEPH_NOSNAP, 0, destdnl); + metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true); + } + } + } + + // dest + if (srcdnl->is_remote()) { + ceph_assert(!linkmerge); + if (destdn->is_auth() && !destdnl->is_null()) + mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl); + else + destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + + if (destdn->is_auth()) + metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type()); + + if (srci->is_auth() ) { // it's remote + if (mdr->slave_request) { + if (mdr->slave_request->srci_snapbl.length() > 0) { + sr_t *new_srnode = new sr_t(); + auto p = mdr->slave_request->srci_snapbl.cbegin(); + decode(*new_srnode, p); + srci->project_snaprealm(new_srnode); + } + } else if (auto& srci_srnode = mdr->more()->srci_srnode) { + srci->project_snaprealm(srci_srnode); + srci_srnode = NULL; + } + + CDentry *srci_pdn = srci->get_projected_parent_dn(); + metablob->add_dir_context(srci_pdn->get_dir()); + mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl); + metablob->add_primary_dentry(srci_pdn, srci, true); + } + } else if (srcdnl->is_primary()) { + // project snap parent update? + if (destdn->is_auth()) { + if (auto& srci_srnode = mdr->more()->srci_srnode) { + srci->project_snaprealm(srci_srnode); + srci_srnode = NULL; + } + } + + if (destdn->is_auth() && !destdnl->is_null()) + mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl); + + destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + + if (destdn->is_auth()) + metablob->add_primary_dentry(destdn, srci, true, true); + else if (force_journal_dest) { + dout(10) << " forced journaling destdn " << *destdn << dendl; + metablob->add_dir_context(destdn->get_dir()); + metablob->add_primary_dentry(destdn, srci, true); + if (srcdn->is_auth() && srci->is_dir()) { + // journal new subtrees root dirfrags + list<CDir*> ls; + srci->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->is_auth()) + metablob->add_dir(dir, true); + } + } + } + } + + // src + if (srcdn->is_auth()) { + dout(10) << " journaling srcdn " << *srcdn << dendl; + mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl); + // also journal the inode in case we need do slave rename rollback. It is Ok to add + // both primary and NULL dentries. Because during journal replay, null dentry is + // processed after primary dentry. + if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth()) + metablob->add_primary_dentry(srcdn, srci, true); + metablob->add_null_dentry(srcdn, true); + } else + dout(10) << " NOT journaling srcdn " << *srcdn << dendl; + + // make renamed inode first track the dn + if (srcdnl->is_primary() && destdn->is_auth()) { + ceph_assert(srci->first <= destdn->first); + srci->first = destdn->first; + } + // make stray inode first track the straydn + if (straydn && straydn->is_auth()) { + ceph_assert(oldin->first <= straydn->first); + oldin->first = straydn->first; + } + + if (oldin && oldin->is_dir()) { + ceph_assert(straydn); + mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir()); + } + if (srci->is_dir()) + mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir()); + +} + + +void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; + dout(10) << " pvs " << mdr->more()->pvmap << dendl; + + CDentry::linkage_t *srcdnl = srcdn->get_linkage(); + CDentry::linkage_t *destdnl = destdn->get_linkage(); + + CInode *oldin = destdnl->get_inode(); + + // primary+remote link merge? + bool linkmerge = (srcdnl->get_inode() == oldin); + if (linkmerge) + ceph_assert(srcdnl->is_primary() || destdnl->is_remote()); + + bool new_in_snaprealm = false; + bool new_oldin_snaprealm = false; + + // target inode + if (!linkmerge) { + if (destdnl->is_primary()) { + ceph_assert(straydn); + dout(10) << "straydn is " << *straydn << dendl; + + // if there is newly created snaprealm, need to split old snaprealm's + // inodes_with_caps. So pop snaprealm before linkage changes. + if (destdn->is_auth()) { + bool hadrealm = (oldin->snaprealm ? true : false); + oldin->early_pop_projected_snaprealm(); + new_oldin_snaprealm = (oldin->snaprealm && !hadrealm); + } else { + ceph_assert(mdr->slave_request); + if (mdr->slave_request->desti_snapbl.length()) { + new_oldin_snaprealm = !oldin->snaprealm; + oldin->decode_snap_blob(mdr->slave_request->desti_snapbl); + ceph_assert(oldin->snaprealm); + ceph_assert(oldin->snaprealm->have_past_parents_open()); + } + } + + destdn->get_dir()->unlink_inode(destdn, false); + + straydn->pop_projected_linkage(); + if (mdr->is_slave() && !mdr->more()->slave_update_journaled) + ceph_assert(!straydn->is_projected()); // no other projected + + // nlink-- targeti + if (destdn->is_auth()) + oldin->pop_and_dirty_projected_inode(mdr->ls); + + mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible. + } else if (destdnl->is_remote()) { + destdn->get_dir()->unlink_inode(destdn, false); + if (oldin->is_auth()) { + oldin->pop_and_dirty_projected_inode(mdr->ls); + } else if (mdr->slave_request) { + if (mdr->slave_request->desti_snapbl.length() > 0) { + ceph_assert(oldin->snaprealm); + oldin->decode_snap_blob(mdr->slave_request->desti_snapbl); + } + } else if (auto& desti_srnode = mdr->more()->desti_srnode) { + delete desti_srnode; + desti_srnode = NULL; + } + } + } + + // unlink src before we relink it at dest + CInode *in = srcdnl->get_inode(); + ceph_assert(in); + + bool srcdn_was_remote = srcdnl->is_remote(); + if (!srcdn_was_remote) { + // if there is newly created snaprealm, need to split old snaprealm's + // inodes_with_caps. So pop snaprealm before linkage changes. + if (destdn->is_auth()) { + bool hadrealm = (in->snaprealm ? true : false); + in->early_pop_projected_snaprealm(); + new_in_snaprealm = (in->snaprealm && !hadrealm); + } else { + ceph_assert(mdr->slave_request); + if (mdr->slave_request->srci_snapbl.length()) { + new_in_snaprealm = !in->snaprealm; + in->decode_snap_blob(mdr->slave_request->srci_snapbl); + ceph_assert(in->snaprealm); + ceph_assert(in->snaprealm->have_past_parents_open()); + } + } + } + + srcdn->get_dir()->unlink_inode(srcdn); + + // dest + if (srcdn_was_remote) { + if (!linkmerge) { + // destdn + destdnl = destdn->pop_projected_linkage(); + if (mdr->is_slave() && !mdr->more()->slave_update_journaled) + ceph_assert(!destdn->is_projected()); // no other projected + + destdn->link_remote(destdnl, in); + if (destdn->is_auth()) + destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); + // in + if (in->is_auth()) { + in->pop_and_dirty_projected_inode(mdr->ls); + } else if (mdr->slave_request) { + if (mdr->slave_request->srci_snapbl.length() > 0) { + ceph_assert(in->snaprealm); + in->decode_snap_blob(mdr->slave_request->srci_snapbl); + } + } else if (auto& srci_srnode = mdr->more()->srci_srnode) { + delete srci_srnode; + srci_srnode = NULL; + } + } else { + dout(10) << "merging remote onto primary link" << dendl; + oldin->pop_and_dirty_projected_inode(mdr->ls); + } + } else { // primary + if (linkmerge) { + dout(10) << "merging primary onto remote link" << dendl; + destdn->get_dir()->unlink_inode(destdn, false); + } + destdnl = destdn->pop_projected_linkage(); + if (mdr->is_slave() && !mdr->more()->slave_update_journaled) + ceph_assert(!destdn->is_projected()); // no other projected + + // srcdn inode import? + if (!srcdn->is_auth() && destdn->is_auth()) { + ceph_assert(mdr->more()->inode_import.length() > 0); + + map<client_t,Capability::Import> imported_caps; + + // finish cap imports + finish_force_open_sessions(mdr->more()->imported_session_map); + if (mdr->more()->cap_imports.count(destdnl->get_inode())) { + mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(), + mdr->more()->srcdn_auth_mds, true, + mdr->more()->imported_session_map, + mdr->more()->cap_imports[destdnl->get_inode()], + imported_caps); + } + + mdr->more()->inode_import.clear(); + encode(imported_caps, mdr->more()->inode_import); + + /* hack: add an auth pin for each xlock we hold. These were + * remote xlocks previously but now they're local and + * we're going to try and unpin when we xlock_finish. */ + + for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock); + i != mdr->locks.end(); + ++i) { + SimpleLock *lock = i->lock; + if (lock->get_parent() != destdnl->get_inode()) + break; + if (i->is_xlock() && !lock->is_locallock()) + mds->locker->xlock_import(lock); + } + + // hack: fix auth bit + in->state_set(CInode::STATE_AUTH); + + mdr->clear_ambiguous_auth(); + } + + if (destdn->is_auth()) + in->pop_and_dirty_projected_inode(mdr->ls); + } + + // src + if (srcdn->is_auth()) + srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); + srcdn->pop_projected_linkage(); + if (mdr->is_slave() && !mdr->more()->slave_update_journaled) + ceph_assert(!srcdn->is_projected()); // no other projected + + // apply remaining projected inodes (nested) + mdr->apply(); + + // update subtree map? + if (destdnl->is_primary() && in->is_dir()) + mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true); + + if (straydn && oldin->is_dir()) + mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true); + + if (new_oldin_snaprealm) + mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false); + if (new_in_snaprealm) + mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true); + + // removing a new dn? + if (srcdn->is_auth()) + srcdn->get_dir()->try_remove_unlinked_dn(srcdn); +} + + + +// ------------ +// SLAVE + +class C_MDS_SlaveRenamePrep : public ServerLogContext { + CDentry *srcdn, *destdn, *straydn; +public: + C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) : + ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {} + void finish(int r) override { + server->_logged_slave_rename(mdr, srcdn, destdn, straydn); + } +}; + +class C_MDS_SlaveRenameCommit : public ServerContext { + MDRequestRef mdr; + CDentry *srcdn, *destdn, *straydn; +public: + C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) : + ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} + void finish(int r) override { + server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); + } +}; + +class C_MDS_SlaveRenameSessionsFlushed : public ServerContext { + MDRequestRef mdr; +public: + C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) : + ServerContext(s), mdr(r) {} + void finish(int r) override { + server->_slave_rename_sessions_flushed(mdr); + } +}; + +void Server::handle_slave_rename_prep(MDRequestRef& mdr) +{ + dout(10) << "handle_slave_rename_prep " << *mdr + << " " << mdr->slave_request->srcdnpath + << " to " << mdr->slave_request->destdnpath + << dendl; + + if (mdr->slave_request->is_interrupted()) { + dout(10) << " slave request interrupted, sending noop reply" << dendl; + auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->mark_interrupted(); + mds->send_message_mds(reply, mdr->slave_to_mds); + mdr->reset_slave_request(); + return; + } + + // discover destdn + filepath destpath(mdr->slave_request->destdnpath); + dout(10) << " dest " << destpath << dendl; + vector<CDentry*> trace; + CF_MDS_MDRContextFactory cf(mdcache, mdr); + int r = mdcache->path_traverse(mdr, cf, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK); + if (r > 0) return; + if (r == -ESTALE) { + mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr), + mdr->slave_to_mds); + return; + } + ceph_assert(r == 0); // we shouldn't get an error here! + + CDentry *destdn = trace.back(); + CDentry::linkage_t *destdnl = destdn->get_projected_linkage(); + dout(10) << " destdn " << *destdn << dendl; + mdr->pin(destdn); + + // discover srcdn + filepath srcpath(mdr->slave_request->srcdnpath); + dout(10) << " src " << srcpath << dendl; + CInode *srci = nullptr; + r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK); + if (r > 0) return; + ceph_assert(r == 0); + + // srcpath must not point to a null dentry + ceph_assert(srci != nullptr); + + CDentry *srcdn = trace.back(); + CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage(); + dout(10) << " srcdn " << *srcdn << dendl; + mdr->pin(srcdn); + mdr->pin(srci); + + // stray? + bool linkmerge = srcdnl->get_inode() == destdnl->get_inode(); + if (linkmerge) + ceph_assert(srcdnl->is_primary() && destdnl->is_remote()); + CDentry *straydn = mdr->straydn; + if (destdnl->is_primary() && !linkmerge) + ceph_assert(straydn); + + mdr->set_op_stamp(mdr->slave_request->op_stamp); + mdr->more()->srcdn_auth_mds = srcdn->authority().first; + + // set up commit waiter (early, to clean up any freezing etc we do) + if (!mdr->more()->slave_commit) + mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); + + // am i srcdn auth? + if (srcdn->is_auth()) { + set<mds_rank_t> srcdnrep; + srcdn->list_replicas(srcdnrep); + + bool reply_witness = false; + if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) { + // freeze? + // we need this to + // - avoid conflicting lock state changes + // - avoid concurrent updates to the inode + // (this could also be accomplished with the versionlock) + int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock + dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl; + bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance); + + // unfreeze auth pin after freezing the inode to avoid queueing waiters + if (srcdnl->get_inode()->is_frozen_auth_pin()) + mdr->unfreeze_auth_pin(); + + if (!frozen_inode) { + srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + /* + * set ambiguous auth for srci + * NOTE: we don't worry about ambiguous cache expire as we do + * with subtree migrations because all slaves will pin + * srcdn->get_inode() for duration of this rename. + */ + mdr->set_ambiguous_auth(srcdnl->get_inode()); + + // just mark the source inode as ambiguous auth if more than two MDS are involved. + // the master will send another OP_RENAMEPREP slave request later. + if (mdr->slave_request->witnesses.size() > 1) { + dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl; + reply_witness = true; + } + + // make sure bystanders have received all lock related messages + for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) { + if (*p == mdr->slave_to_mds || + (mds->is_cluster_degraded() && + !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p))) + continue; + auto notify = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY); + mds->send_message_mds(notify, *p); + mdr->more()->waiting_on_slave.insert(*p); + } + + // make sure clients have received all cap related messages + set<client_t> export_client_set; + mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set); + + MDSGatherBuilder gather(g_ceph_context); + flush_client_sessions(export_client_set, gather); + if (gather.has_subs()) { + mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE); + gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr)); + gather.activate(); + } + } + + // is witness list sufficient? + for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) { + if (*p == mdr->slave_to_mds || + mdr->slave_request->witnesses.count(*p)) continue; + dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; + reply_witness = true; + break; + } + + if (reply_witness) { + ceph_assert(!srcdnrep.empty()); + auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->witnesses.swap(srcdnrep); + mds->send_message_mds(reply, mdr->slave_to_mds); + mdr->reset_slave_request(); + return; + } + dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; + if (!mdr->more()->waiting_on_slave.empty()) { + dout(10) << " still waiting for rename notify acks from " + << mdr->more()->waiting_on_slave << dendl; + return; + } + } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) { + // set ambiguous auth for srci on witnesses + mdr->set_ambiguous_auth(srcdnl->get_inode()); + } + + // encode everything we'd need to roll this back... basically, just the original state. + rename_rollback rollback; + + rollback.reqid = mdr->reqid; + + rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag(); + rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime; + rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime; + rollback.orig_src.dname = srcdn->get_name(); + if (srcdnl->is_primary()) + rollback.orig_src.ino = srcdnl->get_inode()->ino(); + else { + ceph_assert(srcdnl->is_remote()); + rollback.orig_src.remote_ino = srcdnl->get_remote_ino(); + rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type(); + } + + rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag(); + rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime; + rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime; + rollback.orig_dest.dname = destdn->get_name(); + if (destdnl->is_primary()) + rollback.orig_dest.ino = destdnl->get_inode()->ino(); + else if (destdnl->is_remote()) { + rollback.orig_dest.remote_ino = destdnl->get_remote_ino(); + rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type(); + } + + if (straydn) { + rollback.stray.dirfrag = straydn->get_dir()->dirfrag(); + rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime; + rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime; + rollback.stray.dname = straydn->get_name(); + } + if (mdr->slave_request->desti_snapbl.length()) { + CInode *oldin = destdnl->get_inode(); + if (oldin->snaprealm) { + encode(true, rollback.desti_snapbl); + oldin->encode_snap_blob(rollback.desti_snapbl); + } else { + encode(false, rollback.desti_snapbl); + } + } + if (mdr->slave_request->srci_snapbl.length()) { + if (srci->snaprealm) { + encode(true, rollback.srci_snapbl); + srci->encode_snap_blob(rollback.srci_snapbl); + } else { + encode(false, rollback.srci_snapbl); + } + } + encode(rollback, mdr->more()->rollback_bl); + // FIXME: rollback snaprealm + dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + + // journal. + mdr->ls = mdlog->get_current_segment(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, + ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME); + mdlog->start_entry(le); + le->rollback = mdr->more()->rollback_bl; + + bufferlist blah; // inode import data... obviously not used if we're the slave + _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn); + + if (le->commit.empty()) { + dout(10) << " empty metablob, skipping journal" << dendl; + mdlog->cancel_entry(le); + mdr->ls = NULL; + _logged_slave_rename(mdr, srcdn, destdn, straydn); + } else { + mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds); + mdr->more()->slave_update_journaled = true; + submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn), + mdr, __func__); + mdlog->flush(); + } +} + +void Server::_logged_slave_rename(MDRequestRef& mdr, + CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_logged_slave_rename " << *mdr << dendl; + + // prepare ack + MMDSSlaveRequest::ref reply; + if (!mdr->aborted) { + reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK); + if (!mdr->more()->slave_update_journaled) + reply->mark_not_journaled(); + } + + CDentry::linkage_t *srcdnl = srcdn->get_linkage(); + //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0; + + // export srci? + if (srcdn->is_auth() && srcdnl->is_primary()) { + // set export bounds for CInode::encode_export() + if (reply) { + list<CDir*> bounds; + if (srcdnl->get_inode()->is_dir()) { + srcdnl->get_inode()->get_dirfrags(bounds); + for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) + (*p)->state_set(CDir::STATE_EXPORTBOUND); + } + + map<client_t,entity_inst_t> exported_client_map; + map<client_t, client_metadata_t> exported_client_metadata_map; + bufferlist inodebl; + mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl, + exported_client_map, + exported_client_metadata_map); + + for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) + (*p)->state_clear(CDir::STATE_EXPORTBOUND); + + encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features()); + encode(exported_client_metadata_map, reply->inode_export); + reply->inode_export.claim_append(inodebl); + reply->inode_export_v = srcdnl->get_inode()->inode.version; + } + + // remove mdr auth pin + mdr->auth_unpin(srcdnl->get_inode()); + mdr->more()->is_inode_exporter = true; + + if (srcdnl->get_inode()->is_dirty()) + srcdnl->get_inode()->mark_clean(); + + dout(10) << " exported srci " << *srcdnl->get_inode() << dendl; + } + + // apply + _rename_apply(mdr, srcdn, destdn, straydn); + + CDentry::linkage_t *destdnl = destdn->get_linkage(); + + // bump popularity + mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR); + if (destdnl->get_inode() && destdnl->get_inode()->is_auth()) + mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR); + + // done. + mdr->reset_slave_request(); + mdr->straydn = 0; + + if (reply) { + mds->send_message_mds(reply, mdr->slave_to_mds); + } else { + ceph_assert(mdr->aborted); + dout(10) << " abort flag set, finishing" << dendl; + mdcache->request_finish(mdr); + } +} + +void Server::_commit_slave_rename(MDRequestRef& mdr, int r, + CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; + + CInode *in = destdn->get_linkage()->get_inode(); + + inodeno_t migrated_stray; + if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray()) + migrated_stray = in->ino(); + + MDSContext::vec finished; + if (r == 0) { + // unfreeze+singleauth inode + // hmm, do i really need to delay this? + if (mdr->more()->is_inode_exporter) { + // drop our pins + // we exported, clear out any xlocks that we moved to another MDS + + for (auto i = mdr->locks.lower_bound(&in->versionlock); + i != mdr->locks.end(); ) { + SimpleLock *lock = i->lock; + if (lock->get_parent() != in) + break; + // we only care about xlocks on the exported inode + if (i->is_xlock() && !lock->is_locallock()) + mds->locker->xlock_export(i++, mdr.get()); + else + ++i; + } + + map<client_t,Capability::Import> peer_imported; + auto bp = mdr->more()->inode_import.cbegin(); + decode(peer_imported, bp); + + dout(10) << " finishing inode export on " << *in << dendl; + mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished); + mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. + + // unfreeze + ceph_assert(in->is_frozen_inode()); + in->unfreeze_inode(finished); + } + + // singleauth + if (mdr->more()->is_ambiguous_auth) { + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + + if (straydn && mdr->more()->slave_update_journaled) { + CInode *strayin = straydn->get_projected_linkage()->get_inode(); + if (strayin && !strayin->snaprealm) + mdcache->clear_dirty_bits_for_stray(strayin); + } + + mds->queue_waiters(finished); + mdr->cleanup(); + + if (mdr->more()->slave_update_journaled) { + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, + mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT, + ESlaveUpdate::RENAME); + mdlog->start_entry(le); + submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__); + mdlog->flush(); + } else { + _committed_slave(mdr); + } + } else { + + // abort + // rollback_bl may be empty if we froze the inode but had to provide an expanded + // witness list from the master, and they failed before we tried prep again. + if (mdr->more()->rollback_bl.length()) { + if (mdr->more()->is_inode_exporter) { + dout(10) << " reversing inode export of " << *in << dendl; + in->abort_export(); + } + if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) { + mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds); + // rollback but preserve the slave request + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false); + mdr->more()->rollback_bl.clear(); + } else + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true); + } else { + dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl; + // singleauth + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); + mdcache->request_finish(mdr); + } + } + + if (migrated_stray && mds->is_stopping()) + mdcache->shutdown_export_stray_finish(migrated_stray); +} + +void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime, + bool isdir, int linkunlink, nest_info_t &rstat) +{ + fnode_t *pf; + pf = dir->project_fnode(); + mut->add_projected_fnode(dir); + pf->version = dir->pre_dirty(); + + if (isdir) { + pf->fragstat.nsubdirs += linkunlink; + } else { + pf->fragstat.nfiles += linkunlink; + } + if (r.ino) { + pf->rstat.rbytes += linkunlink * rstat.rbytes; + pf->rstat.rfiles += linkunlink * rstat.rfiles; + pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs; + pf->rstat.rsnaps += linkunlink * rstat.rsnaps; + } + if (pf->fragstat.mtime == ctime) { + pf->fragstat.mtime = r.dirfrag_old_mtime; + if (pf->rstat.rctime == ctime) + pf->rstat.rctime = r.dirfrag_old_rctime; + } + mut->add_updated_lock(&dir->get_inode()->filelock); + mut->add_updated_lock(&dir->get_inode()->nestlock); +} + +struct C_MDS_LoggedRenameRollback : public ServerLogContext { + MutationRef mut; + CDentry *srcdn; + version_t srcdnpv; + CDentry *destdn; + CDentry *straydn; + map<client_t,MClientSnap::ref> splits[2]; + bool finish_mdr; + C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r, + CDentry *sd, version_t pv, CDentry *dd, CDentry *st, + map<client_t,MClientSnap::ref> _splits[2], bool f) : + ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd), + straydn(st), finish_mdr(f) { + splits[0].swap(_splits[0]); + splits[1].swap(_splits[1]); + } + void finish(int r) override { + server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, + destdn, straydn, splits, finish_mdr); + } +}; + +void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr, + bool finish_mdr) +{ + rename_rollback rollback; + auto p = rbl.cbegin(); + decode(rollback, p); + + dout(10) << "do_rename_rollback on " << rollback.reqid << dendl; + // need to finish this update before sending resolve to claim the subtree + mdcache->add_rollback(rollback.reqid, master); + + MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid)); + mut->ls = mds->mdlog->get_current_segment(); + + CDentry *srcdn = NULL; + CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag); + if (!srcdir) + srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname); + if (srcdir) { + dout(10) << " srcdir " << *srcdir << dendl; + srcdn = srcdir->lookup(rollback.orig_src.dname); + if (srcdn) { + dout(10) << " srcdn " << *srcdn << dendl; + ceph_assert(srcdn->get_linkage()->is_null()); + } else + dout(10) << " srcdn not found" << dendl; + } else + dout(10) << " srcdir not found" << dendl; + + CDentry *destdn = NULL; + CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag); + if (!destdir) + destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname); + if (destdir) { + dout(10) << " destdir " << *destdir << dendl; + destdn = destdir->lookup(rollback.orig_dest.dname); + if (destdn) + dout(10) << " destdn " << *destdn << dendl; + else + dout(10) << " destdn not found" << dendl; + } else + dout(10) << " destdir not found" << dendl; + + CInode *in = NULL; + if (rollback.orig_src.ino) { + in = mdcache->get_inode(rollback.orig_src.ino); + if (in && in->is_dir()) + ceph_assert(srcdn && destdn); + } else + in = mdcache->get_inode(rollback.orig_src.remote_ino); + + CDir *straydir = NULL; + CDentry *straydn = NULL; + if (rollback.stray.dirfrag.ino) { + straydir = mdcache->get_dirfrag(rollback.stray.dirfrag); + if (straydir) { + dout(10) << "straydir " << *straydir << dendl; + straydn = straydir->lookup(rollback.stray.dname); + if (straydn) { + dout(10) << " straydn " << *straydn << dendl; + ceph_assert(straydn->get_linkage()->is_primary()); + } else + dout(10) << " straydn not found" << dendl; + } else + dout(10) << "straydir not found" << dendl; + } + + CInode *target = NULL; + if (rollback.orig_dest.ino) { + target = mdcache->get_inode(rollback.orig_dest.ino); + if (target) + ceph_assert(destdn && straydn); + } else if (rollback.orig_dest.remote_ino) + target = mdcache->get_inode(rollback.orig_dest.remote_ino); + + // can't use is_auth() in the resolve stage + mds_rank_t whoami = mds->get_nodeid(); + // slave + ceph_assert(!destdn || destdn->authority().first != whoami); + ceph_assert(!straydn || straydn->authority().first != whoami); + + bool force_journal_src = false; + bool force_journal_dest = false; + if (in && in->is_dir() && srcdn->authority().first != whoami) + force_journal_src = _need_force_journal(in, false); + if (in && target && target->is_dir()) + force_journal_dest = _need_force_journal(in, true); + + version_t srcdnpv = 0; + // repair src + if (srcdn) { + if (srcdn->authority().first == whoami) + srcdnpv = srcdn->pre_dirty(); + if (rollback.orig_src.ino) { + ceph_assert(in); + srcdn->push_projected_linkage(in); + } else + srcdn->push_projected_linkage(rollback.orig_src.remote_ino, + rollback.orig_src.remote_d_type); + } + + map<client_t,MClientSnap::ref> splits[2]; + + CInode::mempool_inode *pip = nullptr; + if (in) { + bool projected; + if (in->get_projected_parent_dn()->authority().first == whoami) { + auto &pi = in->project_inode(); + pip = &pi.inode; + mut->add_projected_inode(in); + pip->version = in->pre_dirty(); + projected = true; + } else { + pip = in->get_projected_inode(); + projected = false; + } + if (pip->ctime == rollback.ctime) + pip->ctime = rollback.orig_src.old_ctime; + + if (rollback.srci_snapbl.length() && in->snaprealm) { + bool hadrealm; + auto p = rollback.srci_snapbl.cbegin(); + decode(hadrealm, p); + if (hadrealm) { + if (projected && !mds->is_resolve()) { + sr_t *new_srnode = new sr_t(); + decode(*new_srnode, p); + in->project_snaprealm(new_srnode); + } else + decode(in->snaprealm->srnode, p); + } else { + SnapRealm *realm; + if (rollback.orig_src.ino) { + ceph_assert(srcdir); + realm = srcdir->get_inode()->find_snaprealm(); + } else { + realm = in->snaprealm->parent; + } + if (!mds->is_resolve()) + mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]); + if (projected) + in->project_snaprealm(NULL); + else + in->snaprealm->merge_to(realm); + } + } + } + + if (srcdn && srcdn->authority().first == whoami) { + nest_info_t blah; + _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime, + in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah); + } + + // repair dest + if (destdn) { + if (rollback.orig_dest.ino && target) { + destdn->push_projected_linkage(target); + } else if (rollback.orig_dest.remote_ino) { + destdn->push_projected_linkage(rollback.orig_dest.remote_ino, + rollback.orig_dest.remote_d_type); + } else { + // the dentry will be trimmed soon, it's ok to have wrong linkage + if (rollback.orig_dest.ino) + ceph_assert(mds->is_resolve()); + destdn->push_projected_linkage(); + } + } + + if (straydn) + straydn->push_projected_linkage(); + + if (target) { + bool projected; + CInode::mempool_inode *ti = nullptr; + if (target->get_projected_parent_dn()->authority().first == whoami) { + auto &pi = target->project_inode(); + ti = &pi.inode; + mut->add_projected_inode(target); + ti->version = target->pre_dirty(); + projected = true; + } else { + ti = target->get_projected_inode(); + projected = false; + } + if (ti->ctime == rollback.ctime) + ti->ctime = rollback.orig_dest.old_ctime; + if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) { + if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino)) + ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino); + else + ceph_assert(rollback.orig_dest.remote_ino && + rollback.orig_dest.remote_ino == rollback.orig_src.ino); + } else + ti->nlink++; + + if (rollback.desti_snapbl.length() && target->snaprealm) { + bool hadrealm; + auto p = rollback.desti_snapbl.cbegin(); + decode(hadrealm, p); + if (hadrealm) { + if (projected && !mds->is_resolve()) { + sr_t *new_srnode = new sr_t(); + decode(*new_srnode, p); + target->project_snaprealm(new_srnode); + } else + decode(target->snaprealm->srnode, p); + } else { + SnapRealm *realm; + if (rollback.orig_dest.ino) { + ceph_assert(destdir); + realm = destdir->get_inode()->find_snaprealm(); + } else { + realm = target->snaprealm->parent; + } + if (!mds->is_resolve()) + mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]); + if (projected) + target->project_snaprealm(NULL); + else + target->snaprealm->merge_to(realm); + } + } + } + + if (srcdn) + dout(0) << " srcdn back to " << *srcdn << dendl; + if (in) + dout(0) << " srci back to " << *in << dendl; + if (destdn) + dout(0) << " destdn back to " << *destdn << dendl; + if (target) + dout(0) << " desti back to " << *target << dendl; + + // journal it + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master, + ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME); + mdlog->start_entry(le); + + if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) { + le->commit.add_dir_context(srcdir); + if (rollback.orig_src.ino) + le->commit.add_primary_dentry(srcdn, 0, true); + else + le->commit.add_remote_dentry(srcdn, true); + } + + if (!rollback.orig_src.ino && // remote linkage + in && in->authority().first == whoami) { + le->commit.add_dir_context(in->get_projected_parent_dir()); + le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true); + } + + if (force_journal_dest) { + ceph_assert(rollback.orig_dest.ino); + le->commit.add_dir_context(destdir); + le->commit.add_primary_dentry(destdn, 0, true); + } + + // slave: no need to journal straydn + + if (target && target != in && target->authority().first == whoami) { + ceph_assert(rollback.orig_dest.remote_ino); + le->commit.add_dir_context(target->get_projected_parent_dir()); + le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true); + } + + if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) { + dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = in->ino(); + if (srcdn->authority().first == whoami) { + list<CDir*> ls; + in->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (!dir->is_auth()) + le->commit.renamed_dir_frags.push_back(dir->get_frag()); + } + dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl; + } + } else if (force_journal_dest) { + dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = target->ino(); + } + + if (target && target->is_dir()) { + ceph_assert(destdn); + mdcache->project_subtree_rename(target, straydir, destdir); + } + + if (in && in->is_dir()) { + ceph_assert(srcdn); + mdcache->project_subtree_rename(in, destdir, srcdir); + } + + if (mdr && !mdr->more()->slave_update_journaled) { + ceph_assert(le->commit.empty()); + mdlog->cancel_entry(le); + mut->ls = NULL; + _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr); + } else { + ceph_assert(!le->commit.empty()); + if (mdr) + mdr->more()->slave_update_journaled = false; + MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, + srcdn, srcdnpv, destdn, straydn, + splits, finish_mdr); + submit_mdlog_entry(le, fin, mdr, __func__); + mdlog->flush(); + } +} + +void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, + version_t srcdnpv, CDentry *destdn, CDentry *straydn, + map<client_t,MClientSnap::ref> splits[2], bool finish_mdr) +{ + dout(10) << "_rename_rollback_finish " << mut->reqid << dendl; + + if (straydn) { + straydn->get_dir()->unlink_inode(straydn); + straydn->pop_projected_linkage(); + } + if (destdn) { + destdn->get_dir()->unlink_inode(destdn); + destdn->pop_projected_linkage(); + } + if (srcdn) { + srcdn->pop_projected_linkage(); + if (srcdn->authority().first == mds->get_nodeid()) { + srcdn->mark_dirty(srcdnpv, mut->ls); + if (srcdn->get_linkage()->is_primary()) + srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH); + } + } + + mut->apply(); + + if (srcdn && srcdn->get_linkage()->is_primary()) { + CInode *in = srcdn->get_linkage()->get_inode(); + if (in && in->is_dir()) { + ceph_assert(destdn); + mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true); + } + } + + if (destdn) { + CInode *oldin = destdn->get_linkage()->get_inode(); + // update subtree map? + if (oldin && oldin->is_dir()) { + ceph_assert(straydn); + mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true); + } + } + + if (mds->is_resolve()) { + CDir *root = NULL; + if (straydn) + root = mdcache->get_subtree_root(straydn->get_dir()); + else if (destdn) + root = mdcache->get_subtree_root(destdn->get_dir()); + if (root) + mdcache->try_trim_non_auth_subtree(root); + } else { + mdcache->send_snaps(splits[1]); + mdcache->send_snaps(splits[0]); + } + + if (mdr) { + MDSContext::vec finished; + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); + if (finish_mdr || mdr->aborted) + mdcache->request_finish(mdr); + else + mdr->more()->slave_rolling_back = false; + } + + mdcache->finish_rollback(mut->reqid, mdr); + + mut->cleanup(); +} + +void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack) +{ + dout(10) << "handle_slave_rename_prep_ack " << *mdr + << " witnessed by " << ack->get_source() + << " " << *ack << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + // note slave + mdr->more()->slaves.insert(from); + if (mdr->more()->srcdn_auth_mds == from && + mdr->more()->is_remote_frozen_authpin && + !mdr->more()->is_ambiguous_auth) { + mdr->set_ambiguous_auth(mdr->more()->rename_inode); + } + + // witnessed? or add extra witnesses? + ceph_assert(mdr->more()->witnessed.count(from) == 0); + if (ack->is_interrupted()) { + dout(10) << " slave request interrupted, noop" << dendl; + } else if (ack->witnesses.empty()) { + mdr->more()->witnessed.insert(from); + if (!ack->is_not_journaled()) + mdr->more()->has_journaled_slaves = true; + } else { + dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; + mdr->more()->extra_witnesses = ack->witnesses; + mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! + } + + // srci import? + if (ack->inode_export.length()) { + dout(10) << " got srci import" << dendl; + mdr->more()->inode_import.share(ack->inode_export); + mdr->more()->inode_import_v = ack->inode_export_v; + } + + // remove from waiting list + ceph_assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); + + if (mdr->more()->waiting_on_slave.empty()) + dispatch_client_request(mdr); // go again! + else + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; +} + +void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack) +{ + dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds." + << ack->get_source() << dendl; + ceph_assert(mdr->is_slave()); + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + if (mdr->more()->waiting_on_slave.count(from)) { + mdr->more()->waiting_on_slave.erase(from); + + if (mdr->more()->waiting_on_slave.empty()) { + if (mdr->slave_request) + dispatch_slave_request(mdr); + } else + dout(10) << " still waiting for rename notify acks from " + << mdr->more()->waiting_on_slave << dendl; + } +} + +void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr) +{ + dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl; + + if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) { + mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE); + + if (mdr->more()->waiting_on_slave.empty()) { + if (mdr->slave_request) + dispatch_slave_request(mdr); + } else + dout(10) << " still waiting for rename notify acks from " + << mdr->more()->waiting_on_slave << dendl; + } +} + +// snaps +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_lssnap(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + // traverse to path + CInode *diri = mdcache->get_inode(req->get_filepath().get_ino()); + if (!diri || diri->state_test(CInode::STATE_PURGING)) { + respond_to_request(mdr, -ESTALE); + return; + } + if (!diri->is_auth()) { + mdcache->request_forward(mdr, diri->authority().first); + return; + } + if (!diri->is_dir()) { + respond_to_request(mdr, -ENOTDIR); + return; + } + dout(10) << "lssnap on " << *diri << dendl; + + // lock snap + MutationImpl::LockOpVec lov; + mds->locker->include_snap_rdlocks(diri, lov); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_READ)) + return; + + SnapRealm *realm = diri->find_snaprealm(); + map<snapid_t,const SnapInfo*> infomap; + realm->get_snap_info(infomap, diri->get_oldest_snap()); + + unsigned max_entries = req->head.args.readdir.max_entries; + if (!max_entries) + max_entries = infomap.size(); + int max_bytes = req->head.args.readdir.max_bytes; + if (!max_bytes) + // make sure at least one item can be encoded + max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + + __u64 last_snapid = 0; + string offset_str = req->get_path2(); + if (!offset_str.empty()) + last_snapid = realm->resolve_snapname(offset_str, diri->ino()); + + //Empty DirStat + bufferlist dirbl; + static DirStat empty; + CDir::encode_dirstat(dirbl, mdr->session->info, empty); + + max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2; + + __u32 num = 0; + bufferlist dnbl; + auto p = infomap.upper_bound(last_snapid); + for (; p != infomap.end() && num < max_entries; ++p) { + dout(10) << p->first << " -> " << *p->second << dendl; + + // actual + string snap_name; + if (p->second->ino == diri->ino()) + snap_name = p->second->name; + else + snap_name = p->second->get_long_name(); + + unsigned start_len = dnbl.length(); + if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes) + break; + + encode(snap_name, dnbl); + //infinite lease + LeaseStat e(-1, -1, 0); + mds->locker->encode_lease(dnbl, mdr->session->info, e); + dout(20) << "encode_infinite_lease" << dendl; + + int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length()); + if (r < 0) { + bufferlist keep; + keep.substr_of(dnbl, 0, start_len); + dnbl.swap(keep); + break; + } + ++num; + } + + encode(num, dirbl); + __u16 flags = 0; + if (p == infomap.end()) { + flags = CEPH_READDIR_FRAG_END; + if (last_snapid == 0) + flags |= CEPH_READDIR_FRAG_COMPLETE; + } + encode(flags, dirbl); + dirbl.claim_append(dnbl); + + mdr->reply_extra_bl = dirbl; + mdr->tracei = diri; + respond_to_request(mdr, 0); +} + + +// MKSNAP + +struct C_MDS_mksnap_finish : public ServerLogContext { + CInode *diri; + SnapInfo info; + C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) : + ServerLogContext(s, r), diri(di), info(i) {} + void finish(int r) override { + server->_mksnap_finish(mdr, diri, info); + } +}; + +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_mksnap(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + // make sure we have as new a map as the client + if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) { + mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + if (!mds->mdsmap->allows_snaps()) { + // you can't make snapshots until you set an option right now + respond_to_request(mdr, -EPERM); + return; + } + + CInode *diri = mdcache->get_inode(req->get_filepath().get_ino()); + if (!diri || diri->state_test(CInode::STATE_PURGING)) { + respond_to_request(mdr, -ESTALE); + return; + } + + if (!diri->is_auth()) { // fw to auth? + mdcache->request_forward(mdr, diri->authority().first); + return; + } + + // dir only + if (!diri->is_dir()) { + respond_to_request(mdr, -ENOTDIR); + return; + } + if (diri->is_system() && !diri->is_root()) { + // no snaps in system dirs (root is ok) + respond_to_request(mdr, -EPERM); + return; + } + + std::string_view snapname = req->get_filepath().last_dentry(); + + if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) { + dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl; + respond_to_request(mdr, -EPERM); + return; + } + + dout(10) << "mksnap " << snapname << " on " << *diri << dendl; + + // lock snap + MutationImpl::LockOpVec lov; + + mds->locker->include_snap_rdlocks(diri, lov); + lov.erase_rdlock(&diri->snaplock); + lov.add_xlock(&diri->snaplock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT)) + return; + + if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino(); + (subvol_ino && subvol_ino != diri->ino())) { + respond_to_request(mdr, -EPERM); + return; + } + + // check if we can create any more snapshots + // we don't allow any more if we are already at or beyond the limit + if (diri->snaprealm && + diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) { + respond_to_request(mdr, -EMLINK); + return; + } + + // make sure name is unique + if (diri->snaprealm && + diri->snaprealm->exists(snapname)) { + respond_to_request(mdr, -EEXIST); + return; + } + if (snapname.length() == 0 || + snapname[0] == '_') { + respond_to_request(mdr, -EINVAL); + return; + } + + // allocate a snapid + if (!mdr->more()->stid) { + // prepare an stid + mds->snapclient->prepare_create(diri->ino(), snapname, + mdr->get_mds_stamp(), + &mdr->more()->stid, &mdr->more()->snapidbl, + new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + version_t stid = mdr->more()->stid; + snapid_t snapid; + auto p = mdr->more()->snapidbl.cbegin(); + decode(snapid, p); + dout(10) << " stid " << stid << " snapid " << snapid << dendl; + + ceph_assert(mds->snapclient->get_cached_version() >= stid); + + // journal + SnapInfo info; + info.ino = diri->ino(); + info.snapid = snapid; + info.name = snapname; + info.stamp = mdr->get_op_stamp(); + + auto &pi = diri->project_inode(false, true); + pi.inode.ctime = info.stamp; + if (info.stamp > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = info.stamp; + pi.inode.rstat.rsnaps++; + pi.inode.version = diri->pre_dirty(); + + // project the snaprealm + auto &newsnap = *pi.snapnode; + newsnap.created = snapid; + auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info)); + if (!em.second) + em.first->second = info; + newsnap.seq = snapid; + newsnap.last_created = snapid; + + // journal the inode changes + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "mksnap"); + mdlog->start_entry(le); + + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + le->metablob.add_table_transaction(TABLE_SNAP, stid); + mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri); + + // journal the snaprealm changes + submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info), + mdr, __func__); + mdlog->flush(); +} + +void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info) +{ + dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl; + + int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT); + + diri->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + mds->snapclient->commit(mdr->more()->stid, mdr->ls); + + // create snap + dout(10) << "snaprealm now " << *diri->snaprealm << dendl; + + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, op); + + mdcache->do_realm_invalidate_and_update_notify(diri, op); + + // yay + mdr->in[0] = diri; + mdr->snapid = info.snapid; + mdr->tracei = diri; + respond_to_request(mdr, 0); +} + + +// RMSNAP + +struct C_MDS_rmsnap_finish : public ServerLogContext { + CInode *diri; + snapid_t snapid; + C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) : + ServerLogContext(s, r), diri(di), snapid(sn) {} + void finish(int r) override { + server->_rmsnap_finish(mdr, diri, snapid); + } +}; + +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_rmsnap(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + + CInode *diri = mdcache->get_inode(req->get_filepath().get_ino()); + if (!diri || diri->state_test(CInode::STATE_PURGING)) { + respond_to_request(mdr, -ESTALE); + return; + } + if (!diri->is_auth()) { // fw to auth? + mdcache->request_forward(mdr, diri->authority().first); + return; + } + if (!diri->is_dir()) { + respond_to_request(mdr, -ENOTDIR); + return; + } + + std::string_view snapname = req->get_filepath().last_dentry(); + + if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) { + dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl; + respond_to_request(mdr, -EPERM); + return; + } + + dout(10) << "rmsnap " << snapname << " on " << *diri << dendl; + + // does snap exist? + if (snapname.length() == 0 || snapname[0] == '_') { + respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently. + return; + } + if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) { + respond_to_request(mdr, -ENOENT); + return; + } + snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino()); + dout(10) << " snapname " << snapname << " is " << snapid << dendl; + + MutationImpl::LockOpVec lov; + mds->locker->include_snap_rdlocks(diri, lov); + lov.erase_rdlock(&diri->snaplock); + lov.add_xlock(&diri->snaplock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT)) + return; + + // prepare + if (!mdr->more()->stid) { + mds->snapclient->prepare_destroy(diri->ino(), snapid, + &mdr->more()->stid, &mdr->more()->snapidbl, + new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + version_t stid = mdr->more()->stid; + auto p = mdr->more()->snapidbl.cbegin(); + snapid_t seq; + decode(seq, p); + dout(10) << " stid is " << stid << ", seq is " << seq << dendl; + + ceph_assert(mds->snapclient->get_cached_version() >= stid); + + // journal + auto &pi = diri->project_inode(false, true); + pi.inode.version = diri->pre_dirty(); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.rstat.rsnaps--; + + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "rmsnap"); + mdlog->start_entry(le); + + // project the snaprealm + auto &newnode = *pi.snapnode; + newnode.snaps.erase(snapid); + newnode.seq = seq; + newnode.last_destroyed = seq; + + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + le->metablob.add_table_transaction(TABLE_SNAP, stid); + mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri); + + submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid), + mdr, __func__); + mdlog->flush(); +} + +void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid) +{ + dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl; + snapid_t stid = mdr->more()->stid; + auto p = mdr->more()->snapidbl.cbegin(); + snapid_t seq; + decode(seq, p); + + diri->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + mds->snapclient->commit(stid, mdr->ls); + + dout(10) << "snaprealm now " << *diri->snaprealm << dendl; + + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY); + + mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY); + + // yay + mdr->in[0] = diri; + respond_to_request(mdr, 0); + + // purge snapshot data + if (diri->snaprealm->have_past_parents_open()) + diri->purge_stale_snap_data(diri->snaprealm->get_snaps()); +} + +struct C_MDS_renamesnap_finish : public ServerLogContext { + CInode *diri; + snapid_t snapid; + C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) : + ServerLogContext(s, r), diri(di), snapid(sn) {} + void finish(int r) override { + server->_renamesnap_finish(mdr, diri, snapid); + } +}; + +/* This function takes responsibility for the passed mdr*/ +void Server::handle_client_renamesnap(MDRequestRef& mdr) +{ + const MClientRequest::const_ref &req = mdr->client_request; + if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) { + respond_to_request(mdr, -EINVAL); + return; + } + + CInode *diri = mdcache->get_inode(req->get_filepath().get_ino()); + if (!diri || diri->state_test(CInode::STATE_PURGING)) { + respond_to_request(mdr, -ESTALE); + return; + } + + if (!diri->is_auth()) { // fw to auth? + mdcache->request_forward(mdr, diri->authority().first); + return; + } + + if (!diri->is_dir()) { // dir only + respond_to_request(mdr, -ENOTDIR); + return; + } + + if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || + mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) { + respond_to_request(mdr, -EPERM); + return; + } + + std::string_view dstname = req->get_filepath().last_dentry(); + std::string_view srcname = req->get_filepath2().last_dentry(); + dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl; + + if (srcname.length() == 0 || srcname[0] == '_') { + respond_to_request(mdr, -EINVAL); // can't rename a parent snap. + return; + } + if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) { + respond_to_request(mdr, -ENOENT); + return; + } + if (dstname.length() == 0 || dstname[0] == '_') { + respond_to_request(mdr, -EINVAL); + return; + } + if (diri->snaprealm->exists(dstname)) { + respond_to_request(mdr, -EEXIST); + return; + } + + snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino()); + dout(10) << " snapname " << srcname << " is " << snapid << dendl; + + // lock snap + MutationImpl::LockOpVec lov; + + mds->locker->include_snap_rdlocks(diri, lov); + lov.erase_rdlock(&diri->snaplock); + lov.add_xlock(&diri->snaplock); + + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT)) + return; + + // prepare + if (!mdr->more()->stid) { + mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(), + &mdr->more()->stid, + new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + version_t stid = mdr->more()->stid; + dout(10) << " stid is " << stid << dendl; + + ceph_assert(mds->snapclient->get_cached_version() >= stid); + + // journal + auto &pi = diri->project_inode(false, true); + pi.inode.ctime = mdr->get_op_stamp(); + if (mdr->get_op_stamp() > pi.inode.rstat.rctime) + pi.inode.rstat.rctime = mdr->get_op_stamp(); + pi.inode.version = diri->pre_dirty(); + + // project the snaprealm + auto &newsnap = *pi.snapnode; + auto it = newsnap.snaps.find(snapid); + ceph_assert(it != newsnap.snaps.end()); + it->second.name = dstname; + + // journal the inode changes + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "renamesnap"); + mdlog->start_entry(le); + + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + le->metablob.add_table_transaction(TABLE_SNAP, stid); + mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false); + mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri); + + // journal the snaprealm changes + submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid), + mdr, __func__); + mdlog->flush(); +} + +void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid) +{ + dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl; + + diri->pop_and_dirty_projected_inode(mdr->ls); + mdr->apply(); + + mds->snapclient->commit(mdr->more()->stid, mdr->ls); + + dout(10) << "snaprealm now " << *diri->snaprealm << dendl; + + // notify other mds + mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE); + + mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE); + + // yay + mdr->in[0] = diri; + mdr->tracei = diri; + mdr->snapid = snapid; + respond_to_request(mdr, 0); +} + +/** + * Return true if server is in state RECONNECT and this + * client has not yet reconnected. + */ +bool Server::waiting_for_reconnect(client_t c) const +{ + return client_reconnect_gather.count(c) > 0; +} + +void Server::dump_reconnect_status(Formatter *f) const +{ + f->open_object_section("reconnect_status"); + f->dump_stream("client_reconnect_gather") << client_reconnect_gather; + f->close_section(); +} diff --git a/src/mds/Server.h b/src/mds/Server.h new file mode 100644 index 00000000..715e8496 --- /dev/null +++ b/src/mds/Server.h @@ -0,0 +1,384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_SERVER_H +#define CEPH_MDS_SERVER_H + +#include <string_view> + +#include <common/DecayCounter.h> + +#include "messages/MClientReconnect.h" +#include "messages/MClientReply.h" +#include "messages/MClientRequest.h" +#include "messages/MClientSession.h" +#include "messages/MClientSnap.h" +#include "messages/MClientReclaim.h" +#include "messages/MClientReclaimReply.h" +#include "messages/MLock.h" + +#include "MDSRank.h" +#include "Mutation.h" +#include "MDSContext.h" + +class OSDMap; +class PerfCounters; +class LogEvent; +class EMetaBlob; +class EUpdate; +class MDLog; +struct SnapInfo; + +enum { + l_mdss_first = 1000, + l_mdss_dispatch_client_request, + l_mdss_dispatch_slave_request, + l_mdss_handle_client_request, + l_mdss_handle_client_session, + l_mdss_handle_slave_request, + l_mdss_req_create_latency, + l_mdss_req_getattr_latency, + l_mdss_req_getfilelock_latency, + l_mdss_req_link_latency, + l_mdss_req_lookup_latency, + l_mdss_req_lookuphash_latency, + l_mdss_req_lookupino_latency, + l_mdss_req_lookupname_latency, + l_mdss_req_lookupparent_latency, + l_mdss_req_lookupsnap_latency, + l_mdss_req_lssnap_latency, + l_mdss_req_mkdir_latency, + l_mdss_req_mknod_latency, + l_mdss_req_mksnap_latency, + l_mdss_req_open_latency, + l_mdss_req_readdir_latency, + l_mdss_req_rename_latency, + l_mdss_req_renamesnap_latency, + l_mdss_req_rmdir_latency, + l_mdss_req_rmsnap_latency, + l_mdss_req_rmxattr_latency, + l_mdss_req_setattr_latency, + l_mdss_req_setdirlayout_latency, + l_mdss_req_setfilelock_latency, + l_mdss_req_setlayout_latency, + l_mdss_req_setxattr_latency, + l_mdss_req_symlink_latency, + l_mdss_req_unlink_latency, + l_mdss_cap_revoke_eviction, + l_mdss_cap_acquisition_throttle, + l_mdss_last, +}; + +class Server { +public: + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + +private: + MDSRank *mds; + MDCache *mdcache; + MDLog *mdlog; + PerfCounters *logger; + + // OSDMap full status, used to generate ENOSPC on some operations + bool is_full; + + // State for while in reconnect + MDSContext *reconnect_done; + int failed_reconnects; + bool reconnect_evicting; // true if I am waiting for evictions to complete + // before proceeding to reconnect_gather_finish + time reconnect_start = clock::zero(); + time reconnect_last_seen = clock::zero(); + set<client_t> client_reconnect_gather; // clients i need a reconnect msg from. + + feature_bitset_t supported_features; + feature_bitset_t required_client_features; + + bool replay_unsafe_with_closed_session = false; + double cap_revoke_eviction_timeout = 0; + uint64_t max_snaps_per_dir = 100; + + friend class MDSContinuation; + friend class ServerContext; + friend class ServerLogContext; + +public: + bool terminating_sessions; + + explicit Server(MDSRank *m); + ~Server() { + g_ceph_context->get_perfcounters_collection()->remove(logger); + delete logger; + delete reconnect_done; + } + + void create_logger(); + + // message handler + void dispatch(const Message::const_ref &m); + + void handle_osd_map(); + + // -- sessions and recovery -- + bool waiting_for_reconnect(client_t c) const; + void dump_reconnect_status(Formatter *f) const; + + time last_recalled() const { + return last_recall_state; + } + + void handle_client_session(const MClientSession::const_ref &m); + void _session_logged(Session *session, uint64_t state_seq, + bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv); + version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm, + map<client_t,client_metadata_t>& cmm, + map<client_t,pair<Session*,uint64_t> >& smap); + void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap, + bool dec_import=true); + void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather); + void finish_flush_session(Session *session, version_t seq); + void terminate_sessions(); + void find_idle_sessions(); + void kill_session(Session *session, Context *on_safe); + size_t apply_blacklist(const std::set<entity_addr_t> &blacklist); + void journal_close_session(Session *session, int state, Context *on_safe); + + set<client_t> client_reclaim_gather; + size_t get_num_pending_reclaim() const { return client_reclaim_gather.size(); } + Session *find_session_by_uuid(std::string_view uuid); + void reclaim_session(Session *session, const MClientReclaim::const_ref &m); + void finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply=nullptr); + void handle_client_reclaim(const MClientReclaim::const_ref &m); + + void reconnect_clients(MDSContext *reconnect_done_); + void handle_client_reconnect(const MClientReconnect::const_ref &m); + void infer_supported_features(Session *session, client_metadata_t& client_metadata); + void update_required_client_features(); + + //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo); + void reconnect_gather_finish(); + void reconnect_tick(); + void recover_filelocks(CInode *in, bufferlist locks, int64_t client); + + enum class RecallFlags : uint64_t { + NONE = 0, + STEADY = (1<<0), + ENFORCE_MAX = (1<<1), + TRIM = (1<<2), + ENFORCE_LIVENESS = (1<<3), + }; + std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, RecallFlags=RecallFlags::NONE); + void force_clients_readonly(); + + // -- requests -- + void handle_client_request(const MClientRequest::const_ref &m); + + void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn, + LogEvent *le, MDSLogContextBase *fin); + void submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, + MDRequestRef& mdr, std::string_view event); + void dispatch_client_request(MDRequestRef& mdr); + void perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat); + void early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn); + void respond_to_request(MDRequestRef& mdr, int r = 0); + void set_trace_dist(Session *session, const MClientReply::ref &reply, CInode *in, CDentry *dn, + snapid_t snapid, + int num_dentries_wanted, + MDRequestRef& mdr); + + + void handle_slave_request(const MMDSSlaveRequest::const_ref &m); + void handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m); + void dispatch_slave_request(MDRequestRef& mdr); + void handle_slave_auth_pin(MDRequestRef& mdr); + void handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack); + + // some helpers + bool check_fragment_space(MDRequestRef& mdr, CDir *in); + bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask); + bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid); + CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname); + CDir *traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath); + CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist=false); + CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in); + CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode, + file_layout_t *layout=NULL); + void journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob); + void apply_allocated_inos(MDRequestRef& mdr, Session *session); + + CInode* rdlock_path_pin_ref(MDRequestRef& mdr, int n, MutationImpl::LockOpVec& lov, + bool want_auth, bool no_want_auth=false, + file_layout_t **layout=nullptr, + bool no_lookup=false); + CDentry* rdlock_path_xlock_dentry(MDRequestRef& mdr, int n, + MutationImpl::LockOpVec& lov, + bool okexist, bool mustexist, bool alwaysxlock, + file_layout_t **layout=nullptr); + + CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr); + + + // requests on existing inodes. + void handle_client_getattr(MDRequestRef& mdr, bool is_lookup); + void handle_client_lookup_ino(MDRequestRef& mdr, + bool want_parent, bool want_dentry); + void _lookup_snap_ino(MDRequestRef& mdr); + void _lookup_ino_2(MDRequestRef& mdr, int r); + void handle_client_readdir(MDRequestRef& mdr); + void handle_client_file_setlock(MDRequestRef& mdr); + void handle_client_file_readlock(MDRequestRef& mdr); + + void handle_client_setattr(MDRequestRef& mdr); + void handle_client_setlayout(MDRequestRef& mdr); + void handle_client_setdirlayout(MDRequestRef& mdr); + + int parse_quota_vxattr(string name, string value, quota_info_t *quota); + void create_quota_realm(CInode *in); + int parse_layout_vxattr(string name, string value, const OSDMap& osdmap, + file_layout_t *layout, bool validate=true); + int check_layout_vxattr(MDRequestRef& mdr, + string name, + string value, + file_layout_t *layout); + void handle_set_vxattr(MDRequestRef& mdr, CInode *cur, + file_layout_t *dir_layout, + MutationImpl::LockOpVec& lov); + void handle_remove_vxattr(MDRequestRef& mdr, CInode *cur, + file_layout_t *dir_layout, + MutationImpl::LockOpVec& lov); + void handle_client_setxattr(MDRequestRef& mdr); + void handle_client_removexattr(MDRequestRef& mdr); + + void handle_client_fsync(MDRequestRef& mdr); + + // open + void handle_client_open(MDRequestRef& mdr); + void handle_client_openc(MDRequestRef& mdr); // O_CREAT variant. + void do_open_truncate(MDRequestRef& mdr, int cmode); // O_TRUNC variant. + + // namespace changes + void handle_client_mknod(MDRequestRef& mdr); + void handle_client_mkdir(MDRequestRef& mdr); + void handle_client_symlink(MDRequestRef& mdr); + + // link + void handle_client_link(MDRequestRef& mdr); + void _link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm); + void _link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti, + version_t, version_t, bool); + + void _link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti); + void _link_remote_finish(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti, + version_t); + + void handle_slave_link_prep(MDRequestRef& mdr); + void _logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm); + void _commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti); + void _committed_slave(MDRequestRef& mdr); // use for rename, too + void handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m); + void do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr); + void _link_rollback_finish(MutationRef& mut, MDRequestRef& mdr, + map<client_t,MClientSnap::ref>& split); + + // unlink + void handle_client_unlink(MDRequestRef& mdr); + bool _dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *rmdiri); + bool _dir_is_nonempty(MDRequestRef& mdr, CInode *rmdiri); + void _unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn); + void _unlink_local_finish(MDRequestRef& mdr, + CDentry *dn, CDentry *straydn, + version_t); + bool _rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn); + void handle_slave_rmdir_prep(MDRequestRef& mdr); + void _logged_slave_rmdir(MDRequestRef& mdr, CDentry *srcdn, CDentry *straydn); + void _commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn); + void handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack); + void do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr); + void _rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn); + + // rename + void handle_client_rename(MDRequestRef& mdr); + void _rename_finish(MDRequestRef& mdr, + CDentry *srcdn, CDentry *destdn, CDentry *straydn); + + void handle_client_lssnap(MDRequestRef& mdr); + void handle_client_mksnap(MDRequestRef& mdr); + void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info); + void handle_client_rmsnap(MDRequestRef& mdr); + void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid); + void handle_client_renamesnap(MDRequestRef& mdr); + void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid); + + + // helpers + bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse, + vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn); + version_t _rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl); + bool _need_force_journal(CInode *diri, bool empty); + void _rename_prepare(MDRequestRef& mdr, + EMetaBlob *metablob, bufferlist *client_map_bl, + CDentry *srcdn, CDentry *destdn, CDentry *straydn); + /* set not_journaling=true if you're going to discard the results -- + * this bypasses the asserts to make sure we're journaling the right + * things on the right nodes */ + void _rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); + + // slaving + void handle_slave_rename_prep(MDRequestRef& mdr); + void handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m); + void handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m); + void _slave_rename_sessions_flushed(MDRequestRef& mdr); + void _logged_slave_rename(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); + void _commit_slave_rename(MDRequestRef& mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); + void do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr, bool finish_mdr=false); + void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv, + CDentry *destdn, CDentry *staydn, map<client_t,MClientSnap::ref> splits[2], + bool finish_mdr); + + void evict_cap_revoke_non_responders(); + void handle_conf_change(const std::set<std::string>& changed); + +private: + void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply); + void flush_session(Session *session, MDSGatherBuilder& gather); + + DecayCounter recall_throttle; + time last_recall_state; + + // Cache cap acquisition throttle configs + uint64_t max_caps_per_client; + uint64_t cap_acquisition_throttle; + double max_caps_throttle_ratio; + double caps_throttle_retry_request_timeout; +}; + +static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) { + using T = std::underlying_type<Server::RecallFlags>::type; + return static_cast<Server::RecallFlags>(static_cast<T>(a) | static_cast<T>(b)); +} +static inline constexpr auto operator&(Server::RecallFlags a, Server::RecallFlags b) { + using T = std::underlying_type<Server::RecallFlags>::type; + return static_cast<Server::RecallFlags>(static_cast<T>(a) & static_cast<T>(b)); +} +static inline std::ostream& operator<<(std::ostream& os, const Server::RecallFlags& f) { + using T = std::underlying_type<Server::RecallFlags>::type; + return os << "0x" << std::hex << static_cast<T>(f) << std::dec; +} +static inline constexpr bool operator!(const Server::RecallFlags& f) { + using T = std::underlying_type<Server::RecallFlags>::type; + return static_cast<T>(f) == static_cast<T>(0); +} + +#endif diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc new file mode 100644 index 00000000..56c71de8 --- /dev/null +++ b/src/mds/SessionMap.cc @@ -0,0 +1,1226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSRank.h" +#include "MDCache.h" +#include "Mutation.h" +#include "SessionMap.h" +#include "osdc/Filer.h" +#include "common/Finisher.h" + +#include "common/config.h" +#include "common/errno.h" +#include "common/DecayCounter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << ".sessionmap " + +namespace { +class SessionMapIOContext : public MDSIOContextBase +{ + protected: + SessionMap *sessionmap; + MDSRank *get_mds() override {return sessionmap->mds;} + public: + explicit SessionMapIOContext(SessionMap *sessionmap_) : sessionmap(sessionmap_) { + ceph_assert(sessionmap != NULL); + } +}; +}; + +void SessionMap::register_perfcounters() +{ + PerfCountersBuilder plb(g_ceph_context, "mds_sessions", + l_mdssm_first, l_mdssm_last); + + plb.add_u64(l_mdssm_session_count, "session_count", + "Session count", "sess", PerfCountersBuilder::PRIO_INTERESTING); + + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + plb.add_u64_counter(l_mdssm_session_add, "session_add", + "Sessions added"); + plb.add_u64_counter(l_mdssm_session_remove, "session_remove", + "Sessions removed"); + plb.add_u64(l_mdssm_session_open, "sessions_open", + "Sessions currently open"); + plb.add_u64(l_mdssm_session_stale, "sessions_stale", + "Sessions currently stale"); + plb.add_u64(l_mdssm_total_load, "total_load", "Total Load"); + plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load"); + plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime", + "Average session uptime"); + + logger = plb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); +} + +void SessionMap::dump() +{ + dout(10) << "dump" << dendl; + for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin(); + p != session_map.end(); + ++p) + dout(10) << p->first << " " << p->second + << " state " << p->second->get_state_name() + << " completed " << p->second->info.completed_requests + << " prealloc_inos " << p->second->info.prealloc_inos + << " used_inos " << p->second->info.used_inos + << dendl; +} + + +// ---------------- +// LOAD + + +object_t SessionMap::get_object_name() const +{ + char s[30]; + snprintf(s, sizeof(s), "mds%d_sessionmap", int(mds->get_nodeid())); + return object_t(s); +} + +namespace { +class C_IO_SM_Load : public SessionMapIOContext { +public: + const bool first; //< Am I the initial (header) load? + int header_r; //< Return value from OMAP header read + int values_r; //< Return value from OMAP value read + bufferlist header_bl; + std::map<std::string, bufferlist> session_vals; + bool more_session_vals = false; + + C_IO_SM_Load(SessionMap *cm, const bool f) + : SessionMapIOContext(cm), first(f), header_r(0), values_r(0) {} + + void finish(int r) override { + sessionmap->_load_finish(r, header_r, values_r, first, header_bl, session_vals, + more_session_vals); + } + void print(ostream& out) const override { + out << "session_load"; + } +}; +} + + +/** + * Decode OMAP header. Call this once when loading. + */ +void SessionMapStore::decode_header( + bufferlist &header_bl) +{ + auto q = header_bl.cbegin(); + DECODE_START(1, q) + decode(version, q); + DECODE_FINISH(q); +} + +void SessionMapStore::encode_header( + bufferlist *header_bl) +{ + ENCODE_START(1, 1, *header_bl); + encode(version, *header_bl); + ENCODE_FINISH(*header_bl); +} + +/** + * Decode and insert some serialized OMAP values. Call this + * repeatedly to insert batched loads. + */ +void SessionMapStore::decode_values(std::map<std::string, bufferlist> &session_vals) +{ + for (std::map<std::string, bufferlist>::iterator i = session_vals.begin(); + i != session_vals.end(); ++i) { + + entity_inst_t inst; + + bool parsed = inst.name.parse(i->first); + if (!parsed) { + derr << "Corrupt entity name '" << i->first << "' in sessionmap" << dendl; + throw buffer::malformed_input("Corrupt entity name in sessionmap"); + } + + Session *s = get_or_add_session(inst); + if (s->is_closed()) { + s->set_state(Session::STATE_OPEN); + s->set_load_avg_decay_rate(decay_rate); + } + auto q = i->second.cbegin(); + s->decode(q); + } +} + +/** + * An OMAP read finished. + */ +void SessionMap::_load_finish( + int operation_r, + int header_r, + int values_r, + bool first, + bufferlist &header_bl, + std::map<std::string, bufferlist> &session_vals, + bool more_session_vals) +{ + if (operation_r < 0) { + derr << "_load_finish got " << cpp_strerror(operation_r) << dendl; + mds->clog->error() << "error reading sessionmap '" << get_object_name() + << "' " << operation_r << " (" + << cpp_strerror(operation_r) << ")"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + // Decode header + if (first) { + if (header_r != 0) { + derr << __func__ << ": header error: " << cpp_strerror(header_r) << dendl; + mds->clog->error() << "error reading sessionmap header " + << header_r << " (" << cpp_strerror(header_r) << ")"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + if(header_bl.length() == 0) { + dout(4) << __func__ << ": header missing, loading legacy..." << dendl; + load_legacy(); + return; + } + + try { + decode_header(header_bl); + } catch (buffer::error &e) { + mds->clog->error() << "corrupt sessionmap header: " << e.what(); + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + dout(10) << __func__ << " loaded version " << version << dendl; + } + + if (values_r != 0) { + derr << __func__ << ": error reading values: " + << cpp_strerror(values_r) << dendl; + mds->clog->error() << "error reading sessionmap values: " + << values_r << " (" << cpp_strerror(values_r) << ")"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + // Decode session_vals + try { + decode_values(session_vals); + } catch (buffer::error &e) { + mds->clog->error() << "corrupt sessionmap values: " << e.what(); + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + if (more_session_vals) { + // Issue another read if we're not at the end of the omap + const std::string last_key = session_vals.rbegin()->first; + dout(10) << __func__ << ": continue omap load from '" + << last_key << "'" << dendl; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + C_IO_SM_Load *c = new C_IO_SM_Load(this, false); + ObjectOperation op; + op.omap_get_vals(last_key, "", g_conf()->mds_sessionmap_keys_per_op, + &c->session_vals, &c->more_session_vals, &c->values_r); + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0, + new C_OnFinisher(c, mds->finisher)); + } else { + // I/O is complete. Update `by_state` + dout(10) << __func__ << ": omap load complete" << dendl; + for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin(); + i != session_map.end(); ++i) { + Session *s = i->second; + auto by_state_entry = by_state.find(s->get_state()); + if (by_state_entry == by_state.end()) + by_state_entry = by_state.emplace(s->get_state(), + new xlist<Session*>).first; + by_state_entry->second->push_back(&s->item_session_list); + } + + // Population is complete. Trigger load waiters. + dout(10) << __func__ << ": v " << version + << ", " << session_map.size() << " sessions" << dendl; + projected = committing = committed = version; + dump(); + finish_contexts(g_ceph_context, waiting_for_load); + } +} + +/** + * Populate session state from OMAP records in this + * rank's sessionmap object. + */ +void SessionMap::load(MDSContext *onload) +{ + dout(10) << "load" << dendl; + + if (onload) + waiting_for_load.push_back(onload); + + C_IO_SM_Load *c = new C_IO_SM_Load(this, true); + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + ObjectOperation op; + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals("", "", g_conf()->mds_sessionmap_keys_per_op, + &c->session_vals, &c->more_session_vals, &c->values_r); + + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0, new C_OnFinisher(c, mds->finisher)); +} + +namespace { +class C_IO_SM_LoadLegacy : public SessionMapIOContext { +public: + bufferlist bl; + explicit C_IO_SM_LoadLegacy(SessionMap *cm) : SessionMapIOContext(cm) {} + void finish(int r) override { + sessionmap->_load_legacy_finish(r, bl); + } + void print(ostream& out) const override { + out << "session_load_legacy"; + } +}; +} + + +/** + * Load legacy (object data blob) SessionMap format, assuming + * that waiting_for_load has already been populated with + * the relevant completion. This is the fallback if we do not + * find an OMAP header when attempting to load normally. + */ +void SessionMap::load_legacy() +{ + dout(10) << __func__ << dendl; + + C_IO_SM_LoadLegacy *c = new C_IO_SM_LoadLegacy(this); + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0, + new C_OnFinisher(c, mds->finisher)); +} + +void SessionMap::_load_legacy_finish(int r, bufferlist &bl) +{ + auto blp = bl.cbegin(); + if (r < 0) { + derr << "_load_finish got " << cpp_strerror(r) << dendl; + ceph_abort_msg("failed to load sessionmap"); + } + dump(); + decode_legacy(blp); // note: this sets last_cap_renew = now() + dout(10) << "_load_finish v " << version + << ", " << session_map.size() << " sessions, " + << bl.length() << " bytes" + << dendl; + projected = committing = committed = version; + dump(); + + // Mark all sessions dirty, so that on next save() we will write + // a complete OMAP version of the data loaded from the legacy format + for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin(); + i != session_map.end(); ++i) { + // Don't use mark_dirty because on this occasion we want to ignore the + // keys_per_op limit and do one big write (upgrade must be atomic) + dirty_sessions.insert(i->first); + } + loaded_legacy = true; + + finish_contexts(g_ceph_context, waiting_for_load); +} + + +// ---------------- +// SAVE + +namespace { +class C_IO_SM_Save : public SessionMapIOContext { + version_t version; +public: + C_IO_SM_Save(SessionMap *cm, version_t v) : SessionMapIOContext(cm), version(v) {} + void finish(int r) override { + if (r != 0) { + get_mds()->handle_write_error(r); + } else { + sessionmap->_save_finish(version); + } + } + void print(ostream& out) const override { + out << "session_save"; + } +}; +} + +void SessionMap::save(MDSContext *onsave, version_t needv) +{ + dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl; + + if (needv && committing >= needv) { + ceph_assert(committing > committed); + commit_waiters[committing].push_back(onsave); + return; + } + + commit_waiters[version].push_back(onsave); + + committing = version; + SnapContext snapc; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + ObjectOperation op; + + /* Compose OSD OMAP transaction for full write */ + bufferlist header_bl; + encode_header(&header_bl); + op.omap_set_header(header_bl); + + /* If we loaded a legacy sessionmap, then erase the old data. If + * an old-versioned MDS tries to read it, it'll fail out safely + * with an end_of_buffer exception */ + if (loaded_legacy) { + dout(4) << __func__ << " erasing legacy sessionmap" << dendl; + op.truncate(0); + loaded_legacy = false; // only need to truncate once. + } + + dout(20) << " updating keys:" << dendl; + map<string, bufferlist> to_set; + for(std::set<entity_name_t>::iterator i = dirty_sessions.begin(); + i != dirty_sessions.end(); ++i) { + const entity_name_t name = *i; + Session *session = session_map[name]; + + if (session->is_open() || + session->is_closing() || + session->is_stale() || + session->is_killing()) { + dout(20) << " " << name << dendl; + // Serialize K + std::ostringstream k; + k << name; + + // Serialize V + bufferlist bl; + session->info.encode(bl, mds->mdsmap->get_up_features()); + + // Add to RADOS op + to_set[k.str()] = bl; + + session->clear_dirty_completed_requests(); + } else { + dout(20) << " " << name << " (ignoring)" << dendl; + } + } + if (!to_set.empty()) { + op.omap_set(to_set); + } + + dout(20) << " removing keys:" << dendl; + set<string> to_remove; + for(std::set<entity_name_t>::const_iterator i = null_sessions.begin(); + i != null_sessions.end(); ++i) { + dout(20) << " " << *i << dendl; + std::ostringstream k; + k << *i; + to_remove.insert(k.str()); + } + if (!to_remove.empty()) { + op.omap_rm_keys(to_remove); + } + + dirty_sessions.clear(); + null_sessions.clear(); + + mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), + 0, + new C_OnFinisher(new C_IO_SM_Save(this, version), + mds->finisher)); +} + +void SessionMap::_save_finish(version_t v) +{ + dout(10) << "_save_finish v" << v << dendl; + committed = v; + + finish_contexts(g_ceph_context, commit_waiters[v]); + commit_waiters.erase(v); +} + + +/** + * Deserialize sessions, and update by_state index + */ +void SessionMap::decode_legacy(bufferlist::const_iterator &p) +{ + // Populate `sessions` + SessionMapStore::decode_legacy(p); + + // Update `by_state` + for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin(); + i != session_map.end(); ++i) { + Session *s = i->second; + auto by_state_entry = by_state.find(s->get_state()); + if (by_state_entry == by_state.end()) + by_state_entry = by_state.emplace(s->get_state(), + new xlist<Session*>).first; + by_state_entry->second->push_back(&s->item_session_list); + } +} + +uint64_t SessionMap::set_state(Session *session, int s) { + if (session->state != s) { + session->set_state(s); + auto by_state_entry = by_state.find(s); + if (by_state_entry == by_state.end()) + by_state_entry = by_state.emplace(s, new xlist<Session*>).first; + by_state_entry->second->push_back(&session->item_session_list); + + if (session->is_open() || session->is_stale()) { + session->set_load_avg_decay_rate(decay_rate); + } + + // refresh number of sessions for states which have perf + // couters associated + logger->set(l_mdssm_session_open, + get_session_count_in_state(Session::STATE_OPEN)); + logger->set(l_mdssm_session_stale, + get_session_count_in_state(Session::STATE_STALE)); + } + + return session->get_state_seq(); +} + +void SessionMapStore::decode_legacy(bufferlist::const_iterator& p) +{ + auto now = clock::now(); + uint64_t pre; + decode(pre, p); + if (pre == (uint64_t)-1) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); + ceph_assert(struct_v >= 2); + + decode(version, p); + + while (!p.end()) { + entity_inst_t inst; + decode(inst.name, p); + Session *s = get_or_add_session(inst); + if (s->is_closed()) { + s->set_state(Session::STATE_OPEN); + s->set_load_avg_decay_rate(decay_rate); + } + s->decode(p); + } + + DECODE_FINISH(p); + } else { + // --- old format ---- + version = pre; + + // this is a meaningless upper bound. can be ignored. + __u32 n; + decode(n, p); + + while (n-- && !p.end()) { + auto p2 = p; + Session *s = new Session(ConnectionRef()); + s->info.decode(p); + { + auto& name = s->info.inst.name; + auto it = session_map.find(name); + if (it != session_map.end()) { + // eager client connected too fast! aie. + dout(10) << " already had session for " << name << ", recovering" << dendl; + delete s; + s = it->second; + p = p2; + s->info.decode(p); + } else { + it->second = s; + } + } + s->set_state(Session::STATE_OPEN); + s->set_load_avg_decay_rate(decay_rate); + s->last_cap_renew = now; + } + } +} + +void Session::dump(Formatter *f) const +{ + f->dump_int("id", info.inst.name.num()); + f->dump_object("entity", info.inst); + f->dump_string("state", get_state_name()); + f->dump_int("num_leases", leases.size()); + f->dump_int("num_caps", caps.size()); + if (is_open() || is_stale()) { + f->dump_unsigned("request_load_avg", get_load_avg()); + } + f->dump_float("uptime", get_session_uptime()); + f->dump_unsigned("requests_in_flight", get_request_count()); + f->dump_unsigned("completed_requests", get_num_completed_requests()); + f->dump_bool("reconnecting", reconnecting); + f->dump_object("recall_caps", recall_caps); + f->dump_object("release_caps", release_caps); + f->dump_object("recall_caps_throttle", recall_caps_throttle); + f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o); + f->dump_object("session_cache_liveness", session_cache_liveness); + f->dump_object("cap_acquisition", cap_acquisition); + info.dump(f); +} + +void SessionMapStore::dump(Formatter *f) const +{ + f->open_array_section("sessions"); + for (const auto& p : session_map) { + f->dump_object("session", *p.second); + } + f->close_section(); // Sessions +} + +void SessionMapStore::generate_test_instances(list<SessionMapStore*>& ls) +{ + // pretty boring for now + ls.push_back(new SessionMapStore()); +} + +void SessionMap::wipe() +{ + dout(1) << "wipe start" << dendl; + dump(); + while (!session_map.empty()) { + Session *s = session_map.begin()->second; + remove_session(s); + } + version = ++projected; + dout(1) << "wipe result" << dendl; + dump(); + dout(1) << "wipe done" << dendl; +} + +void SessionMap::wipe_ino_prealloc() +{ + for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin(); + p != session_map.end(); + ++p) { + p->second->pending_prealloc_inos.clear(); + p->second->info.prealloc_inos.clear(); + p->second->info.used_inos.clear(); + } + projected = ++version; +} + +void SessionMap::add_session(Session *s) +{ + dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl; + + ceph_assert(session_map.count(s->info.inst.name) == 0); + session_map[s->info.inst.name] = s; + auto by_state_entry = by_state.find(s->state); + if (by_state_entry == by_state.end()) + by_state_entry = by_state.emplace(s->state, new xlist<Session*>).first; + by_state_entry->second->push_back(&s->item_session_list); + s->get(); + + update_average_birth_time(*s); + + logger->set(l_mdssm_session_count, session_map.size()); + logger->inc(l_mdssm_session_add); +} + +void SessionMap::remove_session(Session *s) +{ + dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl; + + update_average_birth_time(*s, false); + + s->trim_completed_requests(0); + s->item_session_list.remove_myself(); + session_map.erase(s->info.inst.name); + dirty_sessions.erase(s->info.inst.name); + null_sessions.insert(s->info.inst.name); + s->put(); + + logger->set(l_mdssm_session_count, session_map.size()); + logger->inc(l_mdssm_session_remove); +} + +void SessionMap::touch_session(Session *session) +{ + dout(10) << __func__ << " s=" << session << " name=" << session->info.inst.name << dendl; + + // Move to the back of the session list for this state (should + // already be on a list courtesy of add_session and set_state) + ceph_assert(session->item_session_list.is_on_list()); + auto by_state_entry = by_state.find(session->state); + if (by_state_entry == by_state.end()) + by_state_entry = by_state.emplace(session->state, + new xlist<Session*>).first; + by_state_entry->second->push_back(&session->item_session_list); + + session->last_cap_renew = clock::now(); +} + +void SessionMap::_mark_dirty(Session *s, bool may_save) +{ + if (dirty_sessions.count(s->info.inst.name)) + return; + + if (may_save && + dirty_sessions.size() >= g_conf()->mds_sessionmap_keys_per_op) { + // Pre-empt the usual save() call from journal segment trim, in + // order to avoid building up an oversized OMAP update operation + // from too many sessions modified at once + save(new C_MDSInternalNoop, version); + } + + null_sessions.erase(s->info.inst.name); + dirty_sessions.insert(s->info.inst.name); +} + +void SessionMap::mark_dirty(Session *s, bool may_save) +{ + dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name + << " v=" << version << dendl; + + _mark_dirty(s, may_save); + version++; + s->pop_pv(version); +} + +void SessionMap::replay_dirty_session(Session *s) +{ + dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name + << " v=" << version << dendl; + + _mark_dirty(s, false); + + replay_advance_version(); +} + +void SessionMap::replay_advance_version() +{ + version++; + projected = version; +} + +void SessionMap::replay_open_sessions(version_t event_cmapv, + map<client_t,entity_inst_t>& client_map, + map<client_t,client_metadata_t>& client_metadata_map) +{ + unsigned already_saved; + + if (version + client_map.size() < event_cmapv) + goto bad; + + // Server::finish_force_open_sessions() marks sessions dirty one by one. + // Marking a session dirty may flush all existing dirty sessions. So it's + // possible that some sessions are already saved in sessionmap. + already_saved = client_map.size() - (event_cmapv - version); + for (const auto& p : client_map) { + Session *s = get_or_add_session(p.second); + auto q = client_metadata_map.find(p.first); + if (q != client_metadata_map.end()) + s->info.client_metadata.merge(q->second); + + if (already_saved > 0) { + if (s->is_closed()) + goto bad; + + --already_saved; + continue; + } + + set_state(s, Session::STATE_OPEN); + replay_dirty_session(s); + } + return; + +bad: + mds->clog->error() << "error replaying open sessions(" << client_map.size() + << ") sessionmap v " << event_cmapv << " table " << version; + ceph_assert(g_conf()->mds_wipe_sessions); + mds->sessionmap.wipe(); + mds->sessionmap.set_version(event_cmapv); +} + +version_t SessionMap::mark_projected(Session *s) +{ + dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name + << " pv=" << projected << " -> " << projected + 1 << dendl; + ++projected; + s->push_pv(projected); + return projected; +} + +namespace { +class C_IO_SM_Save_One : public SessionMapIOContext { + MDSContext *on_safe; +public: + C_IO_SM_Save_One(SessionMap *cm, MDSContext *on_safe_) + : SessionMapIOContext(cm), on_safe(on_safe_) {} + void finish(int r) override { + if (r != 0) { + get_mds()->handle_write_error(r); + } else { + on_safe->complete(r); + } + } + void print(ostream& out) const override { + out << "session_save_one"; + } +}; +} + + +void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions, + MDSGatherBuilder *gather_bld) +{ + ceph_assert(gather_bld != NULL); + + std::vector<entity_name_t> write_sessions; + + // Decide which sessions require a write + for (std::set<entity_name_t>::iterator i = tgt_sessions.begin(); + i != tgt_sessions.end(); ++i) { + const entity_name_t &session_id = *i; + + if (session_map.count(session_id) == 0) { + // Session isn't around any more, never mind. + continue; + } + + Session *session = session_map[session_id]; + if (!session->has_dirty_completed_requests()) { + // Session hasn't had completed_requests + // modified since last write, no need to + // write it now. + continue; + } + + if (dirty_sessions.count(session_id) > 0) { + // Session is already dirtied, will be written, no + // need to pre-empt that. + continue; + } + // Okay, passed all our checks, now we write + // this session out. The version we write + // into the OMAP may now be higher-versioned + // than the version in the header, but that's + // okay because it's never a problem to have + // an overly-fresh copy of a session. + write_sessions.push_back(*i); + } + + dout(4) << __func__ << ": writing " << write_sessions.size() << dendl; + + // Batch writes into mds_sessionmap_keys_per_op + const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op; + map<string, bufferlist> to_set; + for (uint32_t i = 0; i < write_sessions.size(); ++i) { + const entity_name_t &session_id = write_sessions[i]; + Session *session = session_map[session_id]; + session->clear_dirty_completed_requests(); + + // Serialize K + std::ostringstream k; + k << session_id; + + // Serialize V + bufferlist bl; + session->info.encode(bl, mds->mdsmap->get_up_features()); + + // Add to RADOS op + to_set[k.str()] = bl; + + // Complete this write transaction? + if (i == write_sessions.size() - 1 + || i % kpo == kpo - 1) { + ObjectOperation op; + op.omap_set(to_set); + to_set.clear(); // clear to start a new transaction + + SnapContext snapc; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + MDSContext *on_safe = gather_bld->new_sub(); + mds->objecter->mutate(oid, oloc, op, snapc, + ceph::real_clock::now(), 0, + new C_OnFinisher( + new C_IO_SM_Save_One(this, on_safe), + mds->finisher)); + } + } +} + +// ================= +// Session + +#undef dout_prefix +#define dout_prefix *_dout << "Session " + +/** + * Calculate the length of the `requests` member list, + * because elist does not have a size() method. + * + * O(N) runtime. + */ +size_t Session::get_request_count() const +{ + size_t result = 0; + for (auto p = requests.begin(); !p.end(); ++p) + ++result; + return result; +} + +/** + * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message, + * with n_caps equal to the number of caps that were released + * in the message. Used to update state about how many caps a + * client has released since it was last instructed to RECALL_STATE. + */ +void Session::notify_cap_release(size_t n_caps) +{ + recall_caps.hit(-(double)n_caps); + release_caps.hit(n_caps); +} + +/** + * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE + * message is sent to the client. Update our recall-related state + * in order to generate health metrics if the session doesn't see + * a commensurate number of calls to ::notify_cap_release + */ +uint64_t Session::notify_recall_sent(size_t new_limit) +{ + const auto num_caps = caps.size(); + ceph_assert(new_limit < num_caps); // Behaviour of Server::recall_client_state + const auto count = num_caps-new_limit; + uint64_t new_change; + if (recall_limit != new_limit) { + new_change = count; + } else { + new_change = 0; /* no change! */ + } + + /* Always hit the session counter as a RECALL message is still sent to the + * client and we do not want the MDS to burn its global counter tokens on a + * session that is not releasing caps (i.e. allow the session counter to + * throttle future RECALL messages). + */ + recall_caps_throttle.hit(count); + recall_caps_throttle2o.hit(count); + recall_caps.hit(count); + return new_change; +} + +/** + * Use client metadata to generate a somewhat-friendlier + * name for the client than its session ID. + * + * This is *not* guaranteed to be unique, and any machine + * consumers of session-related output should always use + * the session ID as a primary capacity and use this only + * as a presentation hint. + */ +void Session::_update_human_name() +{ + auto info_client_metadata_entry = info.client_metadata.find("hostname"); + if (info_client_metadata_entry != info.client_metadata.end()) { + // Happy path, refer to clients by hostname + human_name = info_client_metadata_entry->second; + if (!info.auth_name.has_default_id()) { + // When a non-default entity ID is set by the user, assume they + // would like to see it in references to the client, if it's + // reasonable short. Limit the length because we don't want + // to put e.g. uuid-generated names into a "human readable" + // rendering. + const int arbitrarily_short = 16; + if (info.auth_name.get_id().size() < arbitrarily_short) { + human_name += std::string(":") + info.auth_name.get_id(); + } + } + } else { + // Fallback, refer to clients by ID e.g. client.4567 + human_name = stringify(info.inst.name.num()); + } +} + +void Session::decode(bufferlist::const_iterator &p) +{ + info.decode(p); + + _update_human_name(); +} + +int Session::check_access(CInode *in, unsigned mask, + int caller_uid, int caller_gid, + const vector<uint64_t> *caller_gid_list, + int new_uid, int new_gid) +{ + string path; + CInode *diri = NULL; + if (!in->is_base()) + diri = in->get_projected_parent_dn()->get_dir()->get_inode(); + if (diri && diri->is_stray()){ + path = in->get_projected_inode()->stray_prior_path; + dout(20) << __func__ << " stray_prior_path " << path << dendl; + } else { + in->make_path_string(path, true); + dout(20) << __func__ << " path " << path << dendl; + } + if (path.length()) + path = path.substr(1); // drop leading / + + if (in->inode.is_dir() && + in->inode.has_layout() && + in->inode.layout.pool_ns.length() && + !connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { + dout(10) << __func__ << " client doesn't support FS_FILE_LAYOUT_V2" << dendl; + return -EIO; + } + + if (!auth_caps.is_capable(path, in->inode.uid, in->inode.gid, in->inode.mode, + caller_uid, caller_gid, caller_gid_list, mask, + new_uid, new_gid, + info.inst.addr)) { + return -EACCES; + } + return 0; +} + +// track total and per session load +void SessionMap::hit_session(Session *session) { + uint64_t sessions = get_session_count_in_state(Session::STATE_OPEN) + + get_session_count_in_state(Session::STATE_STALE) + + get_session_count_in_state(Session::STATE_CLOSING); + ceph_assert(sessions != 0); + + double total_load = total_load_avg.hit(); + double avg_load = total_load / sessions; + + logger->set(l_mdssm_total_load, (uint64_t)total_load); + logger->set(l_mdssm_avg_load, (uint64_t)avg_load); + + session->hit_session(); +} + +void SessionMap::handle_conf_change(const std::set<std::string>& changed) +{ + auto apply_to_open_sessions = [this](auto f) { + if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) { + for (const auto &session : *(it->second)) { + f(session); + } + } + if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) { + for (const auto &session : *(it->second)) { + f(session); + } + } + }; + + if (changed.count("mds_request_load_average_decay_rate")) { + auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate"); + + decay_rate = d; + total_load_avg = DecayCounter(d); + + auto mut = [d](auto s) { + s->set_load_avg_decay_rate(d); + }; + apply_to_open_sessions(mut); + } + if (changed.count("mds_recall_max_decay_rate")) { + auto d = g_conf().get_val<double>("mds_recall_max_decay_rate"); + auto mut = [d](auto s) { + s->recall_caps_throttle = DecayCounter(d); + }; + apply_to_open_sessions(mut); + } + if (changed.count("mds_recall_warning_decay_rate")) { + auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate"); + auto mut = [d](auto s) { + s->recall_caps = DecayCounter(d); + s->release_caps = DecayCounter(d); + }; + apply_to_open_sessions(mut); + } + if (changed.count("mds_session_cache_liveness_decay_rate")) { + auto d = g_conf().get_val<double>("mds_session_cache_liveness_decay_rate"); + auto mut = [d](auto s) { + s->session_cache_liveness = DecayCounter(d); + s->session_cache_liveness.hit(s->caps.size()); /* so the MDS doesn't immediately start trimming a new session */ + }; + apply_to_open_sessions(mut); + } + if (changed.count("mds_session_cap_acquisition_decay_rate")) { + auto d = g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate"); + auto mut = [d](auto s) { + s->cap_acquisition = DecayCounter(d); + }; + apply_to_open_sessions(mut); + } +} + +void SessionMap::update_average_session_age() { + if (!session_map.size()) { + return; + } + + double avg_uptime = std::chrono::duration<double>(clock::now()-avg_birth_time).count(); + logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime); +} + +int SessionFilter::parse( + const std::vector<std::string> &args, + std::stringstream *ss) +{ + ceph_assert(ss != NULL); + + for (const auto &s : args) { + dout(20) << __func__ << " parsing filter '" << s << "'" << dendl; + + auto eq = s.find("="); + if (eq == std::string::npos || eq == s.size()) { + *ss << "Invalid filter '" << s << "'"; + return -EINVAL; + } + + // Keys that start with this are to be taken as referring + // to freeform client metadata fields. + const std::string metadata_prefix("client_metadata."); + + auto k = s.substr(0, eq); + auto v = s.substr(eq + 1); + + dout(20) << __func__ << " parsed k='" << k << "', v='" << v << "'" << dendl; + + if (k.compare(0, metadata_prefix.size(), metadata_prefix) == 0 + && k.size() > metadata_prefix.size()) { + // Filter on arbitrary metadata key (no fixed schema for this, + // so anything after the dot is a valid field to filter on) + auto metadata_key = k.substr(metadata_prefix.size()); + metadata.insert(std::make_pair(metadata_key, v)); + } else if (k == "auth_name") { + // Filter on client entity name + auth_name = v; + } else if (k == "state") { + state = v; + } else if (k == "id") { + std::string err; + id = strict_strtoll(v.c_str(), 10, &err); + if (!err.empty()) { + *ss << err; + return -EINVAL; + } + } else if (k == "reconnecting") { + + /** + * Strict boolean parser. Allow true/false/0/1. + * Anything else is -EINVAL. + */ + auto is_true = [](std::string_view bstr, bool *out) -> bool + { + ceph_assert(out != nullptr); + + if (bstr == "true" || bstr == "1") { + *out = true; + return 0; + } else if (bstr == "false" || bstr == "0") { + *out = false; + return 0; + } else { + return -EINVAL; + } + }; + + bool bval; + int r = is_true(v, &bval); + if (r == 0) { + set_reconnecting(bval); + } else { + *ss << "Invalid boolean value '" << v << "'"; + return -EINVAL; + } + } else { + *ss << "Invalid filter key '" << k << "'"; + return -EINVAL; + } + } + + return 0; +} + +bool SessionFilter::match( + const Session &session, + std::function<bool(client_t)> is_reconnecting) const +{ + for (const auto &m : metadata) { + const auto &k = m.first; + const auto &v = m.second; + auto it = session.info.client_metadata.find(k); + if (it == session.info.client_metadata.end()) { + return false; + } + if (it->second != v) { + return false; + } + } + + if (!auth_name.empty() && auth_name != session.info.auth_name.get_id()) { + return false; + } + + if (!state.empty() && state != session.get_state_name()) { + return false; + } + + if (id != 0 && id != session.info.inst.name.num()) { + return false; + } + + if (reconnecting.first) { + const bool am_reconnecting = is_reconnecting(session.info.inst.name.num()); + if (reconnecting.second != am_reconnecting) { + return false; + } + } + + return true; +} + +std::ostream& operator<<(std::ostream &out, const Session &s) +{ + if (s.get_human_name() == stringify(s.get_client())) { + out << s.get_human_name(); + } else { + out << s.get_human_name() << " (" << std::dec << s.get_client() << ")"; + } + return out; +} + diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h new file mode 100644 index 00000000..dd7721cc --- /dev/null +++ b/src/mds/SessionMap.h @@ -0,0 +1,838 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_SESSIONMAP_H +#define CEPH_MDS_SESSIONMAP_H + +#include <set> +using std::set; + +#include "include/unordered_map.h" + +#include "include/Context.h" +#include "include/xlist.h" +#include "include/elist.h" +#include "include/interval_set.h" +#include "mdstypes.h" +#include "mds/MDSAuthCaps.h" +#include "common/perf_counters.h" +#include "common/DecayCounter.h" + +class CInode; +struct MDRequestImpl; + +#include "CInode.h" +#include "Capability.h" +#include "MDSContext.h" +#include "msg/Message.h" + +enum { + l_mdssm_first = 5500, + l_mdssm_session_count, + l_mdssm_session_add, + l_mdssm_session_remove, + l_mdssm_session_open, + l_mdssm_session_stale, + l_mdssm_total_load, + l_mdssm_avg_load, + l_mdssm_avg_session_uptime, + l_mdssm_last, +}; + +/* + * session + */ + +class Session : public RefCountedObject { + // -- state etc -- +public: + /* + + <deleted> <-- closed <------------+ + ^ | | + | v | + killing <-- opening <----+ | + ^ | | | + | v | | + stale <--> open --> closing ---+ + + + additional dimension of 'importing' (with counter) + + */ + + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + + enum { + STATE_CLOSED = 0, + STATE_OPENING = 1, // journaling open + STATE_OPEN = 2, + STATE_CLOSING = 3, // journaling close + STATE_STALE = 4, + STATE_KILLING = 5 + }; + + static std::string_view get_state_name(int s) { + switch (s) { + case STATE_CLOSED: return "closed"; + case STATE_OPENING: return "opening"; + case STATE_OPEN: return "open"; + case STATE_CLOSING: return "closing"; + case STATE_STALE: return "stale"; + case STATE_KILLING: return "killing"; + default: return "???"; + } + } + + void dump(Formatter *f) const; + +private: + int state = STATE_CLOSED; + bool reconnecting = false; + uint64_t state_seq = 0; + int importing_count = 0; + friend class SessionMap; + + // Human (friendly) name is soft state generated from client metadata + void _update_human_name(); + std::string human_name; + + // Versions in this session was projected: used to verify + // that appropriate mark_dirty calls follow. + std::deque<version_t> projected; + + // request load average for this session + DecayCounter load_avg; + + // Ephemeral state for tracking progress of capability recalls + // caps being recalled recently by this session; used for Beacon warnings + DecayCounter recall_caps; + // caps that have been released + DecayCounter release_caps; + // throttle on caps recalled + DecayCounter recall_caps_throttle; + // second order throttle that prevents recalling too quickly + DecayCounter recall_caps_throttle2o; + // New limit in SESSION_RECALL + uint32_t recall_limit = 0; + + // session caps liveness + DecayCounter session_cache_liveness; + + // cap acquisition via readdir + DecayCounter cap_acquisition; + + // session start time -- used to track average session time + // note that this is initialized in the constructor rather + // than at the time of adding a session to the sessionmap + // as journal replay of sessionmap will not call add_session(). + time birth_time; + +public: + Session *reclaiming_from = nullptr; + + void push_pv(version_t pv) + { + ceph_assert(projected.empty() || projected.back() != pv); + projected.push_back(pv); + } + + void pop_pv(version_t v) + { + ceph_assert(!projected.empty()); + ceph_assert(projected.front() == v); + projected.pop_front(); + } + + int get_state() const { return state; } + void set_state(int new_state) + { + if (state != new_state) { + state = new_state; + state_seq++; + } + } + + void set_reconnecting(bool s) { reconnecting = s; } + + void decode(bufferlist::const_iterator &p); + template<typename T> + void set_client_metadata(T&& meta) + { + info.client_metadata = std::forward<T>(meta); + _update_human_name(); + } + + const std::string& get_human_name() const {return human_name;} + + session_info_t info; ///< durable bits + + MDSAuthCaps auth_caps; + +protected: + ConnectionRef connection; +public: + xlist<Session*>::item item_session_list; + + list<Message::ref> preopen_out_queue; ///< messages for client, queued before they connect + + /* This is mutable to allow get_request_count to be const. elist does not + * support const iterators yet. + */ + mutable elist<MDRequestImpl*> requests; + size_t get_request_count() const; + + interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos + + void notify_cap_release(size_t n_caps); + uint64_t notify_recall_sent(size_t new_limit); + auto get_recall_caps_throttle() const { + return recall_caps_throttle.get(); + } + auto get_recall_caps_throttle2o() const { + return recall_caps_throttle2o.get(); + } + auto get_recall_caps() const { + return recall_caps.get(); + } + auto get_release_caps() const { + return release_caps.get(); + } + auto get_session_cache_liveness() const { + return session_cache_liveness.get(); + } + auto get_cap_acquisition() const { + return cap_acquisition.get(); + } + + inodeno_t next_ino() const { + if (info.prealloc_inos.empty()) + return 0; + return info.prealloc_inos.range_start(); + } + inodeno_t take_ino(inodeno_t ino = 0) { + ceph_assert(!info.prealloc_inos.empty()); + + if (ino) { + if (info.prealloc_inos.contains(ino)) + info.prealloc_inos.erase(ino); + else + ino = 0; + } + if (!ino) { + ino = info.prealloc_inos.range_start(); + info.prealloc_inos.erase(ino); + } + info.used_inos.insert(ino, 1); + return ino; + } + int get_num_projected_prealloc_inos() const { + return info.prealloc_inos.size() + pending_prealloc_inos.size(); + } + + client_t get_client() const { + return info.get_client(); + } + + std::string_view get_state_name() const { return get_state_name(state); } + uint64_t get_state_seq() const { return state_seq; } + bool is_closed() const { return state == STATE_CLOSED; } + bool is_opening() const { return state == STATE_OPENING; } + bool is_open() const { return state == STATE_OPEN; } + bool is_closing() const { return state == STATE_CLOSING; } + bool is_stale() const { return state == STATE_STALE; } + bool is_killing() const { return state == STATE_KILLING; } + + void inc_importing() { + ++importing_count; + } + void dec_importing() { + ceph_assert(importing_count > 0); + --importing_count; + } + bool is_importing() const { return importing_count > 0; } + + void set_load_avg_decay_rate(double rate) { + ceph_assert(is_open() || is_stale()); + load_avg = DecayCounter(rate); + } + uint64_t get_load_avg() const { + return (uint64_t)load_avg.get(); + } + void hit_session() { + load_avg.adjust(); + } + + double get_session_uptime() const { + chrono::duration<double> uptime = clock::now() - birth_time; + return uptime.count(); + } + + time get_birth_time() const { + return birth_time; + } + + // -- caps -- +private: + uint32_t cap_gen = 0; + version_t cap_push_seq = 0; // cap push seq # + map<version_t, MDSContext::vec > waitfor_flush; // flush session messages + +public: + xlist<Capability*> caps; // inodes with caps; front=most recently used + xlist<ClientLease*> leases; // metadata leases to clients + time last_cap_renew = clock::zero(); + time last_seen = clock::zero(); + + void inc_cap_gen() { ++cap_gen; } + uint32_t get_cap_gen() const { return cap_gen; } + + version_t inc_push_seq() { return ++cap_push_seq; } + version_t get_push_seq() const { return cap_push_seq; } + + version_t wait_for_flush(MDSContext* c) { + waitfor_flush[get_push_seq()].push_back(c); + return get_push_seq(); + } + void finish_flush(version_t seq, MDSContext::vec& ls) { + while (!waitfor_flush.empty()) { + auto it = waitfor_flush.begin(); + if (it->first > seq) + break; + auto& v = it->second; + ls.insert(ls.end(), v.begin(), v.end()); + waitfor_flush.erase(it); + } + } + + void touch_readdir_cap(uint32_t count) { + cap_acquisition.hit(count); + } + + void touch_cap(Capability *cap) { + session_cache_liveness.hit(1.0); + caps.push_front(&cap->item_session_caps); + } + + void touch_cap_bottom(Capability *cap) { + session_cache_liveness.hit(1.0); + caps.push_back(&cap->item_session_caps); + } + + void touch_lease(ClientLease *r) { + session_cache_liveness.hit(1.0); + leases.push_back(&r->item_session_lease); + } + + bool is_any_flush_waiter() { + return !waitfor_flush.empty(); + } + + // -- leases -- + uint32_t lease_seq = 0; + + // -- completed requests -- +private: + // Has completed_requests been modified since the last time we + // wrote this session out? + bool completed_requests_dirty = false; + + unsigned num_trim_flushes_warnings = 0; + unsigned num_trim_requests_warnings = 0; +public: + void add_completed_request(ceph_tid_t t, inodeno_t created) { + info.completed_requests[t] = created; + completed_requests_dirty = true; + } + bool trim_completed_requests(ceph_tid_t mintid) { + // trim + bool erased_any = false; + while (!info.completed_requests.empty() && + (mintid == 0 || info.completed_requests.begin()->first < mintid)) { + info.completed_requests.erase(info.completed_requests.begin()); + erased_any = true; + } + + if (erased_any) { + completed_requests_dirty = true; + } + return erased_any; + } + bool have_completed_request(ceph_tid_t tid, inodeno_t *pcreated) const { + map<ceph_tid_t,inodeno_t>::const_iterator p = info.completed_requests.find(tid); + if (p == info.completed_requests.end()) + return false; + if (pcreated) + *pcreated = p->second; + return true; + } + + void add_completed_flush(ceph_tid_t tid) { + info.completed_flushes.insert(tid); + } + bool trim_completed_flushes(ceph_tid_t mintid) { + bool erased_any = false; + while (!info.completed_flushes.empty() && + (mintid == 0 || *info.completed_flushes.begin() < mintid)) { + info.completed_flushes.erase(info.completed_flushes.begin()); + erased_any = true; + } + if (erased_any) { + completed_requests_dirty = true; + } + return erased_any; + } + bool have_completed_flush(ceph_tid_t tid) const { + return info.completed_flushes.count(tid); + } + + uint64_t get_num_caps() const { + return caps.size(); + } + + unsigned get_num_completed_flushes() const { return info.completed_flushes.size(); } + unsigned get_num_trim_flushes_warnings() const { + return num_trim_flushes_warnings; + } + void inc_num_trim_flushes_warnings() { ++num_trim_flushes_warnings; } + void reset_num_trim_flushes_warnings() { num_trim_flushes_warnings = 0; } + + unsigned get_num_completed_requests() const { return info.completed_requests.size(); } + unsigned get_num_trim_requests_warnings() const { + return num_trim_requests_warnings; + } + void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; } + void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; } + + bool has_dirty_completed_requests() const + { + return completed_requests_dirty; + } + + void clear_dirty_completed_requests() + { + completed_requests_dirty = false; + } + + int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid, + const vector<uint64_t> *gid_list, int new_uid, int new_gid); + + Session() = delete; + Session(ConnectionRef con) : + recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")), + release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")), + recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")), + recall_caps_throttle2o(0.5), + session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")), + cap_acquisition(g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate")), + birth_time(clock::now()), + auth_caps(g_ceph_context), + item_session_list(this), + requests(member_offset(MDRequestImpl, item_session_request)) + { + set_connection(std::move(con)); + } + ~Session() override { + if (state == STATE_CLOSED) { + item_session_list.remove_myself(); + } else { + ceph_assert(!item_session_list.is_on_list()); + } + preopen_out_queue.clear(); + } + + void set_connection(ConnectionRef con) { + connection = std::move(con); + auto& c = connection; + if (c) { + info.auth_name = c->get_peer_entity_name(); + info.inst.addr = c->get_peer_socket_addr(); + info.inst.name = entity_name_t(c->get_peer_type(), c->get_peer_global_id()); + } + } + const ConnectionRef& get_connection() const { + return connection; + } + + void clear() { + pending_prealloc_inos.clear(); + info.clear_meta(); + + cap_push_seq = 0; + last_cap_renew = clock::zero(); + } +}; + +class SessionFilter +{ +protected: + // First is whether to filter, second is filter value + std::pair<bool, bool> reconnecting; + +public: + std::map<std::string, std::string> metadata; + std::string auth_name; + std::string state; + int64_t id; + + SessionFilter() + : reconnecting(false, false), id(0) + {} + + bool match( + const Session &session, + std::function<bool(client_t)> is_reconnecting) const; + int parse(const std::vector<std::string> &args, std::stringstream *ss); + void set_reconnecting(bool v) + { + reconnecting.first = true; + reconnecting.second = v; + } +}; + +/* + * session map + */ + +class MDSRank; + +/** + * Encapsulate the serialized state associated with SessionMap. Allows + * encode/decode outside of live MDS instance. + */ +class SessionMapStore { +public: + using clock = Session::clock; + using time = Session::time; + +protected: + version_t version; + ceph::unordered_map<entity_name_t, Session*> session_map; + PerfCounters *logger; + + // total request load avg + double decay_rate; + DecayCounter total_load_avg; + +public: + mds_rank_t rank; + + version_t get_version() const {return version;} + + virtual void encode_header(bufferlist *header_bl); + virtual void decode_header(bufferlist &header_bl); + virtual void decode_values(std::map<std::string, bufferlist> &session_vals); + virtual void decode_legacy(bufferlist::const_iterator& blp); + void dump(Formatter *f) const; + + void set_rank(mds_rank_t r) + { + rank = r; + } + + Session* get_or_add_session(const entity_inst_t& i) { + Session *s; + auto session_map_entry = session_map.find(i.name); + if (session_map_entry != session_map.end()) { + s = session_map_entry->second; + } else { + s = session_map[i.name] = new Session(ConnectionRef()); + s->info.inst = i; + s->last_cap_renew = Session::clock::now(); + if (logger) { + logger->set(l_mdssm_session_count, session_map.size()); + logger->inc(l_mdssm_session_add); + } + } + + return s; + } + + static void generate_test_instances(list<SessionMapStore*>& ls); + + void reset_state() + { + session_map.clear(); + } + + SessionMapStore() + : version(0), logger(nullptr), + decay_rate(g_conf().get_val<double>("mds_request_load_average_decay_rate")), + total_load_avg(decay_rate), rank(MDS_RANK_NONE) { + } + virtual ~SessionMapStore() {}; +}; + +class SessionMap : public SessionMapStore { +public: + MDSRank *mds; + +protected: + version_t projected = 0, committing = 0, committed = 0; +public: + map<int,xlist<Session*>* > by_state; + uint64_t set_state(Session *session, int state); + map<version_t, MDSContext::vec > commit_waiters; + void update_average_session_age(); + + SessionMap() = delete; + explicit SessionMap(MDSRank *m) : mds(m) {} + + ~SessionMap() override + { + for (auto p : by_state) + delete p.second; + + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger); + } + + delete logger; + } + + void register_perfcounters(); + + void set_version(const version_t v) + { + version = projected = v; + } + + void set_projected(const version_t v) + { + projected = v; + } + + version_t get_projected() const + { + return projected; + } + + version_t get_committed() const + { + return committed; + } + + version_t get_committing() const + { + return committing; + } + + // sessions + void decode_legacy(bufferlist::const_iterator& blp) override; + bool empty() const { return session_map.empty(); } + const auto& get_sessions() const { + return session_map; + } + + bool is_any_state(int state) const { + auto it = by_state.find(state); + if (it == by_state.end() || it->second->empty()) + return false; + return true; + } + + bool have_unclosed_sessions() const { + return + is_any_state(Session::STATE_OPENING) || + is_any_state(Session::STATE_OPEN) || + is_any_state(Session::STATE_CLOSING) || + is_any_state(Session::STATE_STALE) || + is_any_state(Session::STATE_KILLING); + } + bool have_session(entity_name_t w) const { + return session_map.count(w); + } + Session* get_session(entity_name_t w) { + auto session_map_entry = session_map.find(w); + return (session_map_entry != session_map.end() ? + session_map_entry-> second : nullptr); + } + const Session* get_session(entity_name_t w) const { + ceph::unordered_map<entity_name_t, Session*>::const_iterator p = session_map.find(w); + if (p == session_map.end()) { + return NULL; + } else { + return p->second; + } + } + + void add_session(Session *s); + void remove_session(Session *s); + void touch_session(Session *session); + + Session *get_oldest_session(int state) { + auto by_state_entry = by_state.find(state); + if (by_state_entry == by_state.end() || by_state_entry->second->empty()) + return 0; + return by_state_entry->second->front(); + } + + void dump(); + + template<typename F> + void get_client_sessions(F&& f) const { + for (const auto& p : session_map) { + auto& session = p.second; + if (session->info.inst.name.is_client()) + f(session); + } + } + template<typename C> + void get_client_session_set(C& c) const { + auto f = [&c](auto& s) { + c.insert(s); + }; + get_client_sessions(f); + } + + // helpers + entity_inst_t& get_inst(entity_name_t w) { + ceph_assert(session_map.count(w)); + return session_map[w]->info.inst; + } + version_t get_push_seq(client_t client) { + return get_session(entity_name_t::CLIENT(client.v))->get_push_seq(); + } + bool have_completed_request(metareqid_t rid) { + Session *session = get_session(rid.name); + return session && session->have_completed_request(rid.tid, NULL); + } + void trim_completed_requests(entity_name_t c, ceph_tid_t tid) { + Session *session = get_session(c); + ceph_assert(session); + session->trim_completed_requests(tid); + } + + void wipe(); + void wipe_ino_prealloc(); + + // -- loading, saving -- + inodeno_t ino; + MDSContext::vec waiting_for_load; + + object_t get_object_name() const; + + void load(MDSContext *onload); + void _load_finish( + int operation_r, + int header_r, + int values_r, + bool first, + bufferlist &header_bl, + std::map<std::string, bufferlist> &session_vals, + bool more_session_vals); + + void load_legacy(); + void _load_legacy_finish(int r, bufferlist &bl); + + void save(MDSContext *onsave, version_t needv=0); + void _save_finish(version_t v); + +protected: + std::set<entity_name_t> dirty_sessions; + std::set<entity_name_t> null_sessions; + bool loaded_legacy = false; + void _mark_dirty(Session *session, bool may_save); +public: + + /** + * Advance the version, and mark this session + * as dirty within the new version. + * + * Dirty means journalled but needing writeback + * to the backing store. Must have called + * mark_projected previously for this session. + */ + void mark_dirty(Session *session, bool may_save=true); + + /** + * Advance the projected version, and mark this + * session as projected within the new version + * + * Projected means the session is updated in memory + * but we're waiting for the journal write of the update + * to finish. Must subsequently call mark_dirty + * for sessions in the same global order as calls + * to mark_projected. + */ + version_t mark_projected(Session *session); + + /** + * During replay, advance versions to account + * for a session modification, and mark the + * session dirty. + */ + void replay_dirty_session(Session *session); + + /** + * During replay, if a session no longer present + * would have consumed a version, advance `version` + * and `projected` to account for that. + */ + void replay_advance_version(); + + /** + * During replay, open sessions, advance versions and + * mark these sessions as dirty. + */ + void replay_open_sessions(version_t event_cmapv, + map<client_t,entity_inst_t>& client_map, + map<client_t,client_metadata_t>& client_metadata_map); + + /** + * For these session IDs, if a session exists with this ID, and it has + * dirty completed_requests, then persist it immediately + * (ahead of usual project/dirty versioned writes + * of the map). + */ + void save_if_dirty(const std::set<entity_name_t> &tgt_sessions, + MDSGatherBuilder *gather_bld); + +private: + time avg_birth_time = clock::zero(); + + uint64_t get_session_count_in_state(int state) { + return !is_any_state(state) ? 0 : by_state[state]->size(); + } + + void update_average_birth_time(const Session &s, bool added=true) { + uint32_t sessions = session_map.size(); + time birth_time = s.get_birth_time(); + + if (sessions == 1) { + avg_birth_time = added ? birth_time : clock::zero(); + return; + } + + if (added) { + avg_birth_time = clock::time_point( + ((avg_birth_time - clock::zero()) / sessions) * (sessions - 1) + + (birth_time - clock::zero()) / sessions); + } else { + avg_birth_time = clock::time_point( + ((avg_birth_time - clock::zero()) / (sessions - 1)) * sessions - + (birth_time - clock::zero()) / (sessions - 1)); + } + } + +public: + void hit_session(Session *session); + void handle_conf_change(const std::set <std::string> &changed); +}; + +std::ostream& operator<<(std::ostream &out, const Session &s); + + +#endif diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc new file mode 100644 index 00000000..c4c0ae0d --- /dev/null +++ b/src/mds/SimpleLock.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "SimpleLock.h" +#include "Mutation.h" + +void SimpleLock::dump(Formatter *f) const { + ceph_assert(f != NULL); + if (is_sync_and_unlocked()) { + return; + } + + f->open_array_section("gather_set"); + if (have_more()) { + for(const auto &i : more()->gather_set) { + f->dump_int("rank", i); + } + } + f->close_section(); + + f->dump_string("state", get_state_name(get_state())); + f->dump_bool("is_leased", is_leased()); + f->dump_int("num_rdlocks", get_num_rdlocks()); + f->dump_int("num_wrlocks", get_num_wrlocks()); + f->dump_int("num_xlocks", get_num_xlocks()); + f->open_object_section("xlock_by"); + if (get_xlock_by()) { + get_xlock_by()->dump(f); + } + f->close_section(); +} diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h new file mode 100644 index 00000000..2d719b27 --- /dev/null +++ b/src/mds/SimpleLock.h @@ -0,0 +1,720 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_SIMPLELOCK_H +#define CEPH_SIMPLELOCK_H + +#include <boost/intrusive_ptr.hpp> + +#include "MDSCacheObject.h" +#include "MDSContext.h" + +// -- lock types -- +// see CEPH_LOCK_* + + +struct MutationImpl; +typedef boost::intrusive_ptr<MutationImpl> MutationRef; + +extern "C" { +#include "locks.h" +} + + +#define CAP_ANY 0 +#define CAP_LONER 1 +#define CAP_XLOCKER 2 + +struct LockType { + int type; + const sm_t *sm; + + explicit LockType(int t) : type(t) { + switch (type) { + case CEPH_LOCK_DN: + case CEPH_LOCK_IAUTH: + case CEPH_LOCK_ILINK: + case CEPH_LOCK_IXATTR: + case CEPH_LOCK_ISNAP: + case CEPH_LOCK_IFLOCK: + case CEPH_LOCK_IPOLICY: + sm = &sm_simplelock; + break; + case CEPH_LOCK_IDFT: + case CEPH_LOCK_INEST: + sm = &sm_scatterlock; + break; + case CEPH_LOCK_IFILE: + sm = &sm_filelock; + break; + case CEPH_LOCK_DVERSION: + case CEPH_LOCK_IVERSION: + sm = &sm_locallock; + break; + default: + sm = 0; + } + } + +}; + + +class SimpleLock { +public: + LockType *type; + + static std::string_view get_state_name(int n) { + switch (n) { + case LOCK_UNDEF: return "UNDEF"; + case LOCK_SYNC: return "sync"; + case LOCK_LOCK: return "lock"; + + case LOCK_PREXLOCK: return "prexlock"; + case LOCK_XLOCK: return "xlock"; + case LOCK_XLOCKDONE: return "xlockdone"; + case LOCK_XLOCKSNAP: return "xlocksnap"; + case LOCK_LOCK_XLOCK: return "lock->xlock"; + + case LOCK_SYNC_LOCK: return "sync->lock"; + case LOCK_LOCK_SYNC: return "lock->sync"; + case LOCK_REMOTEXLOCK: return "remote_xlock"; + case LOCK_EXCL: return "excl"; + case LOCK_EXCL_SYNC: return "excl->sync"; + case LOCK_EXCL_LOCK: return "excl->lock"; + case LOCK_SYNC_EXCL: return "sync->excl"; + case LOCK_LOCK_EXCL: return "lock->excl"; + + case LOCK_XSYN: return "xsyn"; + case LOCK_XSYN_EXCL: return "xsyn->excl"; + case LOCK_EXCL_XSYN: return "excl->xsyn"; + case LOCK_XSYN_SYNC: return "xsyn->sync"; + case LOCK_XSYN_LOCK: return "xsyn->lock"; + case LOCK_XSYN_MIX: return "xsyn->mix"; + + case LOCK_SYNC_MIX: return "sync->mix"; + case LOCK_SYNC_MIX2: return "sync->mix(2)"; + case LOCK_LOCK_TSYN: return "lock->tsyn"; + + case LOCK_MIX_LOCK: return "mix->lock"; + case LOCK_MIX_LOCK2: return "mix->lock(2)"; + case LOCK_MIX: return "mix"; + case LOCK_MIX_TSYN: return "mix->tsyn"; + + case LOCK_TSYN_MIX: return "tsyn->mix"; + case LOCK_TSYN_LOCK: return "tsyn->lock"; + case LOCK_TSYN: return "tsyn"; + + case LOCK_MIX_SYNC: return "mix->sync"; + case LOCK_MIX_SYNC2: return "mix->sync(2)"; + case LOCK_EXCL_MIX: return "excl->mix"; + case LOCK_MIX_EXCL: return "mix->excl"; + + case LOCK_PRE_SCAN: return "*->scan"; + case LOCK_SCAN: return "scan"; + + case LOCK_SNAP_SYNC: return "snap->sync"; + + default: ceph_abort(); return std::string_view(); + } + } + + static std::string_view get_lock_type_name(int t) { + switch (t) { + case CEPH_LOCK_DN: return "dn"; + case CEPH_LOCK_DVERSION: return "dversion"; + case CEPH_LOCK_IVERSION: return "iversion"; + case CEPH_LOCK_IFILE: return "ifile"; + case CEPH_LOCK_IAUTH: return "iauth"; + case CEPH_LOCK_ILINK: return "ilink"; + case CEPH_LOCK_IDFT: return "idft"; + case CEPH_LOCK_INEST: return "inest"; + case CEPH_LOCK_IXATTR: return "ixattr"; + case CEPH_LOCK_ISNAP: return "isnap"; + case CEPH_LOCK_INO: return "ino"; + case CEPH_LOCK_IFLOCK: return "iflock"; + case CEPH_LOCK_IPOLICY: return "ipolicy"; + default: ceph_abort(); return std::string_view(); + } + } + + static std::string_view get_lock_action_name(int a) { + switch (a) { + case LOCK_AC_SYNC: return "sync"; + case LOCK_AC_MIX: return "mix"; + case LOCK_AC_LOCK: return "lock"; + case LOCK_AC_LOCKFLUSHED: return "lockflushed"; + + case LOCK_AC_SYNCACK: return "syncack"; + case LOCK_AC_MIXACK: return "mixack"; + case LOCK_AC_LOCKACK: return "lockack"; + + case LOCK_AC_REQSCATTER: return "reqscatter"; + case LOCK_AC_REQUNSCATTER: return "requnscatter"; + case LOCK_AC_NUDGE: return "nudge"; + case LOCK_AC_REQRDLOCK: return "reqrdlock"; + default: return "???"; + } + } + + // waiting + static const uint64_t WAIT_RD = (1<<0); // to read + static const uint64_t WAIT_WR = (1<<1); // to write + static const uint64_t WAIT_XLOCK = (1<<2); // to xlock (** dup) + static const uint64_t WAIT_STABLE = (1<<2); // for a stable state + static const uint64_t WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock + static const int WAIT_BITS = 4; + static const uint64_t WAIT_ALL = ((1<<WAIT_BITS)-1); + + +protected: + // parent (what i lock) + MDSCacheObject *parent; + + // lock state + __s16 state; + __s16 state_flags; + + enum { + LEASED = 1 << 0, + NEED_RECOVER = 1 << 1, + }; + +private: + int num_rdlock; + + // XXX not in mempool + struct unstable_bits_t { + set<__s32> gather_set; // auth+rep. >= 0 is mds, < 0 is client + + // local state + int num_wrlock = 0, num_xlock = 0; + MutationRef xlock_by; + client_t xlock_by_client = -1; + client_t excl_client = -1; + + bool empty() { + return + gather_set.empty() && + num_wrlock == 0 && + num_xlock == 0 && + xlock_by.get() == NULL && + xlock_by_client == -1 && + excl_client == -1; + } + + unstable_bits_t() {} + }; + + mutable std::unique_ptr<unstable_bits_t> _unstable; + + bool have_more() const { return _unstable ? true : false; } + unstable_bits_t *more() const { + if (!_unstable) + _unstable.reset(new unstable_bits_t); + return _unstable.get(); + } + void try_clear_more() { + if (_unstable && _unstable->empty()) { + _unstable.reset(); + } + } + +public: + + client_t get_excl_client() const { + return have_more() ? more()->excl_client : -1; + } + void set_excl_client(client_t c) { + if (c < 0 && !have_more()) + return; // default is -1 + more()->excl_client = c; + } + + SimpleLock(MDSCacheObject *o, LockType *lt) : + type(lt), + parent(o), + state(LOCK_SYNC), + state_flags(0), + num_rdlock(0) + {} + virtual ~SimpleLock() {} + + virtual bool is_scatterlock() const { + return false; + } + virtual bool is_locallock() const { + return false; + } + + // parent + MDSCacheObject *get_parent() { return parent; } + int get_type() const { return type->type; } + const sm_t* get_sm() const { return type->sm; } + + int get_wait_shift() const { + switch (get_type()) { + case CEPH_LOCK_DN: return 8; + case CEPH_LOCK_DVERSION: return 8 + 1*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IAUTH: return 8 + 2*SimpleLock::WAIT_BITS; + case CEPH_LOCK_ILINK: return 8 + 3*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IDFT: return 8 + 4*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IFILE: return 8 + 5*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IVERSION: return 8 + 6*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IXATTR: return 8 + 7*SimpleLock::WAIT_BITS; + case CEPH_LOCK_ISNAP: return 8 + 8*SimpleLock::WAIT_BITS; + case CEPH_LOCK_INEST: return 8 + 9*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IFLOCK: return 8 +10*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IPOLICY: return 8 +11*SimpleLock::WAIT_BITS; + default: + ceph_abort(); + } + } + + int get_cap_shift() const { + switch (get_type()) { + case CEPH_LOCK_IAUTH: return CEPH_CAP_SAUTH; + case CEPH_LOCK_ILINK: return CEPH_CAP_SLINK; + case CEPH_LOCK_IFILE: return CEPH_CAP_SFILE; + case CEPH_LOCK_IXATTR: return CEPH_CAP_SXATTR; + default: return 0; + } + } + int get_cap_mask() const { + switch (get_type()) { + case CEPH_LOCK_IFILE: return (1 << CEPH_CAP_FILE_BITS) - 1; + default: return (1 << CEPH_CAP_SIMPLE_BITS) - 1; + } + } + + struct ptr_lt { + bool operator()(const SimpleLock* l, const SimpleLock* r) const { + // first sort by object type (dn < inode) + if (!(l->type->type > CEPH_LOCK_DN) && (r->type->type > CEPH_LOCK_DN)) return true; + if ((l->type->type > CEPH_LOCK_DN) == (r->type->type > CEPH_LOCK_DN)) { + // then sort by object + if (l->parent->is_lt(r->parent)) return true; + if (l->parent == r->parent) { + // then sort by (inode) lock type + if (l->type->type < r->type->type) return true; + } + } + return false; + } + }; + + void decode_locked_state(const bufferlist& bl) { + parent->decode_lock_state(type->type, bl); + } + void encode_locked_state(bufferlist& bl) { + parent->encode_lock_state(type->type, bl); + } + void finish_waiters(uint64_t mask, int r=0) { + parent->finish_waiting(mask << get_wait_shift(), r); + } + void take_waiting(uint64_t mask, MDSContext::vec& ls) { + parent->take_waiting(mask << get_wait_shift(), ls); + } + void add_waiter(uint64_t mask, MDSContext *c) { + parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c); + } + bool is_waiter_for(uint64_t mask) const { + return parent->is_waiter_for(mask << get_wait_shift()); + } + + + + // state + int get_state() const { return state; } + int set_state(int s) { + state = s; + //assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + return s; + } + void set_state_rejoin(int s, MDSContext::vec& waiters, bool survivor) { + ceph_assert(!get_parent()->is_auth()); + + // If lock in the replica object was not in SYNC state when auth mds of the object failed. + // Auth mds of the object may take xlock on the lock and change the object when replaying + // unsafe requests. + if (!survivor || state != LOCK_SYNC) + mark_need_recover(); + + state = s; + + if (is_stable()) + take_waiting(SimpleLock::WAIT_ALL, waiters); + } + + bool is_stable() const { + return get_sm()->states[state].next == 0; + } + bool is_unstable_and_locked() const { + if (is_stable()) + return false; + return is_rdlocked() || is_wrlocked() || is_xlocked(); + } + int get_next_state() { + return get_sm()->states[state].next; + } + + + bool is_sync_and_unlocked() const { + return + get_state() == LOCK_SYNC && + !is_rdlocked() && + !is_leased() && + !is_wrlocked() && + !is_xlocked(); + } + + + /* + bool fw_rdlock_to_auth() { + return get_sm()->states[state].can_rdlock == FW; + } + */ + bool req_rdlock_from_auth() { + return get_sm()->states[state].can_rdlock == REQ; + } + + // gather set + static set<int32_t> empty_gather_set; + + // int32_t: <0 is client, >=0 is MDS rank + const set<int32_t>& get_gather_set() const { + return have_more() ? more()->gather_set : empty_gather_set; + } + + void init_gather() { + for (const auto p : parent->get_replicas()) { + more()->gather_set.insert(p.first); + } + } + bool is_gathering() const { + return have_more() && !more()->gather_set.empty(); + } + bool is_gathering(int32_t i) const { + return have_more() && more()->gather_set.count(i); + } + void clear_gather() { + if (have_more()) + more()->gather_set.clear(); + } + void remove_gather(int32_t i) { + if (have_more()) + more()->gather_set.erase(i); + } + + + + virtual bool is_dirty() const { return false; } + virtual bool is_stale() const { return false; } + virtual bool is_flushing() const { return false; } + virtual bool is_flushed() const { return false; } + virtual void clear_flushed() { } + + // can_* + bool can_lease(client_t client) const { + return get_sm()->states[state].can_lease == ANY || + (get_sm()->states[state].can_lease == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_lease == XCL && client >= 0 && get_xlock_by_client() == client); + } + bool can_read(client_t client) const { + return get_sm()->states[state].can_read == ANY || + (get_sm()->states[state].can_read == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_read == XCL && client >= 0 && get_xlock_by_client() == client); + } + bool can_read_projected(client_t client) const { + return get_sm()->states[state].can_read_projected == ANY || + (get_sm()->states[state].can_read_projected == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_read_projected == XCL && client >= 0 && get_xlock_by_client() == client); + } + bool can_rdlock(client_t client) const { + return get_sm()->states[state].can_rdlock == ANY || + (get_sm()->states[state].can_rdlock == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_rdlock == XCL && client >= 0 && get_xlock_by_client() == client); + } + bool can_wrlock(client_t client) const { + return get_sm()->states[state].can_wrlock == ANY || + (get_sm()->states[state].can_wrlock == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client || + get_excl_client() == client)); + } + bool can_force_wrlock(client_t client) const { + return get_sm()->states[state].can_force_wrlock == ANY || + (get_sm()->states[state].can_force_wrlock == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_force_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client || + get_excl_client() == client)); + } + bool can_xlock(client_t client) const { + return get_sm()->states[state].can_xlock == ANY || + (get_sm()->states[state].can_xlock == AUTH && parent->is_auth()) || + (get_sm()->states[state].can_xlock == XCL && client >= 0 && get_xlock_by_client() == client); + } + + // rdlock + bool is_rdlocked() const { return num_rdlock > 0; } + int get_rdlock() { + if (!num_rdlock) + parent->get(MDSCacheObject::PIN_LOCK); + return ++num_rdlock; + } + int put_rdlock() { + ceph_assert(num_rdlock>0); + --num_rdlock; + if (num_rdlock == 0) + parent->put(MDSCacheObject::PIN_LOCK); + return num_rdlock; + } + int get_num_rdlocks() const { + return num_rdlock; + } + + // wrlock + void get_wrlock(bool force=false) { + //assert(can_wrlock() || force); + if (more()->num_wrlock == 0) + parent->get(MDSCacheObject::PIN_LOCK); + ++more()->num_wrlock; + } + void put_wrlock() { + --more()->num_wrlock; + if (more()->num_wrlock == 0) { + parent->put(MDSCacheObject::PIN_LOCK); + try_clear_more(); + } + } + bool is_wrlocked() const { + return have_more() && more()->num_wrlock > 0; + } + int get_num_wrlocks() const { + return have_more() ? more()->num_wrlock : 0; + } + + // xlock + void get_xlock(MutationRef who, client_t client) { + ceph_assert(get_xlock_by() == MutationRef()); + ceph_assert(state == LOCK_XLOCK || is_locallock() || + state == LOCK_LOCK /* if we are a slave */); + parent->get(MDSCacheObject::PIN_LOCK); + more()->num_xlock++; + more()->xlock_by = who; + more()->xlock_by_client = client; + } + void set_xlock_done() { + ceph_assert(more()->xlock_by); + ceph_assert(state == LOCK_XLOCK || is_locallock() || + state == LOCK_LOCK /* if we are a slave */); + if (!is_locallock()) + state = LOCK_XLOCKDONE; + more()->xlock_by.reset(); + } + void put_xlock() { + ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || + state == LOCK_XLOCKSNAP || state == LOCK_LOCK_XLOCK || + state == LOCK_LOCK || /* if we are a master of a slave */ + is_locallock()); + --more()->num_xlock; + parent->put(MDSCacheObject::PIN_LOCK); + if (more()->num_xlock == 0) { + more()->xlock_by.reset(); + more()->xlock_by_client = -1; + try_clear_more(); + } + } + bool is_xlocked() const { + return have_more() && more()->num_xlock > 0; + } + int get_num_xlocks() const { + return have_more() ? more()->num_xlock : 0; + } + client_t get_xlock_by_client() const { + return have_more() ? more()->xlock_by_client : -1; + } + bool is_xlocked_by_client(client_t c) const { + return have_more() ? more()->xlock_by_client == c : false; + } + MutationRef get_xlock_by() const { + return have_more() ? more()->xlock_by : MutationRef(); + } + + // lease + bool is_leased() const { + return state_flags & LEASED; + } + void get_client_lease() { + ceph_assert(!is_leased()); + state_flags |= LEASED; + } + void put_client_lease() { + ceph_assert(is_leased()); + state_flags &= ~LEASED; + } + + bool needs_recover() const { + return state_flags & NEED_RECOVER; + } + void mark_need_recover() { + state_flags |= NEED_RECOVER; + } + void clear_need_recover() { + state_flags &= ~NEED_RECOVER; + } + + // encode/decode + void encode(bufferlist& bl) const { + ENCODE_START(2, 2, bl); + encode(state, bl); + if (have_more()) + encode(more()->gather_set, bl); + else + encode(empty_gather_set, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(2, p); + decode(state, p); + set<__s32> g; + decode(g, p); + if (!g.empty()) + more()->gather_set.swap(g); + DECODE_FINISH(p); + } + void encode_state_for_replica(bufferlist& bl) const { + __s16 s = get_replica_state(); + using ceph::encode; + encode(s, bl); + } + void decode_state(bufferlist::const_iterator& p, bool is_new=true) { + using ceph::decode; + __s16 s; + decode(s, p); + if (is_new) + state = s; + } + void decode_state_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, bool survivor) { + __s16 s; + using ceph::decode; + decode(s, p); + set_state_rejoin(s, waiters, survivor); + } + + + // caps + bool is_loner_mode() const { + return get_sm()->states[state].loner; + } + int gcaps_allowed_ever() const { + return parent->is_auth() ? get_sm()->allowed_ever_auth : get_sm()->allowed_ever_replica; + } + int gcaps_allowed(int who, int s=-1) const { + if (s < 0) s = state; + if (parent->is_auth()) { + if (get_xlock_by_client() >= 0 && who == CAP_XLOCKER) + return get_sm()->states[s].xlocker_caps | get_sm()->states[s].caps; // xlocker always gets more + else if (is_loner_mode() && who == CAP_ANY) + return get_sm()->states[s].caps; + else + return get_sm()->states[s].loner_caps | get_sm()->states[s].caps; // loner always gets more + } else + return get_sm()->states[s].replica_caps; + } + int gcaps_careful() const { + if (get_num_wrlocks()) + return get_sm()->careful; + return 0; + } + + + int gcaps_xlocker_mask(client_t client) const { + if (client == get_xlock_by_client()) + return type->type == CEPH_LOCK_IFILE ? 0xf : (CEPH_CAP_GSHARED|CEPH_CAP_GEXCL); + return 0; + } + + // simplelock specifics + int get_replica_state() const { + return get_sm()->states[state].replica_state; + } + void export_twiddle() { + clear_gather(); + state = get_replica_state(); + } + + bool remove_replica(int from) { + if (is_gathering(from)) { + remove_gather(from); + if (!is_gathering()) + return true; + } + return false; + } + bool do_import(int from, int to) { + if (!is_stable()) { + remove_gather(from); + remove_gather(to); + if (!is_gathering()) + return true; + } + if (!is_stable() && !is_gathering()) + return true; + return false; + } + + void _print(ostream& out) const { + out << get_lock_type_name(get_type()) << " "; + out << get_state_name(get_state()); + if (!get_gather_set().empty()) + out << " g=" << get_gather_set(); + if (is_leased()) + out << " l"; + if (is_rdlocked()) + out << " r=" << get_num_rdlocks(); + if (is_wrlocked()) + out << " w=" << get_num_wrlocks(); + if (is_xlocked()) { + out << " x=" << get_num_xlocks(); + if (get_xlock_by()) + out << " by " << get_xlock_by(); + } + /*if (is_stable()) + out << " stable"; + else + out << " unstable"; + */ + } + + /** + * Write bare values (caller must be in an object section) + * to formatter, or nothing if is_sync_and_unlocked. + */ + void dump(Formatter *f) const; + + virtual void print(ostream& out) const { + out << "("; + _print(out); + out << ")"; + } +}; +WRITE_CLASS_ENCODER(SimpleLock) + +inline ostream& operator<<(ostream& out, const SimpleLock& l) +{ + l.print(out); + return out; +} + + +#endif diff --git a/src/mds/SnapClient.cc b/src/mds/SnapClient.cc new file mode 100644 index 00000000..fa1f56b4 --- /dev/null +++ b/src/mds/SnapClient.cc @@ -0,0 +1,316 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDSMap.h" +#include "MDSRank.h" +#include "msg/Messenger.h" +#include "messages/MMDSTableRequest.h" +#include "SnapClient.h" + +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".snapclient " + +void SnapClient::resend_queries() +{ + if (!waiting_for_version.empty() || (!synced && sync_reqid > 0)) { + version_t want; + if (!waiting_for_version.empty()) + want = std::max<version_t>(cached_version, waiting_for_version.rbegin()->first); + else + want = std::max<version_t>(cached_version, 1); + refresh(want, NULL); + if (!synced) + sync_reqid = last_reqid; + } +} + +void SnapClient::handle_query_result(const MMDSTableRequest::const_ref &m) +{ + dout(10) << __func__ << " " << *m << dendl; + + char type; + using ceph::decode; + auto p = m->bl.cbegin(); + decode(type, p); + + switch (type) { + case 'U': // uptodate + ceph_assert(cached_version == m->get_tid()); + break; + case 'F': // full + { + decode(cached_snaps, p); + decode(cached_pending_update, p); + decode(cached_pending_destroy, p); + + snapid_t last_created, last_destroyed; + decode(last_created, p); + decode(last_destroyed, p); + + if (last_created > cached_last_created) + cached_last_created = last_created; + if (last_destroyed > cached_last_destroyed) + cached_last_destroyed = last_destroyed; + + cached_version = m->get_tid(); + } + break; + default: + ceph_abort(); + }; + + if (!committing_tids.empty()) { + for (auto p = committing_tids.begin(); + p != committing_tids.end() && *p <= cached_version; ) { + if (cached_pending_update.count(*p)) { + if (cached_pending_update[*p].snapid > cached_last_created) + cached_last_created = cached_pending_update[*p].snapid; + ++p; + } else if (cached_pending_destroy.count(*p)) { + if (cached_pending_destroy[*p].second > cached_last_destroyed) + cached_last_destroyed = cached_pending_destroy[*p].second; + ++p; + } else { + // pending update/destroy have been committed. + committing_tids.erase(p++); + } + } + } + + if (m->op == TABLESERVER_OP_QUERY_REPLY && m->reqid >= sync_reqid) + synced = true; + + if (synced && !waiting_for_version.empty()) { + MDSContext::vec finished; + while (!waiting_for_version.empty()) { + auto it = waiting_for_version.begin(); + if (it->first > cached_version) + break; + auto& v = it->second; + finished.insert(finished.end(), v.begin(), v.end()); + waiting_for_version.erase(it); + } + if (!finished.empty()) + mds->queue_waiters(finished); + } +} + +void SnapClient::handle_notify_prep(const MMDSTableRequest::const_ref &m) +{ + dout(10) << __func__ << " " << *m << dendl; + handle_query_result(m); + auto ack = MMDSTableRequest::create(table, TABLESERVER_OP_NOTIFY_ACK, 0, m->get_tid()); + mds->send_message(ack, m->get_connection()); +} + +void SnapClient::notify_commit(version_t tid) +{ + dout(10) << __func__ << " tid " << tid << dendl; + + ceph_assert(cached_version == 0 || cached_version >= tid); + if (cached_version == 0) { + committing_tids.insert(tid); + } else if (cached_pending_update.count(tid)) { + committing_tids.insert(tid); + if (cached_pending_update[tid].snapid > cached_last_created) + cached_last_created = cached_pending_update[tid].snapid; + } else if (cached_pending_destroy.count(tid)) { + committing_tids.insert(tid); + if (cached_pending_destroy[tid].second > cached_last_destroyed) + cached_last_destroyed = cached_pending_destroy[tid].second; + } else if (cached_version > tid) { + // no need to record the tid if it has already been committed. + } else { + ceph_abort(); + } +} + +void SnapClient::refresh(version_t want, MDSContext *onfinish) +{ + dout(10) << __func__ << " want " << want << dendl; + + ceph_assert(want >= cached_version); + if (onfinish) + waiting_for_version[want].push_back(onfinish); + + if (!server_ready) + return; + + mds_rank_t ts = mds->mdsmap->get_tableserver(); + auto req = MMDSTableRequest::create(table, TABLESERVER_OP_QUERY, ++last_reqid, 0); + using ceph::encode; + char op = 'F'; + encode(op, req->bl); + encode(cached_version, req->bl); + mds->send_message_mds(req, ts); +} + +void SnapClient::sync(MDSContext *onfinish) +{ + dout(10) << __func__ << dendl; + + refresh(std::max<version_t>(cached_version, 1), onfinish); + synced = false; + if (server_ready) + sync_reqid = last_reqid; + else + sync_reqid = (last_reqid == ~0ULL) ? 1 : last_reqid + 1; +} + +void SnapClient::get_snaps(set<snapid_t>& result) const +{ + ceph_assert(cached_version > 0); + for (auto& p : cached_snaps) + result.insert(p.first); + + for (auto tid : committing_tids) { + auto q = cached_pending_update.find(tid); + if (q != cached_pending_update.end()) + result.insert(q->second.snapid); + + auto r = cached_pending_destroy.find(tid); + if (r != cached_pending_destroy.end()) + result.erase(r->second.first); + } +} + +set<snapid_t> SnapClient::filter(const set<snapid_t>& snaps) const +{ + ceph_assert(cached_version > 0); + if (snaps.empty()) + return snaps; + + set<snapid_t> result; + + for (auto p : snaps) { + if (cached_snaps.count(p)) + result.insert(p); + } + + for (auto tid : committing_tids) { + auto q = cached_pending_update.find(tid); + if (q != cached_pending_update.end()) { + if (snaps.count(q->second.snapid)) + result.insert(q->second.snapid); + } + + auto r = cached_pending_destroy.find(tid); + if (r != cached_pending_destroy.end()) + result.erase(r->second.first); + } + + dout(10) << __func__ << " " << snaps << " -> " << result << dendl; + return result; +} + +const SnapInfo* SnapClient::get_snap_info(snapid_t snapid) const +{ + ceph_assert(cached_version > 0); + + const SnapInfo* result = NULL; + auto it = cached_snaps.find(snapid); + if (it != cached_snaps.end()) + result = &it->second; + + for (auto tid : committing_tids) { + auto q = cached_pending_update.find(tid); + if (q != cached_pending_update.end() && q->second.snapid == snapid) { + result = &q->second; + break; + } + + auto r = cached_pending_destroy.find(tid); + if (r != cached_pending_destroy.end() && r->second.first == snapid) { + result = NULL; + break; + } + } + + dout(10) << __func__ << " snapid " << snapid << " -> " << result << dendl; + return result; +} + +void SnapClient::get_snap_infos(map<snapid_t, const SnapInfo*>& infomap, + const set<snapid_t>& snaps) const +{ + ceph_assert(cached_version > 0); + + if (snaps.empty()) + return; + + map<snapid_t, const SnapInfo*> result; + for (auto p : snaps) { + auto it = cached_snaps.find(p); + if (it != cached_snaps.end()) + result[p] = &it->second; + } + + for (auto tid : committing_tids) { + auto q = cached_pending_update.find(tid); + if (q != cached_pending_update.end()) { + if (snaps.count(q->second.snapid)) + result[q->second.snapid] = &q->second; + } + + auto r = cached_pending_destroy.find(tid); + if (r != cached_pending_destroy.end()) + result.erase(r->second.first); + } + + infomap.insert(result.begin(), result.end()); +} + +int SnapClient::dump_cache(Formatter *f) const +{ + if (!is_synced()) { + dout(5) << "dump_cache: not synced" << dendl; + return -EINVAL; + } + + map<snapid_t, const SnapInfo*> snaps; + for (auto& p : cached_snaps) + snaps[p.first] = &p.second; + + for (auto tid : committing_tids) { + auto q = cached_pending_update.find(tid); + if (q != cached_pending_update.end()) + snaps[q->second.snapid] = &q->second; + + auto r = cached_pending_destroy.find(tid); + if (r != cached_pending_destroy.end()) + snaps.erase(r->second.first); + } + + f->open_object_section("snapclient"); + + f->dump_int("last_created", get_last_created()); + f->dump_int("last_destroyed", get_last_destroyed()); + + f->open_array_section("snaps"); + for (auto p : snaps) { + f->open_object_section("snap"); + p.second->dump(f); + f->close_section(); + } + f->close_section(); + + f->close_section(); + + return 0; +} diff --git a/src/mds/SnapClient.h b/src/mds/SnapClient.h new file mode 100644 index 00000000..c0d595ba --- /dev/null +++ b/src/mds/SnapClient.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_SNAPCLIENT_H +#define CEPH_SNAPCLIENT_H + +#include <string_view> + +#include "MDSTableClient.h" +#include "snap.h" +#include "MDSContext.h" + +class MDSRank; +class LogSegment; + +class SnapClient : public MDSTableClient { + version_t cached_version; + snapid_t cached_last_created, cached_last_destroyed; + map<snapid_t, SnapInfo> cached_snaps; + map<version_t, SnapInfo> cached_pending_update; + map<version_t, pair<snapid_t,snapid_t> > cached_pending_destroy; + + set<version_t> committing_tids; + + map<version_t, MDSContext::vec > waiting_for_version; + + uint64_t sync_reqid; + bool synced; + +public: + explicit SnapClient(MDSRank *m) : + MDSTableClient(m, TABLE_SNAP), + cached_version(0), cached_last_created(0), cached_last_destroyed(0), + sync_reqid(0), synced(false) {} + + void resend_queries() override; + void handle_query_result(const MMDSTableRequest::const_ref &m) override; + void handle_notify_prep(const MMDSTableRequest::const_ref &m) override; + void notify_commit(version_t tid) override; + + void prepare_create(inodeno_t dirino, std::string_view name, utime_t stamp, + version_t *pstid, bufferlist *pbl, MDSContext *onfinish) { + bufferlist bl; + __u32 op = TABLE_OP_CREATE; + encode(op, bl); + encode(dirino, bl); + encode(name, bl); + encode(stamp, bl); + _prepare(bl, pstid, pbl, onfinish); + } + + void prepare_create_realm(inodeno_t ino, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) { + bufferlist bl; + __u32 op = TABLE_OP_CREATE; + encode(op, bl); + encode(ino, bl); + _prepare(bl, pstid, pbl, onfinish); + } + + void prepare_destroy(inodeno_t ino, snapid_t snapid, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) { + bufferlist bl; + __u32 op = TABLE_OP_DESTROY; + encode(op, bl); + encode(ino, bl); + encode(snapid, bl); + _prepare(bl, pstid, pbl, onfinish); + } + + void prepare_update(inodeno_t ino, snapid_t snapid, std::string_view name, utime_t stamp, + version_t *pstid, MDSContext *onfinish) { + bufferlist bl; + __u32 op = TABLE_OP_UPDATE; + encode(op, bl); + encode(ino, bl); + encode(snapid, bl); + encode(name, bl); + encode(stamp, bl); + _prepare(bl, pstid, NULL, onfinish); + } + + version_t get_cached_version() const { return cached_version; } + void refresh(version_t want, MDSContext *onfinish); + + void sync(MDSContext *onfinish); + + bool is_synced() const { return synced; } + void wait_for_sync(MDSContext *c) { + ceph_assert(!synced); + waiting_for_version[std::max<version_t>(cached_version, 1)].push_back(c); + } + + snapid_t get_last_created() const { return cached_last_created; } + snapid_t get_last_destroyed() const { return cached_last_destroyed; } + + void get_snaps(set<snapid_t>& snaps) const; + set<snapid_t> filter(const set<snapid_t>& snaps) const; + const SnapInfo* get_snap_info(snapid_t snapid) const; + void get_snap_infos(map<snapid_t, const SnapInfo*>& infomap, const set<snapid_t>& snaps) const; + + int dump_cache(Formatter *f) const; +}; + +#endif diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc new file mode 100644 index 00000000..4ef775dc --- /dev/null +++ b/src/mds/SnapRealm.cc @@ -0,0 +1,726 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SnapRealm.h" +#include "MDCache.h" +#include "MDSRank.h" +#include "SnapClient.h" + +#include <string_view> + + +/* + * SnapRealm + */ + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this) +static ostream& _prefix(std::ostream *_dout, int whoami, const CInode *inode, + uint64_t seq, const SnapRealm *realm) { + return *_dout << " mds." << whoami + << ".cache.snaprealm(" << inode->ino() + << " seq " << seq << " " << realm << ") "; +} + +ostream& operator<<(ostream& out, const SnapRealm& realm) +{ + out << "snaprealm(" << realm.inode->ino() + << " seq " << realm.srnode.seq + << " lc " << realm.srnode.last_created + << " cr " << realm.srnode.created; + if (realm.srnode.created != realm.srnode.current_parent_since) + out << " cps " << realm.srnode.current_parent_since; + out << " snaps=" << realm.srnode.snaps; + out << " past_parent_snaps=" << realm.srnode.past_parent_snaps; + + if (realm.srnode.past_parents.size()) { + out << " past_parents=("; + for (map<snapid_t, snaplink_t>::const_iterator p = realm.srnode.past_parents.begin(); + p != realm.srnode.past_parents.end(); + ++p) { + if (p != realm.srnode.past_parents.begin()) out << ","; + out << p->second.first << "-" << p->first + << "=" << p->second.ino; + } + out << ")"; + } + + if (realm.srnode.is_parent_global()) + out << " global "; + out << " " << &realm << ")"; + return out; +} + +SnapRealm::SnapRealm(MDCache *c, CInode *in) : + mdcache(c), inode(in), parent(nullptr), + num_open_past_parents(0), inodes_with_caps(0) +{ + global = (inode->ino() == MDS_INO_GLOBAL_SNAPREALM); +} + +void SnapRealm::add_open_past_parent(SnapRealm *parent, snapid_t last) +{ + auto p = open_past_parents.find(parent->inode->ino()); + if (p != open_past_parents.end()) { + ceph_assert(p->second.second.count(last) == 0); + p->second.second.insert(last); + } else { + open_past_parents[parent->inode->ino()].first = parent; + open_past_parents[parent->inode->ino()].second.insert(last); + parent->open_past_children.insert(this); + parent->inode->get(CInode::PIN_PASTSNAPPARENT); + } + ++num_open_past_parents; +} + +void SnapRealm::remove_open_past_parent(inodeno_t ino, snapid_t last) +{ + auto p = open_past_parents.find(ino); + ceph_assert(p != open_past_parents.end()); + auto q = p->second.second.find(last); + ceph_assert(q != p->second.second.end()); + p->second.second.erase(q); + --num_open_past_parents; + if (p->second.second.empty()) { + SnapRealm *parent = p->second.first; + open_past_parents.erase(p); + parent->open_past_children.erase(this); + parent->inode->put(CInode::PIN_PASTSNAPPARENT); + } +} + +struct C_SR_RetryOpenParents : public MDSContext { + SnapRealm *sr; + snapid_t first, last, parent_last; + inodeno_t parent; + MDSContext* fin; + C_SR_RetryOpenParents(SnapRealm *s, snapid_t f, snapid_t l, snapid_t pl, + inodeno_t p, MDSContext *c) : + sr(s), first(f), last(l), parent_last(pl), parent(p), fin(c) { + sr->inode->get(CInode::PIN_OPENINGSNAPPARENTS); + } + MDSRank *get_mds() override { return sr->mdcache->mds; } + void finish(int r) override { + if (r < 0) + sr->_remove_missing_parent(parent_last, parent, r); + if (sr->_open_parents(fin, first, last)) { + if (fin) + fin->complete(0); + } + sr->inode->put(CInode::PIN_OPENINGSNAPPARENTS); + } +}; + +void SnapRealm::_remove_missing_parent(snapid_t snapid, inodeno_t parent, int err) +{ + map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.find(snapid); + if (p != srnode.past_parents.end()) { + dout(10) << __func__ << " " << parent << " [" << p->second.first << "," + << p->first << "] errno " << err << dendl; + srnode.past_parents.erase(p); + past_parents_dirty = true; + } else { + dout(10) << __func__ << " " << parent << " not found" << dendl; + } +} + +bool SnapRealm::_open_parents(MDSContext *finish, snapid_t first, snapid_t last) +{ + dout(10) << "open_parents [" << first << "," << last << "]" << dendl; + if (open) + return true; + + // make sure my current parents' parents are open... + if (parent) { + dout(10) << " current parent [" << srnode.current_parent_since << ",head] is " << *parent + << " on " << *parent->inode << dendl; + if (last >= srnode.current_parent_since && + !parent->_open_parents(finish, std::max(first, srnode.current_parent_since), last)) + return false; + } + + if (!srnode.past_parent_snaps.empty()) + ceph_assert(mdcache->mds->snapclient->get_cached_version() > 0); + + if (!srnode.past_parents.empty() && + mdcache->mds->allows_multimds_snaps()) { + dout(10) << " skip non-empty past_parents since multimds_snaps is allowed" << dendl; + open = true; + return true; + } + + // and my past parents too! + ceph_assert(srnode.past_parents.size() >= num_open_past_parents); + if (srnode.past_parents.size() > num_open_past_parents) { + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin(); + p != srnode.past_parents.end(); ) { + dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is " + << p->second.ino << dendl; + CInode *parent = mdcache->get_inode(p->second.ino); + if (!parent) { + C_SR_RetryOpenParents *fin = new C_SR_RetryOpenParents(this, first, last, p->first, + p->second.ino, finish); + mdcache->open_ino(p->second.ino, mdcache->mds->mdsmap->get_metadata_pool(), fin); + return false; + } + if (parent->state_test(CInode::STATE_PURGING)) { + dout(10) << " skip purging past_parent " << *parent << dendl; + srnode.past_parents.erase(p++); + past_parents_dirty = true; + continue; + } + ceph_assert(parent->snaprealm); // hmm! + if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first)) + return false; + auto q = open_past_parents.find(p->second.ino); + if (q == open_past_parents.end() || + q->second.second.count(p->first) == 0) { + add_open_past_parent(parent->snaprealm, p->first); + } + ++p; + } + } + + open = true; + return true; +} + +bool SnapRealm::open_parents(MDSContext *retryorfinish) { + if (!_open_parents(retryorfinish)) + return false; + delete retryorfinish; + return true; +} + +bool SnapRealm::have_past_parents_open(snapid_t first, snapid_t last) const +{ + dout(10) << "have_past_parents_open [" << first << "," << last << "]" << dendl; + if (open) + return true; + + if (!srnode.past_parent_snaps.empty()) + ceph_assert(mdcache->mds->snapclient->get_cached_version() > 0); + + if (!srnode.past_parents.empty() && + mdcache->mds->allows_multimds_snaps()) { + dout(10) << " skip non-empty past_parents since multimds_snaps is allowed" << dendl; + open = true; + return true; + } + + for (auto p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end(); + ++p) { + if (p->second.first > last) + break; + dout(10) << " past parent [" << p->second.first << "," << p->first << "] was " + << p->second.ino << dendl; + auto q = open_past_parents.find(p->second.ino); + if (q == open_past_parents.end()) { + dout(10) << " past parent " << p->second.ino << " is not open" << dendl; + return false; + } + SnapRealm *parent_realm = q->second.first; + if (!parent_realm->have_past_parents_open(std::max(first, p->second.first), + std::min(last, p->first))) + return false; + } + + open = true; + return true; +} + +void SnapRealm::close_parents() +{ + for (auto p = open_past_parents.begin(); p != open_past_parents.end(); ++p) { + num_open_past_parents -= p->second.second.size(); + p->second.first->inode->put(CInode::PIN_PASTSNAPPARENT); + p->second.first->open_past_children.erase(this); + } + open_past_parents.clear(); +} + + +/* + * get list of snaps for this realm. we must include parents' snaps + * for the intervals during which they were our parent. + */ +void SnapRealm::build_snap_set() const +{ + dout(10) << "build_snap_set on " << *this << dendl; + + cached_snaps.clear(); + + if (global) { + mdcache->mds->snapclient->get_snaps(cached_snaps); + return; + } + + // include my snaps + for (const auto& p : srnode.snaps) + cached_snaps.insert(p.first); + + if (!srnode.past_parent_snaps.empty()) { + set<snapid_t> snaps = mdcache->mds->snapclient->filter(srnode.past_parent_snaps); + if (!snaps.empty()) { + snapid_t last = *snaps.rbegin(); + cached_seq = std::max(cached_seq, last); + cached_last_created = std::max(cached_last_created, last); + } + cached_snaps.insert(snaps.begin(), snaps.end()); + } else { + // include snaps for parents + for (const auto& p : srnode.past_parents) { + const CInode *oldparent = mdcache->get_inode(p.second.ino); + ceph_assert(oldparent); // call open_parents first! + ceph_assert(oldparent->snaprealm); + + const set<snapid_t>& snaps = oldparent->snaprealm->get_snaps(); + snapid_t last = 0; + for (auto q = snaps.lower_bound(p.second.first); + q != snaps.end() && *q <= p.first; + q++) { + cached_snaps.insert(*q); + last = *q; + } + cached_seq = std::max(cached_seq, last); + cached_last_created = std::max(cached_last_created, last); + } + } + + snapid_t parent_seq = parent ? parent->get_newest_seq() : snapid_t(0); + if (parent_seq >= srnode.current_parent_since) { + auto& snaps = parent->get_snaps(); + auto p = snaps.lower_bound(srnode.current_parent_since); + cached_snaps.insert(p, snaps.end()); + cached_seq = std::max(cached_seq, parent_seq); + cached_last_created = std::max(cached_last_created, parent->get_last_created()); + } +} + +void SnapRealm::check_cache() const +{ + ceph_assert(have_past_parents_open()); + snapid_t seq; + snapid_t last_created; + snapid_t last_destroyed = mdcache->mds->snapclient->get_last_destroyed(); + if (global || srnode.is_parent_global()) { + last_created = mdcache->mds->snapclient->get_last_created(); + seq = std::max(last_created, last_destroyed); + } else { + last_created = srnode.last_created; + seq = srnode.seq; + } + if (cached_seq >= seq && + cached_last_destroyed == last_destroyed) + return; + + cached_snap_context.clear(); + + cached_seq = seq; + cached_last_created = last_created; + cached_last_destroyed = last_destroyed; + + cached_subvolume_ino = 0; + if (parent) + cached_subvolume_ino = parent->get_subvolume_ino(); + if (!cached_subvolume_ino && srnode.is_subvolume()) + cached_subvolume_ino = inode->ino(); + + build_snap_set(); + + build_snap_trace(); + + dout(10) << "check_cache rebuilt " << cached_snaps + << " seq " << seq + << " cached_seq " << cached_seq + << " cached_last_created " << cached_last_created + << " cached_last_destroyed " << cached_last_destroyed + << ")" << dendl; +} + +const set<snapid_t>& SnapRealm::get_snaps() const +{ + check_cache(); + dout(10) << "get_snaps " << cached_snaps + << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")" + << dendl; + return cached_snaps; +} + +/* + * build vector in reverse sorted order + */ +const SnapContext& SnapRealm::get_snap_context() const +{ + check_cache(); + + if (!cached_snap_context.seq) { + cached_snap_context.seq = cached_seq; + cached_snap_context.snaps.resize(cached_snaps.size()); + unsigned i = 0; + for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin(); + p != cached_snaps.rend(); + ++p) + cached_snap_context.snaps[i++] = *p; + } + + return cached_snap_context; +} + +void SnapRealm::get_snap_info(map<snapid_t, const SnapInfo*>& infomap, snapid_t first, snapid_t last) +{ + const set<snapid_t>& snaps = get_snaps(); + dout(10) << "get_snap_info snaps " << snaps << dendl; + + // include my snaps within interval [first,last] + for (auto p = srnode.snaps.lower_bound(first); // first element >= first + p != srnode.snaps.end() && p->first <= last; + ++p) + infomap[p->first] = &p->second; + + if (!srnode.past_parent_snaps.empty()) { + set<snapid_t> snaps; + for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first + p != srnode.past_parent_snaps.end() && *p <= last; + ++p) { + snaps.insert(*p); + } + + map<snapid_t, const SnapInfo*> _infomap; + mdcache->mds->snapclient->get_snap_infos(_infomap, snaps); + infomap.insert(_infomap.begin(), _infomap.end()); + } else { + // include snaps for parents during intervals that intersect [first,last] + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; + ++p) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + ceph_assert(oldparent); // call open_parents first! + ceph_assert(oldparent->snaprealm); + oldparent->snaprealm->get_snap_info(infomap, + std::max(first, p->second.first), + std::min(last, p->first)); + } + } + + if (srnode.current_parent_since <= last && parent) + parent->get_snap_info(infomap, std::max(first, srnode.current_parent_since), last); +} + +std::string_view SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino) +{ + auto srnode_snaps_entry = srnode.snaps.find(snapid); + if (srnode_snaps_entry != srnode.snaps.end()) { + if (atino == inode->ino()) + return srnode_snaps_entry->second.name; + else + return srnode_snaps_entry->second.get_long_name(); + } + + if (!srnode.past_parent_snaps.empty()) { + if (srnode.past_parent_snaps.count(snapid)) { + const SnapInfo *sinfo = mdcache->mds->snapclient->get_snap_info(snapid); + if (sinfo) { + if (atino == sinfo->ino) + return sinfo->name; + else + return sinfo->get_long_name(); + } + } + } else { + map<snapid_t,snaplink_t>::iterator p = srnode.past_parents.lower_bound(snapid); + if (p != srnode.past_parents.end() && p->second.first <= snapid) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + ceph_assert(oldparent); // call open_parents first! + ceph_assert(oldparent->snaprealm); + return oldparent->snaprealm->get_snapname(snapid, atino); + } + } + + ceph_assert(srnode.current_parent_since <= snapid); + ceph_assert(parent); + return parent->get_snapname(snapid, atino); +} + +snapid_t SnapRealm::resolve_snapname(std::string_view n, inodeno_t atino, snapid_t first, snapid_t last) +{ + // first try me + dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl; + + bool actual = (atino == inode->ino()); + string pname; + inodeno_t pino; + if (n.length() && n[0] == '_') { + size_t next_ = n.find_last_of('_'); + if (next_ > 1 && next_ + 1 < n.length()) { + pname = n.substr(1, next_ - 1); + pino = atoll(n.data() + next_ + 1); + dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl; + } + } + + for (auto p = srnode.snaps.lower_bound(first); // first element >= first + p != srnode.snaps.end() && p->first <= last; + ++p) { + dout(15) << " ? " << p->second << dendl; + //if (num && p->second.snapid == num) + //return p->first; + if (actual && p->second.name == n) + return p->first; + if (!actual && p->second.name == pname && p->second.ino == pino) + return p->first; + } + + if (!srnode.past_parent_snaps.empty()) { + set<snapid_t> snaps; + for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first + p != srnode.past_parent_snaps.end() && *p <= last; + ++p) + snaps.insert(*p); + + map<snapid_t, const SnapInfo*> _infomap; + mdcache->mds->snapclient->get_snap_infos(_infomap, snaps); + + for (auto& it : _infomap) { + dout(15) << " ? " << *it.second << dendl; + actual = (it.second->ino == atino); + if (actual && it.second->name == n) + return it.first; + if (!actual && it.second->name == pname && it.second->ino == pino) + return it.first; + } + } else { + // include snaps for parents during intervals that intersect [first,last] + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; + ++p) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + ceph_assert(oldparent); // call open_parents first! + ceph_assert(oldparent->snaprealm); + snapid_t r = oldparent->snaprealm->resolve_snapname(n, atino, + std::max(first, p->second.first), + std::min(last, p->first)); + if (r) + return r; + } + } + + if (parent && srnode.current_parent_since <= last) + return parent->resolve_snapname(n, atino, std::max(first, srnode.current_parent_since), last); + return 0; +} + + +void SnapRealm::adjust_parent() +{ + SnapRealm *newparent; + if (srnode.is_parent_global()) { + newparent = mdcache->get_global_snaprealm(); + } else { + CDentry *pdn = inode->get_parent_dn(); + newparent = pdn ? pdn->get_dir()->get_inode()->find_snaprealm() : NULL; + } + if (newparent != parent) { + dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl; + if (parent) + parent->open_children.erase(this); + parent = newparent; + if (parent) + parent->open_children.insert(this); + + invalidate_cached_snaps(); + } +} + +void SnapRealm::split_at(SnapRealm *child) +{ + dout(10) << "split_at " << *child + << " on " << *child->inode << dendl; + + if (inode->is_mdsdir() || !child->inode->is_dir()) { + // it's not a dir. + if (child->inode->containing_realm) { + // - no open children. + // - only need to move this child's inode's caps. + child->inode->move_to_realm(child); + } else { + // no caps, nothing to move/split. + dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl; + ceph_assert(!child->inode->is_any_caps()); + } + return; + } + + // it's a dir. + + // split open_children + dout(10) << " open_children are " << open_children << dendl; + for (set<SnapRealm*>::iterator p = open_children.begin(); + p != open_children.end(); ) { + SnapRealm *realm = *p; + if (realm != child && + child->inode->is_ancestor_of(realm->inode)) { + dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl; + realm->parent = child; + child->open_children.insert(realm); + open_children.erase(p++); + } else { + dout(20) << " keeping child realm " << *realm << " on " << *realm->inode << dendl; + ++p; + } + } + + // split inodes_with_caps + for (elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps)); + !p.end(); ) { + CInode *in = *p; + ++p; + // does inode fall within the child realm? + if (child->inode->is_ancestor_of(in)) { + dout(20) << " child gets " << *in << dendl; + in->move_to_realm(child); + } else { + dout(20) << " keeping " << *in << dendl; + } + } +} + +void SnapRealm::merge_to(SnapRealm *newparent) +{ + if (!newparent) + newparent = parent; + dout(10) << "merge to " << *newparent << " on " << *newparent->inode << dendl; + + ceph_assert(open_past_children.empty()); + + dout(10) << " open_children are " << open_children << dendl; + for (auto realm : open_children) { + dout(20) << " child realm " << *realm << " on " << *realm->inode << dendl; + newparent->open_children.insert(realm); + realm->parent = newparent; + } + open_children.clear(); + + elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps)); + while (!p.end()) { + CInode *in = *p; + ++p; + in->move_to_realm(newparent); + } + ceph_assert(inodes_with_caps.empty()); + + // delete this + inode->close_snaprealm(); +} + +const bufferlist& SnapRealm::get_snap_trace() const +{ + check_cache(); + return cached_snap_trace; +} + +void SnapRealm::build_snap_trace() const +{ + cached_snap_trace.clear(); + + if (global) { + SnapRealmInfo info(inode->ino(), 0, cached_seq, 0); + info.my_snaps.reserve(cached_snaps.size()); + for (auto p = cached_snaps.rbegin(); p != cached_snaps.rend(); ++p) + info.my_snaps.push_back(*p); + + dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl; + encode(info, cached_snap_trace); + return; + } + + SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since); + if (parent) { + info.h.parent = parent->inode->ino(); + + set<snapid_t> past; + if (!srnode.past_parent_snaps.empty()) { + past = mdcache->mds->snapclient->filter(srnode.past_parent_snaps); + if (srnode.is_parent_global()) { + auto p = past.lower_bound(srnode.current_parent_since); + past.erase(p, past.end()); + } + } else if (!srnode.past_parents.empty()) { + const set<snapid_t>& snaps = get_snaps(); + for (const auto& p : srnode.past_parents) { + for (auto q = snaps.lower_bound(p.second.first); + q != snaps.end() && *q <= p.first; + q++) { + if (srnode.snaps.count(*q)) + continue; + past.insert(*q); + } + } + } + + if (!past.empty()) { + info.prior_parent_snaps.reserve(past.size()); + for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); ++p) + info.prior_parent_snaps.push_back(*p); + dout(10) << "build_snap_trace prior_parent_snaps from [1," << *past.rbegin() << "] " + << info.prior_parent_snaps << dendl; + } + } + + info.my_snaps.reserve(srnode.snaps.size()); + for (auto p = srnode.snaps.rbegin(); + p != srnode.snaps.rend(); + ++p) + info.my_snaps.push_back(p->first); + dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl; + + encode(info, cached_snap_trace); + + if (parent) + cached_snap_trace.append(parent->get_snap_trace()); +} + +void SnapRealm::prune_past_parents() +{ + dout(10) << "prune_past_parents" << dendl; + check_cache(); + + // convert past_parents to past_parent_snaps + if (!srnode.past_parents.empty()) { + for (auto p = cached_snaps.begin(); + p != cached_snaps.end() && *p < srnode.current_parent_since; + ++p) { + if (!srnode.snaps.count(*p)) + srnode.past_parent_snaps.insert(*p); + } + srnode.past_parents.clear(); + past_parents_dirty = true; + } + + for (auto p = srnode.past_parent_snaps.begin(); + p != srnode.past_parent_snaps.end(); ) { + auto q = cached_snaps.find(*p); + if (q == cached_snaps.end()) { + dout(10) << "prune_past_parents pruning " << *p << dendl; + srnode.past_parent_snaps.erase(p++); + } else { + dout(10) << "prune_past_parents keeping " << *p << dendl; + ++p; + } + } +} + diff --git a/src/mds/SnapRealm.h b/src/mds/SnapRealm.h new file mode 100644 index 00000000..582daa2d --- /dev/null +++ b/src/mds/SnapRealm.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_SNAPREALM_H +#define CEPH_MDS_SNAPREALM_H + +#include <string_view> + +#include "mdstypes.h" +#include "snap.h" +#include "include/xlist.h" +#include "include/elist.h" +#include "common/snap_types.h" +#include "MDSContext.h" + +struct SnapRealm { +protected: + // cache + mutable snapid_t cached_seq; // max seq over self and all past+present parents. + mutable snapid_t cached_last_created; // max last_created over all past+present parents + mutable snapid_t cached_last_destroyed; + mutable set<snapid_t> cached_snaps; + mutable SnapContext cached_snap_context; + mutable bufferlist cached_snap_trace; + mutable inodeno_t cached_subvolume_ino = 0; + + void check_cache() const; + +public: + // realm state + sr_t srnode; + + // in-memory state + MDCache *mdcache; + CInode *inode; + + mutable bool open = false; // set to true once all past_parents are opened + bool past_parents_dirty = false; + bool global; + + SnapRealm *parent; + set<SnapRealm*> open_children; // active children that are currently open + set<SnapRealm*> open_past_children; // past children who has pinned me + map<inodeno_t, pair<SnapRealm*, set<snapid_t> > > open_past_parents; // these are explicitly pinned. + unsigned num_open_past_parents; + + elist<CInode*> inodes_with_caps; // for efficient realm splits + map<client_t, xlist<Capability*>* > client_caps; // to identify clients who need snap notifications + + SnapRealm(MDCache *c, CInode *in); + + bool exists(std::string_view name) const { + for (map<snapid_t,SnapInfo>::const_iterator p = srnode.snaps.begin(); + p != srnode.snaps.end(); + ++p) { + if (p->second.name == name) + return true; + } + return false; + } + + bool _open_parents(MDSContext *retryorfinish, snapid_t first=1, snapid_t last=CEPH_NOSNAP); + bool open_parents(MDSContext *retryorfinish); + void _remove_missing_parent(snapid_t snapid, inodeno_t parent, int err); + bool have_past_parents_open(snapid_t first=1, snapid_t last=CEPH_NOSNAP) const; + void add_open_past_parent(SnapRealm *parent, snapid_t last); + void remove_open_past_parent(inodeno_t ino, snapid_t last); + void close_parents(); + + void prune_past_parents(); + bool has_past_parents() const { + return !srnode.past_parent_snaps.empty() || + !srnode.past_parents.empty(); + } + + void build_snap_set() const; + void get_snap_info(map<snapid_t, const SnapInfo*>& infomap, snapid_t first=0, snapid_t last=CEPH_NOSNAP); + + const bufferlist& get_snap_trace() const; + void build_snap_trace() const; + + std::string_view get_snapname(snapid_t snapid, inodeno_t atino); + snapid_t resolve_snapname(std::string_view name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP); + + const set<snapid_t>& get_snaps() const; + const SnapContext& get_snap_context() const; + void invalidate_cached_snaps() { + cached_seq = 0; + } + snapid_t get_last_created() { + check_cache(); + return cached_last_created; + } + snapid_t get_last_destroyed() { + check_cache(); + return cached_last_destroyed; + } + snapid_t get_newest_snap() { + check_cache(); + if (cached_snaps.empty()) + return 0; + else + return *cached_snaps.rbegin(); + } + snapid_t get_newest_seq() { + check_cache(); + return cached_seq; + } + + snapid_t get_snap_following(snapid_t follows) { + check_cache(); + const set<snapid_t>& s = get_snaps(); + set<snapid_t>::const_iterator p = s.upper_bound(follows); + if (p != s.end()) + return *p; + return CEPH_NOSNAP; + } + + bool has_snaps_in_range(snapid_t first, snapid_t last) { + check_cache(); + const set<snapid_t>& s = get_snaps(); + set<snapid_t>::const_iterator p = s.lower_bound(first); + return (p != s.end() && *p <= last); + } + + inodeno_t get_subvolume_ino() { + check_cache(); + return cached_subvolume_ino; + } + + void adjust_parent(); + + void split_at(SnapRealm *child); + void merge_to(SnapRealm *newparent); + + void add_cap(client_t client, Capability *cap) { + auto client_caps_entry = client_caps.find(client); + if (client_caps_entry == client_caps.end()) + client_caps_entry = client_caps.emplace(client, + new xlist<Capability*>).first; + client_caps_entry->second->push_back(&cap->item_snaprealm_caps); + } + void remove_cap(client_t client, Capability *cap) { + cap->item_snaprealm_caps.remove_myself(); + auto found = client_caps.find(client); + if (found != client_caps.end() && found->second->empty()) { + delete found->second; + client_caps.erase(found); + } + } +}; + +ostream& operator<<(ostream& out, const SnapRealm &realm); + +#endif diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc new file mode 100644 index 00000000..d9690d40 --- /dev/null +++ b/src/mds/SnapServer.cc @@ -0,0 +1,476 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SnapServer.h" +#include "MDSRank.h" +#include "osd/OSDMap.h" +#include "osdc/Objecter.h" +#include "mon/MonClient.h" + +#include "include/types.h" +#include "messages/MMDSTableRequest.h" +#include "messages/MRemoveSnaps.h" + +#include "msg/Messenger.h" + +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << rank << ".snap " + + +void SnapServer::reset_state() +{ + last_snap = 1; /* snapid 1 reserved for initial root snaprealm */ + snaps.clear(); + need_to_purge.clear(); + pending_update.clear(); + pending_destroy.clear(); + pending_noop.clear(); + + // find any removed snapshot in data pools + if (mds) { // only if I'm running in a live MDS + snapid_t first_free = 0; + mds->objecter->with_osdmap([&](const OSDMap& o) { + for (const auto p : mds->mdsmap->get_data_pools()) { + const pg_pool_t *pi = o.get_pg_pool(p); + if (!pi) { + // If pool isn't in OSDMap yet then can't have any snaps + // needing removal, skip. + continue; + } + if (pi->snap_seq > first_free) { + first_free = pi->snap_seq; + } + } + }); + if (first_free > last_snap) + last_snap = first_free; + } + last_created = last_snap; + last_destroyed = last_snap; + snaprealm_v2_since = last_snap + 1; + + MDSTableServer::reset_state(); +} + + +// SERVER + +void SnapServer::_prepare(const bufferlist& bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out) +{ + using ceph::decode; + using ceph::encode; + auto p = bl.cbegin(); + __u32 op; + decode(op, p); + + switch (op) { + case TABLE_OP_CREATE: + { + SnapInfo info; + decode(info.ino, p); + if (!p.end()) { + decode(info.name, p); + decode(info.stamp, p); + info.snapid = ++last_snap; + pending_update[version] = info; + dout(10) << "prepare v" << version << " create " << info << dendl; + } else { + pending_noop.insert(version); + dout(10) << "prepare v" << version << " noop" << dendl; + } + + encode(last_snap, out); + } + break; + + case TABLE_OP_DESTROY: + { + inodeno_t ino; + snapid_t snapid; + decode(ino, p); // not used, currently. + decode(snapid, p); + + // bump last_snap... we use it as a version value on the snaprealm. + ++last_snap; + + pending_destroy[version] = pair<snapid_t,snapid_t>(snapid, last_snap); + dout(10) << "prepare v" << version << " destroy " << snapid << " seq " << last_snap << dendl; + + encode(last_snap, out); + } + break; + + case TABLE_OP_UPDATE: + { + SnapInfo info; + decode(info.ino, p); + decode(info.snapid, p); + decode(info.name, p); + decode(info.stamp, p); + + pending_update[version] = info; + dout(10) << "prepare v" << version << " update " << info << dendl; + } + break; + + default: + ceph_abort(); + } + //dump(); +} + +void SnapServer::_get_reply_buffer(version_t tid, bufferlist *pbl) const +{ + using ceph::encode; + auto p = pending_update.find(tid); + if (p != pending_update.end()) { + if (pbl && !snaps.count(p->second.snapid)) // create + encode(p->second.snapid, *pbl); + return; + } + auto q = pending_destroy.find(tid); + if (q != pending_destroy.end()) { + if (pbl) + encode(q->second.second, *pbl); + return; + } + auto r = pending_noop.find(tid); + if (r != pending_noop.end()) { + if (pbl) + encode(last_snap, *pbl); + return; + } + assert (0 == "tid not found"); +} + +void SnapServer::_commit(version_t tid, MMDSTableRequest::const_ref req) +{ + if (pending_update.count(tid)) { + SnapInfo &info = pending_update[tid]; + string opname; + if (snaps.count(info.snapid)) { + opname = "update"; + if (info.stamp == utime_t()) + info.stamp = snaps[info.snapid].stamp; + } else { + opname = "create"; + if (info.snapid > last_created) + last_created = info.snapid; + } + dout(7) << "commit " << tid << " " << opname << " " << info << dendl; + snaps[info.snapid] = info; + pending_update.erase(tid); + } + + else if (pending_destroy.count(tid)) { + snapid_t sn = pending_destroy[tid].first; + snapid_t seq = pending_destroy[tid].second; + dout(7) << "commit " << tid << " destroy " << sn << " seq " << seq << dendl; + snaps.erase(sn); + if (seq > last_destroyed) + last_destroyed = seq; + + for (const auto p : mds->mdsmap->get_data_pools()) { + need_to_purge[p].insert(sn); + need_to_purge[p].insert(seq); + } + + pending_destroy.erase(tid); + } + else if (pending_noop.count(tid)) { + dout(7) << "commit " << tid << " noop" << dendl; + pending_noop.erase(tid); + } + else + ceph_abort(); + + //dump(); +} + +void SnapServer::_rollback(version_t tid) +{ + if (pending_update.count(tid)) { + SnapInfo &info = pending_update[tid]; + string opname; + if (snaps.count(info.snapid)) + opname = "update"; + else + opname = "create"; + dout(7) << "rollback " << tid << " " << opname << " " << info << dendl; + pending_update.erase(tid); + } + + else if (pending_destroy.count(tid)) { + dout(7) << "rollback " << tid << " destroy " << pending_destroy[tid] << dendl; + pending_destroy.erase(tid); + } + + else if (pending_noop.count(tid)) { + dout(7) << "rollback " << tid << " noop" << dendl; + pending_noop.erase(tid); + } + + else + ceph_abort(); + + //dump(); +} + +void SnapServer::_server_update(bufferlist& bl) +{ + using ceph::decode; + auto p = bl.cbegin(); + map<int, vector<snapid_t> > purge; + decode(purge, p); + + dout(7) << "_server_update purged " << purge << dendl; + for (map<int, vector<snapid_t> >::iterator p = purge.begin(); + p != purge.end(); + ++p) { + for (vector<snapid_t>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + need_to_purge[p->first].erase(*q); + if (need_to_purge[p->first].empty()) + need_to_purge.erase(p->first); + } +} + +bool SnapServer::_notify_prep(version_t tid) +{ + using ceph::encode; + bufferlist bl; + char type = 'F'; + encode(type, bl); + encode(snaps, bl); + encode(pending_update, bl); + encode(pending_destroy, bl); + encode(last_created, bl); + encode(last_destroyed, bl); + ceph_assert(version == tid); + + for (auto &p : active_clients) { + auto m = MMDSTableRequest::create(table, TABLESERVER_OP_NOTIFY_PREP, 0, version); + m->bl = bl; + mds->send_message_mds(m, p); + } + return true; +} + +void SnapServer::handle_query(const MMDSTableRequest::const_ref &req) +{ + using ceph::encode; + using ceph::decode; + char op; + auto p = req->bl.cbegin(); + decode(op, p); + + auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_QUERY_REPLY, req->reqid, version); + + switch (op) { + case 'F': // full + version_t have_version; + decode(have_version, p); + ceph_assert(have_version <= version); + if (have_version == version) { + char type = 'U'; + encode(type, reply->bl); + } else { + char type = 'F'; + encode(type, reply->bl); + encode(snaps, reply->bl); + encode(pending_update, reply->bl); + encode(pending_destroy, reply->bl); + encode(last_created, reply->bl); + encode(last_destroyed, reply->bl); + } + // FIXME: implement incremental change + break; + default: + ceph_abort(); + }; + + mds->send_message(reply, req->get_connection()); +} + +void SnapServer::check_osd_map(bool force) +{ + if (!force && version == last_checked_osdmap) { + dout(10) << "check_osd_map - version unchanged" << dendl; + return; + } + dout(10) << "check_osd_map need_to_purge=" << need_to_purge << dendl; + + map<int, vector<snapid_t> > all_purge; + map<int, vector<snapid_t> > all_purged; + + mds->objecter->with_osdmap( + [this, &all_purged, &all_purge](const OSDMap& osdmap) { + for (const auto& p : need_to_purge) { + int id = p.first; + const pg_pool_t *pi = osdmap.get_pg_pool(id); + if (pi == NULL) { + // The pool is gone. So are the snapshots. + all_purged[id] = std::vector<snapid_t>(p.second.begin(), + p.second.end()); + continue; + } + + for (const auto& q : p.second) { + if (pi->is_removed_snap(q)) { + dout(10) << " osdmap marks " << q << " as removed" << dendl; + all_purged[id].push_back(q); + } else { + all_purge[id].push_back(q); + } + } + } + }); + + if (!all_purged.empty()) { + // prepare to remove from need_to_purge list + bufferlist bl; + using ceph::encode; + encode(all_purged, bl); + do_server_update(bl); + } + + if (!all_purge.empty()) { + dout(10) << "requesting removal of " << all_purge << dendl; + auto m = MRemoveSnaps::create(all_purge); + mon_client->send_mon_message(m.detach()); + } + + last_checked_osdmap = version; +} + + +void SnapServer::dump(Formatter *f) const +{ + f->open_object_section("snapserver"); + + f->dump_int("last_snap", last_snap); + f->dump_int("last_created", last_created); + f->dump_int("last_destroyed", last_destroyed); + + f->open_array_section("pending_noop"); + for(set<version_t>::const_iterator i = pending_noop.begin(); i != pending_noop.end(); ++i) { + f->dump_unsigned("version", *i); + } + f->close_section(); + + f->open_array_section("snaps"); + for (map<snapid_t, SnapInfo>::const_iterator i = snaps.begin(); i != snaps.end(); ++i) { + f->open_object_section("snap"); + i->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_object_section("need_to_purge"); + for (map<int, set<snapid_t> >::const_iterator i = need_to_purge.begin(); i != need_to_purge.end(); ++i) { + stringstream pool_id; + pool_id << i->first; + f->open_array_section(pool_id.str().c_str()); + for (set<snapid_t>::const_iterator s = i->second.begin(); s != i->second.end(); ++s) { + f->dump_unsigned("snapid", s->val); + } + f->close_section(); + } + f->close_section(); + + f->open_array_section("pending_update"); + for(map<version_t, SnapInfo>::const_iterator i = pending_update.begin(); i != pending_update.end(); ++i) { + f->open_object_section("snap"); + f->dump_unsigned("version", i->first); + f->open_object_section("snapinfo"); + i->second.dump(f); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("pending_destroy"); + for(map<version_t, pair<snapid_t, snapid_t> >::const_iterator i = pending_destroy.begin(); i != pending_destroy.end(); ++i) { + f->open_object_section("snap"); + f->dump_unsigned("version", i->first); + f->dump_unsigned("removed_snap", i->second.first); + f->dump_unsigned("seq", i->second.second); + f->close_section(); + } + f->close_section(); + + f->close_section(); +} + +void SnapServer::generate_test_instances(list<SnapServer*>& ls) +{ + list<SnapInfo*> snapinfo_instances; + SnapInfo::generate_test_instances(snapinfo_instances); + SnapInfo populated_snapinfo = *(snapinfo_instances.back()); + for (list<SnapInfo*>::iterator i = snapinfo_instances.begin(); i != snapinfo_instances.end(); ++i) { + delete *i; + } + + SnapServer *blank = new SnapServer(); + ls.push_back(blank); + SnapServer *populated = new SnapServer(); + populated->last_snap = 123; + populated->snaps[456] = populated_snapinfo; + populated->need_to_purge[2].insert(012); + populated->pending_update[234] = populated_snapinfo; + populated->pending_destroy[345].first = 567; + populated->pending_destroy[345].second = 768; + populated->pending_noop.insert(890); + + ls.push_back(populated); +} + +bool SnapServer::force_update(snapid_t last, snapid_t v2_since, + map<snapid_t, SnapInfo>& _snaps) +{ + bool modified = false; + if (last > last_snap) { + derr << " updating last_snap " << last_snap << " -> " << last << dendl; + last_snap = last; + last_created = last; + last_destroyed = last; + modified = true; + } + if (v2_since > snaprealm_v2_since) { + derr << " updating snaprealm_v2_since " << snaprealm_v2_since + << " -> " << v2_since << dendl; + snaprealm_v2_since = v2_since; + modified = true; + } + if (snaps != _snaps) { + derr << " updating snaps {" << snaps << "} -> {" << _snaps << "}" << dendl; + snaps = _snaps; + modified = true; + } + + if (modified) { + need_to_purge.clear(); + pending_update.clear(); + pending_destroy.clear(); + pending_noop.clear(); + MDSTableServer::reset_state(); + } + return modified; +} diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h new file mode 100644 index 00000000..f0a92ce8 --- /dev/null +++ b/src/mds/SnapServer.h @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_SNAPSERVER_H +#define CEPH_SNAPSERVER_H + +#include "MDSTableServer.h" +#include "snap.h" + +class MDSRank; +class MonClient; + +class SnapServer : public MDSTableServer { +protected: + MonClient *mon_client = nullptr; + snapid_t last_snap = 0; + snapid_t last_created, last_destroyed; + snapid_t snaprealm_v2_since; + map<snapid_t, SnapInfo> snaps; + map<int, set<snapid_t> > need_to_purge; + + map<version_t, SnapInfo> pending_update; + map<version_t, pair<snapid_t,snapid_t> > pending_destroy; // (removed_snap, seq) + set<version_t> pending_noop; + + version_t last_checked_osdmap; + + bool root_scrubbed = false; // all snaprealms under root are converted? + bool mdsdir_scrubbed = false; // all snaprealms under ~mds0 are converted? + + void encode_server_state(bufferlist& bl) const override { + ENCODE_START(5, 3, bl); + encode(last_snap, bl); + encode(snaps, bl); + encode(need_to_purge, bl); + encode(pending_update, bl); + encode(pending_destroy, bl); + encode(pending_noop, bl); + encode(last_created, bl); + encode(last_destroyed, bl); + encode(snaprealm_v2_since, bl); + ENCODE_FINISH(bl); + } + void decode_server_state(bufferlist::const_iterator& bl) override { + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + decode(last_snap, bl); + decode(snaps, bl); + decode(need_to_purge, bl); + decode(pending_update, bl); + if (struct_v >= 2) + decode(pending_destroy, bl); + else { + map<version_t, snapid_t> t; + decode(t, bl); + for (map<version_t, snapid_t>::iterator p = t.begin(); p != t.end(); ++p) + pending_destroy[p->first].first = p->second; + } + decode(pending_noop, bl); + if (struct_v >= 4) { + decode(last_created, bl); + decode(last_destroyed, bl); + } else { + last_created = last_snap; + last_destroyed = last_snap; + } + if (struct_v >= 5) + decode(snaprealm_v2_since, bl); + else + snaprealm_v2_since = CEPH_NOSNAP; + + DECODE_FINISH(bl); + } + + // server bits + void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist &out) override; + void _get_reply_buffer(version_t tid, bufferlist *pbl) const override; + void _commit(version_t tid, MMDSTableRequest::const_ref req) override; + void _rollback(version_t tid) override; + void _server_update(bufferlist& bl) override; + bool _notify_prep(version_t tid) override; + void handle_query(const MMDSTableRequest::const_ref &m) override; + +public: + SnapServer(MDSRank *m, MonClient *monc) + : MDSTableServer(m, TABLE_SNAP), mon_client(monc), last_checked_osdmap(0) {} + SnapServer() : MDSTableServer(NULL, TABLE_SNAP), last_checked_osdmap(0) {} + + void reset_state() override; + + bool upgrade_format() { + // upgraded from old filesystem + ceph_assert(is_active()); + ceph_assert(last_snap > 0); + bool upgraded = false; + if (get_version() == 0) { + // version 0 confuses snapclient code + reset(); + upgraded = true; + } + if (snaprealm_v2_since == CEPH_NOSNAP) { + // new snapshots will have new format snaprealms + snaprealm_v2_since = last_snap + 1; + upgraded = true; + } + return upgraded; + } + + void check_osd_map(bool force); + + void mark_base_recursively_scrubbed(inodeno_t ino) { + if (ino == MDS_INO_ROOT) + root_scrubbed = true; + else if (ino == MDS_INO_MDSDIR(rank)) + mdsdir_scrubbed = true; + else + ceph_abort(); + } + bool can_allow_multimds_snaps() const { + return (root_scrubbed && mdsdir_scrubbed) || + snaps.empty() || snaps.begin()->first >= snaprealm_v2_since; + } + + void encode(bufferlist& bl) const { + encode_server_state(bl); + } + void decode(bufferlist::const_iterator& bl) { + decode_server_state(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<SnapServer*>& ls); + + bool force_update(snapid_t last, snapid_t v2_since, + map<snapid_t, SnapInfo>& _snaps); +}; +WRITE_CLASS_ENCODER(SnapServer) + +#endif diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc new file mode 100644 index 00000000..444e4ccc --- /dev/null +++ b/src/mds/StrayManager.cc @@ -0,0 +1,759 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/perf_counters.h" + +#include "mds/MDSRank.h" +#include "mds/MDCache.h" +#include "mds/MDLog.h" +#include "mds/CDir.h" +#include "mds/CDentry.h" +#include "events/EUpdate.h" +#include "messages/MClientRequest.h" + +#include "StrayManager.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".cache.strays "; +} + +class StrayManagerIOContext : public virtual MDSIOContextBase { +protected: + StrayManager *sm; + MDSRank *get_mds() override + { + return sm->mds; + } +public: + explicit StrayManagerIOContext(StrayManager *sm_) : sm(sm_) {} +}; + +class StrayManagerLogContext : public virtual MDSLogContextBase { +protected: + StrayManager *sm; + MDSRank *get_mds() override + { + return sm->mds; + } +public: + explicit StrayManagerLogContext(StrayManager *sm_) : sm(sm_) {} +}; + +class StrayManagerContext : public virtual MDSContext { +protected: + StrayManager *sm; + MDSRank *get_mds() override + { + return sm->mds; + } +public: + explicit StrayManagerContext(StrayManager *sm_) : sm(sm_) {} +}; + + +/** + * Context wrapper for _purge_stray_purged completion + */ +class C_IO_PurgeStrayPurged : public StrayManagerIOContext { + CDentry *dn; + bool only_head; +public: + C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh) : + StrayManagerIOContext(sm_), dn(d), only_head(oh) { } + void finish(int r) override { + ceph_assert(r == 0 || r == -ENOENT); + sm->_purge_stray_purged(dn, only_head); + } + void print(ostream& out) const override { + CInode *in = dn->get_projected_linkage()->get_inode(); + out << "purge_stray(" << in->ino() << ")"; + } +}; + + +void StrayManager::purge(CDentry *dn) +{ + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + CInode *in = dnl->get_inode(); + dout(10) << __func__ << " " << *dn << " " << *in << dendl; + ceph_assert(!dn->is_replicated()); + + // CHEAT. there's no real need to journal our intent to purge, since + // that is implicit in the dentry's presence and non-use in the stray + // dir. on recovery, we'll need to re-eval all strays anyway. + + SnapContext nullsnapc; + + PurgeItem item; + item.ino = in->inode.ino; + item.stamp = ceph_clock_now(); + if (in->is_dir()) { + item.action = PurgeItem::PURGE_DIR; + item.fragtree = in->dirfragtree; + } else { + item.action = PurgeItem::PURGE_FILE; + + const SnapContext *snapc; + SnapRealm *realm = in->find_snaprealm(); + if (realm) { + dout(10) << " realm " << *realm << dendl; + snapc = &realm->get_snap_context(); + } else { + dout(10) << " NO realm, using null context" << dendl; + snapc = &nullsnapc; + ceph_assert(in->last == CEPH_NOSNAP); + } + + uint64_t to = 0; + if (in->is_file()) { + to = in->inode.get_max_size(); + to = std::max(in->inode.size, to); + // when truncating a file, the filer does not delete stripe objects that are + // truncated to zero. so we need to purge stripe objects up to the max size + // the file has ever been. + to = std::max(in->inode.max_size_ever, to); + } + + auto pi = in->get_projected_inode(); + + item.size = to; + item.layout = pi->layout; + item.old_pools.clear(); + for (const auto &p : pi->old_pools) + item.old_pools.insert(p); + item.snapc = *snapc; + } + + purge_queue.push(item, new C_IO_PurgeStrayPurged( + this, dn, false)); +} + +class C_PurgeStrayLogged : public StrayManagerLogContext { + CDentry *dn; + version_t pdv; + LogSegment *ls; +public: + C_PurgeStrayLogged(StrayManager *sm_, CDentry *d, version_t v, LogSegment *s) : + StrayManagerLogContext(sm_), dn(d), pdv(v), ls(s) { } + void finish(int r) override { + sm->_purge_stray_logged(dn, pdv, ls); + } +}; + +class C_TruncateStrayLogged : public StrayManagerLogContext { + CDentry *dn; + LogSegment *ls; +public: + C_TruncateStrayLogged(StrayManager *sm, CDentry *d, LogSegment *s) : + StrayManagerLogContext(sm), dn(d), ls(s) { } + void finish(int r) override { + sm->_truncate_stray_logged(dn, ls); + } +}; + +void StrayManager::_purge_stray_purged( + CDentry *dn, bool only_head) +{ + CInode *in = dn->get_projected_linkage()->get_inode(); + dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl; + + logger->inc(l_mdc_strays_enqueued); + num_strays_enqueuing--; + logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing); + + if (only_head) { + /* This was a ::truncate */ + EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate"); + mds->mdlog->start_entry(le); + + auto &pi = in->project_inode(); + pi.inode.size = 0; + pi.inode.max_size_ever = 0; + pi.inode.client_ranges.clear(); + pi.inode.truncate_size = 0; + pi.inode.truncate_from = 0; + pi.inode.version = in->pre_dirty(); + + le->metablob.add_dir_context(dn->dir); + le->metablob.add_primary_dentry(dn, in, true); + + mds->mdlog->submit_entry(le, + new C_TruncateStrayLogged( + this, dn, mds->mdlog->get_current_segment())); + } else { + if (in->get_num_ref() != (int)in->is_dirty() || + dn->get_num_ref() != (int)dn->is_dirty() + !!in->get_num_ref() + 1/*PIN_PURGING*/) { + // Nobody should be taking new references to an inode when it + // is being purged (aside from it were + + derr << "Rogue reference after purge to " << *dn << dendl; + ceph_abort_msg("rogue reference to purging inode"); + } + + // kill dentry. + version_t pdv = dn->pre_dirty(); + dn->push_projected_linkage(); // NULL + + EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); + mds->mdlog->start_entry(le); + + // update dirfrag fragstat, rstat + CDir *dir = dn->get_dir(); + fnode_t *pf = dir->project_fnode(); + pf->version = dir->pre_dirty(); + if (in->is_dir()) + pf->fragstat.nsubdirs--; + else + pf->fragstat.nfiles--; + pf->rstat.sub(in->inode.accounted_rstat); + + le->metablob.add_dir_context(dn->dir); + EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true); + le->metablob.add_null_dentry(dl, dn, true); + le->metablob.add_destroyed_inode(in->ino()); + + mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv, + mds->mdlog->get_current_segment())); + } +} + +void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) +{ + CInode *in = dn->get_linkage()->get_inode(); + dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl; + + ceph_assert(!in->state_test(CInode::STATE_RECOVERING)); + + bool new_dn = dn->is_new(); + + // unlink + ceph_assert(dn->get_projected_linkage()->is_null()); + dn->dir->unlink_inode(dn, !new_dn); + dn->pop_projected_linkage(); + dn->mark_dirty(pdv, ls); + + dn->dir->pop_and_dirty_projected_fnode(ls); + + in->state_clear(CInode::STATE_ORPHAN); + dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED); + dn->put(CDentry::PIN_PURGING); + + // drop dentry? + if (new_dn) { + dout(20) << " dn is new, removing" << dendl; + dn->mark_clean(); + dn->dir->remove_dentry(dn); + } + + // drop inode + inodeno_t ino = in->ino(); + if (in->is_dirty()) + in->mark_clean(); + mds->mdcache->remove_inode(in); + + if (mds->is_stopping()) + mds->mdcache->shutdown_export_stray_finish(ino); +} + +void StrayManager::enqueue(CDentry *dn, bool trunc) +{ + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + ceph_assert(dnl); + CInode *in = dnl->get_inode(); + ceph_assert(in); + + /* We consider a stray to be purging as soon as it is enqueued, to avoid + * enqueing it twice */ + dn->state_set(CDentry::STATE_PURGING); + in->state_set(CInode::STATE_PURGING); + + /* We must clear this as soon as enqueuing it, to prevent the journal + * expiry code from seeing a dirty parent and trying to write a backtrace */ + if (!trunc) { + if (in->is_dirty_parent()) { + in->clear_dirty_parent(); + } + } + + dout(20) << __func__ << ": purging dn: " << *dn << dendl; + + if (!dn->state_test(CDentry::STATE_PURGINGPINNED)) { + dn->get(CDentry::PIN_PURGING); + dn->state_set(CDentry::STATE_PURGINGPINNED); + } + + ++num_strays_enqueuing; + logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing); + + // Resources are available, acquire them and execute the purge + _enqueue(dn, trunc); + + dout(10) << __func__ << ": purging this dentry immediately: " + << *dn << dendl; +} + +class C_OpenSnapParents : public StrayManagerContext { + CDentry *dn; + bool trunc; + public: + C_OpenSnapParents(StrayManager *sm_, CDentry *dn_, bool t) : + StrayManagerContext(sm_), dn(dn_), trunc(t) { } + void finish(int r) override { + sm->_enqueue(dn, trunc); + } +}; + +void StrayManager::_enqueue(CDentry *dn, bool trunc) +{ + ceph_assert(started); + + CInode *in = dn->get_linkage()->get_inode(); + if (in->snaprealm && + !in->snaprealm->have_past_parents_open() && + !in->snaprealm->open_parents(new C_OpenSnapParents(this, dn, trunc))) { + // this can happen if the dentry had been trimmed from cache. + return; + } + + if (trunc) { + truncate(dn); + } else { + purge(dn); + } +} + +void StrayManager::queue_delayed(CDentry *dn) +{ + if (!started) + return; + + if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY)) + return; + + if (!dn->item_stray.is_on_list()) { + delayed_eval_stray.push_back(&dn->item_stray); + num_strays_delayed++; + logger->set(l_mdc_num_strays_delayed, num_strays_delayed); + } +} + +void StrayManager::advance_delayed() +{ + if (!started) + return; + + while (!delayed_eval_stray.empty()) { + CDentry *dn = delayed_eval_stray.front(); + dn->item_stray.remove_myself(); + num_strays_delayed--; + + if (dn->get_projected_linkage()->is_null()) { + /* A special case: a stray dentry can go null if its inode is being + * re-linked into another MDS's stray dir during a shutdown migration. */ + dout(4) << __func__ << ": delayed dentry is now null: " << *dn << dendl; + continue; + } + + eval_stray(dn); + } + logger->set(l_mdc_num_strays_delayed, num_strays_delayed); +} + +void StrayManager::set_num_strays(uint64_t num) +{ + ceph_assert(!started); + num_strays = num; + logger->set(l_mdc_num_strays, num_strays); +} + +void StrayManager::notify_stray_created() +{ + num_strays++; + logger->set(l_mdc_num_strays, num_strays); + logger->inc(l_mdc_strays_created); +} + +void StrayManager::notify_stray_removed() +{ + num_strays--; + logger->set(l_mdc_num_strays, num_strays); +} + +struct C_EvalStray : public StrayManagerContext { + CDentry *dn; + C_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {} + void finish(int r) override { + sm->eval_stray(dn); + } +}; + +struct C_MDC_EvalStray : public StrayManagerContext { + CDentry *dn; + C_MDC_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {} + void finish(int r) override { + sm->eval_stray(dn); + } +}; + +bool StrayManager::_eval_stray(CDentry *dn) +{ + dout(10) << "eval_stray " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + ceph_assert(dnl->is_primary()); + dout(10) << " inode is " << *dnl->get_inode() << dendl; + CInode *in = dnl->get_inode(); + ceph_assert(in); + ceph_assert(!in->state_test(CInode::STATE_REJOINUNDEF)); + + // The only dentries elegible for purging are those + // in the stray directories + ceph_assert(dn->get_dir()->get_inode()->is_stray()); + + // Inode may not pass through this function if it + // was already identified for purging (i.e. cannot + // call eval_stray() after purge() + ceph_assert(!dn->state_test(CDentry::STATE_PURGING)); + + if (!dn->is_auth()) + return false; + + if (!started) + return false; + + if (dn->item_stray.is_on_list()) { + dn->item_stray.remove_myself(); + num_strays_delayed--; + logger->set(l_mdc_num_strays_delayed, num_strays_delayed); + } + + // purge? + if (in->inode.nlink == 0) { + // past snaprealm parents imply snapped dentry remote links. + // only important for directories. normal file data snaps are handled + // by the object store. + if (in->snaprealm) { + if (!in->snaprealm->have_past_parents_open() && + !in->snaprealm->open_parents(new C_MDC_EvalStray(this, dn))) { + return false; + } + in->snaprealm->prune_past_parents(); + in->purge_stale_snap_data(in->snaprealm->get_snaps()); + } + if (in->is_dir()) { + if (in->snaprealm && in->snaprealm->has_past_parents()) { + dout(20) << " directory has past parents " + << in->snaprealm << dendl; + if (in->state_test(CInode::STATE_MISSINGOBJS)) { + mds->clog->error() << "previous attempt at committing dirfrag of ino " + << in->ino() << " has failed, missing object"; + mds->handle_write_error(-ENOENT); + } + return false; // not until some snaps are deleted. + } + + mds->mdcache->clear_dirty_bits_for_stray(in); + + if (!in->remote_parents.empty()) { + // unlink any stale remote snap dentry. + for (auto it = in->remote_parents.begin(); it != in->remote_parents.end(); ) { + CDentry *remote_dn = *it; + ++it; + ceph_assert(remote_dn->last != CEPH_NOSNAP); + remote_dn->unlink_remote(remote_dn->get_linkage()); + } + } + } + if (dn->is_replicated()) { + dout(20) << " replicated" << dendl; + return false; + } + if (dn->is_any_leases() || in->is_any_caps()) { + dout(20) << " caps | leases" << dendl; + return false; // wait + } + if (in->state_test(CInode::STATE_NEEDSRECOVER) || + in->state_test(CInode::STATE_RECOVERING)) { + dout(20) << " pending recovery" << dendl; + return false; // don't mess with file size probing + } + if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) { + dout(20) << " too many inode refs" << dendl; + return false; + } + if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) { + dout(20) << " too many dn refs" << dendl; + return false; + } + // don't purge multiversion inode with snap data + if (in->snaprealm && in->snaprealm->has_past_parents() && + !in->old_inodes.empty()) { + // A file with snapshots: we will truncate the HEAD revision + // but leave the metadata intact. + ceph_assert(!in->is_dir()); + dout(20) << " file has past parents " + << in->snaprealm << dendl; + if (in->is_file() && in->get_projected_inode()->size > 0) { + enqueue(dn, true); // truncate head objects + } + } else { + // A straightforward file, ready to be purged. Enqueue it. + if (in->is_dir()) { + in->close_dirfrags(); + } + + enqueue(dn, false); + } + + return true; + } else { + /* + * Where a stray has some links, they should be remotes, check + * if we can do anything with them if we happen to have them in + * cache. + */ + _eval_stray_remote(dn, NULL); + return false; + } +} + +void StrayManager::activate() +{ + dout(10) << __func__ << dendl; + started = true; + purge_queue.activate(); +} + +bool StrayManager::eval_stray(CDentry *dn) +{ + // avoid nested eval_stray + if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY)) + return false; + + dn->state_set(CDentry::STATE_EVALUATINGSTRAY); + bool ret = _eval_stray(dn); + dn->state_clear(CDentry::STATE_EVALUATINGSTRAY); + return ret; +} + +void StrayManager::eval_remote(CDentry *remote_dn) +{ + dout(10) << __func__ << " " << *remote_dn << dendl; + + CDentry::linkage_t *dnl = remote_dn->get_projected_linkage(); + ceph_assert(dnl->is_remote()); + CInode *in = dnl->get_inode(); + + if (!in) { + dout(20) << __func__ << ": no inode, cannot evaluate" << dendl; + return; + } + + if (remote_dn->last != CEPH_NOSNAP) { + dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl; + return; + } + + // refers to stray? + CDentry *primary_dn = in->get_projected_parent_dn(); + ceph_assert(primary_dn != NULL); + if (primary_dn->get_dir()->get_inode()->is_stray()) { + _eval_stray_remote(primary_dn, remote_dn); + } else { + dout(20) << __func__ << ": inode's primary dn not stray" << dendl; + } +} + +class C_RetryEvalRemote : public StrayManagerContext { + CDentry *dn; + public: + C_RetryEvalRemote(StrayManager *sm_, CDentry *dn_) : + StrayManagerContext(sm_), dn(dn_) { + dn->get(CDentry::PIN_PTRWAITER); + } + void finish(int r) override { + if (dn->get_projected_linkage()->is_remote()) + sm->eval_remote(dn); + dn->put(CDentry::PIN_PTRWAITER); + } +}; + +void StrayManager::_eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn) +{ + dout(20) << __func__ << " " << *stray_dn << dendl; + ceph_assert(stray_dn != NULL); + ceph_assert(stray_dn->get_dir()->get_inode()->is_stray()); + CDentry::linkage_t *stray_dnl = stray_dn->get_projected_linkage(); + ceph_assert(stray_dnl->is_primary()); + CInode *stray_in = stray_dnl->get_inode(); + ceph_assert(stray_in->inode.nlink >= 1); + ceph_assert(stray_in->last == CEPH_NOSNAP); + + /* If no remote_dn hinted, pick one arbitrarily */ + if (remote_dn == NULL) { + if (!stray_in->remote_parents.empty()) { + for (const auto &dn : stray_in->remote_parents) { + if (dn->last == CEPH_NOSNAP && !dn->is_projected()) { + if (dn->is_auth()) { + remote_dn = dn; + if (remote_dn->dir->can_auth_pin()) + break; + } else if (!remote_dn) { + remote_dn = dn; + } + } + } + } + if (!remote_dn) { + dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl; + return; + } + } + ceph_assert(remote_dn->last == CEPH_NOSNAP); + // NOTE: we repeat this check in _rename(), since our submission path is racey. + if (!remote_dn->is_projected()) { + if (remote_dn->is_auth()) { + if (remote_dn->dir->can_auth_pin()) { + reintegrate_stray(stray_dn, remote_dn); + } else { + remote_dn->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_RetryEvalRemote(this, remote_dn)); + dout(20) << __func__ << ": not reintegrating (can't authpin remote parent)" << dendl; + } + + } else if (!remote_dn->is_auth() && stray_dn->is_auth()) { + migrate_stray(stray_dn, remote_dn->authority().first); + } else { + dout(20) << __func__ << ": not reintegrating" << dendl; + } + } else { + // don't do anything if the remote parent is projected, or we may + // break user-visible semantics! + dout(20) << __func__ << ": not reintegrating (projected)" << dendl; + } +} + +void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn) +{ + dout(10) << __func__ << " " << *straydn << " into " << *rdn << dendl; + + logger->inc(l_mdc_strays_reintegrated); + + // rename it to another mds. + filepath src; + straydn->make_path(src); + filepath dst; + rdn->make_path(dst); + + auto req = MClientRequest::create(CEPH_MDS_OP_RENAME); + req->set_filepath(dst); + req->set_filepath2(src); + req->set_tid(mds->issue_tid()); + + mds->send_message_mds(req, rdn->authority().first); +} + +void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) +{ + CInode *in = dn->get_projected_linkage()->get_inode(); + ceph_assert(in); + CInode *diri = dn->dir->get_inode(); + ceph_assert(diri->is_stray()); + dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri->inode.ino) + << " to mds." << to + << " " << *dn << " " << *in << dendl; + + logger->inc(l_mdc_strays_migrated); + + // rename it to another mds. + filepath src; + dn->make_path(src); + ceph_assert(src.depth() == 2); + + filepath dst(MDS_INO_MDSDIR(to)); + dst.push_dentry(src[0]); + dst.push_dentry(src[1]); + + auto req = MClientRequest::create(CEPH_MDS_OP_RENAME); + req->set_filepath(dst); + req->set_filepath2(src); + req->set_tid(mds->issue_tid()); + + mds->send_message_mds(req, to); +} + +StrayManager::StrayManager(MDSRank *mds, PurgeQueue &purge_queue_) + : delayed_eval_stray(member_offset(CDentry, item_stray)), + mds(mds), logger(NULL), started(false), num_strays(0), + num_strays_delayed(0), num_strays_enqueuing(0), + purge_queue(purge_queue_) +{ + ceph_assert(mds != NULL); +} + +void StrayManager::truncate(CDentry *dn) +{ + const CDentry::linkage_t *dnl = dn->get_projected_linkage(); + const CInode *in = dnl->get_inode(); + ceph_assert(in); + dout(10) << __func__ << ": " << *dn << " " << *in << dendl; + ceph_assert(!dn->is_replicated()); + + const SnapRealm *realm = in->find_snaprealm(); + ceph_assert(realm); + dout(10) << " realm " << *realm << dendl; + const SnapContext *snapc = &realm->get_snap_context(); + + uint64_t to = in->inode.get_max_size(); + to = std::max(in->inode.size, to); + // when truncating a file, the filer does not delete stripe objects that are + // truncated to zero. so we need to purge stripe objects up to the max size + // the file has ever been. + to = std::max(in->inode.max_size_ever, to); + + ceph_assert(to > 0); + + PurgeItem item; + item.action = PurgeItem::TRUNCATE_FILE; + item.ino = in->inode.ino; + item.layout = in->inode.layout; + item.snapc = *snapc; + item.size = to; + item.stamp = ceph_clock_now(); + + purge_queue.push(item, new C_IO_PurgeStrayPurged( + this, dn, true)); +} + +void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls) +{ + CInode *in = dn->get_projected_linkage()->get_inode(); + + dout(10) << __func__ << ": " << *dn << " " << *in << dendl; + + in->pop_and_dirty_projected_inode(ls); + + in->state_clear(CInode::STATE_PURGING); + dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED); + dn->put(CDentry::PIN_PURGING); + + eval_stray(dn); + + if (!dn->state_test(CDentry::STATE_PURGING) && mds->is_stopping()) + mds->mdcache->shutdown_export_stray_finish(in->ino()); +} + diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h new file mode 100644 index 00000000..53e42110 --- /dev/null +++ b/src/mds/StrayManager.h @@ -0,0 +1,197 @@ +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef STRAY_MANAGER_H +#define STRAY_MANAGER_H + +#include "include/elist.h" +#include <list> +#include "mds/PurgeQueue.h" + +class MDSRank; +class PerfCounters; +class CInode; +class CDentry; + +class StrayManager +{ + protected: + // Has passed through eval_stray and still has refs + elist<CDentry*> delayed_eval_stray; + + // strays that have been trimmed from cache + std::set<std::string> trimmed_strays; + + // Global references for doing I/O + MDSRank *mds; + PerfCounters *logger; + + bool started; + + // Stray dentries for this rank (including those not in cache) + uint64_t num_strays; + + // Stray dentries + uint64_t num_strays_delayed; + + // Entries that have entered enqueue() but not been persistently + // recorded by PurgeQueue yet + uint64_t num_strays_enqueuing; + + PurgeQueue &purge_queue; + + void truncate(CDentry *dn); + + /** + * Purge a dentry from a stray directory. This function + * is called once eval_stray is satisfied and StrayManager + * throttling is also satisfied. There is no going back + * at this stage! + */ + void purge(CDentry *dn); + + /** + * Completion handler for a Filer::purge on a stray inode. + */ + void _purge_stray_purged(CDentry *dn, bool only_head); + + void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); + + /** + * Callback: we have logged the update to an inode's metadata + * reflecting it's newly-zeroed length. + */ + void _truncate_stray_logged(CDentry *dn, LogSegment *ls); + + friend class StrayManagerIOContext; + friend class StrayManagerLogContext; + friend class StrayManagerContext; + + friend class C_StraysFetched; + friend class C_OpenSnapParents; + friend class C_PurgeStrayLogged; + friend class C_TruncateStrayLogged; + friend class C_IO_PurgeStrayPurged; + + + // Call this on a dentry that has been identified as + // eligible for purging. It will be passed on to PurgeQueue. + void enqueue(CDentry *dn, bool trunc); + + // Final part of enqueue() which we may have to retry + // after opening snap parents. + void _enqueue(CDentry *dn, bool trunc); + + + /** + * When hard links exist to an inode whose primary dentry + * is unlinked, the inode gets a stray primary dentry. + * + * We may later "reintegrate" the inode into a remaining + * non-stray dentry (one of what was previously a remote + * dentry) by issuing a rename from the stray to the other + * dentry. + */ + void reintegrate_stray(CDentry *dn, CDentry *rlink); + + /** + * Evaluate a stray dentry for purging or reintegration. + * + * purging: If the inode has no linkage, and no more references, then + * we may decide to purge it. + * + * reintegration: If the inode still has linkage, then it means someone else + * (a hard link) is still referring to it, and we should + * think about reintegrating that inode into the remote dentry. + * + * @returns true if the dentry will be purged (caller should never + * take more refs after this happens), else false. + */ + bool _eval_stray(CDentry *dn); + + void _eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn); + + // My public interface is for consumption by MDCache + public: + explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_); + void set_logger(PerfCounters *l) {logger = l;} + void activate(); + + bool eval_stray(CDentry *dn); + + void set_num_strays(uint64_t num); + uint64_t get_num_strays() const { return num_strays; } + + /** + * Queue dentry for later evaluation. (evaluate it while not in the + * middle of another metadata operation) + */ + void queue_delayed(CDentry *dn); + + /** + * Eval strays in the delayed_eval_stray list + */ + void advance_delayed(); + + /** + * Remote dentry potentially points to a stray. When it is touched, + * call in here to evaluate it for migration (move a stray residing + * on another MDS to this MDS) or reintegration (move a stray dentry's + * inode into a non-stray hardlink dentry and clean up the stray). + * + * @param stray_dn a stray dentry whose inode has been referenced + * by a remote dentry + * @param remote_dn (optional) which remote dentry was touched + * in an operation that led us here: this is used + * as a hint for which remote to reintegrate into + * if there are multiple remotes. + */ + void eval_remote(CDentry *remote_dn); + + /** + * Given a dentry within one of my stray directories, + * send it off to a stray directory in another MDS. + * + * This is for use: + * * Case A: when shutting down a rank, we migrate strays + * away from ourselves rather than waiting for purge + * * Case B: when a client request has a trace that refers to + * a stray inode on another MDS, we migrate that inode from + * there to here, in order that we can later re-integrate it + * here. + * + * In case B, the receiver should be calling into eval_stray + * on completion of mv (i.e. inode put), resulting in a subsequent + * reintegration. + */ + void migrate_stray(CDentry *dn, mds_rank_t dest); + + /** + * Update stats to reflect a newly created stray dentry. Needed + * because stats on strays live here, but creation happens + * in Server or MDCache. For our purposes "creation" includes + * loading a stray from a dirfrag and migrating a stray from + * another MDS, in addition to creations per-se. + */ + void notify_stray_created(); + + /** + * Update stats to reflect a removed stray dentry. Needed because + * stats on strays live here, but removal happens in Server or + * MDCache. Also includes migration (rename) of strays from + * this MDS to another MDS. + */ + void notify_stray_removed(); +}; + +#endif // STRAY_MANAGER_H diff --git a/src/mds/balancers/greedyspill.lua b/src/mds/balancers/greedyspill.lua new file mode 100644 index 00000000..20576cdb --- /dev/null +++ b/src/mds/balancers/greedyspill.lua @@ -0,0 +1,49 @@ +local metrics = {"auth.meta_load", "all.meta_load", "req_rate", "queue_len", "cpu_load_avg"} + +-- Metric for balancing is the workload; also dumps metrics +local function mds_load() + for rank, mds in pairs(mds) do + local s = "MDS"..rank..": < " + for _, metric in ipairs(metrics) do + s = s..metric.."="..mds[metric].." " + end + mds.load = mds["all.meta_load"] + BAL_LOG(5, s.."> load="..mds.load) + end +end + +-- Shed load when you have load and your neighbor doesn't +local function when() + if not mds[whoami+1] then + -- i'm the last rank + BAL_LOG(5, "when: not migrating! I am the last rank, nothing to spill to."); + return false + end + my_load = mds[whoami]["load"] + his_load = mds[whoami+1]["load"] + if my_load > 0.01 and his_load < 0.01 then + BAL_LOG(5, "when: migrating! my_load="..my_load.." hisload="..his_load) + return true + end + BAL_LOG(5, "when: not migrating! my_load="..my_load.." hisload="..his_load) + return false +end + +-- Shed half your load to your neighbor +-- neighbor=whoami+2 because Lua tables are indexed starting at 1 +local function where(targets) + targets[whoami+1] = mds[whoami]["load"]/2 + return targets +end + +local targets = {} +for rank in pairs(mds) do + targets[rank] = 0 +end + +mds_load() +if when() then + where(targets) +end + +return targets diff --git a/src/mds/cephfs_features.h b/src/mds/cephfs_features.h new file mode 100644 index 00000000..66752af2 --- /dev/null +++ b/src/mds/cephfs_features.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPHFS_FEATURES_H +#define CEPHFS_FEATURES_H + +// Please add feature bits for later ceph releases and update +// Server::update_required_client_features(). + +// The first 5 bits are reserved for old ceph releases. +#define CEPHFS_FEATURE_JEWEL 5 +#define CEPHFS_FEATURE_KRAKEN 6 +#define CEPHFS_FEATURE_LUMINOUS 7 +#define CEPHFS_FEATURE_MIMIC 8 +#define CEPHFS_FEATURE_REPLY_ENCODING 9 +#define CEPHFS_FEATURE_RECLAIM_CLIENT 10 +#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 +#define CEPHFS_FEATURE_MULTI_RECONNECT 12 +#define CEPHFS_FEATURE_NAUTILUS 12 + +#define CEPHFS_FEATURES_ALL { \ + 0, 1, 2, 3, 4, \ + CEPHFS_FEATURE_JEWEL, \ + CEPHFS_FEATURE_KRAKEN, \ + CEPHFS_FEATURE_LUMINOUS, \ + CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ + CEPHFS_FEATURE_RECLAIM_CLIENT, \ + CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ + CEPHFS_FEATURE_NAUTILUS, \ +} + +#define CEPHFS_FEATURES_MDS_SUPPORTED CEPHFS_FEATURES_ALL +#define CEPHFS_FEATURES_MDS_REQUIRED {} + +#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL +#define CEPHFS_FEATURES_CLIENT_REQUIRED {} + +#endif diff --git a/src/mds/events/ECommitted.h b/src/mds/events/ECommitted.h new file mode 100644 index 00000000..0459f9d0 --- /dev/null +++ b/src/mds/events/ECommitted.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ECOMMITTED_H +#define CEPH_MDS_ECOMMITTED_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class ECommitted : public LogEvent { +public: + metareqid_t reqid; + + ECommitted() : LogEvent(EVENT_COMMITTED) { } + explicit ECommitted(metareqid_t r) : + LogEvent(EVENT_COMMITTED), reqid(r) { } + + void print(ostream& out) const override { + out << "ECommitted " << reqid; + } + + void encode(bufferlist &bl, uint64_t features) const override; + void decode(bufferlist::const_iterator &bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ECommitted*>& ls); + + void update_segment() override {} + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ECommitted) + +#endif diff --git a/src/mds/events/EExport.h b/src/mds/events/EExport.h new file mode 100644 index 00000000..94e39a84 --- /dev/null +++ b/src/mds/events/EExport.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EEXPORT_H +#define CEPH_EEXPORT_H + +#include "common/config.h" +#include "include/types.h" + +#include "../MDSRank.h" + +#include "EMetaBlob.h" +#include "../LogEvent.h" + +class EExport : public LogEvent { +public: + EMetaBlob metablob; // exported dir +protected: + dirfrag_t base; + set<dirfrag_t> bounds; + mds_rank_t target; + +public: + EExport() : + LogEvent(EVENT_EXPORT), target(MDS_RANK_NONE) { } + EExport(MDLog *mdlog, CDir *dir, mds_rank_t t) : + LogEvent(EVENT_EXPORT), + base(dir->dirfrag()), target(t) { } + + set<dirfrag_t> &get_bounds() { return bounds; } + + void print(ostream& out) const override { + out << "EExport " << base << " to mds." << target << " " << metablob; + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator &bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EExport*>& ls); + void replay(MDSRank *mds) override; + +}; +WRITE_CLASS_ENCODER_FEATURES(EExport) + +#endif diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h new file mode 100644 index 00000000..90d9238b --- /dev/null +++ b/src/mds/events/EFragment.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_EFRAGMENT_H +#define CEPH_MDS_EFRAGMENT_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +struct dirfrag_rollback { + fnode_t fnode; + dirfrag_rollback() { } + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); +}; +WRITE_CLASS_ENCODER(dirfrag_rollback) + +class EFragment : public LogEvent { +public: + EMetaBlob metablob; + __u8 op{0}; + inodeno_t ino; + frag_t basefrag; + __s32 bits{0}; // positive for split (from basefrag), negative for merge (to basefrag) + frag_vec_t orig_frags; + bufferlist rollback; + + EFragment() : LogEvent(EVENT_FRAGMENT) { } + EFragment(MDLog *mdlog, int o, dirfrag_t df, int b) : + LogEvent(EVENT_FRAGMENT), + op(o), ino(df.ino), basefrag(df.frag), bits(b) { } + + void print(ostream& out) const override { + out << "EFragment " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << " " << metablob; + } + + enum { + OP_PREPARE = 1, + OP_COMMIT = 2, + OP_ROLLBACK = 3, + OP_FINISH = 4 // finish deleting orphan dirfrags + }; + static std::string_view op_name(int o) { + switch (o) { + case OP_PREPARE: return "prepare"; + case OP_COMMIT: return "commit"; + case OP_ROLLBACK: return "rollback"; + case OP_FINISH: return "finish"; + default: return "???"; + } + } + + void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) { + using ceph::encode; + orig_frags.push_back(df); + if (drb) + encode(*drb, rollback); + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void encode(bufferlist &bl, uint64_t features) const override; + void decode(bufferlist::const_iterator &bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EFragment*>& ls); + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(EFragment) + +#endif diff --git a/src/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h new file mode 100644 index 00000000..699c0527 --- /dev/null +++ b/src/mds/events/EImportFinish.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EIMPORTFINISH_H +#define CEPH_EIMPORTFINISH_H + +#include "common/config.h" +#include "include/types.h" + +#include "../MDSRank.h" +#include "../LogEvent.h" + +class EImportFinish : public LogEvent { + protected: + dirfrag_t base; // imported dir + bool success; + + public: + EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), + base(dir->dirfrag()), + success(s) { } + EImportFinish() : LogEvent(EVENT_IMPORTFINISH), base(), success(false) { } + + void print(ostream& out) const override { + out << "EImportFinish " << base; + if (success) + out << " success"; + else + out << " failed"; + } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator &bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EImportFinish*>& ls); + + void replay(MDSRank *mds) override; + +}; +WRITE_CLASS_ENCODER_FEATURES(EImportFinish) + +#endif diff --git a/src/mds/events/EImportStart.h b/src/mds/events/EImportStart.h new file mode 100644 index 00000000..276469e8 --- /dev/null +++ b/src/mds/events/EImportStart.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EIMPORTSTART_H +#define CEPH_EIMPORTSTART_H + +#include "common/config.h" +#include "include/types.h" + +class MDLog; +class MDSRank; + +#include "EMetaBlob.h" +#include "../LogEvent.h" + +class EImportStart : public LogEvent { +protected: + dirfrag_t base; + vector<dirfrag_t> bounds; + mds_rank_t from; + +public: + EMetaBlob metablob; + bufferlist client_map; // encoded map<__u32,entity_inst_t> + version_t cmapv{0}; + + EImportStart(MDLog *log, dirfrag_t di, const vector<dirfrag_t>& b, mds_rank_t f) : + LogEvent(EVENT_IMPORTSTART), + base(di), bounds(b), from(f) { } + EImportStart() : + LogEvent(EVENT_IMPORTSTART), from(MDS_RANK_NONE) { } + + void print(ostream& out) const override { + out << "EImportStart " << base << " from mds." << from << " " << metablob; + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void encode(bufferlist &bl, uint64_t features) const override; + void decode(bufferlist::const_iterator &bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EImportStart*>& ls); + + void update_segment() override; + void replay(MDSRank *mds) override; + +}; +WRITE_CLASS_ENCODER_FEATURES(EImportStart) + +#endif diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h new file mode 100644 index 00000000..ac09a8fe --- /dev/null +++ b/src/mds/events/EMetaBlob.h @@ -0,0 +1,600 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_EMETABLOB_H +#define CEPH_MDS_EMETABLOB_H + +#include <string_view> + +#include <stdlib.h> + +#include "../CInode.h" +#include "../CDir.h" +#include "../CDentry.h" +#include "../LogSegment.h" + +#include "include/interval_set.h" + +class MDSRank; +class MDLog; +class LogSegment; +struct MDSlaveUpdate; + +/* + * a bunch of metadata in the journal + */ + +/* notes: + * + * - make sure you adjust the inode.version for any modified inode you + * journal. CDir and CDentry maintain a projected_version, but CInode + * doesn't, since the journaled inode usually has to be modified + * manually anyway (to delay the change in the MDS's cache until after + * it is journaled). + * + */ + + +class EMetaBlob { + +public: + /* fullbit - a regular dentry + inode + * + * We encode this one a bit weirdly, just because (also, it's marginally faster + * on multiple encodes, which I think can happen): + * Encode a bufferlist on struct creation with all data members, without a struct_v. + * When encode is called, encode struct_v and then append the bufferlist. + * Decode straight into the appropriate variables. + * + * So, if you add members, encode them in the constructor and then change + * the struct_v in the encode function! + */ + struct fullbit { + static const int STATE_DIRTY = (1<<0); + static const int STATE_DIRTYPARENT = (1<<1); + static const int STATE_DIRTYPOOL = (1<<2); + static const int STATE_NEED_SNAPFLUSH = (1<<3); + std::string dn; // dentry + snapid_t dnfirst, dnlast; + version_t dnv{0}; + CInode::mempool_inode inode; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify + fragtree_t dirfragtree; + CInode::mempool_xattr_map xattrs; + std::string symlink; + snapid_t oldest_snap; + bufferlist snapbl; + __u8 state{0}; + CInode::mempool_old_inode_map old_inodes; // XXX should not be part of mempool; wait for std::pmr to simplify + + fullbit(std::string_view d, snapid_t df, snapid_t dl, + version_t v, const CInode::mempool_inode& i, const fragtree_t &dft, + const CInode::mempool_xattr_map &xa, std::string_view sym, + snapid_t os, const bufferlist &sbl, __u8 st, + const CInode::mempool_old_inode_map *oi = NULL) : + dn(d), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa), + oldest_snap(os), state(st) + { + if (i.is_symlink()) + symlink = sym; + if (i.is_dir()) + dirfragtree = dft; + if (oi) + old_inodes = *oi; + snapbl = sbl; + } + explicit fullbit(bufferlist::const_iterator &p) { + decode(p); + } + fullbit() {} + fullbit(const fullbit&) = delete; + ~fullbit() {} + fullbit& operator=(const fullbit&) = delete; + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EMetaBlob::fullbit*>& ls); + + void update_inode(MDSRank *mds, CInode *in); + bool is_dirty() const { return (state & STATE_DIRTY); } + bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); } + bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); } + bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); } + + void print(ostream& out) const { + out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv + << " inode " << inode.ino + << " state=" << state << std::endl; + } + string state_string() const { + string state_string; + bool marked_already = false; + if (is_dirty()) { + state_string.append("dirty"); + marked_already = true; + } + if (is_dirty_parent()) { + state_string.append(marked_already ? "+dirty_parent" : "dirty_parent"); + if (is_dirty_pool()) + state_string.append("+dirty_pool"); + } + return state_string; + } + }; + WRITE_CLASS_ENCODER_FEATURES(fullbit) + + /* remotebit - a dentry + remote inode link (i.e. just an ino) + */ + struct remotebit { + std::string dn; + snapid_t dnfirst, dnlast; + version_t dnv; + inodeno_t ino; + unsigned char d_type; + bool dirty; + + remotebit(std::string_view d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) : + dn(d), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { } + explicit remotebit(bufferlist::const_iterator &p) { decode(p); } + remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0), + d_type('\0'), dirty(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &bl); + void print(ostream& out) const { + out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv + << " ino " << ino + << " dirty=" << dirty << std::endl; + } + void dump(Formatter *f) const; + static void generate_test_instances(list<remotebit*>& ls); + }; + WRITE_CLASS_ENCODER(remotebit) + + /* + * nullbit - a null dentry + */ + struct nullbit { + std::string dn; + snapid_t dnfirst, dnlast; + version_t dnv; + bool dirty; + + nullbit(std::string_view d, snapid_t df, snapid_t dl, version_t v, bool dr) : + dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { } + explicit nullbit(bufferlist::const_iterator &p) { decode(p); } + nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<nullbit*>& ls); + void print(ostream& out) const { + out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv + << " dirty=" << dirty << std::endl; + } + }; + WRITE_CLASS_ENCODER(nullbit) + + + /* dirlump - contains metadata for any dir we have contents for. + */ +public: + struct dirlump { + static const int STATE_COMPLETE = (1<<1); + static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! + static const int STATE_NEW = (1<<3); // new directory + static const int STATE_IMPORTING = (1<<4); // importing directory + static const int STATE_DIRTYDFT = (1<<5); // dirty dirfragtree + + //version_t dirv; + fnode_t fnode; + __u32 state; + __u32 nfull, nremote, nnull; + + private: + mutable bufferlist dnbl; + mutable bool dn_decoded; + mutable list<fullbit> dfull; + mutable vector<remotebit> dremote; + mutable vector<nullbit> dnull; + + public: + dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } + dirlump(const dirlump&) = delete; + dirlump& operator=(const dirlump&) = delete; + + bool is_complete() const { return state & STATE_COMPLETE; } + void mark_complete() { state |= STATE_COMPLETE; } + bool is_dirty() const { return state & STATE_DIRTY; } + void mark_dirty() { state |= STATE_DIRTY; } + bool is_new() const { return state & STATE_NEW; } + void mark_new() { state |= STATE_NEW; } + bool is_importing() { return state & STATE_IMPORTING; } + void mark_importing() { state |= STATE_IMPORTING; } + bool is_dirty_dft() { return state & STATE_DIRTYDFT; } + void mark_dirty_dft() { state |= STATE_DIRTYDFT; } + + const list<fullbit> &get_dfull() const { return dfull; } + list<fullbit> &_get_dfull() { return dfull; } + const vector<remotebit> &get_dremote() const { return dremote; } + const vector<nullbit> &get_dnull() const { return dnull; } + + template< class... Args> + void add_dfull(Args&&... args) { + dfull.emplace_back(std::forward<Args>(args)...); + } + template< class... Args> + void add_dremote(Args&&... args) { + dremote.emplace_back(std::forward<Args>(args)...); + } + template< class... Args> + void add_dnull(Args&&... args) { + dnull.emplace_back(std::forward<Args>(args)...); + } + + void print(dirfrag_t dirfrag, ostream& out) const { + out << "dirlump " << dirfrag << " v " << fnode.version + << " state " << state + << " num " << nfull << "/" << nremote << "/" << nnull + << std::endl; + _decode_bits(); + for (const auto& p : dfull) + p.print(out); + for (const auto& p : dremote) + p.print(out); + for (const auto& p : dnull) + p.print(out); + } + + string state_string() const { + string state_string; + bool marked_already = false; + if (is_complete()) { + state_string.append("complete"); + marked_already = true; + } + if (is_dirty()) { + state_string.append(marked_already ? "+dirty" : "dirty"); + marked_already = true; + } + if (is_new()) { + state_string.append(marked_already ? "+new" : "new"); + } + return state_string; + } + + // if this changes, update the versioning in encode for it! + void _encode_bits(uint64_t features) const { + using ceph::encode; + if (!dn_decoded) return; + encode(dfull, dnbl, features); + encode(dremote, dnbl); + encode(dnull, dnbl); + } + void _decode_bits() const { + using ceph::decode; + if (dn_decoded) return; + auto p = dnbl.cbegin(); + decode(dfull, p); + decode(dremote, p); + decode(dnull, p); + dn_decoded = true; + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<dirlump*>& ls); + }; + WRITE_CLASS_ENCODER_FEATURES(dirlump) + + // my lumps. preserve the order we added them in a list. + vector<dirfrag_t> lump_order; + map<dirfrag_t, dirlump> lump_map; + list<fullbit> roots; +public: + vector<pair<__u8,version_t> > table_tids; // tableclient transactions + + inodeno_t opened_ino; +public: + inodeno_t renamed_dirino; + vector<frag_t> renamed_dir_frags; +private: + + // ino (pre)allocation. may involve both inotable AND session state. + version_t inotablev, sessionmapv; + inodeno_t allocated_ino; // inotable + interval_set<inodeno_t> preallocated_inos; // inotable + session + inodeno_t used_preallocated_ino; // session + entity_name_t client_name; // session + + // inodes i've truncated + vector<inodeno_t> truncate_start; // start truncate + map<inodeno_t, LogSegment::seq_t> truncate_finish; // finished truncate (started in segment blah) + +public: + vector<inodeno_t> destroyed_inodes; +private: + + // idempotent op(s) + vector<pair<metareqid_t,uint64_t> > client_reqs; + vector<pair<metareqid_t,uint64_t> > client_flushes; + + public: + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void get_inodes(std::set<inodeno_t> &inodes) const; + void get_paths(std::vector<std::string> &paths) const; + void get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const; + entity_name_t get_client_name() const {return client_name;} + + void dump(Formatter *f) const; + static void generate_test_instances(list<EMetaBlob*>& ls); + // soft stateadd + uint64_t last_subtree_map; + uint64_t event_seq; + + // for replay, in certain cases + //LogSegment *_segment; + + EMetaBlob() : opened_ino(0), renamed_dirino(0), + inotablev(0), sessionmapv(0), allocated_ino(0), + last_subtree_map(0), event_seq(0) + {} + EMetaBlob(const EMetaBlob&) = delete; + ~EMetaBlob() { } + EMetaBlob& operator=(const EMetaBlob&) = delete; + + void print(ostream& out) { + for (const auto &p : lump_order) + lump_map[p].print(p, out); + } + + void add_client_req(metareqid_t r, uint64_t tid=0) { + client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid)); + } + void add_client_flush(metareqid_t r, uint64_t tid=0) { + client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid)); + } + + void add_table_transaction(int table, version_t tid) { + table_tids.push_back(pair<__u8, version_t>(table, tid)); + } + + void add_opened_ino(inodeno_t ino) { + ceph_assert(!opened_ino); + opened_ino = ino; + } + + void set_ino_alloc(inodeno_t alloc, + inodeno_t used_prealloc, + interval_set<inodeno_t>& prealloc, + entity_name_t client, + version_t sv, version_t iv) { + allocated_ino = alloc; + used_preallocated_ino = used_prealloc; + preallocated_inos = prealloc; + client_name = client; + sessionmapv = sv; + inotablev = iv; + } + + void add_truncate_start(inodeno_t ino) { + truncate_start.push_back(ino); + } + void add_truncate_finish(inodeno_t ino, uint64_t segoff) { + truncate_finish[ino] = segoff; + } + + bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new); + + void add_destroyed_inode(inodeno_t ino) { + destroyed_inodes.push_back(ino); + } + + void add_null_dentry(CDentry *dn, bool dirty) { + add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty); + } + void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) { + // add the dir + lump.nnull++; + lump.add_dnull(dn->get_name(), dn->first, dn->last, + dn->get_projected_version(), dirty); + } + + void add_remote_dentry(CDentry *dn, bool dirty) { + add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, 0, 0); + } + void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino, int rdt) { + add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, rino, rdt); + } + void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, + inodeno_t rino=0, unsigned char rdt=0) { + if (!rino) { + rino = dn->get_projected_linkage()->get_remote_ino(); + rdt = dn->get_projected_linkage()->get_remote_d_type(); + } + lump.nremote++; + lump.add_dremote(dn->get_name(), dn->first, dn->last, + dn->get_projected_version(), rino, rdt, dirty); + } + + // return remote pointer to to-be-journaled inode + void add_primary_dentry(CDentry *dn, CInode *in, bool dirty, + bool dirty_parent=false, bool dirty_pool=false, + bool need_snapflush=false) { + __u8 state = 0; + if (dirty) state |= fullbit::STATE_DIRTY; + if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT; + if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL; + if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH; + add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state); + } + void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) { + if (!in) + in = dn->get_projected_linkage()->get_inode(); + + // make note of where this inode was last journaled + in->last_journaled = event_seq; + //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; + + const auto pi = in->get_projected_inode(); + if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated()) + state |= fullbit::STATE_DIRTYPARENT; + + bufferlist snapbl; + const sr_t *sr = in->get_projected_srnode(); + if (sr) + sr->encode(snapbl); + + lump.nfull++; + lump.add_dfull(dn->get_name(), dn->first, dn->last, dn->get_projected_version(), + *pi, in->dirfragtree, *in->get_projected_xattrs(), in->symlink, + in->oldest_snap, snapbl, state, &in->old_inodes); + } + + // convenience: primary or remote? figure it out. + void add_dentry(CDentry *dn, bool dirty) { + dirlump& lump = add_dir(dn->get_dir(), false); + add_dentry(lump, dn, dirty, false, false); + } + void add_import_dentry(CDentry *dn) { + bool dirty_parent = false; + bool dirty_pool = false; + if (dn->get_linkage()->is_primary()) { + dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent(); + dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool(); + } + dirlump& lump = add_dir(dn->get_dir(), false); + add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool); + } + void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) { + // primary or remote + if (dn->get_projected_linkage()->is_remote()) { + add_remote_dentry(dn, dirty); + return; + } else if (dn->get_projected_linkage()->is_null()) { + add_null_dentry(dn, dirty); + return; + } + ceph_assert(dn->get_projected_linkage()->is_primary()); + add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool); + } + + void add_root(bool dirty, CInode *in) { + in->last_journaled = event_seq; + //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; + + const auto& pi = *(in->get_projected_inode()); + const auto& pdft = in->dirfragtree; + const auto& px = *(in->get_projected_xattrs()); + + bufferlist snapbl; + const sr_t *sr = in->get_projected_srnode(); + if (sr) + sr->encode(snapbl); + + for (auto p = roots.begin(); p != roots.end(); ++p) { + if (p->inode.ino == in->ino()) { + roots.erase(p); + break; + } + } + + string empty; + roots.emplace_back(empty, in->first, in->last, 0, pi, pdft, px, in->symlink, + in->oldest_snap, snapbl, (dirty ? fullbit::STATE_DIRTY : 0), + &in->old_inodes); + } + + dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + dirty, complete); + } + dirlump& add_new_dir(CDir *dir) { + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + true, true, true); // dirty AND complete AND new + } + dirlump& add_import_dir(CDir *dir) { + // dirty=false would be okay in some cases + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + dir->is_dirty(), dir->is_complete(), false, true, dir->is_dirty_dft()); + } + dirlump& add_fragmented_dir(CDir *dir, bool dirty, bool dirtydft) { + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + dirty, false, false, false, dirtydft); + } + dirlump& add_dir(dirfrag_t df, const fnode_t *pf, version_t pv, bool dirty, + bool complete=false, bool isnew=false, + bool importing=false, bool dirty_dft=false) { + if (lump_map.count(df) == 0) + lump_order.push_back(df); + + dirlump& l = lump_map[df]; + l.fnode = *pf; + l.fnode.version = pv; + if (complete) l.mark_complete(); + if (dirty) l.mark_dirty(); + if (isnew) l.mark_new(); + if (importing) l.mark_importing(); + if (dirty_dft) l.mark_dirty_dft(); + return l; + } + + static const int TO_AUTH_SUBTREE_ROOT = 0; // default. + static const int TO_ROOT = 1; + + void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT); + + bool empty() { + return roots.empty() && lump_order.empty() && table_tids.empty() && + truncate_start.empty() && truncate_finish.empty() && + destroyed_inodes.empty() && client_reqs.empty() && + opened_ino == 0 && inotablev == 0 && sessionmapv == 0; + } + + void print(ostream& out) const { + out << "[metablob"; + if (!lump_order.empty()) + out << " " << lump_order.front() << ", " << lump_map.size() << " dirs"; + if (!table_tids.empty()) + out << " table_tids=" << table_tids; + if (allocated_ino || preallocated_inos.size()) { + if (allocated_ino) + out << " alloc_ino=" << allocated_ino; + if (preallocated_inos.size()) + out << " prealloc_ino=" << preallocated_inos; + if (used_preallocated_ino) + out << " used_prealloc_ino=" << used_preallocated_ino; + out << " v" << inotablev; + } + out << "]"; + } + + void update_segment(LogSegment *ls); + void replay(MDSRank *mds, LogSegment *ls, MDSlaveUpdate *su=NULL); +}; +WRITE_CLASS_ENCODER_FEATURES(EMetaBlob) +WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit) +WRITE_CLASS_ENCODER(EMetaBlob::remotebit) +WRITE_CLASS_ENCODER(EMetaBlob::nullbit) +WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump) + +inline ostream& operator<<(ostream& out, const EMetaBlob& t) { + t.print(out); + return out; +} + +#endif diff --git a/src/mds/events/ENoOp.h b/src/mds/events/ENoOp.h new file mode 100644 index 00000000..1bf5161e --- /dev/null +++ b/src/mds/events/ENoOp.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ENOOP_H +#define CEPH_MDS_ENOOP_H + +#include "../LogEvent.h" + +class ENoOp : public LogEvent { + uint32_t pad_size; + +public: + ENoOp() : LogEvent(EVENT_NOOP), pad_size(0) { } + explicit ENoOp(uint32_t size_) : LogEvent(EVENT_NOOP), pad_size(size_){ } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override {} + + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ENoOp) + +#endif diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h new file mode 100644 index 00000000..192745d9 --- /dev/null +++ b/src/mds/events/EOpen.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_EOPEN_H +#define CEPH_MDS_EOPEN_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class EOpen : public LogEvent { +public: + EMetaBlob metablob; + vector<inodeno_t> inos; + vector<vinodeno_t> snap_inos; + + EOpen() : LogEvent(EVENT_OPEN) { } + explicit EOpen(MDLog *mdlog) : + LogEvent(EVENT_OPEN) { } + + void print(ostream& out) const override { + out << "EOpen " << metablob << ", " << inos.size() << " open files"; + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void add_clean_inode(CInode *in) { + if (!in->is_base()) { + metablob.add_dir_context(in->get_projected_parent_dn()->get_dir()); + metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false); + if (in->last == CEPH_NOSNAP) + inos.push_back(in->ino()); + else + snap_inos.push_back(in->vino()); + } + } + void add_ino(inodeno_t ino) { + inos.push_back(ino); + } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EOpen*>& ls); + + void update_segment() override; + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(EOpen) + +#endif diff --git a/src/mds/events/EResetJournal.h b/src/mds/events/EResetJournal.h new file mode 100644 index 00000000..3004978a --- /dev/null +++ b/src/mds/events/EResetJournal.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MDS_ERESETJOURNAL_H +#define CEPH_MDS_ERESETJOURNAL_H + +#include "../LogEvent.h" + +// generic log event +class EResetJournal : public LogEvent { + public: + EResetJournal() : LogEvent(EVENT_RESETJOURNAL) { } + ~EResetJournal() override {} + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EResetJournal*>& ls); + void print(ostream& out) const override { + out << "EResetJournal"; + } + + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(EResetJournal) + +#endif diff --git a/src/mds/events/ESession.h b/src/mds/events/ESession.h new file mode 100644 index 00000000..0b65765e --- /dev/null +++ b/src/mds/events/ESession.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ESESSION_H +#define CEPH_MDS_ESESSION_H + +#include "common/config.h" +#include "include/types.h" + +#include "../LogEvent.h" + +class ESession : public LogEvent { + protected: + entity_inst_t client_inst; + bool open; // open or close + version_t cmapv{0}; // client map version + + interval_set<inodeno_t> inos; + version_t inotablev{0}; + + // Client metadata stored during open + client_metadata_t client_metadata; + + public: + ESession() : LogEvent(EVENT_SESSION), open(false) { } + ESession(const entity_inst_t& inst, bool o, version_t v, + const client_metadata_t& cm) : + LogEvent(EVENT_SESSION), + client_inst(inst), open(o), cmapv(v), inotablev(0), + client_metadata(cm) { } + ESession(const entity_inst_t& inst, bool o, version_t v, + const interval_set<inodeno_t>& i, version_t iv) : + LogEvent(EVENT_SESSION), + client_inst(inst), open(o), cmapv(v), inos(i), inotablev(iv) { } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ESession*>& ls); + + void print(ostream& out) const override { + if (open) + out << "ESession " << client_inst << " open cmapv " << cmapv; + else + out << "ESession " << client_inst << " close cmapv " << cmapv; + if (inos.size()) + out << " (" << inos.size() << " inos, v" << inotablev << ")"; + } + + void update_segment() override; + void replay(MDSRank *mds) override; + entity_inst_t get_client_inst() const {return client_inst;} +}; +WRITE_CLASS_ENCODER_FEATURES(ESession) + +#endif diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h new file mode 100644 index 00000000..aa0eeff8 --- /dev/null +++ b/src/mds/events/ESessions.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ESESSIONS_H +#define CEPH_MDS_ESESSIONS_H + +#include "common/config.h" +#include "include/types.h" + +#include "../LogEvent.h" + +class ESessions : public LogEvent { +protected: + version_t cmapv; // client map version + bool old_style_encode; + +public: + map<client_t,entity_inst_t> client_map; + map<client_t,client_metadata_t> client_metadata_map; + + ESessions() : LogEvent(EVENT_SESSIONS), cmapv(0), old_style_encode(false) { } + ESessions(version_t pv, map<client_t,entity_inst_t>&& cm, + map<client_t,client_metadata_t>&& cmm) : + LogEvent(EVENT_SESSIONS), + cmapv(pv), old_style_encode(false), + client_map(std::move(cm)), + client_metadata_map(std::move(cmm)) {} + + void mark_old_encoding() { old_style_encode = true; } + + void encode(bufferlist &bl, uint64_t features) const override; + void decode_old(bufferlist::const_iterator &bl); + void decode_new(bufferlist::const_iterator &bl); + void decode(bufferlist::const_iterator &bl) override { + if (old_style_encode) decode_old(bl); + else decode_new(bl); + } + void dump(Formatter *f) const override; + static void generate_test_instances(list<ESessions*>& ls); + + void print(ostream& out) const override { + out << "ESessions " << client_map.size() << " opens cmapv " << cmapv; + } + + void update_segment() override; + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ESessions) + +#endif diff --git a/src/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h new file mode 100644 index 00000000..23ca430b --- /dev/null +++ b/src/mds/events/ESlaveUpdate.h @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ESLAVEUPDATE_H +#define CEPH_MDS_ESLAVEUPDATE_H + +#include <string_view> + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +/* + * rollback records, for remote/slave updates, which may need to be manually + * rolled back during journal replay. (or while active if master fails, but in + * that case these records aren't needed.) + */ +struct link_rollback { + metareqid_t reqid; + inodeno_t ino; + bool was_inc; + utime_t old_ctime; + utime_t old_dir_mtime; + utime_t old_dir_rctime; + bufferlist snapbl; + + link_rollback() : ino(0), was_inc(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<link_rollback*>& ls); +}; +WRITE_CLASS_ENCODER(link_rollback) + +/* + * this is only used on an empty dir with a dirfrag on a remote node. + * we are auth for nothing. all we need to do is relink the directory + * in the hierarchy properly during replay to avoid breaking the + * subtree map. + */ +struct rmdir_rollback { + metareqid_t reqid; + dirfrag_t src_dir; + string src_dname; + dirfrag_t dest_dir; + string dest_dname; + bufferlist snapbl; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<rmdir_rollback*>& ls); +}; +WRITE_CLASS_ENCODER(rmdir_rollback) + +struct rename_rollback { + struct drec { + dirfrag_t dirfrag; + utime_t dirfrag_old_mtime; + utime_t dirfrag_old_rctime; + inodeno_t ino, remote_ino; + string dname; + char remote_d_type; + utime_t old_ctime; + + drec() : remote_d_type((char)S_IFREG) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<drec*>& ls); + }; + WRITE_CLASS_MEMBER_ENCODER(drec) + + metareqid_t reqid; + drec orig_src, orig_dest; + drec stray; // we know this is null, but we want dname, old mtime/rctime + utime_t ctime; + bufferlist srci_snapbl; + bufferlist desti_snapbl; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<rename_rollback*>& ls); +}; +WRITE_CLASS_ENCODER(rename_rollback::drec) +WRITE_CLASS_ENCODER(rename_rollback) + + +class ESlaveUpdate : public LogEvent { +public: + const static int OP_PREPARE = 1; + const static int OP_COMMIT = 2; + const static int OP_ROLLBACK = 3; + + const static int LINK = 1; + const static int RENAME = 2; + const static int RMDIR = 3; + + /* + * we journal a rollback metablob that contains the unmodified metadata + * too, because we may be updating previously dirty metadata, which + * will allow old log segments to be trimmed. if we end of rolling back, + * those updates could be lost.. so we re-journal the unmodified metadata, + * and replay will apply _either_ commit or rollback. + */ + EMetaBlob commit; + bufferlist rollback; + string type; + metareqid_t reqid; + mds_rank_t master; + __u8 op; // prepare, commit, abort + __u8 origop; // link | rename + + ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE), master(0), op(0), origop(0) { } + ESlaveUpdate(MDLog *mdlog, std::string_view s, metareqid_t ri, int mastermds, int o, int oo) : + LogEvent(EVENT_SLAVEUPDATE), + type(s), + reqid(ri), + master(mastermds), + op(o), origop(oo) { } + + void print(ostream& out) const override { + if (type.length()) + out << type << " "; + out << " " << (int)op; + if (origop == LINK) out << " link"; + if (origop == RENAME) out << " rename"; + out << " " << reqid; + out << " for mds." << master; + out << commit; + } + + EMetaBlob *get_metablob() override { return &commit; } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ESlaveUpdate*>& ls); + + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ESlaveUpdate) + +#endif diff --git a/src/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h new file mode 100644 index 00000000..08d4a581 --- /dev/null +++ b/src/mds/events/ESubtreeMap.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ESUBTREEMAP_H +#define CEPH_MDS_ESUBTREEMAP_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class ESubtreeMap : public LogEvent { +public: + EMetaBlob metablob; + map<dirfrag_t, vector<dirfrag_t> > subtrees; + set<dirfrag_t> ambiguous_subtrees; + uint64_t expire_pos; + uint64_t event_seq; + + ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP), expire_pos(0), event_seq(0) { } + + void print(ostream& out) const override { + out << "ESubtreeMap " << subtrees.size() << " subtrees " + << ", " << ambiguous_subtrees.size() << " ambiguous " + << metablob; + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ESubtreeMap*>& ls); + + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ESubtreeMap) + +#endif diff --git a/src/mds/events/ETableClient.h b/src/mds/events/ETableClient.h new file mode 100644 index 00000000..bf3e752d --- /dev/null +++ b/src/mds/events/ETableClient.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ETABLECLIENT_H +#define CEPH_MDS_ETABLECLIENT_H + +#include "common/config.h" +#include "include/types.h" + +#include "../mds_table_types.h" +#include "../LogEvent.h" + +struct ETableClient : public LogEvent { + __u16 table; + __s16 op; + version_t tid; + + ETableClient() : LogEvent(EVENT_TABLECLIENT), table(0), op(0), tid(0) { } + ETableClient(int t, int o, version_t ti) : + LogEvent(EVENT_TABLECLIENT), + table(t), op(o), tid(ti) { } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ETableClient*>& ls); + + void print(ostream& out) const override { + out << "ETableClient " << get_mdstable_name(table) << " " << get_mdstableserver_opname(op); + if (tid) out << " tid " << tid; + } + + //void update_segment(); + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ETableClient) + +#endif diff --git a/src/mds/events/ETableServer.h b/src/mds/events/ETableServer.h new file mode 100644 index 00000000..0005b132 --- /dev/null +++ b/src/mds/events/ETableServer.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_ETABLESERVER_H +#define CEPH_MDS_ETABLESERVER_H + +#include "common/config.h" +#include "include/types.h" + +#include "../mds_table_types.h" +#include "../LogEvent.h" + +struct ETableServer : public LogEvent { + __u16 table; + __s16 op; + uint64_t reqid; + mds_rank_t bymds; + bufferlist mutation; + version_t tid; + version_t version; + + ETableServer() : LogEvent(EVENT_TABLESERVER), table(0), op(0), + reqid(0), bymds(MDS_RANK_NONE), tid(0), version(0) { } + ETableServer(int t, int o, uint64_t ri, mds_rank_t m, version_t ti, version_t v) : + LogEvent(EVENT_TABLESERVER), + table(t), op(o), reqid(ri), bymds(m), tid(ti), version(v) { } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<ETableServer*>& ls); + + void print(ostream& out) const override { + out << "ETableServer " << get_mdstable_name(table) + << " " << get_mdstableserver_opname(op); + if (reqid) out << " reqid " << reqid; + if (bymds >= 0) out << " mds." << bymds; + if (tid) out << " tid " << tid; + if (version) out << " version " << version; + if (mutation.length()) out << " mutation=" << mutation.length() << " bytes"; + } + + void update_segment() override; + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(ETableServer) + +#endif diff --git a/src/mds/events/EUpdate.h b/src/mds/events/EUpdate.h new file mode 100644 index 00000000..dc710d52 --- /dev/null +++ b/src/mds/events/EUpdate.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_EUPDATE_H +#define CEPH_MDS_EUPDATE_H + +#include <string_view> + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class EUpdate : public LogEvent { +public: + EMetaBlob metablob; + string type; + bufferlist client_map; + version_t cmapv; + metareqid_t reqid; + bool had_slaves; + + EUpdate() : LogEvent(EVENT_UPDATE), cmapv(0), had_slaves(false) { } + EUpdate(MDLog *mdlog, std::string_view s) : + LogEvent(EVENT_UPDATE), + type(s), cmapv(0), had_slaves(false) { } + + void print(ostream& out) const override { + if (type.length()) + out << "EUpdate " << type << " "; + out << metablob; + } + + EMetaBlob *get_metablob() override { return &metablob; } + + void encode(bufferlist& bl, uint64_t features) const override; + void decode(bufferlist::const_iterator& bl) override; + void dump(Formatter *f) const override; + static void generate_test_instances(list<EUpdate*>& ls); + + void update_segment() override; + void replay(MDSRank *mds) override; +}; +WRITE_CLASS_ENCODER_FEATURES(EUpdate) + +#endif diff --git a/src/mds/flock.cc b/src/mds/flock.cc new file mode 100644 index 00000000..2382322b --- /dev/null +++ b/src/mds/flock.cc @@ -0,0 +1,596 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <errno.h> + +#include "common/debug.h" +#include "mdstypes.h" +#include "mds/flock.h" + +#define dout_subsys ceph_subsys_mds + +static multimap<ceph_filelock, ceph_lock_state_t*> global_waiting_locks; + +static void remove_global_waiting(ceph_filelock &fl, ceph_lock_state_t *lock_state) +{ + for (auto p = global_waiting_locks.find(fl); + p != global_waiting_locks.end(); ) { + if (p->first != fl) + break; + if (p->second == lock_state) { + global_waiting_locks.erase(p); + break; + } + ++p; + } +} + +ceph_lock_state_t::~ceph_lock_state_t() +{ + if (type == CEPH_LOCK_FCNTL) { + for (auto p = waiting_locks.begin(); p != waiting_locks.end(); ++p) { + remove_global_waiting(p->second, this); + } + } +} + +bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl) const +{ + multimap<uint64_t, ceph_filelock>::const_iterator p = waiting_locks.find(fl.start); + while (p != waiting_locks.end()) { + if (p->second.start > fl.start) + return false; + if (p->second.length == fl.length && + ceph_filelock_owner_equal(p->second, fl)) + return true; + ++p; + } + return false; +} + +void ceph_lock_state_t::remove_waiting(const ceph_filelock& fl) +{ + for (auto p = waiting_locks.find(fl.start); + p != waiting_locks.end(); ) { + if (p->second.start > fl.start) + break; + if (p->second.length == fl.length && + ceph_filelock_owner_equal(p->second, fl)) { + if (type == CEPH_LOCK_FCNTL) { + remove_global_waiting(p->second, this); + } + waiting_locks.erase(p); + --client_waiting_lock_counts[(client_t)fl.client]; + if (!client_waiting_lock_counts[(client_t)fl.client]) { + client_waiting_lock_counts.erase((client_t)fl.client); + } + break; + } + ++p; + } +} + +bool ceph_lock_state_t::is_deadlock(const ceph_filelock& fl, + list<multimap<uint64_t, ceph_filelock>::iterator>& + overlapping_locks, + const ceph_filelock *first_fl, unsigned depth) const +{ + ldout(cct,15) << "is_deadlock " << fl << dendl; + + // only for posix lock + if (type != CEPH_LOCK_FCNTL) + return false; + + // find conflict locks' owners + set<ceph_filelock> lock_owners; + for (auto p = overlapping_locks.begin(); + p != overlapping_locks.end(); + ++p) { + + if (fl.type == CEPH_LOCK_SHARED && + (*p)->second.type == CEPH_LOCK_SHARED) + continue; + + // circle detected + if (first_fl && ceph_filelock_owner_equal(*first_fl, (*p)->second)) { + ldout(cct,15) << " detect deadlock" << dendl; + return true; + } + + ceph_filelock tmp = (*p)->second; + tmp.start = 0; + tmp.length = 0; + tmp.type = 0; + lock_owners.insert(tmp); + } + + if (depth >= MAX_DEADLK_DEPTH) + return false; + + first_fl = first_fl ? first_fl : &fl; + for (auto p = lock_owners.begin(); + p != lock_owners.end(); + ++p) { + ldout(cct,15) << " conflict lock owner " << *p << dendl; + // if conflict lock' owner is waiting for other lock? + for (auto q = global_waiting_locks.lower_bound(*p); + q != global_waiting_locks.end(); + ++q) { + if (!ceph_filelock_owner_equal(q->first, *p)) + break; + + list<multimap<uint64_t, ceph_filelock>::iterator> + _overlapping_locks, _self_overlapping_locks; + ceph_lock_state_t& state = *(q->second); + if (state.get_overlapping_locks(q->first, _overlapping_locks)) { + state.split_by_owner(q->first, _overlapping_locks, _self_overlapping_locks); + } + if (!_overlapping_locks.empty()) { + if (is_deadlock(q->first, _overlapping_locks, first_fl, depth + 1)) + return true; + } + } + } + return false; +} + +void ceph_lock_state_t::add_waiting(const ceph_filelock& fl) +{ + waiting_locks.insert(pair<uint64_t, ceph_filelock>(fl.start, fl)); + ++client_waiting_lock_counts[(client_t)fl.client]; + if (type == CEPH_LOCK_FCNTL) { + global_waiting_locks.insert(pair<ceph_filelock,ceph_lock_state_t*>(fl, this)); + } +} + +bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock, + bool wait_on_fail, bool replay, + bool *deadlock) +{ + ldout(cct,15) << "add_lock " << new_lock << dendl; + bool ret = false; + list<multimap<uint64_t, ceph_filelock>::iterator> + overlapping_locks, self_overlapping_locks, neighbor_locks; + + // first, get any overlapping locks and split them into owned-by-us and not + if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) { + ldout(cct,15) << "got overlapping lock, splitting by owner" << dendl; + split_by_owner(new_lock, overlapping_locks, self_overlapping_locks); + } + if (!overlapping_locks.empty()) { //overlapping locks owned by others :( + if (CEPH_LOCK_EXCL == new_lock.type) { + //can't set, we want an exclusive + ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set" + << dendl; + if (wait_on_fail && !replay) { + if (is_deadlock(new_lock, overlapping_locks)) + *deadlock = true; + else + add_waiting(new_lock); + } + } else { //shared lock, check for any exclusive locks blocking us + if (contains_exclusive_lock(overlapping_locks)) { //blocked :( + ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl; + if (wait_on_fail && !replay) { + if (is_deadlock(new_lock, overlapping_locks)) + *deadlock = true; + else + add_waiting(new_lock); + } + } else { + //yay, we can insert a shared lock + ldout(cct,15) << "inserting shared lock" << dendl; + remove_waiting(new_lock); + adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); + held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); + ret = true; + } + } + } else { //no overlapping locks except our own + remove_waiting(new_lock); + adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); + ldout(cct,15) << "no conflicts, inserting " << new_lock << dendl; + held_locks.insert(pair<uint64_t, ceph_filelock> + (new_lock.start, new_lock)); + ret = true; + } + if (ret) { + ++client_held_lock_counts[(client_t)new_lock.client]; + } + return ret; +} + +void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock) +{ + list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, + self_overlapping_locks; + if (get_overlapping_locks(testing_lock, overlapping_locks)) { + split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks); + } + if (!overlapping_locks.empty()) { //somebody else owns overlapping lock + if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it + testing_lock = (*overlapping_locks.begin())->second; + } else { + ceph_filelock *blocking_lock; + if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) { + testing_lock = *blocking_lock; + } else { //nothing blocking! + testing_lock.type = CEPH_LOCK_UNLOCK; + } + } + return; + } + //if we get here, only our own locks block + testing_lock.type = CEPH_LOCK_UNLOCK; +} + +void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock, + list<ceph_filelock>& activated_locks) +{ + list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, + self_overlapping_locks; + if (get_overlapping_locks(removal_lock, overlapping_locks)) { + ldout(cct,15) << "splitting by owner" << dendl; + split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks); + } else ldout(cct,15) << "attempt to remove lock at " << removal_lock.start + << " but no locks there!" << dendl; + bool remove_to_end = (0 == removal_lock.length); + uint64_t removal_start = removal_lock.start; + uint64_t removal_end = removal_start + removal_lock.length - 1; + __s64 old_lock_client = 0; + ceph_filelock *old_lock; + + ldout(cct,15) << "examining " << self_overlapping_locks.size() + << " self-overlapping locks for removal" << dendl; + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = self_overlapping_locks.begin(); + iter != self_overlapping_locks.end(); + ++iter) { + ldout(cct,15) << "self overlapping lock " << (*iter)->second << dendl; + old_lock = &(*iter)->second; + bool old_lock_to_end = (0 == old_lock->length); + uint64_t old_lock_end = old_lock->start + old_lock->length - 1; + old_lock_client = old_lock->client; + if (remove_to_end) { + if (old_lock->start < removal_start) { + old_lock->length = removal_start - old_lock->start; + } else { + ldout(cct,15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } else if (old_lock_to_end) { + ceph_filelock append_lock = *old_lock; + append_lock.start = removal_end+1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (append_lock.start, append_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + if (old_lock->start >= removal_start) { + ldout(cct,15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else old_lock->length = removal_start - old_lock->start; + } else { + if (old_lock_end > removal_end) { + ceph_filelock append_lock = *old_lock; + append_lock.start = removal_end + 1; + append_lock.length = old_lock_end - append_lock.start + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (append_lock.start, append_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + } + if (old_lock->start < removal_start) { + old_lock->length = removal_start - old_lock->start; + } else { + ldout(cct,15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } +} + +bool ceph_lock_state_t::remove_all_from (client_t client) +{ + bool cleared_any = false; + if (client_held_lock_counts.count(client)) { + multimap<uint64_t, ceph_filelock>::iterator iter = held_locks.begin(); + while (iter != held_locks.end()) { + if ((client_t)iter->second.client == client) { + held_locks.erase(iter++); + } else + ++iter; + } + client_held_lock_counts.erase(client); + cleared_any = true; + } + + if (client_waiting_lock_counts.count(client)) { + multimap<uint64_t, ceph_filelock>::iterator iter = waiting_locks.begin(); + while (iter != waiting_locks.end()) { + if ((client_t)iter->second.client != client) { + ++iter; + continue; + } + if (type == CEPH_LOCK_FCNTL) { + remove_global_waiting(iter->second, this); + } + waiting_locks.erase(iter++); + } + client_waiting_lock_counts.erase(client); + } + return cleared_any; +} + +void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks, + ceph_filelock& new_lock, + list<multimap<uint64_t, ceph_filelock>::iterator> + neighbor_locks) +{ + ldout(cct,15) << "adjust_locks" << dendl; + bool new_lock_to_end = (0 == new_lock.length); + __s64 old_lock_client = 0; + ceph_filelock *old_lock; + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = old_locks.begin(); + iter != old_locks.end(); + ++iter) { + old_lock = &(*iter)->second; + ldout(cct,15) << "adjusting lock: " << *old_lock << dendl; + bool old_lock_to_end = (0 == old_lock->length); + uint64_t old_lock_start = old_lock->start; + uint64_t old_lock_end = old_lock->start + old_lock->length - 1; + uint64_t new_lock_start = new_lock.start; + uint64_t new_lock_end = new_lock.start + new_lock.length - 1; + old_lock_client = old_lock->client; + if (new_lock_to_end || old_lock_to_end) { + //special code path to deal with a length set at 0 + ldout(cct,15) << "one lock extends forever" << dendl; + if (old_lock->type == new_lock.type) { + //just unify them in new lock, remove old lock + ldout(cct,15) << "same lock type, unifying" << dendl; + new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start : + old_lock_start; + new_lock.length = 0; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else { //not same type, have to keep any remains of old lock around + ldout(cct,15) << "shrinking old lock" << dendl; + if (new_lock_to_end) { + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } else { //old lock extends past end of new lock + ceph_filelock appended_lock = *old_lock; + appended_lock.start = new_lock_end + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (appended_lock.start, appended_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + } + } else { + if (old_lock->type == new_lock.type) { //just merge them! + ldout(cct,15) << "merging locks, they're the same type" << dendl; + new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start : + new_lock_start; + int new_end = (new_lock_end > old_lock_end) ? new_lock_end : + old_lock_end; + new_lock.length = new_end - new_lock.start + 1; + ldout(cct,15) << "erasing lock " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else { //we'll have to update sizes and maybe make new locks + ldout(cct,15) << "locks aren't same type, changing sizes" << dendl; + if (old_lock_end > new_lock_end) { //add extra lock after new_lock + ceph_filelock appended_lock = *old_lock; + appended_lock.start = new_lock_end + 1; + appended_lock.length = old_lock_end - appended_lock.start + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (appended_lock.start, appended_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + } + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { //old_lock starts inside new_lock, so remove it + //if it extended past new_lock_end it's been replaced + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } + + //make sure to coalesce neighboring locks + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = neighbor_locks.begin(); + iter != neighbor_locks.end(); + ++iter) { + old_lock = &(*iter)->second; + old_lock_client = old_lock->client; + ldout(cct,15) << "lock to coalesce: " << *old_lock << dendl; + /* because if it's a neighboring lock there can't be any self-overlapping + locks that covered it */ + if (old_lock->type == new_lock.type) { //merge them + if (0 == new_lock.length) { + if (old_lock->start + old_lock->length == new_lock.start) { + new_lock.start = old_lock->start; + } else ceph_abort(); /* if there's no end to new_lock, the neighbor + HAS TO be to left side */ + } else if (0 == old_lock->length) { + if (new_lock.start + new_lock.length == old_lock->start) { + new_lock.length = 0; + } else ceph_abort(); //same as before, but reversed + } else { + if (old_lock->start + old_lock->length == new_lock.start) { + new_lock.start = old_lock->start; + new_lock.length = old_lock->length + new_lock.length; + } else if (new_lock.start + new_lock.length == old_lock->start) { + new_lock.length = old_lock->length + new_lock.length; + } + } + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } +} + +multimap<uint64_t, ceph_filelock>::iterator +ceph_lock_state_t::get_lower_bound(uint64_t start, + multimap<uint64_t, ceph_filelock>& lock_map) +{ + multimap<uint64_t, ceph_filelock>::iterator lower_bound = + lock_map.lower_bound(start); + if ((lower_bound->first != start) + && (start != 0) + && (lower_bound != lock_map.begin())) --lower_bound; + if (lock_map.end() == lower_bound) + ldout(cct,15) << "get_lower_dout(15)eturning end()" << dendl; + else ldout(cct,15) << "get_lower_bound returning iterator pointing to " + << lower_bound->second << dendl; + return lower_bound; + } + +multimap<uint64_t, ceph_filelock>::iterator +ceph_lock_state_t::get_last_before(uint64_t end, + multimap<uint64_t, ceph_filelock>& lock_map) +{ + multimap<uint64_t, ceph_filelock>::iterator last = + lock_map.upper_bound(end); + if (last != lock_map.begin()) --last; + if (lock_map.end() == last) + ldout(cct,15) << "get_last_before returning end()" << dendl; + else ldout(cct,15) << "get_last_before returning iterator pointing to " + << last->second << dendl; + return last; +} + +bool ceph_lock_state_t::share_space( + multimap<uint64_t, ceph_filelock>::iterator& iter, + uint64_t start, uint64_t end) +{ + bool ret = ((iter->first >= start && iter->first <= end) || + ((iter->first < start) && + (((iter->first + iter->second.length - 1) >= start) || + (0 == iter->second.length)))); + ldout(cct,15) << "share_space got start: " << start << ", end: " << end + << ", lock: " << iter->second << ", returning " << ret << dendl; + return ret; +} + +bool ceph_lock_state_t::get_overlapping_locks(const ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator> & overlaps, + list<multimap<uint64_t, + ceph_filelock>::iterator> *self_neighbors) +{ + ldout(cct,15) << "get_overlapping_locks" << dendl; + // create a lock starting one earlier and ending one later + // to check for neighbors + ceph_filelock neighbor_check_lock = lock; + if (neighbor_check_lock.start != 0) { + neighbor_check_lock.start = neighbor_check_lock.start - 1; + if (neighbor_check_lock.length) + neighbor_check_lock.length = neighbor_check_lock.length + 2; + } else { + if (neighbor_check_lock.length) + neighbor_check_lock.length = neighbor_check_lock.length + 1; + } + //find the last held lock starting at the point after lock + uint64_t endpoint = lock.start; + if (lock.length) { + endpoint += lock.length; + } else { + endpoint = uint64_t(-1); // max offset + } + multimap<uint64_t, ceph_filelock>::iterator iter = + get_last_before(endpoint, held_locks); + bool cont = iter != held_locks.end(); + while(cont) { + if (share_space(iter, lock)) { + overlaps.push_front(iter); + } else if (self_neighbors && + ceph_filelock_owner_equal(neighbor_check_lock, iter->second) && + share_space(iter, neighbor_check_lock)) { + self_neighbors->push_front(iter); + } + if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) { + //can't be any more overlapping locks or they'd interfere with this one + cont = false; + } else if (held_locks.begin() == iter) cont = false; + else --iter; + } + return !overlaps.empty(); +} + +bool ceph_lock_state_t::get_waiting_overlaps(const ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator>& + overlaps) +{ + ldout(cct,15) << "get_waiting_overlaps" << dendl; + multimap<uint64_t, ceph_filelock>::iterator iter = + get_last_before(lock.start + lock.length - 1, waiting_locks); + bool cont = iter != waiting_locks.end(); + while(cont) { + if (share_space(iter, lock)) overlaps.push_front(iter); + if (waiting_locks.begin() == iter) cont = false; + --iter; + } + return !overlaps.empty(); +} + +void ceph_lock_state_t::split_by_owner(const ceph_filelock& owner, + list<multimap<uint64_t, + ceph_filelock>::iterator>& locks, + list<multimap<uint64_t, + ceph_filelock>::iterator>& + owned_locks) +{ + list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = locks.begin(); + ldout(cct,15) << "owner lock: " << owner << dendl; + while (iter != locks.end()) { + ldout(cct,15) << "comparing to " << (*iter)->second << dendl; + if (ceph_filelock_owner_equal((*iter)->second, owner)) { + ldout(cct,15) << "success, pushing to owned_locks" << dendl; + owned_locks.push_back(*iter); + iter = locks.erase(iter); + } else { + ldout(cct,15) << "failure, something not equal in this group " + << (*iter)->second.client << ":" << owner.client << "," + << (*iter)->second.owner << ":" << owner.owner << "," + << (*iter)->second.pid << ":" << owner.pid << dendl; + ++iter; + } + } +} + +ceph_filelock * +ceph_lock_state_t::contains_exclusive_lock(list<multimap<uint64_t, + ceph_filelock>::iterator>& locks) +{ + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = locks.begin(); + iter != locks.end(); + ++iter) { + if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second; + } + return NULL; +} diff --git a/src/mds/flock.h b/src/mds/flock.h new file mode 100644 index 00000000..ef1793f4 --- /dev/null +++ b/src/mds/flock.h @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_MDS_FLOCK_H +#define CEPH_MDS_FLOCK_H + +#include <errno.h> + +#include "common/debug.h" +#include "mdstypes.h" + + +inline ostream& operator<<(ostream& out, const ceph_filelock& l) { + out << "start: " << l.start << ", length: " << l.length + << ", client: " << l.client << ", owner: " << l.owner + << ", pid: " << l.pid << ", type: " << (int)l.type + << std::endl; + return out; +} + +inline bool ceph_filelock_owner_equal(const ceph_filelock& l, const ceph_filelock& r) +{ + if (l.client != r.client || l.owner != r.owner) + return false; + // The file lock is from old client if the most significant bit of + // 'owner' is not set. Old clients use both 'owner' and 'pid' to + // identify the owner of lock. + if (l.owner & (1ULL << 63)) + return true; + return l.pid == r.pid; +} + +inline int ceph_filelock_owner_compare(const ceph_filelock& l, const ceph_filelock& r) +{ + if (l.client != r.client) + return l.client > r.client ? 1 : -1; + if (l.owner != r.owner) + return l.owner > r.owner ? 1 : -1; + if (l.owner & (1ULL << 63)) + return 0; + if (l.pid != r.pid) + return l.pid > r.pid ? 1 : -1; + return 0; +} + +inline int ceph_filelock_compare(const ceph_filelock& l, const ceph_filelock& r) +{ + int ret = ceph_filelock_owner_compare(l, r); + if (ret) + return ret; + if (l.start != r.start) + return l.start > r.start ? 1 : -1; + if (l.length != r.length) + return l.length > r.length ? 1 : -1; + if (l.type != r.type) + return l.type > r.type ? 1 : -1; + return 0; +} + +inline bool operator<(const ceph_filelock& l, const ceph_filelock& r) +{ + return ceph_filelock_compare(l, r) < 0; +} + +inline bool operator==(const ceph_filelock& l, const ceph_filelock& r) { + return ceph_filelock_compare(l, r) == 0; +} + +inline bool operator!=(const ceph_filelock& l, const ceph_filelock& r) { + return ceph_filelock_compare(l, r) != 0; +} + +class ceph_lock_state_t { + CephContext *cct; + int type; +public: + explicit ceph_lock_state_t(CephContext *cct_, int type_) : cct(cct_), type(type_) {} + ~ceph_lock_state_t(); + multimap<uint64_t, ceph_filelock> held_locks; // current locks + multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks + // both of the above are keyed by starting offset + map<client_t, int> client_held_lock_counts; + map<client_t, int> client_waiting_lock_counts; + + /** + * Check if a lock is on the waiting_locks list. + * + * @param fl The filelock to check for + * @returns True if the lock is waiting, false otherwise + */ + bool is_waiting(const ceph_filelock &fl) const; + /** + * Remove a lock from the waiting_locks list + * + * @param fl The filelock to remove + */ + void remove_waiting(const ceph_filelock& fl); + /* + * Try to set a new lock. If it's blocked and wait_on_fail is true, + * add the lock to waiting_locks. + * The lock needs to be of type CEPH_LOCK_EXCL or CEPH_LOCK_SHARED. + * This may merge previous locks, or convert the type of already-owned + * locks. + * + * @param new_lock The lock to set + * @param wait_on_fail whether to wait until the lock can be set. + * Otherwise it fails immediately when blocked. + * + * @returns true if set, false if not set. + */ + bool add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay, + bool *deadlock); + /** + * See if a lock is blocked by existing locks. If the lock is blocked, + * it will be set to the value of the first blocking lock. Otherwise, + * it will be returned unchanged, except for setting the type field + * to CEPH_LOCK_UNLOCK. + * + * @param testing_lock The lock to check for conflicts on. + */ + void look_for_lock(ceph_filelock& testing_lock); + + /* + * Remove lock(s) described in old_lock. This may involve splitting a + * previous lock or making a previous lock smaller. + * + * @param removal_lock The lock to remove + * @param activated_locks A return parameter, holding activated wait locks. + */ + void remove_lock(const ceph_filelock removal_lock, + list<ceph_filelock>& activated_locks); + + bool remove_all_from(client_t client); +private: + static const unsigned MAX_DEADLK_DEPTH = 5; + + /** + * Check if adding the lock causes deadlock + * + * @param fl The blocking filelock + * @param overlapping_locks list of all overlapping locks + * @param first_fl + * @depth recursion call depth + */ + bool is_deadlock(const ceph_filelock& fl, + list<multimap<uint64_t, ceph_filelock>::iterator>& + overlapping_locks, + const ceph_filelock *first_fl=NULL, unsigned depth=0) const; + + /** + * Add a lock to the waiting_locks list + * + * @param fl The filelock to add + */ + void add_waiting(const ceph_filelock& fl); + + /** + * Adjust old locks owned by a single process so that process can set + * a new lock of different type. Handle any changes needed to the old locks + * (and the new lock) so that once the new lock is inserted into the + * held_locks list the process has a coherent, non-fragmented set of lock + * ranges. Make sure any overlapping locks are combined, trimmed, and removed + * as needed. + * This function should only be called once you know the lock will be + * inserted, as it DOES adjust new_lock. You can call this function + * on an empty list, in which case it does nothing. + * This function does not remove elements from old_locks, so regard the list + * as bad information following function invocation. + * + * @param new_lock The new lock the process has requested. + * @param old_locks list of all locks currently held by same + * client/process that overlap new_lock. + * @param neighbor_locks locks owned by same process that neighbor new_lock on + * left or right side. + */ + void adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks, + ceph_filelock& new_lock, + list<multimap<uint64_t, ceph_filelock>::iterator> + neighbor_locks); + + //get last lock prior to start position + multimap<uint64_t, ceph_filelock>::iterator + get_lower_bound(uint64_t start, + multimap<uint64_t, ceph_filelock>& lock_map); + //get latest-starting lock that goes over the byte "end" + multimap<uint64_t, ceph_filelock>::iterator + get_last_before(uint64_t end, + multimap<uint64_t, ceph_filelock>& lock_map); + + /* + * See if an iterator's lock covers any of the same bounds as a given range + * Rules: locks cover "length" bytes from "start", so the last covered + * byte is at start + length - 1. + * If the length is 0, the lock covers from "start" to the end of the file. + */ + bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter, + uint64_t start, uint64_t end); + + bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter, + const ceph_filelock &lock) { + uint64_t end = lock.start; + if (lock.length) { + end += lock.length - 1; + } else { // zero length means end of file + end = uint64_t(-1); + } + return share_space(iter, lock.start, end); + } + /* + *get a list of all locks overlapping with the given lock's range + * lock: the lock to compare with. + * overlaps: an empty list, to be filled. + * Returns: true if at least one lock overlaps. + */ + bool get_overlapping_locks(const ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator> & overlaps, + list<multimap<uint64_t, + ceph_filelock>::iterator> *self_neighbors); + + + bool get_overlapping_locks(const ceph_filelock& lock, + list<multimap<uint64_t, ceph_filelock>::iterator>& overlaps) { + return get_overlapping_locks(lock, overlaps, NULL); + } + + /** + * Get a list of all waiting locks that overlap with the given lock's range. + * lock: specifies the range to compare with + * overlaps: an empty list, to be filled + * Returns: true if at least one waiting_lock overlaps + */ + bool get_waiting_overlaps(const ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator>& overlaps); + /* + * split a list of locks up by whether they're owned by same + * process as given lock + * owner: the owning lock + * locks: the list of locks (obtained from get_overlapping_locks, probably) + * Will have all locks owned by owner removed + * owned_locks: an empty list, to be filled with the locks owned by owner + */ + void split_by_owner(const ceph_filelock& owner, + list<multimap<uint64_t, + ceph_filelock>::iterator> & locks, + list<multimap<uint64_t, + ceph_filelock>::iterator> & owned_locks); + + ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t, + ceph_filelock>::iterator>& locks); + +public: + void encode(bufferlist& bl) const { + using ceph::encode; + encode(held_locks, bl); + encode(client_held_lock_counts, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(held_locks, bl); + decode(client_held_lock_counts, bl); + } + bool empty() const { + return held_locks.empty() && waiting_locks.empty() && + client_held_lock_counts.empty() && + client_waiting_lock_counts.empty(); + } +}; +WRITE_CLASS_ENCODER(ceph_lock_state_t) + + +inline ostream& operator<<(ostream &out, const ceph_lock_state_t &l) { + out << "ceph_lock_state_t. held_locks.size()=" << l.held_locks.size() + << ", waiting_locks.size()=" << l.waiting_locks.size() + << ", client_held_lock_counts -- " << l.client_held_lock_counts + << "\n client_waiting_lock_counts -- " << l.client_waiting_lock_counts + << "\n held_locks -- "; + for (auto iter = l.held_locks.begin(); + iter != l.held_locks.end(); + ++iter) + out << iter->second; + out << "\n waiting_locks -- "; + for (auto iter =l.waiting_locks.begin(); + iter != l.waiting_locks.end(); + ++iter) + out << iter->second << "\n"; + return out; +} + +#endif diff --git a/src/mds/inode_backtrace.cc b/src/mds/inode_backtrace.cc new file mode 100644 index 00000000..50e986a1 --- /dev/null +++ b/src/mds/inode_backtrace.cc @@ -0,0 +1,163 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "inode_backtrace.h" + +#include "common/Formatter.h" + +/* inode_backpointer_t */ + +void inode_backpointer_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(dirino, bl); + encode(dname, bl); + encode(version, bl); + ENCODE_FINISH(bl); +} + +void inode_backpointer_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(dirino, bl); + decode(dname, bl); + decode(version, bl); + DECODE_FINISH(bl); +} + +void inode_backpointer_t::decode_old(bufferlist::const_iterator& bl) +{ + using ceph::decode; + decode(dirino, bl); + decode(dname, bl); + decode(version, bl); +} + +void inode_backpointer_t::dump(Formatter *f) const +{ + f->dump_unsigned("dirino", dirino); + f->dump_string("dname", dname); + f->dump_unsigned("version", version); +} + +void inode_backpointer_t::generate_test_instances(list<inode_backpointer_t*>& ls) +{ + ls.push_back(new inode_backpointer_t); + ls.push_back(new inode_backpointer_t); + ls.back()->dirino = 1; + ls.back()->dname = "foo"; + ls.back()->version = 123; +} + + +/* + * inode_backtrace_t + */ + +void inode_backtrace_t::encode(bufferlist& bl) const +{ + ENCODE_START(5, 4, bl); + encode(ino, bl); + encode(ancestors, bl); + encode(pool, bl); + encode(old_pools, bl); + ENCODE_FINISH(bl); +} + +void inode_backtrace_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); + if (struct_v < 3) + return; // sorry, the old data was crap + decode(ino, bl); + if (struct_v >= 4) { + decode(ancestors, bl); + } else { + __u32 n; + decode(n, bl); + while (n--) { + ancestors.push_back(inode_backpointer_t()); + ancestors.back().decode_old(bl); + } + } + if (struct_v >= 5) { + decode(pool, bl); + decode(old_pools, bl); + } + DECODE_FINISH(bl); +} + +void inode_backtrace_t::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->open_array_section("ancestors"); + for (vector<inode_backpointer_t>::const_iterator p = ancestors.begin(); p != ancestors.end(); ++p) { + f->open_object_section("backpointer"); + p->dump(f); + f->close_section(); + } + f->close_section(); + f->dump_int("pool", pool); + f->open_array_section("old_pools"); + for (set<int64_t>::iterator p = old_pools.begin(); p != old_pools.end(); ++p) { + f->dump_int("old_pool", *p); + } + f->close_section(); +} + +void inode_backtrace_t::generate_test_instances(list<inode_backtrace_t*>& ls) +{ + ls.push_back(new inode_backtrace_t); + ls.push_back(new inode_backtrace_t); + ls.back()->ino = 1; + ls.back()->ancestors.push_back(inode_backpointer_t()); + ls.back()->ancestors.back().dirino = 123; + ls.back()->ancestors.back().dname = "bar"; + ls.back()->ancestors.back().version = 456; + ls.back()->pool = 0; + ls.back()->old_pools.insert(10); + ls.back()->old_pools.insert(7); +} + +int inode_backtrace_t::compare(const inode_backtrace_t& other, + bool *equivalent, bool *divergent) const +{ + int min_size = std::min(ancestors.size(),other.ancestors.size()); + *equivalent = true; + *divergent = false; + if (min_size == 0) + return 0; + int comparator = 0; + if (ancestors[0].version > other.ancestors[0].version) + comparator = 1; + else if (ancestors[0].version < other.ancestors[0].version) + comparator = -1; + if (ancestors[0].dirino != other.ancestors[0].dirino || + ancestors[0].dname != other.ancestors[0].dname) + *divergent = true; + for (int i = 1; i < min_size; ++i) { + if (*divergent) { + /** + * we already know the dentries and versions are + * incompatible; no point checking farther + */ + break; + } + if (ancestors[i].dirino != other.ancestors[i].dirino || + ancestors[i].dname != other.ancestors[i].dname) { + *equivalent = false; + return comparator; + } else if (ancestors[i].version > other.ancestors[i].version) { + if (comparator < 0) + *divergent = true; + comparator = 1; + } else if (ancestors[i].version < other.ancestors[i].version) { + if (comparator > 0) + *divergent = true; + comparator = -1; + } + } + if (*divergent) + *equivalent = false; + return comparator; +} diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h new file mode 100644 index 00000000..7c60865c --- /dev/null +++ b/src/mds/inode_backtrace.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_INODE_BACKTRACE_H +#define CEPH_INODE_BACKTRACE_H + +#include <string_view> + +#include "mdstypes.h" + +namespace ceph { + class Formatter; +} + +/** metadata backpointers **/ + +/* + * - inode_backpointer_t is just the _pointer_ portion; it doesn't + * tell us who we point _from_. + * + * - it _does_ include a version of the source object, so we can look + * at two different pointers (from the same inode) and tell which is + * newer. + */ +struct inode_backpointer_t { + inodeno_t dirino; // containing directory ino + string dname; // linking dentry name + version_t version; // child's version at time of backpointer creation + + inode_backpointer_t() : version(0) {} + inode_backpointer_t(inodeno_t i, std::string_view d, version_t v) : dirino(i), dname(d), version(v) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &bl); + void decode_old(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_backpointer_t*>& ls); +}; +WRITE_CLASS_ENCODER(inode_backpointer_t) + +inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) { + return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname; +} + +inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) { + return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">"; +} + +/* + * inode_backtrace_t is a complete ancestor backtraces for a given inode. + * we include who _we_ are, so that the backtrace can stand alone (as, say, + * an xattr on an object). + */ +struct inode_backtrace_t { + inodeno_t ino; // my ino + vector<inode_backpointer_t> ancestors; + int64_t pool; + // we use a set for old_pools to avoid duplicate entries, e.g. setlayout 0, 1, 0 + set<int64_t> old_pools; + + inode_backtrace_t() : pool(-1) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_backtrace_t*>& ls); + + /** + * Compare two backtraces *for the same inode*. + * @pre The backtraces are for the same inode + * + * @param other The backtrace to compare ourselves with + * @param equivalent A bool pointer which will be set to true if + * the other backtrace is equivalent to our own (has the same dentries) + * @param divergent A bool pointer which will be set to true if + * the backtraces have differing entries without versions supporting them + * + * @returns 1 if we are newer than the other, 0 if equal, -1 if older + */ + int compare(const inode_backtrace_t& other, + bool *equivalent, bool *divergent) const; +}; +WRITE_CLASS_ENCODER(inode_backtrace_t) + +inline ostream& operator<<(ostream& out, const inode_backtrace_t& it) { + return out << "(" << it.pool << ")" << it.ino << ":" << it.ancestors << "//" << it.old_pools; +} + +inline bool operator==(const inode_backtrace_t& l, + const inode_backtrace_t& r) { + return l.ino == r.ino && + l.pool == r.pool && + l.old_pools == r.old_pools && + l.ancestors == r.ancestors; +} + +#endif + diff --git a/src/mds/journal.cc b/src/mds/journal.cc new file mode 100644 index 00000000..3eb24af2 --- /dev/null +++ b/src/mds/journal.cc @@ -0,0 +1,3170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/config.h" +#include "osdc/Journaler.h" +#include "events/ESubtreeMap.h" +#include "events/ESession.h" +#include "events/ESessions.h" + +#include "events/EMetaBlob.h" +#include "events/EResetJournal.h" +#include "events/ENoOp.h" + +#include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" +#include "events/EOpen.h" +#include "events/ECommitted.h" + +#include "events/EExport.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" +#include "events/EFragment.h" + +#include "events/ETableClient.h" +#include "events/ETableServer.h" + +#include "include/stringify.h" + +#include "LogSegment.h" + +#include "MDSRank.h" +#include "MDLog.h" +#include "MDCache.h" +#include "Server.h" +#include "Migrator.h" +#include "Mutation.h" + +#include "InoTable.h" +#include "MDSTableClient.h" +#include "MDSTableServer.h" + +#include "Locker.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal " + + +// ----------------------- +// LogSegment + +void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio) +{ + set<CDir*> commit; + + dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl; + + ceph_assert(g_conf()->mds_kill_journal_expire_at != 1); + + // commit dirs + for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) { + dout(20) << " new_dirfrag " << **p << dendl; + ceph_assert((*p)->is_auth()); + commit.insert(*p); + } + for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) { + dout(20) << " dirty_dirfrag " << **p << dendl; + ceph_assert((*p)->is_auth()); + commit.insert(*p); + } + for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) { + dout(20) << " dirty_dentry " << **p << dendl; + ceph_assert((*p)->is_auth()); + commit.insert((*p)->get_dir()); + } + for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) { + dout(20) << " dirty_inode " << **p << dendl; + ceph_assert((*p)->is_auth()); + if ((*p)->is_base()) { + (*p)->store(gather_bld.new_sub()); + } else + commit.insert((*p)->get_parent_dn()->get_dir()); + } + + if (!commit.empty()) { + for (set<CDir*>::iterator p = commit.begin(); + p != commit.end(); + ++p) { + CDir *dir = *p; + ceph_assert(dir->is_auth()); + if (dir->can_auth_pin()) { + dout(15) << "try_to_expire committing " << *dir << dendl; + dir->commit(0, gather_bld.new_sub(), false, op_prio); + } else { + dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl; + dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub()); + } + } + } + + // master ops with possibly uncommitted slaves + for (set<metareqid_t>::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl; + mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub()); + } + + // slave ops that haven't been committed + for (set<metareqid_t>::iterator p = uncommitted_slaves.begin(); + p != uncommitted_slaves.end(); + ++p) { + dout(10) << "try_to_expire waiting for master to ack OP_FINISH on " << *p << dendl; + mds->mdcache->wait_for_uncommitted_slave(*p, gather_bld.new_sub()); + } + + // uncommitted fragments + for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin(); + p != uncommitted_fragments.end(); + ++p) { + dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl; + mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub()); + } + + // nudge scatterlocks + for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl; + mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub()); + } + for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl; + mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub()); + } + for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl; + mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub()); + } + + ceph_assert(g_conf()->mds_kill_journal_expire_at != 2); + + // open files and snap inodes + if (!open_files.empty()) { + ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME + EOpen *le = 0; + LogSegment *ls = mds->mdlog->get_current_segment(); + ceph_assert(ls != this); + elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file)); + while (!p.end()) { + CInode *in = *p; + ++p; + if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) { + // journal snap inodes that need flush. This simplify the mds failover hanlding + dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl; + if (!le) { + le = new EOpen(mds->mdlog); + mds->mdlog->start_entry(le); + } + le->add_clean_inode(in); + ls->open_files.push_back(&in->item_open_file); + } else { + // open files are tracked by open file table, no need to journal them again + in->item_open_file.remove_myself(); + } + } + if (le) { + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_safe(gather_bld.new_sub()); + dout(10) << "try_to_expire waiting for open files to rejournal" << dendl; + } + } + + ceph_assert(g_conf()->mds_kill_journal_expire_at != 3); + + // backtraces to be stored/updated + for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) { + CInode *in = *p; + ceph_assert(in->is_auth()); + if (in->can_auth_pin()) { + dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl; + in->store_backtrace(gather_bld.new_sub(), op_prio); + } else { + dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub()); + } + } + + ceph_assert(g_conf()->mds_kill_journal_expire_at != 4); + + // idalloc + if (inotablev > mds->inotable->get_committed_version()) { + dout(10) << "try_to_expire saving inotable table, need " << inotablev + << ", committed is " << mds->inotable->get_committed_version() + << " (" << mds->inotable->get_committing_version() << ")" + << dendl; + mds->inotable->save(gather_bld.new_sub(), inotablev); + } + + // sessionmap + if (sessionmapv > mds->sessionmap.get_committed()) { + dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv + << ", committed is " << mds->sessionmap.get_committed() + << " (" << mds->sessionmap.get_committing() << ")" + << dendl; + mds->sessionmap.save(gather_bld.new_sub(), sessionmapv); + } + + // updates to sessions for completed_requests + mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld); + touched_sessions.clear(); + + // pending commit atids + for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin(); + p != pending_commit_tids.end(); + ++p) { + MDSTableClient *client = mds->get_table_client(p->first); + ceph_assert(client); + for (ceph::unordered_set<version_t>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q + << " pending commit (not yet acked), waiting" << dendl; + ceph_assert(!client->has_committed(*q)); + client->wait_for_ack(*q, gather_bld.new_sub()); + } + } + + // table servers + for (map<int, version_t>::iterator p = tablev.begin(); + p != tablev.end(); + ++p) { + MDSTableServer *server = mds->get_table_server(p->first); + ceph_assert(server); + if (p->second > server->get_committed_version()) { + dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first) + << " to save, need " << p->second << dendl; + server->save(gather_bld.new_sub()); + } + } + + // truncating + for (set<CInode*>::iterator p = truncating_inodes.begin(); + p != truncating_inodes.end(); + ++p) { + dout(10) << "try_to_expire waiting for truncate of " << **p << dendl; + (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub()); + } + + if (gather_bld.has_subs()) { + dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl; + mds->mdlog->flush(); + } else { + ceph_assert(g_conf()->mds_kill_journal_expire_at != 5); + dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl; + } +} + + +// ----------------------- +// EMetaBlob + +void EMetaBlob::add_dir_context(CDir *dir, int mode) +{ + MDSRank *mds = dir->cache->mds; + + list<CDentry*> parents; + + // it may be okay not to include the maybe items, if + // - we journaled the maybe child inode in this segment + // - that subtree turns out to be unambiguously auth + list<CDentry*> maybe; + bool maybenot = false; + + while (true) { + // already have this dir? (we must always add in order) + if (lump_map.count(dir->dirfrag())) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl; + break; + } + + // stop at root/stray + CInode *diri = dir->get_inode(); + CDentry *parent = diri->get_projected_parent_dn(); + + if (mode == TO_AUTH_SUBTREE_ROOT) { + // subtree root? + if (dir->is_subtree_root()) { + // match logic in MDCache::create_subtree_map() + if (dir->get_dir_auth().first == mds->get_nodeid()) { + mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF; + if (parent_auth.first == dir->get_dir_auth().first) { + if (parent_auth.second == CDIR_AUTH_UNKNOWN && + !dir->is_ambiguous_dir_auth() && + !dir->state_test(CDir::STATE_EXPORTBOUND) && + !dir->state_test(CDir::STATE_AUXSUBTREE) && + !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) { + dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl; + ceph_abort(); + } + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl; + } else { + // it's an auth subtree, we don't need maybe (if any), and we're done. + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe + << " at " << *dir << dendl; + maybe.clear(); + break; + } + } else { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe + << " at " << *dir << dendl; + // we need the maybe list after all! + parents.splice(parents.begin(), maybe); + maybenot = false; + } + } + + // was the inode journaled in this blob? + if (event_seq && diri->last_journaled == event_seq) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl; + break; + } + + // have we journaled this inode since the last subtree map? + if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) { + dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment (" + << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag " + << *diri << dendl; + maybenot = true; + } + } + + if (!parent) + break; + + if (maybenot) { + dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl; + maybe.push_front(parent); + } else { + dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl; + parents.push_front(parent); + } + + dir = parent->get_dir(); + } + + parents.splice(parents.begin(), maybe); + + dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl; + for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); ++p) { + ceph_assert((*p)->get_projected_linkage()->is_primary()); + add_dentry(*p, false); + } +} + +void EMetaBlob::update_segment(LogSegment *ls) +{ + // dirty inode mtimes + // -> handled directly by Server.cc, replay() + + // alloc table update? + if (inotablev) + ls->inotablev = inotablev; + if (sessionmapv) + ls->sessionmapv = sessionmapv; + + // truncated inodes + // -> handled directly by Server.cc + + // client requests + // note the newest request per client + //if (!client_reqs.empty()) + // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); +} + +// EMetaBlob::fullbit + +void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const { + ENCODE_START(8, 5, bl); + encode(dn, bl); + encode(dnfirst, bl); + encode(dnlast, bl); + encode(dnv, bl); + encode(inode, bl, features); + encode(xattrs, bl); + if (inode.is_symlink()) + encode(symlink, bl); + if (inode.is_dir()) { + encode(dirfragtree, bl); + encode(snapbl, bl); + } + encode(state, bl); + if (old_inodes.empty()) { + encode(false, bl); + } else { + encode(true, bl); + encode(old_inodes, bl, features); + } + if (!inode.is_dir()) + encode(snapbl, bl); + encode(oldest_snap, bl); + ENCODE_FINISH(bl); +} + +void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + decode(dn, bl); + decode(dnfirst, bl); + decode(dnlast, bl); + decode(dnv, bl); + decode(inode, bl); + decode_noshare(xattrs, bl); + if (inode.is_symlink()) + decode(symlink, bl); + if (inode.is_dir()) { + decode(dirfragtree, bl); + decode(snapbl, bl); + if ((struct_v == 2) || (struct_v == 3)) { + bool dir_layout_exists; + decode(dir_layout_exists, bl); + if (dir_layout_exists) { + __u8 dir_struct_v; + decode(dir_struct_v, bl); // default_file_layout version + decode(inode.layout, bl); // and actual layout, that we care about + } + } + } + if (struct_v >= 6) { + decode(state, bl); + } else { + bool dirty; + decode(dirty, bl); + state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0; + } + + if (struct_v >= 3) { + bool old_inodes_present; + decode(old_inodes_present, bl); + if (old_inodes_present) { + decode(old_inodes, bl); + } + } + if (!inode.is_dir()) { + if (struct_v >= 7) + decode(snapbl, bl); + } + if (struct_v >= 8) + decode(oldest_snap, bl); + else + oldest_snap = CEPH_NOSNAP; + + DECODE_FINISH(bl); +} + +void EMetaBlob::fullbit::dump(Formatter *f) const +{ + f->dump_string("dentry", dn); + f->dump_stream("snapid.first") << dnfirst; + f->dump_stream("snapid.last") << dnlast; + f->dump_int("dentry version", dnv); + f->open_object_section("inode"); + inode.dump(f); + f->close_section(); // inode + f->open_object_section("xattrs"); + for (const auto &p : xattrs) { + std::string s(p.second.c_str(), p.second.length()); + f->dump_string(p.first.c_str(), s); + } + f->close_section(); // xattrs + if (inode.is_symlink()) { + f->dump_string("symlink", symlink); + } + if (inode.is_dir()) { + f->dump_stream("frag tree") << dirfragtree; + f->dump_string("has_snapbl", snapbl.length() ? "true" : "false"); + if (inode.has_layout()) { + f->open_object_section("file layout policy"); + // FIXME + f->dump_string("layout", "the layout exists"); + f->close_section(); // file layout policy + } + } + f->dump_string("state", state_string()); + if (!old_inodes.empty()) { + f->open_array_section("old inodes"); + for (const auto &p : old_inodes) { + f->open_object_section("inode"); + f->dump_int("snapid", p.first); + p.second.dump(f); + f->close_section(); // inode + } + f->close_section(); // old inodes + } +} + +void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls) +{ + CInode::mempool_inode inode; + fragtree_t fragtree; + CInode::mempool_xattr_map empty_xattrs; + bufferlist empty_snapbl; + fullbit *sample = new fullbit("/testdn", 0, 0, 0, + inode, fragtree, empty_xattrs, "", 0, empty_snapbl, + false, NULL); + ls.push_back(sample); +} + +void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in) +{ + in->inode = inode; + in->xattrs = xattrs; + in->maybe_export_pin(); + if (in->inode.is_dir()) { + if (!(in->dirfragtree == dirfragtree)) { + dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> " + << dirfragtree << " on " << *in << dendl; + in->dirfragtree = dirfragtree; + in->force_dirfrags(); + if (in->has_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) { + list<CDir*> ls; + in->get_nested_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->get_num_any() == 0 && + mds->mdcache->can_trim_non_auth_dirfrag(dir)) { + dout(10) << " closing empty non-auth dirfrag " << *dir << dendl; + in->close_dirfrag(dir->get_frag()); + } + } + } + } + } else if (in->inode.is_symlink()) { + in->symlink = symlink; + } + in->old_inodes = old_inodes; + if (!in->old_inodes.empty()) { + snapid_t min_first = in->old_inodes.rbegin()->first + 1; + if (min_first > in->first) + in->first = min_first; + } + + /* + * we can do this before linking hte inode bc the split_at would + * be a no-op.. we have no children (namely open snaprealms) to + * divy up + */ + in->oldest_snap = oldest_snap; + in->decode_snap_blob(snapbl); + + /* + * In case there was anything malformed in the journal that we are + * replaying, do sanity checks on the inodes we're replaying and + * go damaged instead of letting any trash into a live cache + */ + if (in->is_file()) { + // Files must have valid layouts with a pool set + if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) { + dout(0) << "EMetaBlob.replay invalid layout on ino " << *in + << ": " << in->inode.layout << dendl; + std::ostringstream oss; + oss << "Invalid layout for inode " << in->ino() << " in journal"; + mds->clog->error() << oss.str(); + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + } +} + +// EMetaBlob::remotebit + +void EMetaBlob::remotebit::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(dn, bl); + encode(dnfirst, bl); + encode(dnlast, bl); + encode(dnv, bl); + encode(ino, bl); + encode(d_type, bl); + encode(dirty, bl); + ENCODE_FINISH(bl); +} + +void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(dn, bl); + decode(dnfirst, bl); + decode(dnlast, bl); + decode(dnv, bl); + decode(ino, bl); + decode(d_type, bl); + decode(dirty, bl); + DECODE_FINISH(bl); +} + +void EMetaBlob::remotebit::dump(Formatter *f) const +{ + f->dump_string("dentry", dn); + f->dump_int("snapid.first", dnfirst); + f->dump_int("snapid.last", dnlast); + f->dump_int("dentry version", dnv); + f->dump_int("inodeno", ino); + uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries + string type_string; + switch(type) { + case S_IFREG: + type_string = "file"; break; + case S_IFLNK: + type_string = "symlink"; break; + case S_IFDIR: + type_string = "directory"; break; + case S_IFIFO: + type_string = "fifo"; break; + case S_IFCHR: + type_string = "chr"; break; + case S_IFBLK: + type_string = "blk"; break; + case S_IFSOCK: + type_string = "sock"; break; + default: + assert (0 == "unknown d_type!"); + } + f->dump_string("d_type", type_string); + f->dump_string("dirty", dirty ? "true" : "false"); +} + +void EMetaBlob::remotebit:: +generate_test_instances(list<EMetaBlob::remotebit*>& ls) +{ + remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false); + ls.push_back(remote); +} + +// EMetaBlob::nullbit + +void EMetaBlob::nullbit::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(dn, bl); + encode(dnfirst, bl); + encode(dnlast, bl); + encode(dnv, bl); + encode(dirty, bl); + ENCODE_FINISH(bl); +} + +void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(dn, bl); + decode(dnfirst, bl); + decode(dnlast, bl); + decode(dnv, bl); + decode(dirty, bl); + DECODE_FINISH(bl); +} + +void EMetaBlob::nullbit::dump(Formatter *f) const +{ + f->dump_string("dentry", dn); + f->dump_int("snapid.first", dnfirst); + f->dump_int("snapid.last", dnlast); + f->dump_int("dentry version", dnv); + f->dump_string("dirty", dirty ? "true" : "false"); +} + +void EMetaBlob::nullbit::generate_test_instances(list<nullbit*>& ls) +{ + nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false); + nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true); + ls.push_back(sample); + ls.push_back(sample2); +} + +// EMetaBlob::dirlump + +void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(2, 2, bl); + encode(fnode, bl); + encode(state, bl); + encode(nfull, bl); + encode(nremote, bl); + encode(nnull, bl); + _encode_bits(features); + encode(dnbl, bl); + ENCODE_FINISH(bl); +} + +void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl) + decode(fnode, bl); + decode(state, bl); + decode(nfull, bl); + decode(nremote, bl); + decode(nnull, bl); + decode(dnbl, bl); + dn_decoded = false; // don't decode bits unless we need them. + DECODE_FINISH(bl); +} + +void EMetaBlob::dirlump::dump(Formatter *f) const +{ + if (!dn_decoded) { + dirlump *me = const_cast<dirlump*>(this); + me->_decode_bits(); + } + f->open_object_section("fnode"); + fnode.dump(f); + f->close_section(); // fnode + f->dump_string("state", state_string()); + f->dump_int("nfull", nfull); + f->dump_int("nremote", nremote); + f->dump_int("nnull", nnull); + + f->open_array_section("full bits"); + for (const auto& iter : dfull) { + f->open_object_section("fullbit"); + iter.dump(f); + f->close_section(); // fullbit + } + f->close_section(); // full bits + f->open_array_section("remote bits"); + for (const auto& iter : dremote) { + f->open_object_section("remotebit"); + iter.dump(f); + f->close_section(); // remotebit + } + f->close_section(); // remote bits + f->open_array_section("null bits"); + for (const auto& iter : dnull) { + f->open_object_section("null bit"); + iter.dump(f); + f->close_section(); // null bit + } + f->close_section(); // null bits +} + +void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls) +{ + ls.push_back(new dirlump()); +} + +/** + * EMetaBlob proper + */ +void EMetaBlob::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(8, 5, bl); + encode(lump_order, bl); + encode(lump_map, bl, features); + encode(roots, bl, features); + encode(table_tids, bl); + encode(opened_ino, bl); + encode(allocated_ino, bl); + encode(used_preallocated_ino, bl); + encode(preallocated_inos, bl); + encode(client_name, bl); + encode(inotablev, bl); + encode(sessionmapv, bl); + encode(truncate_start, bl); + encode(truncate_finish, bl); + encode(destroyed_inodes, bl); + encode(client_reqs, bl); + encode(renamed_dirino, bl); + encode(renamed_dir_frags, bl); + { + // make MDSRank use v6 format happy + int64_t i = -1; + bool b = false; + encode(i, bl); + encode(b, bl); + } + encode(client_flushes, bl); + ENCODE_FINISH(bl); +} +void EMetaBlob::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + decode(lump_order, bl); + decode(lump_map, bl); + if (struct_v >= 4) { + decode(roots, bl); + } else { + bufferlist rootbl; + decode(rootbl, bl); + if (rootbl.length()) { + auto p = rootbl.cbegin(); + roots.emplace_back(p); + } + } + decode(table_tids, bl); + decode(opened_ino, bl); + decode(allocated_ino, bl); + decode(used_preallocated_ino, bl); + decode(preallocated_inos, bl); + decode(client_name, bl); + decode(inotablev, bl); + decode(sessionmapv, bl); + decode(truncate_start, bl); + decode(truncate_finish, bl); + decode(destroyed_inodes, bl); + if (struct_v >= 2) { + decode(client_reqs, bl); + } else { + list<metareqid_t> r; + decode(r, bl); + while (!r.empty()) { + client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0)); + r.pop_front(); + } + } + if (struct_v >= 3) { + decode(renamed_dirino, bl); + decode(renamed_dir_frags, bl); + } + if (struct_v >= 6) { + // ignore + int64_t i; + bool b; + decode(i, bl); + decode(b, bl); + } + if (struct_v >= 8) { + decode(client_flushes, bl); + } + DECODE_FINISH(bl); +} + + +/** + * Get all inodes touched by this metablob. Includes the 'bits' within + * dirlumps, and the inodes of the dirs themselves. + */ +void EMetaBlob::get_inodes( + std::set<inodeno_t> &inodes) const +{ + // For all dirlumps in this metablob + for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) { + // Record inode of dirlump + inodeno_t const dir_ino = i->first.ino; + inodes.insert(dir_ino); + + // Decode dirlump bits + dirlump const &dl = i->second; + dl._decode_bits(); + + // Record inodes of fullbits + for (const auto& iter : dl.get_dfull()) { + inodes.insert(iter.inode.ino); + } + + // Record inodes of remotebits + for (const auto& iter : dl.get_dremote()) { + inodes.insert(iter.ino); + } + } +} + + +/** + * Get a map of dirfrag to set of dentries in that dirfrag which are + * touched in this operation. + */ +void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const +{ + for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) { + dirlump const &dl = i->second; + dirfrag_t const &df = i->first; + + // Get all bits + dl._decode_bits(); + + // For all bits, store dentry + for (const auto& iter : dl.get_dfull()) { + dentries[df].insert(iter.dn); + } + for (const auto& iter : dl.get_dremote()) { + dentries[df].insert(iter.dn); + } + for (const auto& iter : dl.get_dnull()) { + dentries[df].insert(iter.dn); + } + } +} + + + +/** + * Calculate all paths that we can infer are touched by this metablob. Only uses + * information local to this metablob so it may only be the path within the + * subtree. + */ +void EMetaBlob::get_paths( + std::vector<std::string> &paths) const +{ + // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name + typedef std::pair<inodeno_t, std::string> Location; + + // Whenever we see a dentry within a dirlump, we remember it as a child of + // the dirlump's inode + std::map<inodeno_t, std::list<std::string> > children; + + // Whenever we see a location for an inode, remember it: this allows us to + // build a path given an inode + std::map<inodeno_t, Location> ino_locations; + + // Special case: operations on root inode populate roots but not dirlumps + if (lump_map.empty() && !roots.empty()) { + paths.push_back("/"); + return; + } + + // First pass + // ========== + // Build a tiny local metadata cache for the path structure in this metablob + for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) { + inodeno_t const dir_ino = i->first.ino; + dirlump const &dl = i->second; + dl._decode_bits(); + + for (const auto& iter : dl.get_dfull()) { + std::string_view dentry = iter.dn; + children[dir_ino].emplace_back(dentry); + ino_locations[iter.inode.ino] = Location(dir_ino, dentry); + } + + for (const auto& iter : dl.get_dremote()) { + std::string_view dentry = iter.dn; + children[dir_ino].emplace_back(dentry); + } + + for (const auto& iter : dl.get_dnull()) { + std::string_view dentry = iter.dn; + children[dir_ino].emplace_back(dentry); + } + } + + std::vector<Location> leaf_locations; + + // Second pass + // =========== + // Output paths for all childless nodes in the metablob + for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) { + inodeno_t const dir_ino = i->first.ino; + dirlump const &dl = i->second; + dl._decode_bits(); + + for (const auto& iter : dl.get_dfull()) { + std::string_view dentry = iter.dn; + if (children.find(iter.inode.ino) == children.end()) { + leaf_locations.push_back(Location(dir_ino, dentry)); + } + } + + for (const auto& iter : dl.get_dremote()) { + std::string_view dentry = iter.dn; + leaf_locations.push_back(Location(dir_ino, dentry)); + } + + for (const auto& iter : dl.get_dnull()) { + std::string_view dentry = iter.dn; + leaf_locations.push_back(Location(dir_ino, dentry)); + } + } + + // For all the leaf locations identified, generate paths + for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) { + Location const &loc = *i; + std::string path = loc.second; + inodeno_t ino = loc.first; + std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino); + while(iter != ino_locations.end()) { + Location const &loc = iter->second; + if (!path.empty()) { + path = loc.second + "/" + path; + } else { + path = loc.second + path; + } + iter = ino_locations.find(loc.first); + } + + paths.push_back(path); + } +} + + +void EMetaBlob::dump(Formatter *f) const +{ + f->open_array_section("lumps"); + for (const auto& d : lump_order) { + f->open_object_section("lump"); + f->open_object_section("dirfrag"); + f->dump_stream("dirfrag") << d; + f->close_section(); // dirfrag + f->open_object_section("dirlump"); + lump_map.at(d).dump(f); + f->close_section(); // dirlump + f->close_section(); // lump + } + f->close_section(); // lumps + + f->open_array_section("roots"); + for (const auto& iter : roots) { + f->open_object_section("root"); + iter.dump(f); + f->close_section(); // root + } + f->close_section(); // roots + + f->open_array_section("tableclient tranactions"); + for (const auto& p : table_tids) { + f->open_object_section("transaction"); + f->dump_int("tid", p.first); + f->dump_int("version", p.second); + f->close_section(); // transaction + } + f->close_section(); // tableclient transactions + + f->dump_int("renamed directory inodeno", renamed_dirino); + + f->open_array_section("renamed directory fragments"); + for (const auto& p : renamed_dir_frags) { + f->dump_int("frag", p); + } + f->close_section(); // renamed directory fragments + + f->dump_int("inotable version", inotablev); + f->dump_int("SessionMap version", sessionmapv); + f->dump_int("allocated ino", allocated_ino); + + f->dump_stream("preallocated inos") << preallocated_inos; + f->dump_int("used preallocated ino", used_preallocated_ino); + + f->open_object_section("client name"); + client_name.dump(f); + f->close_section(); // client name + + f->open_array_section("inodes starting a truncate"); + for(const auto& ino : truncate_start) { + f->dump_int("inodeno", ino); + } + f->close_section(); // truncate inodes + f->open_array_section("inodes finishing a truncated"); + for(const auto& p : truncate_finish) { + f->open_object_section("inode+segment"); + f->dump_int("inodeno", p.first); + f->dump_int("truncate starting segment", p.second); + f->close_section(); // truncated inode + } + f->close_section(); // truncate finish inodes + + f->open_array_section("destroyed inodes"); + for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin(); + i != destroyed_inodes.end(); ++i) { + f->dump_int("inodeno", *i); + } + f->close_section(); // destroyed inodes + + f->open_array_section("client requests"); + for(const auto& p : client_reqs) { + f->open_object_section("Client request"); + f->dump_stream("request ID") << p.first; + f->dump_int("oldest request on client", p.second); + f->close_section(); // request + } + f->close_section(); // client requests +} + +void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls) +{ + ls.push_back(new EMetaBlob()); +} + +void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) +{ + dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl; + + ceph_assert(logseg); + + ceph_assert(g_conf()->mds_kill_journal_replay_at != 1); + + for (auto& p : roots) { + CInode *in = mds->mdcache->get_inode(p.inode.ino); + bool isnew = in ? false:true; + if (!in) + in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP); + p.update_inode(mds, in); + + if (isnew) + mds->mdcache->add_inode(in); + if (p.is_dirty()) in->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl; + } + + CInode *renamed_diri = 0; + CDir *olddir = 0; + if (renamed_dirino) { + renamed_diri = mds->mdcache->get_inode(renamed_dirino); + if (renamed_diri) + dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl; + else + dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl; + + int nnull = 0; + for (const auto& lp : lump_order) { + dirlump &lump = lump_map[lp]; + if (lump.nnull) { + dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl; + nnull += lump.nnull; + } + } + ceph_assert(nnull <= 1); + } + + // keep track of any inodes we unlink and don't relink elsewhere + map<CInode*, CDir*> unlinked; + set<CInode*> linked; + + // walk through my dirs (in order!) + int count = 0; + for (const auto& lp : lump_order) { + dout(10) << "EMetaBlob.replay dir " << lp << dendl; + dirlump &lump = lump_map[lp]; + + // the dir + CDir *dir = mds->mdcache->get_force_dirfrag(lp, true); + if (!dir) { + // hmm. do i have the inode? + CInode *diri = mds->mdcache->get_inode((lp).ino); + if (!diri) { + if (MDS_INO_IS_MDSDIR(lp.ino)) { + ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino); + diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755); + diri->state_clear(CInode::STATE_AUTH); + dout(10) << "EMetaBlob.replay created base " << *diri << dendl; + } else { + dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl; + mds->clog->error() << "failure replaying journal (EMetaBlob)"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + } + + // create the dirfrag + dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag); + + if (MDS_INO_IS_BASE(lp.ino)) + mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF); + + dout(10) << "EMetaBlob.replay added dir " << *dir << dendl; + } + dir->set_version( lump.fnode.version ); + dir->fnode = lump.fnode; + + if (lump.is_importing()) { + dir->state_set(CDir::STATE_AUTH); + dir->state_clear(CDir::STATE_COMPLETE); + } + if (lump.is_dirty()) { + dir->_mark_dirty(logseg); + + if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) { + dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->nestlock); + logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest); + } else { + dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl; + } + if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) { + dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->filelock); + logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir); + } else { + dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl; + } + } + if (lump.is_dirty_dft()) { + dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl; + dir->state_set(CDir::STATE_DIRTYDFT); + mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock); + logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree); + } + if (lump.is_new()) + dir->mark_new(logseg); + if (lump.is_complete()) + dir->mark_complete(); + + dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl; + + // decode bits + lump._decode_bits(); + + // full dentry+inode pairs + for (auto& fb : lump._get_dfull()) { + CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast); + if (!dn) { + dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast); + dn->set_version(fb.dnv); + if (fb.is_dirty()) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl; + } else { + dn->set_version(fb.dnv); + if (fb.is_dirty()) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl; + dn->first = fb.dnfirst; + ceph_assert(dn->last == fb.dnlast); + } + if (lump.is_importing()) + dn->state_set(CDentry::STATE_AUTH); + + CInode *in = mds->mdcache->get_inode(fb.inode.ino, fb.dnlast); + if (!in) { + in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast); + fb.update_inode(mds, in); + mds->mdcache->add_inode(in); + if (!dn->get_linkage()->is_null()) { + if (dn->get_linkage()->is_primary()) { + unlinked[dn->get_linkage()->get_inode()] = dir; + stringstream ss; + ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn + << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino; + dout(0) << ss.str() << dendl; + mds->clog->warn(ss); + } + dir->unlink_inode(dn, false); + } + if (unlinked.count(in)) + linked.insert(in); + dir->link_primary_inode(dn, in); + dout(10) << "EMetaBlob.replay added " << *in << dendl; + } else { + in->first = fb.dnfirst; + fb.update_inode(mds, in); + if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) { + dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; + unlinked[in] = in->get_parent_dir(); + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); + } + if (dn->get_linkage()->get_inode() != in) { + if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration. + if (dn->get_linkage()->is_primary()) { + unlinked[dn->get_linkage()->get_inode()] = dir; + stringstream ss; + ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn + << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino; + dout(0) << ss.str() << dendl; + mds->clog->warn(ss); + } + dir->unlink_inode(dn, false); + } + if (unlinked.count(in)) + linked.insert(in); + dir->link_primary_inode(dn, in); + dout(10) << "EMetaBlob.replay linked " << *in << dendl; + } else { + dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl; + } + ceph_assert(in->first == fb.dnfirst || + (in->is_multiversion() && in->first > fb.dnfirst)); + } + if (fb.is_dirty()) + in->_mark_dirty(logseg); + if (fb.is_dirty_parent()) + in->mark_dirty_parent(logseg, fb.is_dirty_pool()); + if (fb.need_snapflush()) + logseg->open_files.push_back(&in->item_open_file); + if (dn->is_auth()) + in->state_set(CInode::STATE_AUTH); + else + in->state_clear(CInode::STATE_AUTH); + ceph_assert(g_conf()->mds_kill_journal_replay_at != 2); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // remote dentries + for (const auto& rb : lump.get_dremote()) { + CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast); + if (!dn) { + dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, rb.dnfirst, rb.dnlast); + dn->set_version(rb.dnv); + if (rb.dirty) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay added " << *dn << dendl; + } else { + if (!dn->get_linkage()->is_null()) { + dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; + if (dn->get_linkage()->is_primary()) { + unlinked[dn->get_linkage()->get_inode()] = dir; + stringstream ss; + ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn + << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino; + dout(0) << ss.str() << dendl; + } + dir->unlink_inode(dn, false); + } + dir->link_remote_inode(dn, rb.ino, rb.d_type); + dn->set_version(rb.dnv); + if (rb.dirty) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl; + dn->first = rb.dnfirst; + ceph_assert(dn->last == rb.dnlast); + } + if (lump.is_importing()) + dn->state_set(CDentry::STATE_AUTH); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // null dentries + for (const auto& nb : lump.get_dnull()) { + CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast); + if (!dn) { + dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast); + dn->set_version(nb.dnv); + if (nb.dirty) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl; + } else { + dn->first = nb.dnfirst; + if (!dn->get_linkage()->is_null()) { + dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; + CInode *in = dn->get_linkage()->get_inode(); + // For renamed inode, We may call CInode::force_dirfrag() later. + // CInode::force_dirfrag() doesn't work well when inode is detached + // from the hierarchy. + if (!renamed_diri || renamed_diri != in) { + if (dn->get_linkage()->is_primary()) + unlinked[in] = dir; + dir->unlink_inode(dn); + } + } + dn->set_version(nb.dnv); + if (nb.dirty) dn->_mark_dirty(logseg); + dout(10) << "EMetaBlob.replay had " << *dn << dendl; + ceph_assert(dn->last == nb.dnlast); + } + olddir = dir; + if (lump.is_importing()) + dn->state_set(CDentry::STATE_AUTH); + + // Make null dentries the first things we trim + dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl; + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + } + + ceph_assert(g_conf()->mds_kill_journal_replay_at != 3); + + if (renamed_dirino) { + if (renamed_diri) { + ceph_assert(unlinked.count(renamed_diri)); + ceph_assert(linked.count(renamed_diri)); + olddir = unlinked[renamed_diri]; + } else { + // we imported a diri we haven't seen before + renamed_diri = mds->mdcache->get_inode(renamed_dirino); + ceph_assert(renamed_diri); // it was in the metablob + } + + if (olddir) { + if (olddir->authority() != CDIR_AUTH_UNDEF && + renamed_diri->authority() == CDIR_AUTH_UNDEF) { + ceph_assert(slaveup); // auth to non-auth, must be slave prepare + frag_vec_t leaves; + renamed_diri->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = renamed_diri->get_dirfrag(leaf); + ceph_assert(dir); + if (dir->get_dir_auth() == CDIR_AUTH_UNDEF) + // preserve subtree bound until slave commit + slaveup->olddirs.insert(dir->inode); + else + dir->state_set(CDir::STATE_AUTH); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + } + + mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false); + + // see if we can discard the subtree we renamed out of + CDir *root = mds->mdcache->get_subtree_root(olddir); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { + if (slaveup) // preserve the old dir until slave commit + slaveup->olddirs.insert(olddir->inode); + else + mds->mdcache->try_trim_non_auth_subtree(root); + } + } + + // if we are the srci importer, we'll also have some dirfrags we have to open up... + if (renamed_diri->authority() != CDIR_AUTH_UNDEF) { + for (const auto& p : renamed_dir_frags) { + CDir *dir = renamed_diri->get_dirfrag(p); + if (dir) { + // we already had the inode before, and we already adjusted this subtree accordingly. + dout(10) << " already had+adjusted rename import bound " << *dir << dendl; + ceph_assert(olddir); + continue; + } + dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p); + dout(10) << " creating new rename import bound " << *dir << dendl; + dir->state_clear(CDir::STATE_AUTH); + mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + } + + // rename may overwrite an empty directory and move it into stray dir. + unlinked.erase(renamed_diri); + for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) { + if (!linked.count(p->first)) + continue; + ceph_assert(p->first->is_dir()); + mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + } + + if (!unlinked.empty()) { + for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p) + unlinked.erase(*p); + dout(10) << " unlinked set contains " << unlinked << dendl; + for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) { + CInode *in = p->first; + if (slaveup) { // preserve unlinked inodes until slave commit + slaveup->unlinked.insert(in); + if (in->snaprealm) + in->snaprealm->adjust_parent(); + } else + mds->mdcache->remove_inode_recursive(in); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + } + + // table client transactions + for (const auto& p : table_tids) { + dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first) + << " transaction " << p.second << dendl; + MDSTableClient *client = mds->get_table_client(p.first); + if (client) + client->got_journaled_agree(p.second, logseg); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // opened ino? + if (opened_ino) { + CInode *in = mds->mdcache->get_inode(opened_ino); + ceph_assert(in); + dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl; + logseg->open_files.push_back(&in->item_open_file); + } + + // allocated_inos + if (inotablev) { + if (mds->inotable->get_version() >= inotablev) { + dout(10) << "EMetaBlob.replay inotable tablev " << inotablev + << " <= table " << mds->inotable->get_version() << dendl; + } else { + dout(10) << "EMetaBlob.replay inotable v " << inotablev + << " - 1 == table " << mds->inotable->get_version() + << " allocated+used " << allocated_ino + << " prealloc " << preallocated_inos + << dendl; + if (allocated_ino) + mds->inotable->replay_alloc_id(allocated_ino); + if (preallocated_inos.size()) + mds->inotable->replay_alloc_ids(preallocated_inos); + + // [repair bad inotable updates] + if (inotablev > mds->inotable->get_version()) { + mds->clog->error() << "journal replay inotablev mismatch " + << mds->inotable->get_version() << " -> " << inotablev; + mds->inotable->force_replay_version(inotablev); + } + + ceph_assert(inotablev == mds->inotable->get_version()); + } + } + if (sessionmapv) { + unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1; + if (mds->sessionmap.get_version() >= sessionmapv) { + dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv + << " <= table " << mds->sessionmap.get_version() << dendl; + } else if (mds->sessionmap.get_version() + diff == sessionmapv) { + dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv + << " - " << diff << " == table " << mds->sessionmap.get_version() + << " prealloc " << preallocated_inos + << " used " << used_preallocated_ino + << dendl; + Session *session = mds->sessionmap.get_session(client_name); + if (session) { + dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl; + if (used_preallocated_ino) { + if (!session->info.prealloc_inos.empty()) { + inodeno_t next = session->next_ino(); + inodeno_t i = session->take_ino(used_preallocated_ino); + if (next != i) + mds->clog->warn() << " replayed op " << client_reqs << " used ino " << i + << " but session next is " << next; + ceph_assert(i == used_preallocated_ino); + session->info.used_inos.clear(); + } + mds->sessionmap.replay_dirty_session(session); + } + if (!preallocated_inos.empty()) { + session->info.prealloc_inos.insert(preallocated_inos); + mds->sessionmap.replay_dirty_session(session); + } + + } else { + dout(10) << "EMetaBlob.replay no session for " << client_name << dendl; + if (used_preallocated_ino) + mds->sessionmap.replay_advance_version(); + + if (!preallocated_inos.empty()) + mds->sessionmap.replay_advance_version(); + } + ceph_assert(sessionmapv == mds->sessionmap.get_version()); + } else { + mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv + << " - " << diff << " > table " << mds->sessionmap.get_version(); + ceph_assert(g_conf()->mds_wipe_sessions); + mds->sessionmap.wipe(); + mds->sessionmap.set_version(sessionmapv); + } + } + + // truncating inodes + for (const auto& ino : truncate_start) { + CInode *in = mds->mdcache->get_inode(ino); + ceph_assert(in); + mds->mdcache->add_recovered_truncate(in, logseg); + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + for (const auto& p : truncate_finish) { + LogSegment *ls = mds->mdlog->get_segment(p.second); + if (ls) { + CInode *in = mds->mdcache->get_inode(p.first); + ceph_assert(in); + mds->mdcache->remove_recovered_truncate(in, ls); + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // destroyed inodes + if (!destroyed_inodes.empty()) { + for (vector<inodeno_t>::iterator p = destroyed_inodes.begin(); + p != destroyed_inodes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(*p); + if (in) { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; + CDentry *parent = in->get_parent_dn(); + mds->mdcache->remove_inode(in); + if (parent) { + dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; + ceph_assert(parent->get_linkage()->is_null()); + } + } else { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes); + } + + // client requests + for (const auto& p : client_reqs) { + if (p.first.name.is_client()) { + dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl; + inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino; + // if we allocated an inode, there should be exactly one client request id. + ceph_assert(created == inodeno_t() || client_reqs.size() == 1); + + Session *session = mds->sessionmap.get_session(p.first.name); + if (session) { + session->add_completed_request(p.first.tid, created); + if (p.second) + session->trim_completed_requests(p.second); + } + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // client flushes + for (const auto& p : client_flushes) { + if (p.first.name.is_client()) { + dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl; + Session *session = mds->sessionmap.get_session(p.first.name); + if (session) { + session->add_completed_flush(p.first.tid); + if (p.second) + session->trim_completed_flushes(p.second); + } + } + + if (!(++count % 1000)) + mds->heartbeat_reset(); + } + + // update segment + update_segment(logseg); + + ceph_assert(g_conf()->mds_kill_journal_replay_at != 4); +} + +// ----------------------- +// ESession + +void ESession::update_segment() +{ + get_segment()->sessionmapv = cmapv; + if (inos.size() && inotablev) + get_segment()->inotablev = inotablev; +} + +void ESession::replay(MDSRank *mds) +{ + if (mds->sessionmap.get_version() >= cmapv) { + dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version() + << " >= " << cmapv << ", noop" << dendl; + } else if (mds->sessionmap.get_version() + 1 == cmapv) { + dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version() + << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl; + Session *session; + if (open) { + session = mds->sessionmap.get_or_add_session(client_inst); + mds->sessionmap.set_state(session, Session::STATE_OPEN); + session->set_client_metadata(client_metadata); + dout(10) << " opened session " << session->info.inst << dendl; + } else { + session = mds->sessionmap.get_session(client_inst.name); + if (session) { // there always should be a session, but there's a bug + if (session->get_connection() == NULL) { + dout(10) << " removed session " << session->info.inst << dendl; + mds->sessionmap.remove_session(session); + session = NULL; + } else { + session->clear(); // the client has reconnected; keep the Session, but reset + dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl; + } + } else { + mds->clog->error() << "replayed stray Session close event for " << client_inst + << " from time " << stamp << ", ignoring"; + } + } + if (session) { + mds->sessionmap.replay_dirty_session(session); + } else { + mds->sessionmap.replay_advance_version(); + } + ceph_assert(mds->sessionmap.get_version() == cmapv); + } else { + mds->clog->error() << "ESession.replay sessionmap v " << cmapv + << " - 1 > table " << mds->sessionmap.get_version(); + ceph_assert(g_conf()->mds_wipe_sessions); + mds->sessionmap.wipe(); + mds->sessionmap.set_version(cmapv); + } + + if (inos.size() && inotablev) { + if (mds->inotable->get_version() >= inotablev) { + dout(10) << "ESession.replay inotable " << mds->inotable->get_version() + << " >= " << inotablev << ", noop" << dendl; + } else { + dout(10) << "ESession.replay inotable " << mds->inotable->get_version() + << " < " << inotablev << " " << (open ? "add":"remove") << dendl; + ceph_assert(!open); // for now + mds->inotable->replay_release_ids(inos); + ceph_assert(mds->inotable->get_version() == inotablev); + } + } + + update_segment(); +} + +void ESession::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(5, 5, bl); + encode(stamp, bl); + encode(client_inst, bl, features); + encode(open, bl); + encode(cmapv, bl); + encode(inos, bl); + encode(inotablev, bl); + encode(client_metadata, bl); + ENCODE_FINISH(bl); +} + +void ESession::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(client_inst, bl); + decode(open, bl); + decode(cmapv, bl); + decode(inos, bl); + decode(inotablev, bl); + if (struct_v == 4) { + decode(client_metadata.kv_map, bl); + } else if (struct_v >= 5) { + decode(client_metadata, bl); + } + DECODE_FINISH(bl); +} + +void ESession::dump(Formatter *f) const +{ + f->dump_stream("client instance") << client_inst; + f->dump_string("open", open ? "true" : "false"); + f->dump_int("client map version", cmapv); + f->dump_stream("inos") << inos; + f->dump_int("inotable version", inotablev); + f->open_object_section("client_metadata"); + client_metadata.dump(f); + f->close_section(); // client_metadata +} + +void ESession::generate_test_instances(list<ESession*>& ls) +{ + ls.push_back(new ESession); +} + +// ----------------------- +// ESessions + +void ESessions::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(2, 1, bl); + encode(client_map, bl, features); + encode(cmapv, bl); + encode(stamp, bl); + encode(client_metadata_map, bl); + ENCODE_FINISH(bl); +} + +void ESessions::decode_old(bufferlist::const_iterator &bl) +{ + using ceph::decode; + decode(client_map, bl); + decode(cmapv, bl); + if (!bl.end()) + decode(stamp, bl); +} + +void ESessions::decode_new(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + decode(client_map, bl); + decode(cmapv, bl); + decode(stamp, bl); + if (struct_v >= 2) + decode(client_metadata_map, bl); + DECODE_FINISH(bl); +} + +void ESessions::dump(Formatter *f) const +{ + f->dump_int("client map version", cmapv); + + f->open_array_section("client map"); + for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin(); + i != client_map.end(); ++i) { + f->open_object_section("client"); + f->dump_int("client id", i->first.v); + f->dump_stream("client entity") << i->second; + f->close_section(); // client + } + f->close_section(); // client map +} + +void ESessions::generate_test_instances(list<ESessions*>& ls) +{ + ls.push_back(new ESessions()); +} + +void ESessions::update_segment() +{ + get_segment()->sessionmapv = cmapv; +} + +void ESessions::replay(MDSRank *mds) +{ + if (mds->sessionmap.get_version() >= cmapv) { + dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version() + << " >= " << cmapv << ", noop" << dendl; + } else { + dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version() + << " < " << cmapv << dendl; + mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map); + } + update_segment(); +} + + +// ----------------------- +// ETableServer + +void ETableServer::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(3, 3, bl); + encode(stamp, bl); + encode(table, bl); + encode(op, bl); + encode(reqid, bl); + encode(bymds, bl); + encode(mutation, bl); + encode(tid, bl); + encode(version, bl); + ENCODE_FINISH(bl); +} + +void ETableServer::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(table, bl); + decode(op, bl); + decode(reqid, bl); + decode(bymds, bl); + decode(mutation, bl); + decode(tid, bl); + decode(version, bl); + DECODE_FINISH(bl); +} + +void ETableServer::dump(Formatter *f) const +{ + f->dump_int("table id", table); + f->dump_int("op", op); + f->dump_int("request id", reqid); + f->dump_int("by mds", bymds); + f->dump_int("tid", tid); + f->dump_int("version", version); +} + +void ETableServer::generate_test_instances(list<ETableServer*>& ls) +{ + ls.push_back(new ETableServer()); +} + + +void ETableServer::update_segment() +{ + get_segment()->tablev[table] = version; +} + +void ETableServer::replay(MDSRank *mds) +{ + MDSTableServer *server = mds->get_table_server(table); + if (!server) + return; + + if (server->get_version() >= version) { + dout(10) << "ETableServer.replay " << get_mdstable_name(table) + << " " << get_mdstableserver_opname(op) + << " event " << version + << " <= table " << server->get_version() << dendl; + return; + } + + dout(10) << " ETableServer.replay " << get_mdstable_name(table) + << " " << get_mdstableserver_opname(op) + << " event " << version << " - 1 == table " << server->get_version() << dendl; + ceph_assert(version-1 == server->get_version()); + + switch (op) { + case TABLESERVER_OP_PREPARE: { + server->_note_prepare(bymds, reqid, true); + bufferlist out; + server->_prepare(mutation, reqid, bymds, out); + mutation = std::move(out); + break; + } + case TABLESERVER_OP_COMMIT: + server->_commit(tid, MMDSTableRequest::ref()); + server->_note_commit(tid, true); + break; + case TABLESERVER_OP_ROLLBACK: + server->_rollback(tid); + server->_note_rollback(tid, true); + break; + case TABLESERVER_OP_SERVER_UPDATE: + server->_server_update(mutation); + server->_note_server_update(mutation, true); + break; + default: + mds->clog->error() << "invalid tableserver op in ETableServer"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } + + ceph_assert(version == server->get_version()); + update_segment(); +} + + +// --------------------- +// ETableClient + +void ETableClient::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(3, 3, bl); + encode(stamp, bl); + encode(table, bl); + encode(op, bl); + encode(tid, bl); + ENCODE_FINISH(bl); +} + +void ETableClient::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(table, bl); + decode(op, bl); + decode(tid, bl); + DECODE_FINISH(bl); +} + +void ETableClient::dump(Formatter *f) const +{ + f->dump_int("table", table); + f->dump_int("op", op); + f->dump_int("tid", tid); +} + +void ETableClient::generate_test_instances(list<ETableClient*>& ls) +{ + ls.push_back(new ETableClient()); +} + +void ETableClient::replay(MDSRank *mds) +{ + dout(10) << " ETableClient.replay " << get_mdstable_name(table) + << " op " << get_mdstableserver_opname(op) + << " tid " << tid << dendl; + + MDSTableClient *client = mds->get_table_client(table); + if (!client) + return; + + ceph_assert(op == TABLESERVER_OP_ACK); + client->got_journaled_ack(tid); +} + + +// ----------------------- +// ESnap +/* +void ESnap::update_segment() +{ + get_segment()->tablev[TABLE_SNAP] = version; +} + +void ESnap::replay(MDSRank *mds) +{ + if (mds->snaptable->get_version() >= version) { + dout(10) << "ESnap.replay event " << version + << " <= table " << mds->snaptable->get_version() << dendl; + return; + } + + dout(10) << " ESnap.replay event " << version + << " - 1 == table " << mds->snaptable->get_version() << dendl; + ceph_assert(version-1 == mds->snaptable->get_version()); + + if (create) { + version_t v; + snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v); + ceph_assert(s == snap.snapid); + } else { + mds->snaptable->remove(snap.snapid); + } + + ceph_assert(version == mds->snaptable->get_version()); +} +*/ + + + +// ----------------------- +// EUpdate + +void EUpdate::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(4, 4, bl); + encode(stamp, bl); + encode(type, bl); + encode(metablob, bl, features); + encode(client_map, bl); + encode(cmapv, bl); + encode(reqid, bl); + encode(had_slaves, bl); + ENCODE_FINISH(bl); +} + +void EUpdate::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(type, bl); + decode(metablob, bl); + decode(client_map, bl); + if (struct_v >= 3) + decode(cmapv, bl); + decode(reqid, bl); + decode(had_slaves, bl); + DECODE_FINISH(bl); +} + +void EUpdate::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + + f->dump_string("type", type); + f->dump_int("client map length", client_map.length()); + f->dump_int("client map version", cmapv); + f->dump_stream("reqid") << reqid; + f->dump_string("had slaves", had_slaves ? "true" : "false"); +} + +void EUpdate::generate_test_instances(list<EUpdate*>& ls) +{ + ls.push_back(new EUpdate()); +} + + +void EUpdate::update_segment() +{ + auto&& segment = get_segment(); + metablob.update_segment(segment); + + if (client_map.length()) + segment->sessionmapv = cmapv; + + if (had_slaves) + segment->uncommitted_masters.insert(reqid); +} + +void EUpdate::replay(MDSRank *mds) +{ + auto&& segment = get_segment(); + metablob.replay(mds, segment); + + if (had_slaves) { + dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl; + segment->uncommitted_masters.insert(reqid); + set<mds_rank_t> slaves; + mds->mdcache->add_uncommitted_master(reqid, segment, slaves, true); + } + + if (client_map.length()) { + if (mds->sessionmap.get_version() >= cmapv) { + dout(10) << "EUpdate.replay sessionmap v " << cmapv + << " <= table " << mds->sessionmap.get_version() << dendl; + } else { + dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version() + << " < " << cmapv << dendl; + // open client sessions? + map<client_t,entity_inst_t> cm; + map<client_t,client_metadata_t> cmm; + auto blp = client_map.cbegin(); + using ceph::decode; + decode(cm, blp); + if (!blp.end()) + decode(cmm, blp); + mds->sessionmap.replay_open_sessions(cmapv, cm, cmm); + } + } + update_segment(); +} + + +// ------------------------ +// EOpen + +void EOpen::encode(bufferlist &bl, uint64_t features) const { + ENCODE_START(4, 3, bl); + encode(stamp, bl); + encode(metablob, bl, features); + encode(inos, bl); + encode(snap_inos, bl); + ENCODE_FINISH(bl); +} + +void EOpen::decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(metablob, bl); + decode(inos, bl); + if (struct_v >= 4) + decode(snap_inos, bl); + DECODE_FINISH(bl); +} + +void EOpen::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + f->open_array_section("inos involved"); + for (vector<inodeno_t>::const_iterator i = inos.begin(); + i != inos.end(); ++i) { + f->dump_int("ino", *i); + } + f->close_section(); // inos +} + +void EOpen::generate_test_instances(list<EOpen*>& ls) +{ + ls.push_back(new EOpen()); + ls.push_back(new EOpen()); + ls.back()->add_ino(0); +} + +void EOpen::update_segment() +{ + // ?? +} + +void EOpen::replay(MDSRank *mds) +{ + dout(10) << "EOpen.replay " << dendl; + auto&& segment = get_segment(); + metablob.replay(mds, segment); + + // note which segments inodes belong to, so we don't have to start rejournaling them + for (const auto &ino : inos) { + CInode *in = mds->mdcache->get_inode(ino); + if (!in) { + dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl; + ceph_assert(in); + } + segment->open_files.push_back(&in->item_open_file); + } + for (const auto &vino : snap_inos) { + CInode *in = mds->mdcache->get_inode(vino); + if (!in) { + dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl; + ceph_assert(in); + } + segment->open_files.push_back(&in->item_open_file); + } +} + + +// ----------------------- +// ECommitted + +void ECommitted::replay(MDSRank *mds) +{ + if (mds->mdcache->uncommitted_masters.count(reqid)) { + dout(10) << "ECommitted.replay " << reqid << dendl; + mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid); + mds->mdcache->uncommitted_masters.erase(reqid); + } else { + dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl; + } +} + +void ECommitted::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(3, 3, bl); + encode(stamp, bl); + encode(reqid, bl); + ENCODE_FINISH(bl); +} + +void ECommitted::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(reqid, bl); + DECODE_FINISH(bl); +} + +void ECommitted::dump(Formatter *f) const { + f->dump_stream("stamp") << stamp; + f->dump_stream("reqid") << reqid; +} + +void ECommitted::generate_test_instances(list<ECommitted*>& ls) +{ + ls.push_back(new ECommitted); + ls.push_back(new ECommitted); + ls.back()->stamp = utime_t(1, 2); + ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456); +} + +// ----------------------- +// ESlaveUpdate + +void link_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(3, 2, bl); + encode(reqid, bl); + encode(ino, bl); + encode(was_inc, bl); + encode(old_ctime, bl); + encode(old_dir_mtime, bl); + encode(old_dir_rctime, bl); + encode(snapbl, bl); + ENCODE_FINISH(bl); +} + +void link_rollback::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(reqid, bl); + decode(ino, bl); + decode(was_inc, bl); + decode(old_ctime, bl); + decode(old_dir_mtime, bl); + decode(old_dir_rctime, bl); + if (struct_v >= 3) + decode(snapbl, bl); + DECODE_FINISH(bl); +} + +void link_rollback::dump(Formatter *f) const +{ + f->dump_stream("metareqid") << reqid; + f->dump_int("ino", ino); + f->dump_string("was incremented", was_inc ? "true" : "false"); + f->dump_stream("old_ctime") << old_ctime; + f->dump_stream("old_dir_mtime") << old_dir_mtime; + f->dump_stream("old_dir_rctime") << old_dir_rctime; +} + +void link_rollback::generate_test_instances(list<link_rollback*>& ls) +{ + ls.push_back(new link_rollback()); +} + +void rmdir_rollback::encode(bufferlist& bl) const +{ + ENCODE_START(3, 2, bl); + encode(reqid, bl); + encode(src_dir, bl); + encode(src_dname, bl); + encode(dest_dir, bl); + encode(dest_dname, bl); + encode(snapbl, bl); + ENCODE_FINISH(bl); +} + +void rmdir_rollback::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(reqid, bl); + decode(src_dir, bl); + decode(src_dname, bl); + decode(dest_dir, bl); + decode(dest_dname, bl); + if (struct_v >= 3) + decode(snapbl, bl); + DECODE_FINISH(bl); +} + +void rmdir_rollback::dump(Formatter *f) const +{ + f->dump_stream("metareqid") << reqid; + f->dump_stream("source directory") << src_dir; + f->dump_string("source dname", src_dname); + f->dump_stream("destination directory") << dest_dir; + f->dump_string("destination dname", dest_dname); +} + +void rmdir_rollback::generate_test_instances(list<rmdir_rollback*>& ls) +{ + ls.push_back(new rmdir_rollback()); +} + +void rename_rollback::drec::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + encode(dirfrag, bl); + encode(dirfrag_old_mtime, bl); + encode(dirfrag_old_rctime, bl); + encode(ino, bl); + encode(remote_ino, bl); + encode(dname, bl); + encode(remote_d_type, bl); + encode(old_ctime, bl); + ENCODE_FINISH(bl); +} + +void rename_rollback::drec::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(dirfrag, bl); + decode(dirfrag_old_mtime, bl); + decode(dirfrag_old_rctime, bl); + decode(ino, bl); + decode(remote_ino, bl); + decode(dname, bl); + decode(remote_d_type, bl); + decode(old_ctime, bl); + DECODE_FINISH(bl); +} + +void rename_rollback::drec::dump(Formatter *f) const +{ + f->dump_stream("directory fragment") << dirfrag; + f->dump_stream("directory old mtime") << dirfrag_old_mtime; + f->dump_stream("directory old rctime") << dirfrag_old_rctime; + f->dump_int("ino", ino); + f->dump_int("remote ino", remote_ino); + f->dump_string("dname", dname); + uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries + string type_string; + switch(type) { + case S_IFREG: + type_string = "file"; break; + case S_IFLNK: + type_string = "symlink"; break; + case S_IFDIR: + type_string = "directory"; break; + default: + type_string = "UNKNOWN-" + stringify((int)type); break; + } + f->dump_string("remote dtype", type_string); + f->dump_stream("old ctime") << old_ctime; +} + +void rename_rollback::drec::generate_test_instances(list<drec*>& ls) +{ + ls.push_back(new drec()); + ls.back()->remote_d_type = IFTODT(S_IFREG); +} + +void rename_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(3, 2, bl); + encode(reqid, bl); + encode(orig_src, bl); + encode(orig_dest, bl); + encode(stray, bl); + encode(ctime, bl); + encode(srci_snapbl, bl); + encode(desti_snapbl, bl); + ENCODE_FINISH(bl); +} + +void rename_rollback::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(reqid, bl); + decode(orig_src, bl); + decode(orig_dest, bl); + decode(stray, bl); + decode(ctime, bl); + if (struct_v >= 3) { + decode(srci_snapbl, bl); + decode(desti_snapbl, bl); + } + DECODE_FINISH(bl); +} + +void rename_rollback::dump(Formatter *f) const +{ + f->dump_stream("request id") << reqid; + f->open_object_section("original src drec"); + orig_src.dump(f); + f->close_section(); // original src drec + f->open_object_section("original dest drec"); + orig_dest.dump(f); + f->close_section(); // original dest drec + f->open_object_section("stray drec"); + stray.dump(f); + f->close_section(); // stray drec + f->dump_stream("ctime") << ctime; +} + +void rename_rollback::generate_test_instances(list<rename_rollback*>& ls) +{ + ls.push_back(new rename_rollback()); + ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG); + ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG); + ls.back()->stray.remote_d_type = IFTODT(S_IFREG); +} + +void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(3, 3, bl); + encode(stamp, bl); + encode(type, bl); + encode(reqid, bl); + encode(master, bl); + encode(op, bl); + encode(origop, bl); + encode(commit, bl, features); + encode(rollback, bl); + ENCODE_FINISH(bl); +} + +void ESlaveUpdate::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(type, bl); + decode(reqid, bl); + decode(master, bl); + decode(op, bl); + decode(origop, bl); + decode(commit, bl); + decode(rollback, bl); + DECODE_FINISH(bl); +} + +void ESlaveUpdate::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + commit.dump(f); + f->close_section(); // metablob + + f->dump_int("rollback length", rollback.length()); + f->dump_string("type", type); + f->dump_stream("metareqid") << reqid; + f->dump_int("master", master); + f->dump_int("op", op); + f->dump_int("original op", origop); +} + +void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls) +{ + ls.push_back(new ESlaveUpdate()); +} + +void ESlaveUpdate::replay(MDSRank *mds) +{ + MDSlaveUpdate *su; + auto&& segment = get_segment(); + switch (op) { + case ESlaveUpdate::OP_PREPARE: + dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master + << ": applying commit, saving rollback info" << dendl; + su = new MDSlaveUpdate(origop, rollback); + commit.replay(mds, segment, su); + mds->mdcache->add_uncommitted_slave(reqid, segment, master, su); + break; + + case ESlaveUpdate::OP_COMMIT: + dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl; + mds->mdcache->finish_uncommitted_slave(reqid, false); + break; + + case ESlaveUpdate::OP_ROLLBACK: + dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master + << ": applying rollback commit blob" << dendl; + commit.replay(mds, segment); + mds->mdcache->finish_uncommitted_slave(reqid, false); + break; + + default: + mds->clog->error() << "invalid op in ESlaveUpdate"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } +} + + +// ----------------------- +// ESubtreeMap + +void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(6, 5, bl); + encode(stamp, bl); + encode(metablob, bl, features); + encode(subtrees, bl); + encode(ambiguous_subtrees, bl); + encode(expire_pos, bl); + encode(event_seq, bl); + ENCODE_FINISH(bl); +} + +void ESubtreeMap::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(metablob, bl); + decode(subtrees, bl); + if (struct_v >= 4) + decode(ambiguous_subtrees, bl); + if (struct_v >= 3) + decode(expire_pos, bl); + if (struct_v >= 6) + decode(event_seq, bl); + DECODE_FINISH(bl); +} + +void ESubtreeMap::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + + f->open_array_section("subtrees"); + for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin(); + i != subtrees.end(); ++i) { + f->open_object_section("tree"); + f->dump_stream("root dirfrag") << i->first; + for (vector<dirfrag_t>::const_iterator j = i->second.begin(); + j != i->second.end(); ++j) { + f->dump_stream("bound dirfrag") << *j; + } + f->close_section(); // tree + } + f->close_section(); // subtrees + + f->open_array_section("ambiguous subtrees"); + for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin(); + i != ambiguous_subtrees.end(); ++i) { + f->dump_stream("dirfrag") << *i; + } + f->close_section(); // ambiguous subtrees + + f->dump_int("expire position", expire_pos); +} + +void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls) +{ + ls.push_back(new ESubtreeMap()); +} + +void ESubtreeMap::replay(MDSRank *mds) +{ + if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos()) + mds->mdlog->journaler->set_expire_pos(expire_pos); + + // suck up the subtree map? + if (mds->mdcache->is_subtrees()) { + dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl; + int errors = 0; + + for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = mds->mdcache->get_dirfrag(p->first); + if (!dir) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree root " << p->first << " not in cache"; + ++errors; + continue; + } + + if (!mds->mdcache->is_subtree(dir)) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree root " << p->first << " not a subtree in cache"; + ++errors; + continue; + } + if (dir->get_dir_auth().first != mds->get_nodeid()) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree root " << p->first + << " is not mine in cache (it's " << dir->get_dir_auth() << ")"; + ++errors; + continue; + } + + for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) + mds->mdcache->get_force_dirfrag(*q, true); + + set<CDir*> bounds; + mds->mdcache->get_subtree_bounds(dir, bounds); + for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) { + CDir *b = mds->mdcache->get_dirfrag(*q); + if (!b) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree " << p->first << " bound " << *q << " not in cache"; + ++errors; + continue; + } + if (bounds.count(b) == 0) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree " << p->first << " bound " << *q << " not a bound in cache"; + ++errors; + continue; + } + bounds.erase(b); + } + for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag(); + ++errors; + } + + if (ambiguous_subtrees.count(p->first)) { + if (!mds->mdcache->have_ambiguous_import(p->first)) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree " << p->first << " is ambiguous but is not in our cache"; + ++errors; + } + } else { + if (mds->mdcache->have_ambiguous_import(p->first)) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " subtree " << p->first << " is not ambiguous but is in our cache"; + ++errors; + } + } + } + + std::vector<CDir*> dirs; + mds->mdcache->get_subtrees(dirs); + for (const auto& dir : dirs) { + if (dir->get_dir_auth().first != mds->get_nodeid()) + continue; + if (subtrees.count(dir->dirfrag()) == 0) { + mds->clog->error() << " replayed ESubtreeMap at " << get_start_off() + << " does not include cache subtree " << dir->dirfrag(); + ++errors; + } + } + + if (errors) { + dout(0) << "journal subtrees: " << subtrees << dendl; + dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl; + mds->mdcache->show_subtrees(); + ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0); + } + return; + } + + dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; + + // first, stick the spanning tree in my cache + //metablob.print(*_dout); + metablob.replay(mds, get_segment()); + + // restore import/export maps + for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = mds->mdcache->get_dirfrag(p->first); + ceph_assert(dir); + if (ambiguous_subtrees.count(p->first)) { + // ambiguous! + mds->mdcache->add_ambiguous_import(p->first, p->second); + mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, + mds_authority_t(mds->get_nodeid(), mds->get_nodeid())); + } else { + // not ambiguous + mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); + } + } + + mds->mdcache->recalc_auth_bits(true); + + mds->mdcache->show_subtrees(); +} + + + +// ----------------------- +// EFragment + +void EFragment::replay(MDSRank *mds) +{ + dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl; + + list<CDir*> resultfrags; + MDSContext::vec waiters; + + // in may be NULL if it wasn't in our cache yet. if it's a prepare + // it will be once we replay the metablob , but first we need to + // refragment anything we already have in the cache. + CInode *in = mds->mdcache->get_inode(ino); + + auto&& segment = get_segment(); + switch (op) { + case OP_PREPARE: + mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback); + + if (in) + mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true); + break; + + case OP_ROLLBACK: { + frag_vec_t old_frags; + if (in) { + in->dirfragtree.get_leaves_under(basefrag, old_frags); + if (orig_frags.empty()) { + // old format EFragment + mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); + } else { + for (const auto& fg : orig_frags) + mds->mdcache->force_dir_fragment(in, fg); + } + } + mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags)); + break; + } + + case OP_COMMIT: + case OP_FINISH: + mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op); + break; + + default: + ceph_abort(); + } + + metablob.replay(mds, segment); + if (in && g_conf()->mds_debug_frag) + in->verify_dirfrags(); +} + +void EFragment::encode(bufferlist &bl, uint64_t features) const { + ENCODE_START(5, 4, bl); + encode(stamp, bl); + encode(op, bl); + encode(ino, bl); + encode(basefrag, bl); + encode(bits, bl); + encode(metablob, bl, features); + encode(orig_frags, bl); + encode(rollback, bl); + ENCODE_FINISH(bl); +} + +void EFragment::decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); + if (struct_v >= 2) + decode(stamp, bl); + if (struct_v >= 3) + decode(op, bl); + decode(ino, bl); + decode(basefrag, bl); + decode(bits, bl); + decode(metablob, bl); + if (struct_v >= 5) { + decode(orig_frags, bl); + decode(rollback, bl); + } + DECODE_FINISH(bl); +} + +void EFragment::dump(Formatter *f) const +{ + /*f->open_object_section("Metablob"); + metablob.dump(f); // sadly we don't have this; dunno if we'll get it + f->close_section();*/ + f->dump_string("op", op_name(op)); + f->dump_stream("ino") << ino; + f->dump_stream("base frag") << basefrag; + f->dump_int("bits", bits); +} + +void EFragment::generate_test_instances(list<EFragment*>& ls) +{ + ls.push_back(new EFragment); + ls.push_back(new EFragment); + ls.back()->op = OP_PREPARE; + ls.back()->ino = 1; + ls.back()->bits = 5; +} + +void dirfrag_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(fnode, bl); + ENCODE_FINISH(bl); +} + +void dirfrag_rollback::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(fnode, bl); + DECODE_FINISH(bl); +} + + + +// ========================================================================= + +// ----------------------- +// EExport + +void EExport::replay(MDSRank *mds) +{ + dout(10) << "EExport.replay " << base << dendl; + auto&& segment = get_segment(); + metablob.replay(mds, segment); + + CDir *dir = mds->mdcache->get_dirfrag(base); + ceph_assert(dir); + + set<CDir*> realbounds; + for (set<dirfrag_t>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = mds->mdcache->get_dirfrag(*p); + ceph_assert(bd); + realbounds.insert(bd); + } + + // adjust auth away + mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF); + + mds->mdcache->try_trim_non_auth_subtree(dir); +} + +void EExport::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(4, 3, bl); + encode(stamp, bl); + encode(metablob, bl, features); + encode(base, bl); + encode(bounds, bl); + encode(target, bl); + ENCODE_FINISH(bl); +} + +void EExport::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(metablob, bl); + decode(base, bl); + decode(bounds, bl); + if (struct_v >= 4) + decode(target, bl); + DECODE_FINISH(bl); +} + +void EExport::dump(Formatter *f) const +{ + f->dump_float("stamp", (double)stamp); + /*f->open_object_section("Metablob"); + metablob.dump(f); // sadly we don't have this; dunno if we'll get it + f->close_section();*/ + f->dump_stream("base dirfrag") << base; + f->open_array_section("bounds dirfrags"); + for (set<dirfrag_t>::const_iterator i = bounds.begin(); + i != bounds.end(); ++i) { + f->dump_stream("dirfrag") << *i; + } + f->close_section(); // bounds dirfrags +} + +void EExport::generate_test_instances(list<EExport*>& ls) +{ + EExport *sample = new EExport(); + ls.push_back(sample); +} + + +// ----------------------- +// EImportStart + +void EImportStart::update_segment() +{ + get_segment()->sessionmapv = cmapv; +} + +void EImportStart::replay(MDSRank *mds) +{ + dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl; + //metablob.print(*_dout); + auto&& segment = get_segment(); + metablob.replay(mds, segment); + + // put in ambiguous import list + mds->mdcache->add_ambiguous_import(base, bounds); + + // set auth partially to us so we don't trim it + CDir *dir = mds->mdcache->get_dirfrag(base); + ceph_assert(dir); + + set<CDir*> realbounds; + for (vector<dirfrag_t>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = mds->mdcache->get_dirfrag(*p); + ceph_assert(bd); + if (!bd->is_subtree_root()) + bd->state_clear(CDir::STATE_AUTH); + realbounds.insert(bd); + } + + mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, + mds_authority_t(mds->get_nodeid(), mds->get_nodeid())); + + // open client sessions? + if (mds->sessionmap.get_version() >= cmapv) { + dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() + << " >= " << cmapv << ", noop" << dendl; + } else { + dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() + << " < " << cmapv << dendl; + map<client_t,entity_inst_t> cm; + map<client_t,client_metadata_t> cmm; + auto blp = client_map.cbegin(); + using ceph::decode; + decode(cm, blp); + if (!blp.end()) + decode(cmm, blp); + mds->sessionmap.replay_open_sessions(cmapv, cm, cmm); + } + update_segment(); +} + +void EImportStart::encode(bufferlist &bl, uint64_t features) const { + ENCODE_START(4, 3, bl); + encode(stamp, bl); + encode(base, bl); + encode(metablob, bl, features); + encode(bounds, bl); + encode(cmapv, bl); + encode(client_map, bl); + encode(from, bl); + ENCODE_FINISH(bl); +} + +void EImportStart::decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(base, bl); + decode(metablob, bl); + decode(bounds, bl); + decode(cmapv, bl); + decode(client_map, bl); + if (struct_v >= 4) + decode(from, bl); + DECODE_FINISH(bl); +} + +void EImportStart::dump(Formatter *f) const +{ + f->dump_stream("base dirfrag") << base; + f->open_array_section("boundary dirfrags"); + for (vector<dirfrag_t>::const_iterator iter = bounds.begin(); + iter != bounds.end(); ++iter) { + f->dump_stream("frag") << *iter; + } + f->close_section(); +} + +void EImportStart::generate_test_instances(list<EImportStart*>& ls) +{ + ls.push_back(new EImportStart); +} + +// ----------------------- +// EImportFinish + +void EImportFinish::replay(MDSRank *mds) +{ + if (mds->mdcache->have_ambiguous_import(base)) { + dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl; + if (success) { + mds->mdcache->finish_ambiguous_import(base); + } else { + CDir *dir = mds->mdcache->get_dirfrag(base); + ceph_assert(dir); + vector<dirfrag_t> bounds; + mds->mdcache->get_ambiguous_import_bounds(base, bounds); + mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF); + mds->mdcache->cancel_ambiguous_import(dir); + mds->mdcache->try_trim_non_auth_subtree(dir); + } + } else { + // this shouldn't happen unless this is an old journal + dout(10) << "EImportFinish.replay " << base << " success=" << success + << " on subtree not marked as ambiguous" + << dendl; + mds->clog->error() << "failure replaying journal (EImportFinish)"; + mds->damaged(); + ceph_abort(); // Should be unreachable because damaged() calls respawn() + } +} + +void EImportFinish::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(3, 3, bl); + encode(stamp, bl); + encode(base, bl); + encode(success, bl); + ENCODE_FINISH(bl); +} + +void EImportFinish::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + decode(stamp, bl); + decode(base, bl); + decode(success, bl); + DECODE_FINISH(bl); +} + +void EImportFinish::dump(Formatter *f) const +{ + f->dump_stream("base dirfrag") << base; + f->dump_string("success", success ? "true" : "false"); +} +void EImportFinish::generate_test_instances(list<EImportFinish*>& ls) +{ + ls.push_back(new EImportFinish); + ls.push_back(new EImportFinish); + ls.back()->success = true; +} + + +// ------------------------ +// EResetJournal + +void EResetJournal::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(2, 2, bl); + encode(stamp, bl); + ENCODE_FINISH(bl); +} + +void EResetJournal::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(stamp, bl); + DECODE_FINISH(bl); +} + +void EResetJournal::dump(Formatter *f) const +{ + f->dump_stream("timestamp") << stamp; +} + +void EResetJournal::generate_test_instances(list<EResetJournal*>& ls) +{ + ls.push_back(new EResetJournal()); +} + +void EResetJournal::replay(MDSRank *mds) +{ + dout(1) << "EResetJournal" << dendl; + + mds->sessionmap.wipe(); + mds->inotable->replay_reset(); + + if (mds->mdsmap->get_root() == mds->get_nodeid()) { + CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t()); + mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid()); + } + + CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t()); + mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid()); + + mds->mdcache->recalc_auth_bits(true); + + mds->mdcache->show_subtrees(); +} + + +void ENoOp::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(2, 2, bl); + encode(pad_size, bl); + uint8_t const pad = 0xff; + for (unsigned int i = 0; i < pad_size; ++i) { + encode(pad, bl); + } + ENCODE_FINISH(bl); +} + + +void ENoOp::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(2, bl); + decode(pad_size, bl); + if (bl.get_remaining() != pad_size) { + // This is spiritually an assertion, but expressing in a way that will let + // journal debug tools catch it and recognise a malformed entry. + throw buffer::end_of_buffer(); + } else { + bl.advance(pad_size); + } + DECODE_FINISH(bl); +} + + +void ENoOp::replay(MDSRank *mds) +{ + dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl; +} + +/** + * If re-formatting an old journal that used absolute log position + * references as segment sequence numbers, use this function to update + * it. + * + * @param mds + * MDSRank instance, just used for logging + * @param old_to_new + * Map of old journal segment sequence numbers to new journal segment sequence numbers + * + * @return + * True if the event was modified. + */ +bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds, + std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new) +{ + bool modified = false; + map<inodeno_t, LogSegment::seq_t> new_trunc_finish; + for (const auto& p : truncate_finish) { + auto q = old_to_new.find(p.second); + if (q != old_to_new.end()) { + dout(20) << __func__ << " applying segment seq mapping " + << p.second << " -> " << q->second << dendl; + new_trunc_finish.emplace(p.first, q->second); + modified = true; + } else { + dout(20) << __func__ << " no segment seq mapping found for " + << p.second << dendl; + new_trunc_finish.insert(p); + } + } + truncate_finish.swap(new_trunc_finish); + + return modified; +} diff --git a/src/mds/locks.c b/src/mds/locks.c new file mode 100644 index 00000000..25646fdd --- /dev/null +++ b/src/mds/locks.c @@ -0,0 +1,162 @@ +#include "include/int_types.h" + +#include <string.h> +#include <fcntl.h> + +#include "locks.h" + +/* Duplicated from ceph_fs.h, which we cannot include into a C file. */ +#define CEPH_CAP_GSHARED 1 /* client can reads */ +#define CEPH_CAP_GEXCL 2 /* client can read and update */ +#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ +#define CEPH_CAP_GRD 8 /* (file) client can read */ +#define CEPH_CAP_GWR 16 /* (file) client can write */ +#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ +#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ +#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ + +static const struct sm_state_t simplelock[LOCK_MAX] = { + // stable loner rep state r rp rd wr fwr l x caps,other + [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED }, + [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, XCL, XCL, 0, 0, XCL, 0, 0,0,0,0 }, + [LOCK_EXCL_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GSHARED,0,0 }, + [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 }, + + [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,0,0,0 }, + + [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, 0,0,0,0 }, + [LOCK_XLOCK] = { LOCK_SYNC, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_XLOCKDONE] = { LOCK_SYNC, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, 0,0,CEPH_CAP_GSHARED,0 }, + [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, 0,0,0,0 }, + + [LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, REQ, XCL, 0, 0, 0, 0,CEPH_CAP_GEXCL|CEPH_CAP_GSHARED,0,0 }, + [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED,0,0 }, + [LOCK_LOCK_EXCL] = { LOCK_EXCL, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, CEPH_CAP_GSHARED,0,0,0 }, + + [LOCK_REMOTEXLOCK]={ LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + +}; + +const struct sm_t sm_simplelock = { + .states = simplelock, + .allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL, + .allowed_ever_replica = CEPH_CAP_GSHARED, + .careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL, + .can_remote_xlock = 1, +}; + + +// lock state machine states: +// Sync -- Lock -- sCatter +// Tempsync _/ +// (out of date) + +static const struct sm_state_t scatterlock[LOCK_MAX] = { + // stable loner rep state r rp rd wr fwr l x caps,other + [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED }, + [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 }, + + [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, ANY, 0,0,0,0 }, + [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + + [LOCK_TSYN] = { 0, false, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,0,0,0 }, + [LOCK_LOCK_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_MIX_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + + [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, 0,0,0,0 }, + [LOCK_TSYN_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,ANY,0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, +}; + +const struct sm_t sm_scatterlock = { + .states = scatterlock, + .allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL, + .allowed_ever_replica = CEPH_CAP_GSHARED, + .careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL, + .can_remote_xlock = 0, +}; + +const struct sm_state_t filelock[LOCK_MAX] = { + // stable loner rep state r rp rd wr fwr l x caps(any,loner,xlocker,replica) + [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD }, + [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, + [LOCK_EXCL_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 }, + [LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_MIX_SYNC2,0,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, + [LOCK_MIX_SYNC2] = { LOCK_SYNC, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, + [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 }, + [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 }, + + [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, + [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, + [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, + [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_XSYN_LOCK] = { LOCK_LOCK, true, LOCK_LOCK, AUTH, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + + [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, + [LOCK_XLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, + [LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 }, + [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, + [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, + + [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, + [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,ANY,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, + [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, + [LOCK_EXCL_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 }, + [LOCK_XSYN_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,0,0,0 }, + + [LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, XCL, XCL, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER,0,0 }, + [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 }, + [LOCK_MIX_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 }, + [LOCK_LOCK_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + [LOCK_XSYN_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + + [LOCK_XSYN] = { 0, true, LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + + [LOCK_PRE_SCAN] = { LOCK_SCAN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, + [LOCK_SCAN] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, +}; + +const struct sm_t sm_filelock = { + .states = filelock, + .allowed_ever_auth = (CEPH_CAP_GSHARED | + CEPH_CAP_GEXCL | + CEPH_CAP_GCACHE | + CEPH_CAP_GRD | + CEPH_CAP_GWR | + CEPH_CAP_GWREXTEND | + CEPH_CAP_GBUFFER | + CEPH_CAP_GLAZYIO), + .allowed_ever_replica = (CEPH_CAP_GSHARED | + CEPH_CAP_GCACHE | + CEPH_CAP_GRD | + CEPH_CAP_GLAZYIO), + .careful = (CEPH_CAP_GSHARED | + CEPH_CAP_GEXCL | + CEPH_CAP_GCACHE | + CEPH_CAP_GBUFFER), + .can_remote_xlock = 0, +}; + + +const struct sm_state_t locallock[LOCK_MAX] = { + // stable loner rep state r rp rd wr fwr l x caps(any,loner,xlocker,replica) + [LOCK_LOCK] = { 0, false, LOCK_LOCK, ANY, 0, ANY, 0, 0, ANY, AUTH,0,0,0,0 }, +}; + +const struct sm_t sm_locallock = { + .states = locallock, + .allowed_ever_auth = 0, + .allowed_ever_replica = 0, + .careful = 0, + .can_remote_xlock = 0, +}; diff --git a/src/mds/locks.h b/src/mds/locks.h new file mode 100644 index 00000000..e6fdc1cf --- /dev/null +++ b/src/mds/locks.h @@ -0,0 +1,126 @@ +#ifndef CEPH_MDS_LOCKS_H +#define CEPH_MDS_LOCKS_H +#include <stdbool.h> + +struct sm_state_t { + int next; // 0 if stable + bool loner; + int replica_state; + char can_read; + char can_read_projected; + char can_rdlock; + char can_wrlock; + char can_force_wrlock; + char can_lease; + char can_xlock; + int caps; + int loner_caps; + int xlocker_caps; + int replica_caps; +}; + +struct sm_t { + const struct sm_state_t *states; + int allowed_ever_auth; + int allowed_ever_replica; + int careful; + int can_remote_xlock; +}; + +#define ANY 1 // auth or replica +#define AUTH 2 // auth only +#define XCL 3 // auth or exclusive client +//#define FW 4 // fw to auth, if replica +#define REQ 5 // req state change from auth, if replica + +extern const struct sm_t sm_simplelock; +extern const struct sm_t sm_filelock; +extern const struct sm_t sm_scatterlock; +extern const struct sm_t sm_locallock; + + + +// -- lock states -- +// sync <-> lock +enum { + LOCK_UNDEF = 0, + + // auth rep + LOCK_SYNC, // AR R . RD L . / C . R RD L . / C . + LOCK_LOCK, // AR R . .. . X / . . . .. . . / . . + + LOCK_PREXLOCK, // A . . .. . . / . . (lock) + LOCK_XLOCK, // A . . .. . . / . . (lock) + LOCK_XLOCKDONE, // A r p rd l x / . . (lock) <-- by same client only!! + LOCK_XLOCKSNAP, // also revoke Fb + LOCK_LOCK_XLOCK, + + LOCK_SYNC_LOCK, // AR R . .. . . / . . R .. . . / . . + LOCK_LOCK_SYNC, // A R p rd l . / . . (lock) <-- lc by same client only + + LOCK_EXCL, // A . . .. . . / c x * (lock) + LOCK_EXCL_SYNC, // A . . .. . . / c . * (lock) + LOCK_EXCL_LOCK, // A . . .. . . / . . (lock) + LOCK_SYNC_EXCL, // Ar R . .. . . / c . * (sync->lock) + LOCK_LOCK_EXCL, // A R . .. . . / . . (lock) + + LOCK_REMOTEXLOCK, // on NON-auth + + // * = loner mode + + LOCK_MIX, + LOCK_SYNC_MIX, + LOCK_SYNC_MIX2, + LOCK_LOCK_MIX, + LOCK_EXCL_MIX, + LOCK_MIX_SYNC, + LOCK_MIX_SYNC2, + LOCK_MIX_LOCK, + LOCK_MIX_LOCK2, + LOCK_MIX_EXCL, + + LOCK_TSYN, + LOCK_TSYN_LOCK, + LOCK_TSYN_MIX, + LOCK_LOCK_TSYN, + LOCK_MIX_TSYN, + + LOCK_PRE_SCAN, + LOCK_SCAN, + + LOCK_SNAP_SYNC, + + LOCK_XSYN, + LOCK_XSYN_EXCL, + LOCK_EXCL_XSYN, + LOCK_XSYN_SYNC, + LOCK_XSYN_LOCK, + LOCK_XSYN_MIX, + + LOCK_MAX, +}; + +// ------------------------- +// lock actions + +// for replicas +#define LOCK_AC_SYNC -1 +#define LOCK_AC_MIX -2 +#define LOCK_AC_LOCK -3 +#define LOCK_AC_LOCKFLUSHED -4 + +// for auth +#define LOCK_AC_SYNCACK 1 +#define LOCK_AC_MIXACK 2 +#define LOCK_AC_LOCKACK 3 + +#define LOCK_AC_REQSCATTER 7 +#define LOCK_AC_REQUNSCATTER 8 +#define LOCK_AC_NUDGE 9 +#define LOCK_AC_REQRDLOCK 10 + +#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) +#define LOCK_AC_FOR_AUTH(a) ((a) > 0) + + +#endif diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h new file mode 100644 index 00000000..bfb2baa9 --- /dev/null +++ b/src/mds/mds_table_types.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSTABLETYPES_H +#define CEPH_MDSTABLETYPES_H + +// MDS TABLES + +#include <string_view> + +enum { + TABLE_ANCHOR, + TABLE_SNAP, +}; + +inline std::string_view get_mdstable_name(int t) { + switch (t) { + case TABLE_ANCHOR: return "anchortable"; + case TABLE_SNAP: return "snaptable"; + default: ceph_abort(); return std::string_view(); + } +} + +enum { + TABLESERVER_OP_QUERY = 1, + TABLESERVER_OP_QUERY_REPLY = -2, + TABLESERVER_OP_PREPARE = 3, + TABLESERVER_OP_AGREE = -4, + TABLESERVER_OP_COMMIT = 5, + TABLESERVER_OP_ACK = -6, + TABLESERVER_OP_ROLLBACK = 7, + TABLESERVER_OP_SERVER_UPDATE = 8, + TABLESERVER_OP_SERVER_READY = -9, + TABLESERVER_OP_NOTIFY_ACK = 10, + TABLESERVER_OP_NOTIFY_PREP = -11, +}; + +inline std::string_view get_mdstableserver_opname(int op) { + switch (op) { + case TABLESERVER_OP_QUERY: return "query"; + case TABLESERVER_OP_QUERY_REPLY: return "query_reply"; + case TABLESERVER_OP_PREPARE: return "prepare"; + case TABLESERVER_OP_AGREE: return "agree"; + case TABLESERVER_OP_COMMIT: return "commit"; + case TABLESERVER_OP_ACK: return "ack"; + case TABLESERVER_OP_ROLLBACK: return "rollback"; + case TABLESERVER_OP_SERVER_UPDATE: return "server_update"; + case TABLESERVER_OP_SERVER_READY: return "server_ready"; + case TABLESERVER_OP_NOTIFY_ACK: return "notify_ack"; + case TABLESERVER_OP_NOTIFY_PREP: return "notify_prep"; + default: ceph_abort(); return std::string_view(); + } +} + +enum { + TABLE_OP_CREATE, + TABLE_OP_UPDATE, + TABLE_OP_DESTROY, +}; + +inline std::string_view get_mdstable_opname(int op) { + switch (op) { + case TABLE_OP_CREATE: return "create"; + case TABLE_OP_UPDATE: return "update"; + case TABLE_OP_DESTROY: return "destroy"; + default: ceph_abort(); return std::string_view(); + } +} + +#endif diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc new file mode 100644 index 00000000..a55c8559 --- /dev/null +++ b/src/mds/mdstypes.cc @@ -0,0 +1,895 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mdstypes.h" +#include "MDSContext.h" +#include "common/Formatter.h" + +const mds_gid_t MDS_GID_NONE = mds_gid_t(0); + + +/* + * frag_info_t + */ + +void frag_info_t::encode(bufferlist &bl) const +{ + ENCODE_START(3, 2, bl); + encode(version, bl); + encode(mtime, bl); + encode(nfiles, bl); + encode(nsubdirs, bl); + encode(change_attr, bl); + ENCODE_FINISH(bl); +} + +void frag_info_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(version, bl); + decode(mtime, bl); + decode(nfiles, bl); + decode(nsubdirs, bl); + if (struct_v >= 3) + decode(change_attr, bl); + else + change_attr = 0; + DECODE_FINISH(bl); +} + +void frag_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("mtime") << mtime; + f->dump_unsigned("num_files", nfiles); + f->dump_unsigned("num_subdirs", nsubdirs); +} + +void frag_info_t::generate_test_instances(list<frag_info_t*>& ls) +{ + ls.push_back(new frag_info_t); + ls.push_back(new frag_info_t); + ls.back()->version = 1; + ls.back()->mtime = utime_t(2, 3); + ls.back()->nfiles = 4; + ls.back()->nsubdirs = 5; +} + +ostream& operator<<(ostream &out, const frag_info_t &f) +{ + if (f == frag_info_t()) + return out << "f()"; + out << "f(v" << f.version; + if (f.mtime != utime_t()) + out << " m" << f.mtime; + if (f.nfiles || f.nsubdirs) + out << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs; + out << ")"; + return out; +} + + +/* + * nest_info_t + */ + +void nest_info_t::encode(bufferlist &bl) const +{ + ENCODE_START(3, 2, bl); + encode(version, bl); + encode(rbytes, bl); + encode(rfiles, bl); + encode(rsubdirs, bl); + { + // removed field + int64_t ranchors = 0; + encode(ranchors, bl); + } + encode(rsnaps, bl); + encode(rctime, bl); + ENCODE_FINISH(bl); +} + +void nest_info_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(version, bl); + decode(rbytes, bl); + decode(rfiles, bl); + decode(rsubdirs, bl); + { + int64_t ranchors; + decode(ranchors, bl); + } + decode(rsnaps, bl); + decode(rctime, bl); + DECODE_FINISH(bl); +} + +void nest_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_unsigned("rbytes", rbytes); + f->dump_unsigned("rfiles", rfiles); + f->dump_unsigned("rsubdirs", rsubdirs); + f->dump_unsigned("rsnaps", rsnaps); + f->dump_stream("rctime") << rctime; +} + +void nest_info_t::generate_test_instances(list<nest_info_t*>& ls) +{ + ls.push_back(new nest_info_t); + ls.push_back(new nest_info_t); + ls.back()->version = 1; + ls.back()->rbytes = 2; + ls.back()->rfiles = 3; + ls.back()->rsubdirs = 4; + ls.back()->rsnaps = 6; + ls.back()->rctime = utime_t(7, 8); +} + +ostream& operator<<(ostream &out, const nest_info_t &n) +{ + if (n == nest_info_t()) + return out << "n()"; + out << "n(v" << n.version; + if (n.rctime != utime_t()) + out << " rc" << n.rctime; + if (n.rbytes) + out << " b" << n.rbytes; + if (n.rsnaps) + out << " rs" << n.rsnaps; + if (n.rfiles || n.rsubdirs) + out << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs; + out << ")"; + return out; +} + +/* + * quota_info_t + */ +void quota_info_t::dump(Formatter *f) const +{ + f->dump_int("max_bytes", max_bytes); + f->dump_int("max_files", max_files); +} + +void quota_info_t::generate_test_instances(list<quota_info_t *>& ls) +{ + ls.push_back(new quota_info_t); + ls.push_back(new quota_info_t); + ls.back()->max_bytes = 16; + ls.back()->max_files = 16; +} + +ostream& operator<<(ostream &out, const quota_info_t &n) +{ + out << "quota(" + << "max_bytes = " << n.max_bytes + << " max_files = " << n.max_files + << ")"; + return out; +} + +/* + * client_writeable_range_t + */ + +void client_writeable_range_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + encode(range.first, bl); + encode(range.last, bl); + encode(follows, bl); + ENCODE_FINISH(bl); +} + +void client_writeable_range_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(range.first, bl); + decode(range.last, bl); + decode(follows, bl); + DECODE_FINISH(bl); +} + +void client_writeable_range_t::dump(Formatter *f) const +{ + f->open_object_section("byte range"); + f->dump_unsigned("first", range.first); + f->dump_unsigned("last", range.last); + f->close_section(); + f->dump_unsigned("follows", follows); +} + +void client_writeable_range_t::generate_test_instances(list<client_writeable_range_t*>& ls) +{ + ls.push_back(new client_writeable_range_t); + ls.push_back(new client_writeable_range_t); + ls.back()->range.first = 123; + ls.back()->range.last = 456; + ls.back()->follows = 12; +} + +ostream& operator<<(ostream& out, const client_writeable_range_t& r) +{ + return out << r.range.first << '-' << r.range.last << "@" << r.follows; +} + +/* + * inline_data_t + */ +void inline_data_t::encode(bufferlist &bl) const +{ + using ceph::encode; + encode(version, bl); + if (blp) + encode(*blp, bl); + else + encode(bufferlist(), bl); +} +void inline_data_t::decode(bufferlist::const_iterator &p) +{ + using ceph::decode; + decode(version, p); + uint32_t inline_len; + decode(inline_len, p); + if (inline_len > 0) + decode_nohead(inline_len, get_data(), p); + else + free_data(); +} + + +/* + * fnode_t + */ +void fnode_t::encode(bufferlist &bl) const +{ + ENCODE_START(4, 3, bl); + encode(version, bl); + encode(snap_purged_thru, bl); + encode(fragstat, bl); + encode(accounted_fragstat, bl); + encode(rstat, bl); + encode(accounted_rstat, bl); + encode(damage_flags, bl); + encode(recursive_scrub_version, bl); + encode(recursive_scrub_stamp, bl); + encode(localized_scrub_version, bl); + encode(localized_scrub_stamp, bl); + ENCODE_FINISH(bl); +} + +void fnode_t::decode(bufferlist::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(version, bl); + decode(snap_purged_thru, bl); + decode(fragstat, bl); + decode(accounted_fragstat, bl); + decode(rstat, bl); + decode(accounted_rstat, bl); + if (struct_v >= 3) { + decode(damage_flags, bl); + } + if (struct_v >= 4) { + decode(recursive_scrub_version, bl); + decode(recursive_scrub_stamp, bl); + decode(localized_scrub_version, bl); + decode(localized_scrub_stamp, bl); + } + DECODE_FINISH(bl); +} + +void fnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_unsigned("snap_purged_thru", snap_purged_thru); + + f->open_object_section("fragstat"); + fragstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_fragstat"); + accounted_fragstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); +} + +void fnode_t::generate_test_instances(list<fnode_t*>& ls) +{ + ls.push_back(new fnode_t); + ls.push_back(new fnode_t); + ls.back()->version = 1; + ls.back()->snap_purged_thru = 2; + list<frag_info_t*> fls; + frag_info_t::generate_test_instances(fls); + ls.back()->fragstat = *fls.back(); + ls.back()->accounted_fragstat = *fls.front(); + list<nest_info_t*> nls; + nest_info_t::generate_test_instances(nls); + ls.back()->rstat = *nls.front(); + ls.back()->accounted_rstat = *nls.back(); +} + + +/* + * old_rstat_t + */ +void old_rstat_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(first, bl); + encode(rstat, bl); + encode(accounted_rstat, bl); + ENCODE_FINISH(bl); +} + +void old_rstat_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(first, bl); + decode(rstat, bl); + decode(accounted_rstat, bl); + DECODE_FINISH(bl); +} + +void old_rstat_t::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", first); + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); +} + +void old_rstat_t::generate_test_instances(list<old_rstat_t*>& ls) +{ + ls.push_back(new old_rstat_t()); + ls.push_back(new old_rstat_t()); + ls.back()->first = 12; + list<nest_info_t*> nls; + nest_info_t::generate_test_instances(nls); + ls.back()->rstat = *nls.back(); + ls.back()->accounted_rstat = *nls.front(); +} + +/* + * feature_bitset_t + */ +feature_bitset_t::feature_bitset_t(unsigned long value) +{ + if (value) { + for (size_t i = 0; i < sizeof(value) * 8; i += bits_per_block) { + _vec.push_back((block_type)(value >> i)); + } + } +} + +feature_bitset_t::feature_bitset_t(const vector<size_t>& array) +{ + if (!array.empty()) { + size_t n = array.back(); + n += bits_per_block; + n /= bits_per_block; + _vec.resize(n, 0); + + size_t last = 0; + for (auto& bit : array) { + if (bit > last) + last = bit; + else + ceph_assert(bit == last); + _vec[bit / bits_per_block] |= (block_type)1 << (bit % bits_per_block); + } + } +} + +feature_bitset_t& feature_bitset_t::operator-=(const feature_bitset_t& other) +{ + for (size_t i = 0; i < _vec.size(); ++i) { + if (i >= other._vec.size()) + break; + _vec[i] &= ~other._vec[i]; + } + return *this; +} + +void feature_bitset_t::encode(bufferlist& bl) const { + using ceph::encode; + using ceph::encode_nohead; + uint32_t len = _vec.size() * sizeof(block_type); + encode(len, bl); + encode_nohead(_vec, bl); +} + +void feature_bitset_t::decode(bufferlist::const_iterator &p) { + using ceph::decode; + using ceph::decode_nohead; + uint32_t len; + decode(len, p); + + _vec.clear(); + if (len >= sizeof(block_type)) + decode_nohead(len / sizeof(block_type), _vec, p); + + if (len % sizeof(block_type)) { + ceph_le64 buf{}; + p.copy(len % sizeof(block_type), (char*)&buf); + _vec.push_back((block_type)buf); + } +} + +void feature_bitset_t::print(ostream& out) const +{ + std::ios_base::fmtflags f(out.flags()); + out << "0x"; + for (int i = _vec.size() - 1; i >= 0; --i) + out << std::setfill('0') << std::setw(sizeof(block_type) * 2) + << std::hex << _vec[i]; + out.flags(f); +} + +/* + * client_metadata_t + */ +void client_metadata_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 1, bl); + encode(kv_map, bl); + encode(features, bl); + ENCODE_FINISH(bl); +} + +void client_metadata_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(2, p); + decode(kv_map, p); + if (struct_v >= 2) + decode(features, p); + DECODE_FINISH(p); +} + +void client_metadata_t::dump(Formatter *f) const +{ + f->dump_stream("features") << features; + for (const auto& [name, val] : kv_map) + f->dump_string(name.c_str(), val); +} + +/* + * session_info_t + */ +void session_info_t::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(7, 7, bl); + encode(inst, bl, features); + encode(completed_requests, bl); + encode(prealloc_inos, bl); // hacky, see below. + encode(used_inos, bl); + encode(completed_flushes, bl); + encode(auth_name, bl); + encode(client_metadata, bl); + ENCODE_FINISH(bl); +} + +void session_info_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 2, 2, p); + decode(inst, p); + if (struct_v <= 2) { + set<ceph_tid_t> s; + decode(s, p); + while (!s.empty()) { + completed_requests[*s.begin()] = inodeno_t(); + s.erase(s.begin()); + } + } else { + decode(completed_requests, p); + } + decode(prealloc_inos, p); + decode(used_inos, p); + prealloc_inos.insert(used_inos); + used_inos.clear(); + if (struct_v >= 4 && struct_v < 7) { + decode(client_metadata.kv_map, p); + } + if (struct_v >= 5) { + decode(completed_flushes, p); + } + if (struct_v >= 6) { + decode(auth_name, p); + } + if (struct_v >= 7) { + decode(client_metadata, p); + } + DECODE_FINISH(p); +} + +void session_info_t::dump(Formatter *f) const +{ + f->dump_stream("inst") << inst; + + f->open_array_section("completed_requests"); + for (const auto& [tid, ino] : completed_requests) { + f->open_object_section("request"); + f->dump_unsigned("tid", tid); + f->dump_stream("created_ino") << ino; + f->close_section(); + } + f->close_section(); + + f->open_array_section("prealloc_inos"); + for (const auto& [start, len] : prealloc_inos) { + f->open_object_section("ino_range"); + f->dump_unsigned("start", start); + f->dump_unsigned("length", len); + f->close_section(); + } + f->close_section(); + + f->open_array_section("used_inos"); + for (const auto& [start, len] : used_inos) { + f->open_object_section("ino_range"); + f->dump_unsigned("start", start); + f->dump_unsigned("length", len); + f->close_section(); + } + f->close_section(); + + f->dump_object("client_metadata", client_metadata); +} + +void session_info_t::generate_test_instances(list<session_info_t*>& ls) +{ + ls.push_back(new session_info_t); + ls.push_back(new session_info_t); + ls.back()->inst = entity_inst_t(entity_name_t::MDS(12), entity_addr_t()); + ls.back()->completed_requests.insert(make_pair(234, inodeno_t(111222))); + ls.back()->completed_requests.insert(make_pair(237, inodeno_t(222333))); + ls.back()->prealloc_inos.insert(333, 12); + ls.back()->prealloc_inos.insert(377, 112); + // we can't add used inos; they're cleared on decode +} + + +/* + * string_snap_t + */ +void string_snap_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(name, bl); + encode(snapid, bl); + ENCODE_FINISH(bl); +} + +void string_snap_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(name, bl); + decode(snapid, bl); + DECODE_FINISH(bl); +} + +void string_snap_t::dump(Formatter *f) const +{ + f->dump_string("name", name); + f->dump_unsigned("snapid", snapid); +} + +void string_snap_t::generate_test_instances(list<string_snap_t*>& ls) +{ + ls.push_back(new string_snap_t); + ls.push_back(new string_snap_t); + ls.back()->name = "foo"; + ls.back()->snapid = 123; + ls.push_back(new string_snap_t); + ls.back()->name = "bar"; + ls.back()->snapid = 456; +} + + +/* + * MDSCacheObjectInfo + */ +void MDSCacheObjectInfo::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(ino, bl); + encode(dirfrag, bl); + encode(dname, bl); + encode(snapid, bl); + ENCODE_FINISH(bl); +} + +void MDSCacheObjectInfo::decode(bufferlist::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + decode(ino, p); + decode(dirfrag, p); + decode(dname, p); + decode(snapid, p); + DECODE_FINISH(p); +} + +void MDSCacheObjectInfo::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_stream("dirfrag") << dirfrag; + f->dump_string("name", dname); + f->dump_unsigned("snapid", snapid); +} + +void MDSCacheObjectInfo::generate_test_instances(list<MDSCacheObjectInfo*>& ls) +{ + ls.push_back(new MDSCacheObjectInfo); + ls.push_back(new MDSCacheObjectInfo); + ls.back()->ino = 1; + ls.back()->dirfrag = dirfrag_t(2, 3); + ls.back()->dname = "fooname"; + ls.back()->snapid = CEPH_NOSNAP; + ls.push_back(new MDSCacheObjectInfo); + ls.back()->ino = 121; + ls.back()->dirfrag = dirfrag_t(222, 0); + ls.back()->dname = "bar foo"; + ls.back()->snapid = 21322; +} + +/* + * mds_table_pending_t + */ +void mds_table_pending_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(reqid, bl); + encode(mds, bl); + encode(tid, bl); + ENCODE_FINISH(bl); +} + +void mds_table_pending_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(reqid, bl); + decode(mds, bl); + decode(tid, bl); + DECODE_FINISH(bl); +} + +void mds_table_pending_t::dump(Formatter *f) const +{ + f->dump_unsigned("reqid", reqid); + f->dump_unsigned("mds", mds); + f->dump_unsigned("tid", tid); +} + +void mds_table_pending_t::generate_test_instances(list<mds_table_pending_t*>& ls) +{ + ls.push_back(new mds_table_pending_t); + ls.push_back(new mds_table_pending_t); + ls.back()->reqid = 234; + ls.back()->mds = 2; + ls.back()->tid = 35434; +} + + +/* + * inode_load_vec_t + */ +void inode_load_vec_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + for (const auto &i : vec) { + encode(i, bl); + } + ENCODE_FINISH(bl); +} + +void inode_load_vec_t::decode(bufferlist::const_iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + for (auto &i : vec) { + decode(i, p); + } + DECODE_FINISH(p); +} + +void inode_load_vec_t::dump(Formatter *f) const +{ + f->open_array_section("Decay Counters"); + for (const auto &i : vec) { + f->open_object_section("Decay Counter"); + i.dump(f); + f->close_section(); + } + f->close_section(); +} + +void inode_load_vec_t::generate_test_instances(list<inode_load_vec_t*>& ls) +{ + ls.push_back(new inode_load_vec_t(DecayRate())); +} + + +/* + * dirfrag_load_vec_t + */ +void dirfrag_load_vec_t::dump(Formatter *f) const +{ + f->open_array_section("Decay Counters"); + for (const auto &i : vec) { + f->open_object_section("Decay Counter"); + i.dump(f); + f->close_section(); + } + f->close_section(); +} + +void dirfrag_load_vec_t::dump(Formatter *f, const DecayRate& rate) const +{ + f->dump_float("meta_load", meta_load()); + f->dump_float("IRD", get(META_POP_IRD).get()); + f->dump_float("IWR", get(META_POP_IWR).get()); + f->dump_float("READDIR", get(META_POP_READDIR).get()); + f->dump_float("FETCH", get(META_POP_FETCH).get()); + f->dump_float("STORE", get(META_POP_STORE).get()); +} + +void dirfrag_load_vec_t::generate_test_instances(std::list<dirfrag_load_vec_t*>& ls) +{ + ls.push_back(new dirfrag_load_vec_t(DecayRate())); +} + +/* + * mds_load_t + */ +void mds_load_t::encode(bufferlist &bl) const { + ENCODE_START(2, 2, bl); + encode(auth, bl); + encode(all, bl); + encode(req_rate, bl); + encode(cache_hit_rate, bl); + encode(queue_len, bl); + encode(cpu_load_avg, bl); + ENCODE_FINISH(bl); +} + +void mds_load_t::decode(bufferlist::const_iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(auth, bl); + decode(all, bl); + decode(req_rate, bl); + decode(cache_hit_rate, bl); + decode(queue_len, bl); + decode(cpu_load_avg, bl); + DECODE_FINISH(bl); +} + +void mds_load_t::dump(Formatter *f) const +{ + f->dump_float("request rate", req_rate); + f->dump_float("cache hit rate", cache_hit_rate); + f->dump_float("queue length", queue_len); + f->dump_float("cpu load", cpu_load_avg); + f->open_object_section("auth dirfrag"); + auth.dump(f); + f->close_section(); + f->open_object_section("all dirfrags"); + all.dump(f); + f->close_section(); +} + +void mds_load_t::generate_test_instances(std::list<mds_load_t*>& ls) +{ + ls.push_back(new mds_load_t(DecayRate())); +} + +/* + * cap_reconnect_t + */ +void cap_reconnect_t::encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode_old(bl); // extract out when something changes + encode(snap_follows, bl); + ENCODE_FINISH(bl); +} + +void cap_reconnect_t::encode_old(bufferlist& bl) const { + using ceph::encode; + encode(path, bl); + capinfo.flock_len = flockbl.length(); + encode(capinfo, bl); + encode_nohead(flockbl, bl); +} + +void cap_reconnect_t::decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode_old(bl); // extract out when something changes + if (struct_v >= 2) + decode(snap_follows, bl); + DECODE_FINISH(bl); +} + +void cap_reconnect_t::decode_old(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(path, bl); + decode(capinfo, bl); + decode_nohead(capinfo.flock_len, flockbl, bl); +} + +void cap_reconnect_t::dump(Formatter *f) const +{ + f->dump_string("path", path); + f->dump_int("cap_id", capinfo.cap_id); + f->dump_string("cap wanted", ccap_string(capinfo.wanted)); + f->dump_string("cap issued", ccap_string(capinfo.issued)); + f->dump_int("snaprealm", capinfo.snaprealm); + f->dump_int("path base ino", capinfo.pathbase); + f->dump_string("has file locks", capinfo.flock_len ? "true" : "false"); +} + +void cap_reconnect_t::generate_test_instances(list<cap_reconnect_t*>& ls) +{ + ls.push_back(new cap_reconnect_t); + ls.back()->path = "/test/path"; + ls.back()->capinfo.cap_id = 1; +} + +/* + * snaprealm_reconnect_t + */ +void snaprealm_reconnect_t::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode_old(bl); // extract out when something changes + ENCODE_FINISH(bl); +} + +void snaprealm_reconnect_t::encode_old(bufferlist& bl) const { + using ceph::encode; + encode(realm, bl); +} + +void snaprealm_reconnect_t::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode_old(bl); // extract out when something changes + DECODE_FINISH(bl); +} + +void snaprealm_reconnect_t::decode_old(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(realm, bl); +} + +void snaprealm_reconnect_t::dump(Formatter *f) const +{ + f->dump_int("ino", realm.ino); + f->dump_int("seq", realm.seq); + f->dump_int("parent", realm.parent); +} + +void snaprealm_reconnect_t::generate_test_instances(list<snaprealm_reconnect_t*>& ls) +{ + ls.push_back(new snaprealm_reconnect_t); + ls.back()->realm.ino = 0x10000000001ULL; + ls.back()->realm.seq = 2; + ls.back()->realm.parent = 1; +} + + +ostream& operator<<(ostream &out, const mds_role_t &role) +{ + out << role.fscid << ":" << role.rank; + return out; +} + diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h new file mode 100644 index 00000000..d241030a --- /dev/null +++ b/src/mds/mdstypes.h @@ -0,0 +1,1821 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_MDSTYPES_H +#define CEPH_MDSTYPES_H + +#include "include/int_types.h" + +#include <math.h> +#include <ostream> +#include <set> +#include <map> +#include <string_view> + +#include "common/config.h" +#include "common/Clock.h" +#include "common/DecayCounter.h" +#include "common/entity_name.h" + +#include "include/Context.h" +#include "include/frag.h" +#include "include/xlist.h" +#include "include/interval_set.h" +#include "include/compact_map.h" +#include "include/compact_set.h" +#include "include/fs_types.h" + +#include "inode_backtrace.h" + +#include <boost/spirit/include/qi.hpp> +#include <boost/pool/pool.hpp> +#include "include/ceph_assert.h" +#include <boost/serialization/strong_typedef.hpp> + +#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011" + +#define MDS_PORT_CACHE 0x200 +#define MDS_PORT_LOCKER 0x300 +#define MDS_PORT_MIGRATOR 0x400 + +#define MAX_MDS 0x100 +#define NUM_STRAY 10 + +#define MDS_INO_ROOT 1 + +// No longer created but recognised in existing filesystems +// so that we don't try to fragment it. +#define MDS_INO_CEPH 2 + +#define MDS_INO_GLOBAL_SNAPREALM 3 + +#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS) +#define MDS_INO_STRAY_OFFSET (6*MAX_MDS) + +// Locations for journal data +#define MDS_INO_LOG_OFFSET (2*MAX_MDS) +#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS) +#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS) +#define MDS_INO_PURGE_QUEUE (5*MAX_MDS) + +#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY)) + +#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i)))) +#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x)) + +#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY))) +#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS)) +#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET)) +#define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i)) +#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY)) +#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY) + +#define MDS_TRAVERSE_FORWARD 1 +#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. +#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. + + +typedef int32_t mds_rank_t; +constexpr mds_rank_t MDS_RANK_NONE = -1; + +BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) +extern const mds_gid_t MDS_GID_NONE; + +typedef int32_t fs_cluster_id_t; +constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1; +// The namespace ID of the anonymous default filesystem from legacy systems +constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0; + +class mds_role_t +{ + public: + fs_cluster_id_t fscid; + mds_rank_t rank; + + mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_) + : fscid(fscid_), rank(rank_) + {} + mds_role_t() + : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE) + {} + bool operator<(mds_role_t const &rhs) const + { + if (fscid < rhs.fscid) { + return true; + } else if (fscid == rhs.fscid) { + return rank < rhs.rank; + } else { + return false; + } + } + + bool is_none() const + { + return (rank == MDS_RANK_NONE); + } +}; +std::ostream& operator<<(std::ostream &out, const mds_role_t &role); + + +// CAPS + +inline string gcap_string(int cap) +{ + string s; + if (cap & CEPH_CAP_GSHARED) s += "s"; + if (cap & CEPH_CAP_GEXCL) s += "x"; + if (cap & CEPH_CAP_GCACHE) s += "c"; + if (cap & CEPH_CAP_GRD) s += "r"; + if (cap & CEPH_CAP_GWR) s += "w"; + if (cap & CEPH_CAP_GBUFFER) s += "b"; + if (cap & CEPH_CAP_GWREXTEND) s += "a"; + if (cap & CEPH_CAP_GLAZYIO) s += "l"; + return s; +} +inline string ccap_string(int cap) +{ + string s; + if (cap & CEPH_CAP_PIN) s += "p"; + + int a = (cap >> CEPH_CAP_SAUTH) & 3; + if (a) s += 'A' + gcap_string(a); + + a = (cap >> CEPH_CAP_SLINK) & 3; + if (a) s += 'L' + gcap_string(a); + + a = (cap >> CEPH_CAP_SXATTR) & 3; + if (a) s += 'X' + gcap_string(a); + + a = cap >> CEPH_CAP_SFILE; + if (a) s += 'F' + gcap_string(a); + + if (s.length() == 0) + s = "-"; + return s; +} + + +struct scatter_info_t { + version_t version = 0; + + scatter_info_t() {} +}; + +struct frag_info_t : public scatter_info_t { + // this frag + utime_t mtime; + uint64_t change_attr = 0; + int64_t nfiles = 0; // files + int64_t nsubdirs = 0; // subdirs + + frag_info_t() {} + + int64_t size() const { return nfiles + nsubdirs; } + + void zero() { + *this = frag_info_t(); + } + + // *this += cur - acc; + void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) { + if (cur.mtime > mtime) { + mtime = cur.mtime; + if (touched_mtime) + *touched_mtime = true; + } + if (cur.change_attr > change_attr) { + change_attr = cur.change_attr; + if (touched_chattr) + *touched_chattr = true; + } + nfiles += cur.nfiles - acc.nfiles; + nsubdirs += cur.nsubdirs - acc.nsubdirs; + } + + void add(const frag_info_t& other) { + if (other.mtime > mtime) + mtime = other.mtime; + if (other.change_attr > change_attr) + change_attr = other.change_attr; + nfiles += other.nfiles; + nsubdirs += other.nsubdirs; + } + + bool same_sums(const frag_info_t &o) const { + return mtime <= o.mtime && + nfiles == o.nfiles && + nsubdirs == o.nsubdirs; + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<frag_info_t*>& ls); +}; +WRITE_CLASS_ENCODER(frag_info_t) + +inline bool operator==(const frag_info_t &l, const frag_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const frag_info_t &l, const frag_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const frag_info_t &f); + + +struct nest_info_t : public scatter_info_t { + // this frag + children + utime_t rctime; + int64_t rbytes = 0; + int64_t rfiles = 0; + int64_t rsubdirs = 0; + int64_t rsize() const { return rfiles + rsubdirs; } + + int64_t rsnaps = 0; + + nest_info_t() {} + + void zero() { + *this = nest_info_t(); + } + + void sub(const nest_info_t &other) { + add(other, -1); + } + void add(const nest_info_t &other, int fac=1) { + if (other.rctime > rctime) + rctime = other.rctime; + rbytes += fac*other.rbytes; + rfiles += fac*other.rfiles; + rsubdirs += fac*other.rsubdirs; + rsnaps += fac*other.rsnaps; + } + + // *this += cur - acc; + void add_delta(const nest_info_t &cur, const nest_info_t &acc) { + if (cur.rctime > rctime) + rctime = cur.rctime; + rbytes += cur.rbytes - acc.rbytes; + rfiles += cur.rfiles - acc.rfiles; + rsubdirs += cur.rsubdirs - acc.rsubdirs; + rsnaps += cur.rsnaps - acc.rsnaps; + } + + bool same_sums(const nest_info_t &o) const { + return rctime <= o.rctime && + rbytes == o.rbytes && + rfiles == o.rfiles && + rsubdirs == o.rsubdirs && + rsnaps == o.rsnaps; + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<nest_info_t*>& ls); +}; +WRITE_CLASS_ENCODER(nest_info_t) + +inline bool operator==(const nest_info_t &l, const nest_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const nest_info_t &l, const nest_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const nest_info_t &n); + + +struct vinodeno_t { + inodeno_t ino; + snapid_t snapid; + vinodeno_t() {} + vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(ino, bl); + encode(snapid, bl); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + decode(ino, p); + decode(snapid, p); + } +}; +WRITE_CLASS_ENCODER(vinodeno_t) + +inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) { + return l.ino == r.ino && l.snapid == r.snapid; +} +inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) { + return !(l == r); +} +inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) { + return + l.ino < r.ino || + (l.ino == r.ino && l.snapid < r.snapid); +} + +struct quota_info_t +{ + int64_t max_bytes = 0; + int64_t max_files = 0; + + quota_info_t() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(max_bytes, bl); + encode(max_files, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p); + decode(max_bytes, p); + decode(max_files, p); + DECODE_FINISH(p); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<quota_info_t *>& ls); + + bool is_valid() const { + return max_bytes >=0 && max_files >=0; + } + bool is_enable() const { + return max_bytes || max_files; + } +}; +WRITE_CLASS_ENCODER(quota_info_t) + +inline bool operator==(const quota_info_t &l, const quota_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} + +ostream& operator<<(ostream &out, const quota_info_t &n); + +namespace std { + template<> struct hash<vinodeno_t> { + size_t operator()(const vinodeno_t &vino) const { + hash<inodeno_t> H; + hash<uint64_t> I; + return H(vino.ino) ^ I(vino.snapid); + } + }; +} // namespace std + + + + +inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) { + out << vino.ino; + if (vino.snapid == CEPH_NOSNAP) + out << ".head"; + else if (vino.snapid) + out << '.' << vino.snapid; + return out; +} + + +/* + * client_writeable_range_t + */ +struct client_writeable_range_t { + struct byte_range_t { + uint64_t first = 0, last = 0; // interval client can write to + byte_range_t() {} + }; + + byte_range_t range; + snapid_t follows = 0; // aka "data+metadata flushed thru" + + client_writeable_range_t() {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<client_writeable_range_t*>& ls); +}; + +inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) { + decode(range.first, bl); + decode(range.last, bl); +} + +WRITE_CLASS_ENCODER(client_writeable_range_t) + +std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r); + +inline bool operator==(const client_writeable_range_t& l, + const client_writeable_range_t& r) { + return l.range.first == r.range.first && l.range.last == r.range.last && + l.follows == r.follows; +} + +struct inline_data_t { +private: + std::unique_ptr<bufferlist> blp; +public: + version_t version = 1; + + void free_data() { + blp.reset(); + } + bufferlist& get_data() { + if (!blp) + blp.reset(new bufferlist); + return *blp; + } + size_t length() const { return blp ? blp->length() : 0; } + + inline_data_t() {} + inline_data_t(const inline_data_t& o) : version(o.version) { + if (o.blp) + get_data() = *o.blp; + } + inline_data_t& operator=(const inline_data_t& o) { + version = o.version; + if (o.blp) + get_data() = *o.blp; + else + free_data(); + return *this; + } + bool operator==(const inline_data_t& o) const { + return length() == o.length() && + (length() == 0 || + (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get()))); + } + bool operator!=(const inline_data_t& o) const { + return !(*this == o); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); +}; +WRITE_CLASS_ENCODER(inline_data_t) + +enum { + DAMAGE_STATS, // statistics (dirstat, size, etc) + DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat) + DAMAGE_FRAGTREE // fragtree -- repair by searching +}; +typedef uint32_t damage_flags_t; + +/* + * inode_t + */ +template<template<typename> class Allocator = std::allocator> +struct inode_t { + /** + * *************** + * Do not forget to add any new fields to the compare() function. + * *************** + */ + // base (immutable) + inodeno_t ino = 0; + uint32_t rdev = 0; // if special file + + // affected by any inode change... + utime_t ctime; // inode change time + utime_t btime; // birth time + + // perm (namespace permissions) + uint32_t mode = 0; + uid_t uid = 0; + gid_t gid = 0; + + // nlink + int32_t nlink = 0; + + // file (data access) + ceph_dir_layout dir_layout; // [dir only] + file_layout_t layout; + compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools; + uint64_t size = 0; // on directory, # dentries + uint64_t max_size_ever = 0; // max size the file has ever been + uint32_t truncate_seq = 0; + uint64_t truncate_size = 0, truncate_from = 0; + uint32_t truncate_pending = 0; + utime_t mtime; // file data modify time. + utime_t atime; // file data access time. + uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes()) + inline_data_t inline_data; // FIXME check + + // change attribute + uint64_t change_attr = 0; + + using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>; + client_range_map client_ranges; // client(s) can write to these ranges + + // dirfrag, recursive accountin + frag_info_t dirstat; // protected by my filelock + nest_info_t rstat; // protected by my nestlock + nest_info_t accounted_rstat; // protected by parent's nestlock + + quota_info_t quota; + + mds_rank_t export_pin = MDS_RANK_NONE; + + // special stuff + version_t version = 0; // auth only + version_t file_data_version = 0; // auth only + version_t xattr_version = 0; + + utime_t last_scrub_stamp; // start time of last complete scrub + version_t last_scrub_version = 0;// (parent) start version of last complete scrub + + version_t backtrace_version = 0; + + snapid_t oldest_snap; + + std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink + + inode_t() + { + clear_layout(); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&dir_layout, 0, sizeof(dir_layout)); + } + + // file type + bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; } + bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } + bool is_file() const { return (mode & S_IFMT) == S_IFREG; } + + bool is_truncating() const { return (truncate_pending > 0); } + void truncate(uint64_t old_size, uint64_t new_size) { + ceph_assert(new_size < old_size); + if (old_size > max_size_ever) + max_size_ever = old_size; + truncate_from = old_size; + size = new_size; + rstat.rbytes = new_size; + truncate_size = size; + truncate_seq++; + truncate_pending++; + } + + bool has_layout() const { + return layout != file_layout_t(); + } + + void clear_layout() { + layout = file_layout_t(); + } + + uint64_t get_layout_size_increment() const { + return layout.get_period(); + } + + bool is_dirty_rstat() const { return !(rstat == accounted_rstat); } + + uint64_t get_max_size() const { + uint64_t max = 0; + for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + if (p->second.range.last > max) + max = p->second.range.last; + return max; + } + void set_max_size(uint64_t new_max) { + if (new_max == 0) { + client_ranges.clear(); + } else { + for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + p->second.range.last = new_max; + } + } + + void trim_client_ranges(snapid_t last) { + std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin(); + while (p != client_ranges.end()) { + if (p->second.follows >= last) + client_ranges.erase(p++); + else + ++p; + } + } + + bool is_backtrace_updated() const { + return backtrace_version == version; + } + void update_backtrace(version_t pv=0) { + backtrace_version = pv ? pv : version; + } + + void add_old_pool(int64_t l) { + backtrace_version = version; + old_pools.insert(l); + } + + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<inode_t*>& ls); + /** + * Compare this inode_t with another that represent *the same inode* + * at different points in time. + * @pre The inodes are the same ino + * + * @param other The inode_t to compare ourselves with + * @param divergent A bool pointer which will be set to true + * if the values are different in a way that can't be explained + * by one being a newer version than the other. + * + * @returns 1 if we are newer than the other, 0 if equal, -1 if older. + */ + int compare(const inode_t &other, bool *divergent) const; +private: + bool older_is_consistent(const inode_t &other) const; +}; + +// These methods may be moved back to mdstypes.cc when we have pmr +template<template<typename> class Allocator> +void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(15, 6, bl); + + encode(ino, bl); + encode(rdev, bl); + encode(ctime, bl); + + encode(mode, bl); + encode(uid, bl); + encode(gid, bl); + + encode(nlink, bl); + { + // removed field + bool anchored = 0; + encode(anchored, bl); + } + + encode(dir_layout, bl); + encode(layout, bl, features); + encode(size, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(truncate_from, bl); + encode(truncate_pending, bl); + encode(mtime, bl); + encode(atime, bl); + encode(time_warp_seq, bl); + encode(client_ranges, bl); + + encode(dirstat, bl); + encode(rstat, bl); + encode(accounted_rstat, bl); + + encode(version, bl); + encode(file_data_version, bl); + encode(xattr_version, bl); + encode(backtrace_version, bl); + encode(old_pools, bl); + encode(max_size_ever, bl); + encode(inline_data, bl); + encode(quota, bl); + + encode(stray_prior_path, bl); + + encode(last_scrub_version, bl); + encode(last_scrub_stamp, bl); + + encode(btime, bl); + encode(change_attr, bl); + + encode(export_pin, bl); + + ENCODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::decode(bufferlist::const_iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p); + + decode(ino, p); + decode(rdev, p); + decode(ctime, p); + + decode(mode, p); + decode(uid, p); + decode(gid, p); + + decode(nlink, p); + { + bool anchored; + decode(anchored, p); + } + + if (struct_v >= 4) + decode(dir_layout, p); + else { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&dir_layout, 0, sizeof(dir_layout)); + } + decode(layout, p); + decode(size, p); + decode(truncate_seq, p); + decode(truncate_size, p); + decode(truncate_from, p); + if (struct_v >= 5) + decode(truncate_pending, p); + else + truncate_pending = 0; + decode(mtime, p); + decode(atime, p); + decode(time_warp_seq, p); + if (struct_v >= 3) { + decode(client_ranges, p); + } else { + map<client_t, client_writeable_range_t::byte_range_t> m; + decode(m, p); + for (map<client_t, client_writeable_range_t::byte_range_t>::iterator + q = m.begin(); q != m.end(); ++q) + client_ranges[q->first].range = q->second; + } + + decode(dirstat, p); + decode(rstat, p); + decode(accounted_rstat, p); + + decode(version, p); + decode(file_data_version, p); + decode(xattr_version, p); + if (struct_v >= 2) + decode(backtrace_version, p); + if (struct_v >= 7) + decode(old_pools, p); + if (struct_v >= 8) + decode(max_size_ever, p); + if (struct_v >= 9) { + decode(inline_data, p); + } else { + inline_data.version = CEPH_INLINE_NONE; + } + if (struct_v < 10) + backtrace_version = 0; // force update backtrace + if (struct_v >= 11) + decode(quota, p); + + if (struct_v >= 12) { + std::string tmp; + decode(tmp, p); + stray_prior_path = std::string_view(tmp); + } + + if (struct_v >= 13) { + decode(last_scrub_version, p); + decode(last_scrub_stamp, p); + } + if (struct_v >= 14) { + decode(btime, p); + decode(change_attr, p); + } else { + btime = utime_t(); + change_attr = 0; + } + + if (struct_v >= 15) { + decode(export_pin, p); + } else { + export_pin = MDS_RANK_NONE; + } + + DECODE_FINISH(p); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("rdev", rdev); + f->dump_stream("ctime") << ctime; + f->dump_stream("btime") << btime; + f->dump_unsigned("mode", mode); + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + f->dump_unsigned("nlink", nlink); + + f->open_object_section("dir_layout"); + ::dump(dir_layout, f); + f->close_section(); + + f->dump_object("layout", layout); + + f->open_array_section("old_pools"); + for (const auto &p : old_pools) { + f->dump_int("pool", p); + } + f->close_section(); + + f->dump_unsigned("size", size); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_unsigned("truncate_from", truncate_from); + f->dump_unsigned("truncate_pending", truncate_pending); + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_unsigned("time_warp_seq", time_warp_seq); + f->dump_unsigned("change_attr", change_attr); + f->dump_int("export_pin", export_pin); + + f->open_array_section("client_ranges"); + for (const auto &p : client_ranges) { + f->open_object_section("client"); + f->dump_unsigned("client", p.first.v); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_object_section("dirstat"); + dirstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); + + f->dump_unsigned("version", version); + f->dump_unsigned("file_data_version", file_data_version); + f->dump_unsigned("xattr_version", xattr_version); + f->dump_unsigned("backtrace_version", backtrace_version); + + f->dump_string("stray_prior_path", stray_prior_path); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls) +{ + ls.push_back(new inode_t<Allocator>); + ls.push_back(new inode_t<Allocator>); + ls.back()->ino = 1; + // i am lazy. +} + +template<template<typename> class Allocator> +int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const +{ + ceph_assert(ino == other.ino); + *divergent = false; + if (version == other.version) { + if (rdev != other.rdev || + ctime != other.ctime || + btime != other.btime || + mode != other.mode || + uid != other.uid || + gid != other.gid || + nlink != other.nlink || + memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) || + layout != other.layout || + old_pools != other.old_pools || + size != other.size || + max_size_ever != other.max_size_ever || + truncate_seq != other.truncate_seq || + truncate_size != other.truncate_size || + truncate_from != other.truncate_from || + truncate_pending != other.truncate_pending || + change_attr != other.change_attr || + mtime != other.mtime || + atime != other.atime || + time_warp_seq != other.time_warp_seq || + inline_data != other.inline_data || + client_ranges != other.client_ranges || + !(dirstat == other.dirstat) || + !(rstat == other.rstat) || + !(accounted_rstat == other.accounted_rstat) || + file_data_version != other.file_data_version || + xattr_version != other.xattr_version || + backtrace_version != other.backtrace_version) { + *divergent = true; + } + return 0; + } else if (version > other.version) { + *divergent = !older_is_consistent(other); + return 1; + } else { + ceph_assert(version < other.version); + *divergent = !other.older_is_consistent(*this); + return -1; + } +} + +template<template<typename> class Allocator> +bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const +{ + if (max_size_ever < other.max_size_ever || + truncate_seq < other.truncate_seq || + time_warp_seq < other.time_warp_seq || + inline_data.version < other.inline_data.version || + dirstat.version < other.dirstat.version || + rstat.version < other.rstat.version || + accounted_rstat.version < other.accounted_rstat.version || + file_data_version < other.file_data_version || + xattr_version < other.xattr_version || + backtrace_version < other.backtrace_version) { + return false; + } + return true; +} + +template<template<typename> class Allocator> +inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features) +{ + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template<template<typename> class Allocator> +inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p) +{ + c.decode(p); +} + +template<template<typename> class Allocator> +using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>; + +template<template<typename> class Allocator> +using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool + +template<template<typename> class Allocator> +inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p) +{ + __u32 n; + decode(n, p); + while (n-- > 0) { + alloc_string<Allocator> key; + decode(key, p); + __u32 len; + decode(len, p); + p.copy_deep(len, xattrs[key]); + } +} + +/* + * old_inode_t + */ +template<template<typename> class Allocator = std::allocator> +struct old_inode_t { + snapid_t first; + inode_t<Allocator> inode; + xattr_map<Allocator> xattrs; + + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<old_inode_t*>& ls); +}; + +// These methods may be moved back to mdstypes.cc when we have pmr +template<template<typename> class Allocator> +void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(2, 2, bl); + encode(first, bl); + encode(inode, bl, features); + encode(xattrs, bl); + ENCODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(first, bl); + decode(inode, bl); + decode_noshare<Allocator>(xattrs, bl); + DECODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + inode.dump(f); + f->open_object_section("xattrs"); + for (const auto &p : xattrs) { + std::string v(p.second.c_str(), p.second.length()); + f->dump_string(p.first.c_str(), v); + } + f->close_section(); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls) +{ + ls.push_back(new old_inode_t<Allocator>); + ls.push_back(new old_inode_t<Allocator>); + ls.back()->first = 2; + std::list<inode_t<Allocator>*> ils; + inode_t<Allocator>::generate_test_instances(ils); + ls.back()->inode = *ils.back(); + ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4); + ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3); +} + +template<template<typename> class Allocator> +inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features) +{ + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template<template<typename> class Allocator> +inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p) +{ + c.decode(p); +} + + +/* + * like an inode, but for a dir frag + */ +struct fnode_t { + version_t version = 0; + snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru + frag_info_t fragstat, accounted_fragstat; + nest_info_t rstat, accounted_rstat; + damage_flags_t damage_flags = 0; + + // we know we and all our descendants have been scrubbed since this version + version_t recursive_scrub_version = 0; + utime_t recursive_scrub_stamp; + // version at which we last scrubbed our personal data structures + version_t localized_scrub_version = 0; + utime_t localized_scrub_stamp; + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<fnode_t*>& ls); + fnode_t() {} +}; +WRITE_CLASS_ENCODER(fnode_t) + + +struct old_rstat_t { + snapid_t first; + nest_info_t rstat, accounted_rstat; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<old_rstat_t*>& ls); +}; +WRITE_CLASS_ENCODER(old_rstat_t) + +inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) { + return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")"; +} + +/* + * feature_bitset_t + */ +class feature_bitset_t { +public: + typedef uint64_t block_type; + static const size_t bits_per_block = sizeof(block_type) * 8; + + feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {} + feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {} + feature_bitset_t(unsigned long value = 0); + feature_bitset_t(const vector<size_t>& array); + feature_bitset_t& operator=(const feature_bitset_t& other) { + _vec = other._vec; + return *this; + } + feature_bitset_t& operator=(feature_bitset_t&& other) { + _vec = std::move(other._vec); + return *this; + } + bool empty() const { + for (auto& v : _vec) { + if (v) + return false; + } + return true; + } + bool test(size_t bit) const { + if (bit >= bits_per_block * _vec.size()) + return false; + return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block)); + } + void clear() { + _vec.clear(); + } + feature_bitset_t& operator-=(const feature_bitset_t& other); + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator &p); + void print(ostream& out) const; +private: + vector<block_type> _vec; +}; +WRITE_CLASS_ENCODER(feature_bitset_t) + +inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) { + s.print(out); + return out; +} + +/* + * client_metadata_t + */ +struct client_metadata_t { + using kv_map_t = std::map<std::string,std::string>; + using iterator = kv_map_t::const_iterator; + + kv_map_t kv_map; + feature_bitset_t features; + + client_metadata_t() {} + client_metadata_t(const client_metadata_t& other) : + kv_map(other.kv_map), features(other.features) {} + client_metadata_t(client_metadata_t&& other) : + kv_map(std::move(other.kv_map)), features(std::move(other.features)) {} + client_metadata_t(kv_map_t&& kv, feature_bitset_t &&f) : + kv_map(std::move(kv)), features(std::move(f)) {} + client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f) : + kv_map(kv), features(f) {} + client_metadata_t& operator=(const client_metadata_t& other) { + kv_map = other.kv_map; + features = other.features; + return *this; + } + + bool empty() const { return kv_map.empty() && features.empty(); } + iterator find(const std::string& key) const { return kv_map.find(key); } + iterator begin() const { return kv_map.begin(); } + iterator end() const { return kv_map.end(); } + void erase(iterator it) { kv_map.erase(it); } + std::string& operator[](const std::string& key) { return kv_map[key]; } + void merge(const client_metadata_t& other) { + kv_map.insert(other.kv_map.begin(), other.kv_map.end()); + features = other.features; + } + void clear() { + kv_map.clear(); + features.clear(); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(client_metadata_t) + +/* + * session_info_t + */ +struct session_info_t { + entity_inst_t inst; + std::map<ceph_tid_t,inodeno_t> completed_requests; + interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use. + interval_set<inodeno_t> used_inos; // journaling use + client_metadata_t client_metadata; + std::set<ceph_tid_t> completed_flushes; + EntityName auth_name; + + client_t get_client() const { return client_t(inst.name.num()); } + bool has_feature(size_t bit) const { return client_metadata.features.test(bit); } + const entity_name_t& get_source() const { return inst.name; } + + void clear_meta() { + prealloc_inos.clear(); + used_inos.clear(); + completed_requests.clear(); + completed_flushes.clear(); + client_metadata.clear(); + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<session_info_t*>& ls); +}; +WRITE_CLASS_ENCODER_FEATURES(session_info_t) + + +// ======= +// dentries + +struct dentry_key_t { + snapid_t snapid = 0; + std::string_view name; + __u32 hash = 0; + dentry_key_t() {} + dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) : + snapid(s), name(n), hash(h) {} + + bool is_valid() { return name.length() || snapid; } + + // encode into something that can be decoded as a string. + // name_ (head) or name_%x (!head) + void encode(bufferlist& bl) const { + string key; + encode(key); + using ceph::encode; + encode(key, bl); + } + void encode(string& key) const { + char b[20]; + if (snapid != CEPH_NOSNAP) { + uint64_t val(snapid); + snprintf(b, sizeof(b), "%" PRIx64, val); + } else { + snprintf(b, sizeof(b), "%s", "head"); + } + ostringstream oss; + oss << name << "_" << b; + key = oss.str(); + } + static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) { + string key; + decode(key, bl); + decode_helper(key, nm, sn); + } + static void decode_helper(std::string_view key, string& nm, snapid_t& sn) { + size_t i = key.find_last_of('_'); + ceph_assert(i != string::npos); + if (key.compare(i+1, std::string_view::npos, "head") == 0) { + // name_head + sn = CEPH_NOSNAP; + } else { + // name_%x + long long unsigned x = 0; + std::string x_str(key.substr(i+1)); + sscanf(x_str.c_str(), "%llx", &x); + sn = x; + } + nm = key.substr(0, i); + } +}; + +inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k) +{ + return out << "(" << k.name << "," << k.snapid << ")"; +} + +inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2) +{ + /* + * order by hash, name, snap + */ + int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash); + if (c) + return c < 0; + c = k1.name.compare(k2.name); + if (c) + return c < 0; + return k1.snapid < k2.snapid; +} + + +/* + * string_snap_t is a simple (string, snapid_t) pair + */ +struct string_snap_t { + string name; + snapid_t snapid; + string_snap_t() {} + string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<string_snap_t*>& ls); +}; +WRITE_CLASS_ENCODER(string_snap_t) + +inline bool operator<(const string_snap_t& l, const string_snap_t& r) { + int c = l.name.compare(r.name); + return c < 0 || (c == 0 && l.snapid < r.snapid); +} + +inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k) +{ + return out << "(" << k.name << "," << k.snapid << ")"; +} + +/* + * mds_table_pending_t + * + * mds's requesting any pending ops. child needs to encode the corresponding + * pending mutation state in the table. + */ +struct mds_table_pending_t { + uint64_t reqid = 0; + __s32 mds = 0; + version_t tid = 0; + mds_table_pending_t() {} + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<mds_table_pending_t*>& ls); +}; +WRITE_CLASS_ENCODER(mds_table_pending_t) + + +// ========= +// requests + +struct metareqid_t { + entity_name_t name; + uint64_t tid = 0; + metareqid_t() {} + metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {} + void encode(bufferlist& bl) const { + using ceph::encode; + encode(name, bl); + encode(tid, bl); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + decode(name, p); + decode(tid, p); + } +}; +WRITE_CLASS_ENCODER(metareqid_t) + +inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) { + return out << r.name << ":" << r.tid; +} + +inline bool operator==(const metareqid_t& l, const metareqid_t& r) { + return (l.name == r.name) && (l.tid == r.tid); +} +inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { + return (l.name != r.name) || (l.tid != r.tid); +} +inline bool operator<(const metareqid_t& l, const metareqid_t& r) { + return (l.name < r.name) || + (l.name == r.name && l.tid < r.tid); +} +inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { + return (l.name < r.name) || + (l.name == r.name && l.tid <= r.tid); +} +inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } +inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } + +namespace std { + template<> struct hash<metareqid_t> { + size_t operator()(const metareqid_t &r) const { + hash<uint64_t> H; + return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid); + } + }; +} // namespace std + + +// cap info for client reconnect +struct cap_reconnect_t { + string path; + mutable ceph_mds_cap_reconnect capinfo; + snapid_t snap_follows; + bufferlist flockbl; + + cap_reconnect_t() { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&capinfo, 0, sizeof(capinfo)); + snap_follows = 0; + } + cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i, + inodeno_t sr, snapid_t sf, bufferlist& lb) : + path(p) { + capinfo.cap_id = cap_id; + capinfo.wanted = w; + capinfo.issued = i; + capinfo.snaprealm = sr; + capinfo.pathbase = pino; + capinfo.flock_len = 0; + snap_follows = sf; + flockbl.claim(lb); + } + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void encode_old(bufferlist& bl) const; + void decode_old(bufferlist::const_iterator& bl); + + void dump(Formatter *f) const; + static void generate_test_instances(list<cap_reconnect_t*>& ls); +}; +WRITE_CLASS_ENCODER(cap_reconnect_t) + +struct snaprealm_reconnect_t { + mutable ceph_mds_snaprealm_reconnect realm; + + snaprealm_reconnect_t() { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&realm, 0, sizeof(realm)); + } + snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) { + realm.ino = ino; + realm.seq = seq; + realm.parent = parent; + } + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void encode_old(bufferlist& bl) const; + void decode_old(bufferlist::const_iterator& bl); + + void dump(Formatter *f) const; + static void generate_test_instances(list<snaprealm_reconnect_t*>& ls); +}; +WRITE_CLASS_ENCODER(snaprealm_reconnect_t) + +// compat for pre-FLOCK feature +struct old_ceph_mds_cap_reconnect { + ceph_le64 cap_id; + ceph_le32 wanted; + ceph_le32 issued; + ceph_le64 old_size; + struct ceph_timespec old_mtime, old_atime; + ceph_le64 snaprealm; + ceph_le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); +WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect) + +struct old_cap_reconnect_t { + string path; + old_ceph_mds_cap_reconnect capinfo; + + const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) { + path = n.path; + capinfo.cap_id = n.capinfo.cap_id; + capinfo.wanted = n.capinfo.wanted; + capinfo.issued = n.capinfo.issued; + capinfo.snaprealm = n.capinfo.snaprealm; + capinfo.pathbase = n.capinfo.pathbase; + return *this; + } + operator cap_reconnect_t() { + cap_reconnect_t n; + n.path = path; + n.capinfo.cap_id = capinfo.cap_id; + n.capinfo.wanted = capinfo.wanted; + n.capinfo.issued = capinfo.issued; + n.capinfo.snaprealm = capinfo.snaprealm; + n.capinfo.pathbase = capinfo.pathbase; + return n; + } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(path, bl); + encode(capinfo, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(path, bl); + decode(capinfo, bl); + } +}; +WRITE_CLASS_ENCODER(old_cap_reconnect_t) + + +// ================================================================ +// dir frag + +struct dirfrag_t { + inodeno_t ino = 0; + frag_t frag; + + dirfrag_t() {} + dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(ino, bl); + encode(frag, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(ino, bl); + decode(frag, bl); + } +}; +WRITE_CLASS_ENCODER(dirfrag_t) + + +inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) { + out << df.ino; + if (!df.frag.is_root()) out << "." << df.frag; + return out; +} +inline bool operator<(dirfrag_t l, dirfrag_t r) { + if (l.ino < r.ino) return true; + if (l.ino == r.ino && l.frag < r.frag) return true; + return false; +} +inline bool operator==(dirfrag_t l, dirfrag_t r) { + return l.ino == r.ino && l.frag == r.frag; +} + +namespace std { + template<> struct hash<dirfrag_t> { + size_t operator()(const dirfrag_t &df) const { + static rjhash<uint64_t> H; + static rjhash<uint32_t> I; + return H(df.ino) ^ I(df.frag); + } + }; +} // namespace std + + + +// ================================================================ + +#define META_POP_IRD 0 +#define META_POP_IWR 1 +#define META_POP_READDIR 2 +#define META_POP_FETCH 3 +#define META_POP_STORE 4 +#define META_NPOP 5 + +class inode_load_vec_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const size_t NUM = 2; + + inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {} + inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {} + + DecayCounter &get(int t) { + return vec[t]; + } + void zero() { + for (auto &d : vec) { + d.reset(); + } + } + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_load_vec_t*>& ls); + +private: + std::array<DecayCounter, NUM> vec; +}; +inline void encode(const inode_load_vec_t &c, bufferlist &bl) { + c.encode(bl); +} +inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) { + c.decode(p); +} + +class dirfrag_load_vec_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const size_t NUM = 5; + + dirfrag_load_vec_t() : + vec{DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()) + } + {} + dirfrag_load_vec_t(const DecayRate &rate) : + vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)} + {} + + void encode(bufferlist &bl) const { + ENCODE_START(2, 2, bl); + for (const auto &i : vec) { + encode(i, bl); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + for (auto &i : vec) { + decode(i, p); + } + DECODE_FINISH(p); + } + void dump(Formatter *f) const; + void dump(Formatter *f, const DecayRate& rate) const; + static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls); + + const DecayCounter &get(int t) const { + return vec[t]; + } + DecayCounter &get(int t) { + return vec[t]; + } + void adjust(double d) { + for (auto &i : vec) { + i.adjust(d); + } + } + void zero() { + for (auto &i : vec) { + i.reset(); + } + } + double meta_load() const { + return + 1*vec[META_POP_IRD].get() + + 2*vec[META_POP_IWR].get() + + 1*vec[META_POP_READDIR].get() + + 2*vec[META_POP_FETCH].get() + + 4*vec[META_POP_STORE].get(); + } + + void add(dirfrag_load_vec_t& r) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].adjust(r.vec[i].get()); + } + void sub(dirfrag_load_vec_t& r) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].adjust(-r.vec[i].get()); + } + void scale(double f) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].scale(f); + } + +private: + friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl); + std::array<DecayCounter, NUM> vec; +}; + +inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) { + c.encode(bl); +} +inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) { + c.decode(p); +} + +inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl) +{ + std::ostringstream ss; + ss << std::setprecision(1) << std::fixed + << "[pop" + " IRD:" << dl.vec[0] + << " IWR:" << dl.vec[1] + << " RDR:" << dl.vec[2] + << " FET:" << dl.vec[3] + << " STR:" << dl.vec[4] + << " *LOAD:" << dl.meta_load() << "]"; + return out << ss.str() << std::endl; +} + + +/* mds_load_t + * mds load + */ + +struct mds_load_t { + using clock = dirfrag_load_vec_t::clock; + using time = dirfrag_load_vec_t::time; + + dirfrag_load_vec_t auth; + dirfrag_load_vec_t all; + + mds_load_t() : auth(DecayRate()), all(DecayRate()) {} + mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {} + + double req_rate = 0.0; + double cache_hit_rate = 0.0; + double queue_len = 0.0; + + double cpu_load_avg = 0.0; + + double mds_load() const; // defiend in MDBalancer.cc + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<mds_load_t*>& ls); +}; +inline void encode(const mds_load_t &c, bufferlist &bl) { + c.encode(bl); +} +inline void decode(mds_load_t &c, bufferlist::const_iterator &p) { + c.decode(p); +} + +inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load) +{ + return out << "mdsload<" << load.auth << "/" << load.all + << ", req " << load.req_rate + << ", hr " << load.cache_hit_rate + << ", qlen " << load.queue_len + << ", cpu " << load.cpu_load_avg + << ">"; +} + +class load_spread_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const int MAX = 4; + int last[MAX]; + int p = 0, n = 0; + DecayCounter count; + +public: + load_spread_t() = delete; + load_spread_t(const DecayRate &rate) : count(rate) + { + for (int i=0; i<MAX; i++) + last[i] = -1; + } + + double hit(int who) { + for (int i=0; i<n; i++) + if (last[i] == who) + return count.get_last(); + + // we're new(ish) + last[p++] = who; + if (n < MAX) n++; + if (n == 1) return 0.0; + + if (p == MAX) p = 0; + + return count.hit(); + } + double get() const { + return count.get(); + } +}; + + + +// ================================================================ +typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t; + +// -- authority delegation -- +// directory authority types +// >= 0 is the auth mds +#define CDIR_AUTH_PARENT mds_rank_t(-1) // default +#define CDIR_AUTH_UNKNOWN mds_rank_t(-2) +#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN) +#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN) +//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2) + +class MDSCacheObjectInfo { +public: + inodeno_t ino = 0; + dirfrag_t dirfrag; + string dname; + snapid_t snapid; + + MDSCacheObjectInfo() {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<MDSCacheObjectInfo*>& ls); +}; + +inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) { + if (info.ino) return out << info.ino << "." << info.snapid; + if (info.dname.length()) return out << info.dirfrag << "/" << info.dname + << " snap " << info.snapid; + return out << info.dirfrag; +} + +inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) { + if (l.ino || r.ino) + return l.ino == r.ino && l.snapid == r.snapid; + else + return l.dirfrag == r.dirfrag && l.dname == r.dname; +} +WRITE_CLASS_ENCODER(MDSCacheObjectInfo) + + +// parse a map of keys/values. +namespace qi = boost::spirit::qi; + +template <typename Iterator> +struct keys_and_values + : qi::grammar<Iterator, std::map<string, string>()> +{ + keys_and_values() + : keys_and_values::base_type(query) + { + query = pair >> *(qi::lit(' ') >> pair); + pair = key >> '=' >> value; + key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9"); + value = +qi::char_("a-zA-Z0-9-_."); + } + qi::rule<Iterator, std::map<string, string>()> query; + qi::rule<Iterator, std::pair<string, string>()> pair; + qi::rule<Iterator, string()> key, value; +}; + +#endif diff --git a/src/mds/snap.cc b/src/mds/snap.cc new file mode 100644 index 00000000..e53daef2 --- /dev/null +++ b/src/mds/snap.cc @@ -0,0 +1,218 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004- Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string_view> + +#include "snap.h" + +#include "common/Formatter.h" + +/* + * SnapInfo + */ + +void SnapInfo::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(snapid, bl); + encode(ino, bl); + encode(stamp, bl); + encode(name, bl); + ENCODE_FINISH(bl); +} + +void SnapInfo::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(snapid, bl); + decode(ino, bl); + decode(stamp, bl); + decode(name, bl); + DECODE_FINISH(bl); +} + +void SnapInfo::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", snapid); + f->dump_unsigned("ino", ino); + f->dump_stream("stamp") << stamp; + f->dump_string("name", name); +} + +void SnapInfo::generate_test_instances(list<SnapInfo*>& ls) +{ + ls.push_back(new SnapInfo); + ls.push_back(new SnapInfo); + ls.back()->snapid = 1; + ls.back()->ino = 2; + ls.back()->stamp = utime_t(3, 4); + ls.back()->name = "foo"; +} + +ostream& operator<<(ostream& out, const SnapInfo &sn) +{ + return out << "snap(" << sn.snapid + << " " << sn.ino + << " '" << sn.name + << "' " << sn.stamp << ")"; +} + +std::string_view SnapInfo::get_long_name() const +{ + if (long_name.empty() || + long_name.compare(1, name.size(), name) || + long_name.find_last_of("_") != name.size() + 1) { + char nm[80]; + snprintf(nm, sizeof(nm), "_%s_%llu", name.c_str(), (unsigned long long)ino); + long_name = nm; + } + return long_name; +} + +/* + * snaplink_t + */ + +void snaplink_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + encode(ino, bl); + encode(first, bl); + ENCODE_FINISH(bl); +} + +void snaplink_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(ino, bl); + decode(first, bl); + DECODE_FINISH(bl); +} + +void snaplink_t::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("first", first); +} + +void snaplink_t::generate_test_instances(list<snaplink_t*>& ls) +{ + ls.push_back(new snaplink_t); + ls.push_back(new snaplink_t); + ls.back()->ino = 2; + ls.back()->first = 123; +} + +ostream& operator<<(ostream& out, const snaplink_t &l) +{ + return out << l.ino << "@" << l.first; +} + +/* + * sr_t + */ + +void sr_t::encode(bufferlist& bl) const +{ + ENCODE_START(6, 4, bl); + encode(seq, bl); + encode(created, bl); + encode(last_created, bl); + encode(last_destroyed, bl); + encode(current_parent_since, bl); + encode(snaps, bl); + encode(past_parents, bl); + encode(past_parent_snaps, bl); + encode(flags, bl); + ENCODE_FINISH(bl); +} + +void sr_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, p); + if (struct_v == 2) { + __u8 struct_v; + decode(struct_v, p); // yes, really: extra byte for v2 encoding only, see 6ee52e7d. + } + decode(seq, p); + decode(created, p); + decode(last_created, p); + decode(last_destroyed, p); + decode(current_parent_since, p); + decode(snaps, p); + decode(past_parents, p); + if (struct_v >= 5) + decode(past_parent_snaps, p); + if (struct_v >= 6) + decode(flags, p); + else + flags = 0; + DECODE_FINISH(p); +} + +void sr_t::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->dump_unsigned("created", created); + f->dump_unsigned("last_created", last_created); + f->dump_unsigned("last_destroyed", last_destroyed); + f->dump_unsigned("current_parent_since", current_parent_since); + + f->open_array_section("snaps"); + for (map<snapid_t,SnapInfo>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("snapinfo"); + f->dump_unsigned("last", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("past_parents"); + for (map<snapid_t,snaplink_t>::const_iterator p = past_parents.begin(); p != past_parents.end(); ++p) { + f->open_object_section("past_parent"); + f->dump_unsigned("last", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("past_parent_snaps"); + for (auto p = past_parent_snaps.begin(); p != past_parent_snaps.end(); ++p) { + f->open_object_section("snapinfo"); + f->dump_unsigned("snapid", *p); + f->close_section(); + } + f->close_section(); +} + +void sr_t::generate_test_instances(list<sr_t*>& ls) +{ + ls.push_back(new sr_t); + ls.push_back(new sr_t); + ls.back()->seq = 1; + ls.back()->created = 2; + ls.back()->last_created = 3; + ls.back()->last_destroyed = 4; + ls.back()->current_parent_since = 5; + ls.back()->snaps[123].snapid = 7; + ls.back()->snaps[123].ino = 8; + ls.back()->snaps[123].stamp = utime_t(9, 10); + ls.back()->snaps[123].name = "name1"; + ls.back()->past_parents[12].ino = 12; + ls.back()->past_parents[12].first = 3; + + ls.back()->past_parent_snaps.insert(5); + ls.back()->past_parent_snaps.insert(6); +} + diff --git a/src/mds/snap.h b/src/mds/snap.h new file mode 100644 index 00000000..41f48d80 --- /dev/null +++ b/src/mds/snap.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_SNAP_H +#define CEPH_MDS_SNAP_H + +#include <string_view> + +#include "mdstypes.h" +#include "common/snap_types.h" + +/* + * generic snap descriptor. + */ +struct SnapInfo { + snapid_t snapid; + inodeno_t ino; + utime_t stamp; + string name; + + mutable string long_name; ///< cached _$ino_$name + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<SnapInfo*>& ls); + + std::string_view get_long_name() const; +}; +WRITE_CLASS_ENCODER(SnapInfo) + +inline bool operator==(const SnapInfo &l, const SnapInfo &r) +{ + return l.snapid == r.snapid && l.ino == r.ino && + l.stamp == r.stamp && l.name == r.name; +} + +ostream& operator<<(ostream& out, const SnapInfo &sn); + + +/* + * SnapRealm - a subtree that shares the same set of snapshots. + */ +struct SnapRealm; +class CInode; +class MDCache; + + + +#include "Capability.h" + +struct snaplink_t { + inodeno_t ino; + snapid_t first; + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<snaplink_t*>& ls); +}; +WRITE_CLASS_ENCODER(snaplink_t) + +ostream& operator<<(ostream& out, const snaplink_t &l); + + +// carry data about a specific version of a SnapRealm +struct sr_t { + snapid_t seq; // basically, a version/seq # for changes to _this_ realm. + snapid_t created; // when this realm was created. + snapid_t last_created; // last snap created in _this_ realm. + snapid_t last_destroyed; // seq for last removal + snapid_t current_parent_since; + map<snapid_t, SnapInfo> snaps; + map<snapid_t, snaplink_t> past_parents; // key is "last" (or NOSNAP) + set<snapid_t> past_parent_snaps; + + __u32 flags; + enum { + PARENT_GLOBAL = 1 << 0, + SUBVOLUME = 1 << 1, + }; + + void mark_parent_global() { flags |= PARENT_GLOBAL; } + void clear_parent_global() { flags &= ~PARENT_GLOBAL; } + bool is_parent_global() const { return flags & PARENT_GLOBAL; } + + void mark_subvolume() { flags |= SUBVOLUME; } + void clear_subvolume() { flags &= ~SUBVOLUME; } + bool is_subvolume() const { return flags & SUBVOLUME; } + + sr_t() + : seq(0), created(0), + last_created(0), last_destroyed(0), + current_parent_since(1), flags(0) + {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<sr_t*>& ls); +}; +WRITE_CLASS_ENCODER(sr_t) + +#endif |