summaryrefslogtreecommitdiffstats
path: root/src/mds
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:44 +0000
commit17d6a993fc17d533460c5f40f3908c708e057c18 (patch)
tree1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /src/mds
parentReleasing progress-linux version 18.2.2-0progress7.99u1. (diff)
downloadceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz
ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/mds')
-rw-r--r--src/mds/Beacon.cc22
-rw-r--r--src/mds/CDentry.cc2
-rw-r--r--src/mds/CDentry.h2
-rw-r--r--src/mds/CDir.cc1
-rw-r--r--src/mds/CInode.cc32
-rw-r--r--src/mds/CInode.h2
-rw-r--r--src/mds/CMakeLists.txt1
-rw-r--r--src/mds/Capability.h2
-rw-r--r--src/mds/DamageTable.cc28
-rw-r--r--src/mds/DamageTable.h7
-rw-r--r--src/mds/FSMap.cc3
-rw-r--r--src/mds/Locker.cc61
-rw-r--r--src/mds/Locker.h4
-rw-r--r--src/mds/MDBalancer.cc37
-rw-r--r--src/mds/MDCache.cc67
-rw-r--r--src/mds/MDCache.h1
-rw-r--r--src/mds/MDLog.cc7
-rw-r--r--src/mds/MDSAuthCaps.cc58
-rw-r--r--src/mds/MDSAuthCaps.h116
-rw-r--r--src/mds/MDSDaemon.cc6
-rw-r--r--src/mds/MDSMap.cc18
-rw-r--r--src/mds/MDSMap.h24
-rw-r--r--src/mds/MDSMetaRequest.h33
-rw-r--r--src/mds/MDSRank.cc54
-rw-r--r--src/mds/MDSRank.h8
-rw-r--r--src/mds/MetricAggregator.cc169
-rw-r--r--src/mds/MetricAggregator.h5
-rw-r--r--src/mds/Mutation.h7
-rw-r--r--src/mds/OpenFileTable.h2
-rw-r--r--src/mds/ScrubStack.cc3
-rw-r--r--src/mds/Server.cc285
-rw-r--r--src/mds/Server.h15
-rw-r--r--src/mds/SessionMap.cc3
-rw-r--r--src/mds/SessionMap.h5
-rw-r--r--src/mds/SnapRealm.cc33
-rw-r--r--src/mds/StrayManager.cc30
-rw-r--r--src/mds/StrayManager.h17
-rw-r--r--src/mds/cephfs_features.cc1
-rw-r--r--src/mds/cephfs_features.h4
-rw-r--r--src/mds/locks.c2
40 files changed, 949 insertions, 228 deletions
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 35f0f7942..5dd319a14 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -495,6 +495,28 @@ void Beacon::notify_health(MDSRank const *mds)
MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv());
health.metrics.push_back(m);
}
+
+ // Report laggy client(s) due to laggy OSDs
+ {
+ bool defer_client_eviction =
+ g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds")
+ && mds->objecter->with_osdmap([](const OSDMap &map) {
+ return map.any_osd_laggy(); });
+ auto&& laggy_clients = mds->server->get_laggy_clients();
+ if (defer_client_eviction && !laggy_clients.empty()) {
+ std::vector<MDSHealthMetric> laggy_clients_metrics;
+ for (const auto& laggy_client: laggy_clients) {
+ CachedStackStringStream css;
+ *css << "Client " << laggy_client << " is laggy; not evicted"
+ << " because some OSD(s) is/are laggy";
+ MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv());
+ laggy_clients_metrics.emplace_back(std::move(m));
+ }
+ auto&& m = laggy_clients_metrics;
+ health.metrics.insert(std::end(health.metrics), std::cbegin(m),
+ std::cend(m));
+ }
+ }
}
MDSMap::DaemonState Beacon::get_want_state() const
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index b6d169b9e..6722f0f2a 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -702,7 +702,7 @@ bool CDentry::check_corruption(bool load)
{
auto&& snapclient = dir->mdcache->mds->snapclient;
auto next_snap = snapclient->get_last_seq()+1;
- if (first > last || (snapclient->is_server_ready() && first > next_snap)) {
+ if (first > last || (snapclient->is_synced() && first > next_snap)) {
if (load) {
dout(1) << "loaded already corrupt dentry: " << *this << dendl;
corrupt_first_loaded = true;
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 4cbf24f0c..a0aa02ab3 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -377,6 +377,8 @@ public:
mempool::mds_co::map<client_t,ClientLease*> client_lease_map;
std::map<int, std::unique_ptr<BatchOp>> batch_ops;
+ ceph_tid_t reintegration_reqid = 0;
+
protected:
friend class Migrator;
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 0484c38cc..a8aaf11c0 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -3752,6 +3752,7 @@ bool CDir::scrub_local()
mdcache->repair_dirfrag_stats(this);
scrub_infop->header->set_repaired();
good = true;
+ mdcache->mds->damage_table.remove_dentry_damage_entry(this);
}
return good;
}
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 23cb087c8..71b6081be 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -965,11 +965,25 @@ CInode *CInode::get_parent_inode()
return NULL;
}
-bool CInode::is_ancestor_of(const CInode *other) const
+bool CInode::is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited) const
{
+ std::vector<CInode const*> my_visited = {};
while (other) {
- if (other == this)
+ if (visited && other->is_dir()) {
+ if (auto it = visited->find(other); it != visited->end()) {
+ for (auto& in : my_visited) {
+ (*visited)[in] = it->second;
+ }
+ return it->second;
+ }
+ my_visited.push_back(other); /* N.B.: this being non-empty means visited is assumed non-null */
+ }
+ if (other == this) {
+ for (auto& in : my_visited) {
+ (*visited)[in] = true;
+ }
return true;
+ }
const CDentry *pdn = other->get_oldest_parent_dn();
if (!pdn) {
ceph_assert(other->is_base());
@@ -977,6 +991,9 @@ bool CInode::is_ancestor_of(const CInode *other) const
}
other = pdn->get_dir()->get_inode();
}
+ for (auto& in : my_visited) {
+ (*visited)[in] = false;
+ }
return false;
}
@@ -3457,7 +3474,7 @@ void CInode::remove_client_cap(client_t client)
void CInode::move_to_realm(SnapRealm *realm)
{
- dout(10) << __func__ << " joining realm " << *realm
+ dout(20) << __func__ << " joining realm " << *realm
<< ", leaving realm " << *containing_realm << dendl;
for (auto& p : client_caps) {
containing_realm->remove_cap(p.first, &p.second);
@@ -4788,6 +4805,7 @@ next:
false);
// Flag that we repaired this BT so that it won't go into damagetable
results->backtrace.repaired = true;
+ in->mdcache->mds->damage_table.remove_backtrace_damage_entry(in->ino());
if (in->mdcache->mds->logger)
in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
}
@@ -4926,6 +4944,9 @@ next:
<< "freshly-calculated rstats don't match existing ones (will be fixed)";
in->mdcache->repair_inode_stats(in);
results->raw_stats.repaired = true;
+ for (const auto &p : in->dirfrags){
+ in->mdcache->mds->damage_table.remove_dirfrag_damage_entry(p.second);
+ }
} else {
results->raw_stats.error_str
<< "freshly-calculated rstats don't match existing ones";
@@ -5165,6 +5186,11 @@ void CInode::dump(Formatter *f, int flags) const
}
f->close_section();
}
+
+ auto realm = find_snaprealm();
+ inodeno_t subvol_ino = realm->get_subvolume_ino();
+ bool is_subvol = (subvol_ino && subvol_ino == ino());
+ f->dump_bool("is_subvolume", is_subvol);
}
/****** Scrub Stuff *****/
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 979b45174..6f965bffa 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -712,7 +712,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
}
// -- misc --
- bool is_ancestor_of(const CInode *other) const;
+ bool is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited=nullptr) const;
bool is_projected_ancestor_of(const CInode *other) const;
void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt
index a12898f38..88c8a1db0 100644
--- a/src/mds/CMakeLists.txt
+++ b/src/mds/CMakeLists.txt
@@ -34,7 +34,6 @@ set(mds_srcs
snap.cc
SessionMap.cc
MDSContext.cc
- MDSAuthCaps.cc
MDLog.cc
MDSCacheObject.cc
Mantle.cc
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 3fd6d2ce6..ebc626a22 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -381,7 +381,7 @@ private:
ceph_seq_t mseq = 0;
int suppress = 0;
- unsigned state = 0;
+ uint32_t state = 0;
int lock_cache_allowed = 0;
};
diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc
index 22802079d..2079d2333 100644
--- a/src/mds/DamageTable.cc
+++ b/src/mds/DamageTable.cc
@@ -15,6 +15,7 @@
#include "common/debug.h"
#include "mds/CDir.h"
+#include "mds/CInode.h"
#include "DamageTable.h"
@@ -200,6 +201,33 @@ bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path)
return false;
}
+void DamageTable::remove_dentry_damage_entry(CDir *dir)
+{
+ if (dentries.count(
+ DirFragIdent(dir->inode->ino(), dir->frag)
+ ) > 0){
+ const auto frag_dentries =
+ dentries.at(DirFragIdent(dir->inode->ino(), dir->frag));
+ for(const auto &i : frag_dentries) {
+ erase(i.second->id);
+ }
+ }
+}
+
+void DamageTable::remove_dirfrag_damage_entry(CDir *dir)
+{
+ if (is_dirfrag_damaged(dir)){
+ erase(dirfrags.find(DirFragIdent(dir->inode->ino(), dir->frag))->second->id);
+ }
+}
+
+void DamageTable::remove_backtrace_damage_entry(inodeno_t ino)
+{
+ if (is_remote_damaged(ino)){
+ erase(remotes.find(ino)->second->id);
+ }
+}
+
bool DamageTable::oversized() const
{
return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries);
diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h
index 18a61e08b..a1b96fe22 100644
--- a/src/mds/DamageTable.h
+++ b/src/mds/DamageTable.h
@@ -22,6 +22,7 @@
#include "include/random.h"
class CDir;
+class CInode;
typedef uint64_t damage_entry_id_t;
@@ -155,6 +156,12 @@ class DamageTable
*/
bool notify_remote_damaged(inodeno_t ino, std::string_view path);
+ void remove_dentry_damage_entry(CDir *dir);
+
+ void remove_dirfrag_damage_entry(CDir *dir);
+
+ void remove_backtrace_damage_entry(inodeno_t ino);
+
bool is_dentry_damaged(
const CDir *dir_frag,
std::string_view dname,
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
index b9ae05ac0..e1c98be1b 100644
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -792,7 +792,8 @@ const MDSMap::mds_info_t* FSMap::get_available_standby(const Filesystem& fs) con
break;
} else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
who = &info; /* vanilla standby */
- } else if (who == nullptr) {
+ } else if (who == nullptr &&
+ !fs.mds_map.test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) {
who = &info; /* standby for another fs, last resort */
}
}
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 5d7ec56f2..0b1f64099 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1237,6 +1237,19 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSCon
send_lock_message(lock, LOCK_AC_SYNC, softdata);
}
break;
+ case LOCK_XLOCKSNAP:
+ if (lock->get_sm() == &sm_filelock) {
+ int pending = lock->gcaps_allowed(CAP_ANY) ||
+ lock->gcaps_allowed(CAP_LONER) ||
+ lock->gcaps_allowed(CAP_XLOCKER);
+ int revoke = ~pending & (loner_issued | other_issued | xlocker_issued);
+
+ // wait for 'Fb' to be revoked
+ if (revoke & CEPH_CAP_GBUFFER) {
+ return;
+ }
+ }
+ break;
}
}
@@ -3567,6 +3580,36 @@ void Locker::kick_cap_releases(MDRequestRef& mdr)
}
}
+__u32 Locker::get_xattr_total_length(CInode::mempool_xattr_map &xattr)
+{
+ __u32 total = 0;
+
+ for (const auto &p : xattr)
+ total += (p.first.length() + p.second.length());
+ return total;
+}
+
+void Locker::decode_new_xattrs(CInode::mempool_inode *inode,
+ CInode::mempool_xattr_map *px,
+ const cref_t<MClientCaps> &m)
+{
+ CInode::mempool_xattr_map tmp;
+
+ auto p = m->xattrbl.cbegin();
+ decode_noshare(tmp, p);
+ __u32 total = get_xattr_total_length(tmp);
+ inode->xattr_version = m->head.xattr_version;
+ if (total > mds->mdsmap->get_max_xattr_size()) {
+ dout(1) << "Maximum xattr size exceeded: " << total
+ << " max size: " << mds->mdsmap->get_max_xattr_size() << dendl;
+ // Ignore new xattr (!!!) but increase xattr version
+ // XXX how to force the client to drop cached xattrs?
+ inode->xattr_version++;
+ } else {
+ *px = std::move(tmp);
+ }
+}
+
/**
* m and ack might be NULL, so don't dereference them unless dirty != 0
*/
@@ -3637,10 +3680,8 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
// xattr
if (xattrs) {
dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version
- << " len " << m->xattrbl.length() << dendl;
- i->xattr_version = m->head.xattr_version;
- auto p = m->xattrbl.cbegin();
- decode(*px, p);
+ << " len " << m->xattrbl.length() << dendl;
+ decode_new_xattrs(i, px, m);
}
{
@@ -3879,13 +3920,6 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
if (!dirty && !change_max)
return false;
- Session *session = mds->get_session(m);
- if (session->check_access(in, MAY_WRITE,
- m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) {
- dout(10) << "check_access failed, dropping cap update on " << *in << dendl;
- return false;
- }
-
// do the update.
EUpdate *le = new EUpdate(mds->mdlog, "cap update");
mds->mdlog->start_entry(le);
@@ -3932,9 +3966,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
// xattrs update?
if (xattr) {
dout(7) << " xattrs v" << pi.inode->xattr_version << " -> " << m->head.xattr_version << dendl;
- pi.inode->xattr_version = m->head.xattr_version;
- auto p = m->xattrbl.cbegin();
- decode_noshare(*pi.xattrs, p);
+ decode_new_xattrs(pi.inode.get(), pi.xattrs.get(), m);
wrlock_force(&in->xattrlock, mut);
}
@@ -5769,6 +5801,7 @@ void Locker::handle_file_lock(ScatterLock *lock, const cref_t<MLock> &m)
case LOCK_AC_SYNC:
ceph_assert(lock->get_state() == LOCK_LOCK ||
lock->get_state() == LOCK_MIX ||
+ lock->get_state() == LOCK_MIX_SYNC ||
lock->get_state() == LOCK_MIX_SYNC2);
if (lock->get_state() == LOCK_MIX) {
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 3aff8db0b..1fe678940 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -262,6 +262,10 @@ private:
bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const;
uint64_t calc_new_max_size(const CInode::inode_const_ptr& pi, uint64_t size);
+ __u32 get_xattr_total_length(CInode::mempool_xattr_map &xattr);
+ void decode_new_xattrs(CInode::mempool_inode *inode,
+ CInode::mempool_xattr_map *px,
+ const cref_t<MClientCaps> &m);
MDSRank *mds;
MDCache *mdcache;
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
index baa43bb43..da785179e 100644
--- a/src/mds/MDBalancer.cc
+++ b/src/mds/MDBalancer.cc
@@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void)
void MDBalancer::tick()
{
static int num_bal_times = g_conf()->mds_bal_max;
+ bool balance_automate = mds->mdsmap->allows_balance_automate();
auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
time now = clock::now();
@@ -248,7 +249,8 @@ void MDBalancer::tick()
// We can use duration_cast below, although the result is an int,
// because the values from g_conf are also integers.
// balance?
- if (mds->get_nodeid() == 0
+ if (balance_automate
+ && mds->get_nodeid() == 0
&& mds->is_active()
&& bal_interval > 0
&& chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
@@ -565,7 +567,8 @@ double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxe
void MDBalancer::queue_split(const CDir *dir, bool fast)
{
- dout(10) << __func__ << " enqueuing " << *dir
+ constexpr const auto &_func_ = __func__;
+ dout(10) << _func_ << " enqueuing " << *dir
<< " (fast=" << fast << ")" << dendl;
const dirfrag_t df = dir->dirfrag();
@@ -579,6 +582,16 @@ void MDBalancer::queue_split(const CDir *dir, bool fast)
return;
}
+ if (mds->is_stopping()) {
+ // not a good time. This could have been (!mds->is_active())
+ // or at least (mds->is_stopping() || mds->is_stopped()), but
+ // is_stopped() is never true because an MDS respawns as soon as it's removed from the map;
+ // the narrow is_stopping check is to avoid potential regressions
+ // due to unknown coupling with other parts of the MDS (especially multiple ranks).
+ dout(5) << "ignoring the " << _func_ << " callback because the MDS state is '" << ceph_mds_state_name(mds->get_state()) << "'" << dendl;
+ return;
+ }
+
auto mdcache = mds->mdcache;
CDir *dir = mdcache->get_dirfrag(df);
@@ -593,7 +606,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast)
// Pass on to MDCache: note that the split might still not
// happen if the checks in MDCache::can_fragment fail.
- dout(10) << __func__ << " splitting " << *dir << dendl;
+ dout(10) << _func_ << " splitting " << *dir << dendl;
int bits = g_conf()->mds_bal_split_bits;
if (dir->inode->is_ephemeral_dist()) {
unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
@@ -623,6 +636,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast)
void MDBalancer::queue_merge(CDir *dir)
{
const auto frag = dir->dirfrag();
+ constexpr const auto &_func_ = __func__;
auto callback = [this, frag](int r) {
ceph_assert(frag.frag != frag_t());
@@ -631,6 +645,16 @@ void MDBalancer::queue_merge(CDir *dir)
// starting one), and this context is the only one that erases it.
merge_pending.erase(frag);
+ if (mds->is_stopping()) {
+ // not a good time. This could have been (!mds->is_active())
+ // or at least (mds->is_stopping() || mds->is_stopped()), but
+ // is_stopped() is never true because an MDS respawns as soon as it's removed from the map;
+ // the narrow is_stopping check is to avoid potential regressions
+ // due to unknown coupling with other parts of the MDS (especially multiple ranks).
+ dout(5) << "ignoring the " << _func_ << " callback because the MDS state is '" << ceph_mds_state_name(mds->get_state()) << "'" << dendl;
+ return;
+ }
+
auto mdcache = mds->mdcache;
CDir *dir = mdcache->get_dirfrag(frag);
if (!dir) {
@@ -662,7 +686,12 @@ void MDBalancer::queue_merge(CDir *dir)
}
bool all = true;
for (auto& sib : sibs) {
- if (!sib->is_auth() || !sib->should_merge()) {
+ auto is_auth = sib->is_auth();
+ auto should_merge = sib->should_merge();
+
+ dout(20) << ": sib=" << *sib << ", is_auth=" << is_auth << ", should_merge="
+ << should_merge << dendl;
+ if (!is_auth || !should_merge) {
all = false;
break;
}
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 2ea13155e..5480e6dcd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -6784,6 +6784,13 @@ std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap
<< " pinned=" << lru.lru_get_num_pinned()
<< dendl;
+ dout(20) << "bottom_lru: " << bottom_lru.lru_get_size() << " items"
+ ", " << bottom_lru.lru_get_top() << " top"
+ ", " << bottom_lru.lru_get_bot() << " bot"
+ ", " << bottom_lru.lru_get_pintail() << " pintail"
+ ", " << bottom_lru.lru_get_num_pinned() << " pinned"
+ << dendl;
+
const uint64_t trim_counter_start = trim_counter.get();
bool throttled = false;
while (1) {
@@ -6804,20 +6811,25 @@ std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap
}
unexpirables.clear();
+ dout(20) << "lru: " << lru.lru_get_size() << " items"
+ ", " << lru.lru_get_top() << " top"
+ ", " << lru.lru_get_bot() << " bot"
+ ", " << lru.lru_get_pintail() << " pintail"
+ ", " << lru.lru_get_num_pinned() << " pinned"
+ << dendl;
+
// trim dentries from the LRU until count is reached
- // if mds is in standby_replay and skip trimming the inodes
- while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
+ while (!throttled && (cache_toofull() || count > 0)) {
throttled |= trim_counter_start+trimmed >= trim_threshold;
if (throttled) break;
CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
if (!dn) {
break;
}
- if (is_standby_replay && dn->get_linkage()->inode) {
- // we move the inodes that need to be trimmed to the end of the lru queue.
- // refer to MDCache::standby_trim_segment
- lru.lru_insert_bot(dn);
- break;
+ if ((is_standby_replay && dn->get_linkage()->inode &&
+ dn->get_linkage()->inode->item_open_file.is_on_list())) {
+ dout(20) << "unexpirable: " << *dn << dendl;
+ unexpirables.push_back(dn);
} else if (trim_dentry(dn, expiremap)) {
unexpirables.push_back(dn);
} else {
@@ -7463,69 +7475,42 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
void MDCache::standby_trim_segment(LogSegment *ls)
{
- auto try_trim_inode = [this](CInode *in) {
- if (in->get_num_ref() == 0 &&
- !in->item_open_file.is_on_list() &&
- in->parent != NULL &&
- in->parent->get_num_ref() == 0){
- touch_dentry_bottom(in->parent);
- }
- };
-
- auto try_trim_dentry = [this](CDentry *dn) {
- if (dn->get_num_ref() > 0)
- return;
- auto in = dn->get_linkage()->inode;
- if(in && in->item_open_file.is_on_list())
- return;
- touch_dentry_bottom(dn);
- };
-
ls->new_dirfrags.clear_list();
ls->open_files.clear_list();
while (!ls->dirty_dirfrags.empty()) {
CDir *dir = ls->dirty_dirfrags.front();
dir->mark_clean();
- if (dir->inode)
- try_trim_inode(dir->inode);
}
while (!ls->dirty_inodes.empty()) {
CInode *in = ls->dirty_inodes.front();
in->mark_clean();
- try_trim_inode(in);
}
while (!ls->dirty_dentries.empty()) {
CDentry *dn = ls->dirty_dentries.front();
dn->mark_clean();
- try_trim_dentry(dn);
}
while (!ls->dirty_parent_inodes.empty()) {
CInode *in = ls->dirty_parent_inodes.front();
in->clear_dirty_parent();
- try_trim_inode(in);
}
while (!ls->dirty_dirfrag_dir.empty()) {
CInode *in = ls->dirty_dirfrag_dir.front();
in->filelock.remove_dirty();
- try_trim_inode(in);
}
while (!ls->dirty_dirfrag_nest.empty()) {
CInode *in = ls->dirty_dirfrag_nest.front();
in->nestlock.remove_dirty();
- try_trim_inode(in);
}
while (!ls->dirty_dirfrag_dirfragtree.empty()) {
CInode *in = ls->dirty_dirfrag_dirfragtree.front();
in->dirfragtreelock.remove_dirty();
- try_trim_inode(in);
}
while (!ls->truncating_inodes.empty()) {
auto it = ls->truncating_inodes.begin();
CInode *in = *it;
ls->truncating_inodes.erase(it);
in->put(CInode::PIN_TRUNCATING);
- try_trim_inode(in);
}
}
@@ -9897,6 +9882,12 @@ void MDCache::request_cleanup(MDRequestRef& mdr)
// remove from map
active_requests.erase(mdr->reqid);
+ // queue next replay op?
+ if (mdr->is_queued_for_replay() && !mdr->get_queued_next_replay_op()) {
+ mdr->set_queued_next_replay_op();
+ mds->queue_one_replay();
+ }
+
if (mds->logger)
log_stat();
@@ -13444,6 +13435,12 @@ bool MDCache::dump_inode(Formatter *f, uint64_t number) {
return true;
}
+void MDCache::dump_dir(Formatter *f, CDir *dir, bool dentry_dump) {
+ f->open_object_section("dir");
+ dir->dump(f, dentry_dump ? CDir::DUMP_ALL : CDir::DUMP_DEFAULT);
+ f->close_section();
+}
+
void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
const mds_rank_t max_mds = mdsmap.get_max_mds();
@@ -13506,7 +13503,7 @@ void MDCache::upkeep_main(void)
if (active_with_clients) {
trim_client_leases();
}
- if (is_open()) {
+ if (is_open() || mds->is_standby_replay()) {
trim();
}
if (active_with_clients) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index d9f173038..f1b58c28d 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -528,6 +528,7 @@ class MDCache {
void clean_open_file_lists();
void dump_openfiles(Formatter *f);
bool dump_inode(Formatter *f, uint64_t number);
+ void dump_dir(Formatter *f, CDir *dir, bool dentry_dump=false);
void rejoin_start(MDSContext *rejoin_done_);
void rejoin_gather_finish();
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 82899d2da..840a43733 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -1364,11 +1364,10 @@ void MDLog::_replay_thread()
break;
}
- if (!journaler->is_readable() &&
- journaler->get_read_pos() == journaler->get_write_pos())
+ if (journaler->get_read_pos() == journaler->get_write_pos()) {
+ dout(10) << "_replay: read_pos == write_pos" << dendl;
break;
-
- ceph_assert(journaler->is_readable() || mds->is_daemon_stopping());
+ }
// read it
uint64_t pos = journaler->get_read_pos();
diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc
index d983f2d58..22383445e 100644
--- a/src/mds/MDSAuthCaps.cc
+++ b/src/mds/MDSAuthCaps.cc
@@ -33,6 +33,7 @@
using std::ostream;
using std::string;
using std::vector;
+using std::string_view;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;
@@ -53,6 +54,8 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
using qi::_1;
using qi::_2;
using qi::_3;
+ using qi::_4;
+ using qi::_5;
using qi::eps;
using qi::lit;
@@ -65,25 +68,13 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
network_str %= +char_("/.:a-fA-F0-9][");
fs_name_str %= +char_("a-zA-Z0-9_.-");
- // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]]
- // TODO: allow fsname, and root_squash to be specified with uid, and gidlist
- path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path));
- uid %= (spaces >> lit("uid") >> lit('=') >> uint_);
+ path %= -(spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path));
+ uid %= -(spaces >> lit("uid") >> lit('=') >> uint_);
uintlist %= (uint_ % lit(','));
gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist);
fs_name %= -(spaces >> lit("fsname") >> lit('=') >> fs_name_str);
- root_squash %= (spaces >> lit("root_squash") >> attr(true));
- match = -(
- (fs_name >> path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_2, _1, _3)] |
- (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] |
- (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] |
- (fs_name >> path)[_val = phoenix::construct<MDSCapMatch>(_2, _1)] |
- (fs_name >> root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), _1, _2)] |
- (path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_1, std::string(), _2)] |
- (path)[_val = phoenix::construct<MDSCapMatch>(_1)] |
- (root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), std::string(), _1)] |
- (fs_name)[_val = phoenix::construct<MDSCapMatch>(std::string(),
- _1)]);
+ root_squash %= -(spaces >> lit("root_squash") >> attr(true));
+ match = (fs_name >> path >> root_squash >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3, _4, _5)];
// capspec = * | r[w][f][p][s]
capspec = spaces >> (
@@ -122,11 +113,11 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
qi::rule<Iterator, bool()> root_squash;
qi::rule<Iterator, MDSCapSpec()> capspec;
qi::rule<Iterator, uint32_t()> uid;
- qi::rule<Iterator, std::vector<uint32_t>() > uintlist;
- qi::rule<Iterator, std::vector<uint32_t>() > gidlist;
+ qi::rule<Iterator, vector<uint32_t>() > uintlist;
+ qi::rule<Iterator, vector<uint32_t>() > gidlist;
qi::rule<Iterator, MDSCapMatch()> match;
qi::rule<Iterator, MDSCapGrant()> grant;
- qi::rule<Iterator, std::vector<MDSCapGrant>()> grants;
+ qi::rule<Iterator, vector<MDSCapGrant>()> grants;
qi::rule<Iterator, MDSAuthCaps()> mdscaps;
};
@@ -142,7 +133,7 @@ void MDSCapMatch::normalize_path()
// drop ..
}
-bool MDSCapMatch::match(std::string_view target_path,
+bool MDSCapMatch::match(string_view target_path,
const int caller_uid,
const int caller_gid,
const vector<uint64_t> *caller_gid_list) const
@@ -174,7 +165,7 @@ bool MDSCapMatch::match(std::string_view target_path,
return true;
}
-bool MDSCapMatch::match_path(std::string_view target_path) const
+bool MDSCapMatch::match_path(string_view target_path) const
{
if (path.length()) {
if (target_path.find(path) != 0)
@@ -200,7 +191,7 @@ void MDSCapGrant::parse_network()
* Is the client *potentially* able to access this path? Actual
* permission will depend on uids/modes in the full is_capable.
*/
-bool MDSAuthCaps::path_capable(std::string_view inode_path) const
+bool MDSAuthCaps::path_capable(string_view inode_path) const
{
for (const auto &i : grants) {
if (i.match.match_path(inode_path)) {
@@ -218,7 +209,7 @@ bool MDSAuthCaps::path_capable(std::string_view inode_path) const
* This is true if any of the 'grant' clauses in the capability match the
* requested path + op.
*/
-bool MDSAuthCaps::is_capable(std::string_view inode_path,
+bool MDSAuthCaps::is_capable(string_view inode_path,
uid_t inode_uid, gid_t inode_gid,
unsigned inode_mode,
uid_t caller_uid, gid_t caller_gid,
@@ -338,7 +329,7 @@ void MDSAuthCaps::set_allow_all()
{}));
}
-bool MDSAuthCaps::parse(std::string_view str, ostream *err)
+bool MDSAuthCaps::parse(string_view str, ostream *err)
{
// Special case for legacy caps
if (str == "allow") {
@@ -363,10 +354,15 @@ bool MDSAuthCaps::parse(std::string_view str, ostream *err)
// Make sure no grants are kept after parsing failed!
grants.clear();
- if (err)
- *err << "mds capability parse failed, stopped at '"
- << std::string(iter, end)
- << "' of '" << str << "'";
+ if (err) {
+ if (string(iter, end).find("allow") != string::npos) {
+ *err << "Permission flags in MDS caps must start with 'r' or " <<
+ "'rw' or be '*' or 'all'";
+ } else {
+ *err << "mds capability parse failed, stopped at '"
+ << string(iter, end) << "' of '" << str << "'";
+ }
+ }
return false;
}
}
@@ -465,3 +461,9 @@ ostream &operator<<(ostream &out, const MDSAuthCaps &cap)
return out;
}
+ostream &operator<<(ostream &out, const MDSCapAuth &auth)
+{
+ out << "MDSCapAuth(" << auth.match << "readable="
+ << auth.readable << ", writeable=" << auth.writeable << ")";
+ return out;
+}
diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h
index 5fcbb1f2f..bbb2589b3 100644
--- a/src/mds/MDSAuthCaps.h
+++ b/src/mds/MDSAuthCaps.h
@@ -19,6 +19,7 @@
#include <string_view>
#include <vector>
+#include "include/encoding.h"
#include "include/common_fwd.h"
#include "include/types.h"
#include "common/debug.h"
@@ -101,35 +102,31 @@ private:
struct MDSCapMatch {
static const int64_t MDS_AUTH_UID_ANY = -1;
- MDSCapMatch() : uid(MDS_AUTH_UID_ANY), fs_name(std::string()) {}
+ MDSCapMatch() {}
- MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) :
- uid(uid_), gids(gids_), fs_name(std::string()) {}
+ MDSCapMatch(const std::string& fsname_, const std::string& path_,
+ bool root_squash_, int64_t uid_=MDS_AUTH_UID_ANY,
+ const std::vector<gid_t>& gids_={}) {
+ fs_name = std::move(fsname_);
+ path = std::move(path_);
+ root_squash = root_squash_;
+ uid = (uid_ == 0) ? -1 : uid_;
+ gids = gids_;
- explicit MDSCapMatch(const std::string &path_)
- : uid(MDS_AUTH_UID_ANY), path(path_), fs_name(std::string()) {
normalize_path();
}
- explicit MDSCapMatch(std::string path, std::string fs_name) :
- uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name))
- {
- normalize_path();
- }
-
- explicit MDSCapMatch(std::string path, std::string fs_name, bool root_squash_) :
- uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name)), root_squash(root_squash_)
- {
- normalize_path();
- }
-
- MDSCapMatch(const std::string& path_, int64_t uid_, std::vector<gid_t>& gids_)
- : uid(uid_), gids(gids_), path(path_), fs_name(std::string()) {
- normalize_path();
+ const MDSCapMatch& operator=(const MDSCapMatch& m) {
+ uid = m.uid;
+ gids = m.gids;
+ path = m.path;
+ fs_name = m.fs_name;
+ root_squash = m.root_squash;
+ return *this;
}
void normalize_path();
-
+
bool is_match_all() const
{
return uid == MDS_AUTH_UID_ANY && path == "";
@@ -149,12 +146,68 @@ struct MDSCapMatch {
*/
bool match_path(std::string_view target_path) const;
- int64_t uid; // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(uid, bl);
+ encode(gids, bl);
+ encode(path, bl);
+ encode(fs_name, bl);
+ encode(root_squash, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(uid, p);
+ decode(gids, p);
+ decode(path, p);
+ decode(fs_name, p);
+ decode(root_squash, p);
+ DECODE_FINISH(p);
+ }
+
+ // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
+ int64_t uid = MDS_AUTH_UID_ANY;
std::vector<gid_t> gids; // Use these GIDs
std::string path; // Require path to be child of this (may be "" or "/" for any)
std::string fs_name;
bool root_squash=false;
};
+WRITE_CLASS_ENCODER(MDSCapMatch)
+
+struct MDSCapAuth {
+ MDSCapAuth() {}
+ MDSCapAuth(MDSCapMatch m, bool r, bool w) :
+ match(m), readable(r), writeable(w) {}
+
+ const MDSCapAuth& operator=(const MDSCapAuth& m) {
+ match = m.match;
+ readable = m.readable;
+ writeable = m.writeable;
+ return *this;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(match, bl);
+ encode(readable, bl);
+ encode(writeable, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(match, p);
+ decode(readable, p);
+ decode(writeable, p);
+ DECODE_FINISH(p);
+ }
+
+ MDSCapMatch match;
+ bool readable;
+ bool writeable;
+};
+WRITE_CLASS_ENCODER(MDSCapAuth)
struct MDSCapGrant {
MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_,
@@ -223,12 +276,31 @@ public:
return false;
}
+ void get_cap_auths(std::vector<MDSCapAuth> *cap_auths)
+ {
+ for (const auto& grant : grants) {
+ cap_auths->emplace_back(MDSCapAuth(grant.match,
+ grant.spec.allow_read(),
+ grant.spec.allow_write()));
+ }
+ }
+
+ bool root_squash_in_caps() const {
+ for (const MDSCapGrant &g : grants) {
+ if (g.match.root_squash) {
+ return true;
+ }
+ }
+ return false;
+ }
+
friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
private:
std::vector<MDSCapGrant> grants;
};
std::ostream &operator<<(std::ostream &out, const MDSCapMatch &match);
+std::ostream &operator<<(std::ostream &out, const MDSCapAuth &auth);
std::ostream &operator<<(std::ostream &out, const MDSCapSpec &spec);
std::ostream &operator<<(std::ostream &out, const MDSCapGrant &grant);
std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index d45acce06..e97fd2cf8 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -445,6 +445,12 @@ void MDSDaemon::set_up_admin_socket()
asok_hook,
"dump inode by inode number");
ceph_assert(r == 0);
+ r = admin_socket->register_command("dump dir "
+ "name=path,type=CephString,req=true "
+ "name=dentry_dump,type=CephBool,req=false",
+ asok_hook,
+ "dump directory by path");
+ ceph_assert(r == 0);
r = admin_socket->register_command("exit",
asok_hook,
"Terminate this MDS");
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 6433d08c5..aed591d95 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -177,6 +177,7 @@ void MDSMap::dump(Formatter *f) const
cephfs_dump_features(f, required_client_features);
f->close_section();
f->dump_int("max_file_size", max_file_size);
+ f->dump_int("max_xattr_size", max_xattr_size);
f->dump_int("last_failure", last_failure);
f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
f->open_object_section("compat");
@@ -235,6 +236,8 @@ void MDSMap::dump_flags_state(Formatter *f) const
f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS), allows_multimds_snaps());
f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay());
f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION));
+ f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS));
+ f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE));
f->close_section();
}
@@ -268,6 +271,7 @@ void MDSMap::print(ostream& out) const
out << "session_timeout\t" << session_timeout << "\n"
<< "session_autoclose\t" << session_autoclose << "\n";
out << "max_file_size\t" << max_file_size << "\n";
+ out << "max_xattr_size\t" << max_xattr_size << "\n";
out << "required_client_features\t" << cephfs_stringify_features(required_client_features) << "\n";
out << "last_failure\t" << last_failure << "\n"
<< "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
@@ -376,6 +380,10 @@ void MDSMap::print_flags(std::ostream& out) const {
out << " " << flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY);
if (test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION))
out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION);
+ if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS))
+ out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS);
+ if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE))
+ out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE);
}
void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
@@ -763,7 +771,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
encode(data_pools, bl);
encode(cas_pool, bl);
- __u16 ev = 17;
+ __u16 ev = 18;
encode(ev, bl);
encode(compat, bl);
encode(metadata_pool, bl);
@@ -791,6 +799,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
}
encode(required_client_features, bl);
encode(bal_rank_mask, bl);
+ encode(max_xattr_size, bl);
ENCODE_FINISH(bl);
}
@@ -842,7 +851,8 @@ void MDSMap::decode(bufferlist::const_iterator& p)
decode(cas_pool, p);
}
- // kclient ignores everything from here
+ // kclient skips most of what's below
+ // see fs/ceph/mdsmap.c for current decoding
__u16 ev = 1;
if (struct_v >= 2)
decode(ev, p);
@@ -942,6 +952,10 @@ void MDSMap::decode(bufferlist::const_iterator& p)
decode(bal_rank_mask, p);
}
+ if (ev >= 18) {
+ decode(max_xattr_size, p);
+ }
+
/* All MDS since at least v14.0.0 understand INLINE */
/* TODO: remove after R is released */
compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 75c44e27c..c61fc2ce1 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -50,6 +50,12 @@ static inline const auto MDS_FEATURE_INCOMPAT_SNAPREALM_V2 = CompatSet::Feature(
#define MDS_FS_NAME_DEFAULT "cephfs"
+/*
+ * Maximum size of xattrs the MDS can handle per inode by default. This
+ * includes the attribute name and 4+4 bytes for the key/value sizes.
+ */
+#define MDS_MAX_XATTR_SIZE (1<<16) /* 64K */
+
class health_check_map_t;
class MDSMap {
@@ -196,6 +202,9 @@ public:
uint64_t get_max_filesize() const { return max_file_size; }
void set_max_filesize(uint64_t m) { max_file_size = m; }
+ uint64_t get_max_xattr_size() const { return max_xattr_size; }
+ void set_max_xattr_size(uint64_t m) { max_xattr_size = m; }
+
void set_min_compat_client(ceph_release_t version);
void add_required_client_feature(size_t bit) {
@@ -234,6 +243,15 @@ public:
bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; }
+ void set_balance_automate() {
+ set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE);
+ ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE;
+ explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE;
+ }
+ void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); }
+ bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); }
+ bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; }
+
void set_multimds_snaps_allowed() {
set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS);
ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
@@ -620,6 +638,8 @@ protected:
__u32 session_autoclose = 300;
uint64_t max_file_size = 1ULL<<40; /* 1TB */
+ uint64_t max_xattr_size = MDS_MAX_XATTR_SIZE;
+
feature_bitset_t required_client_features;
std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default.
@@ -664,7 +684,9 @@ private:
{CEPH_MDSMAP_ALLOW_SNAPS, "allow_snaps"},
{CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"},
{CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"},
- {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"}
+ {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"},
+ {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"},
+ {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"}
};
};
WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t)
diff --git a/src/mds/MDSMetaRequest.h b/src/mds/MDSMetaRequest.h
new file mode 100644
index 000000000..ad4720410
--- /dev/null
+++ b/src/mds/MDSMetaRequest.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_META_REQUEST_H
+#define CEPH_MDS_META_REQUEST_H
+
+#include "include/types.h"
+
+struct MDSMetaRequest {
+private:
+ int op;
+ ceph_tid_t tid;
+public:
+ explicit MDSMetaRequest(int o, ceph_tid_t t) :
+ op(o), tid(t) { }
+ virtual ~MDSMetaRequest() { }
+
+ int get_op() { return op; }
+ ceph_tid_t get_tid() { return tid; }
+};
+
+#endif // !CEPH_MDS_META_REQUEST_H
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 9a80534a4..aa6a8c162 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -551,6 +551,8 @@ MDSRank::MDSRank(
cct->_conf->mds_op_log_threshold);
op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
cct->_conf->mds_op_history_duration);
+ op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->mds_op_history_slow_op_size,
+ cct->_conf->mds_op_history_slow_op_threshold);
schedule_update_timer_task();
}
@@ -744,6 +746,7 @@ void MDSRankDispatcher::tick()
// ...
if (is_clientreplay() || is_active() || is_stopping()) {
+ server->clear_laggy_clients();
server->find_idle_sessions();
server->evict_cap_revoke_non_responders();
locker->tick();
@@ -1185,6 +1188,7 @@ bool MDSRank::is_valid_message(const cref_t<Message> &m) {
type == CEPH_MSG_CLIENT_RECONNECT ||
type == CEPH_MSG_CLIENT_RECLAIM ||
type == CEPH_MSG_CLIENT_REQUEST ||
+ type == CEPH_MSG_CLIENT_REPLY ||
type == MSG_MDS_PEER_REQUEST ||
type == MSG_MDS_HEARTBEAT ||
type == MSG_MDS_TABLE_REQUEST ||
@@ -1238,6 +1242,7 @@ void MDSRank::handle_message(const cref_t<Message> &m)
ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
// fall-thru
case CEPH_MSG_CLIENT_REQUEST:
+ case CEPH_MSG_CLIENT_REPLY:
server->dispatch(m);
break;
case MSG_MDS_PEER_REQUEST:
@@ -2060,6 +2065,7 @@ bool MDSRank::queue_one_replay()
if (!replay_queue.empty()) {
queue_waiter(replay_queue.front());
replay_queue.pop_front();
+ dout(10) << " queued next replay op" << dendl;
return true;
}
if (!replaying_requests_done) {
@@ -2067,6 +2073,7 @@ bool MDSRank::queue_one_replay()
mdlog->flush();
}
maybe_clientreplay_done();
+ dout(10) << " journaled last replay op" << dendl;
return false;
}
@@ -2909,6 +2916,8 @@ void MDSRankDispatcher::handle_asok_command(
command_openfiles_ls(f);
} else if (command == "dump inode") {
command_dump_inode(f, cmdmap, *css);
+ } else if (command == "dump dir") {
+ command_dump_dir(f, cmdmap, *css);
} else if (command == "damage ls") {
std::lock_guard l(mds_lock);
damage_table.dump(f);
@@ -3349,6 +3358,42 @@ void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostr
}
}
+void MDSRank::command_dump_dir(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
+{
+ std::lock_guard l(mds_lock);
+ std::string path;
+ bool got = cmd_getval(cmdmap, "path", path);
+ if (!got) {
+ ss << "missing path argument";
+ return;
+ }
+
+ bool dentry_dump = false;
+ cmd_getval(cmdmap, "dentry_dump", dentry_dump);
+
+ CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+ if (!in) {
+ ss << "directory inode not in cache";
+ return;
+ }
+
+ f->open_array_section("dirs");
+ frag_vec_t leaves;
+ in->dirfragtree.get_leaves_under(frag_t(), leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = in->get_dirfrag(leaf);
+ if (dir) {
+ mdcache->dump_dir(f, dir, dentry_dump);
+ } else {
+ f->open_object_section("frag");
+ f->dump_stream("frag") << leaf;
+ f->dump_string("status", "dirfrag not in cache");
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
+
void MDSRank::dump_status(Formatter *f) const
{
f->dump_string("fs_name", std::string(mdsmap->get_fs_name()));
@@ -3503,6 +3548,9 @@ void MDSRank::create_logger()
PerfCountersBuilder::PRIO_INTERESTING);
mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
PerfCountersBuilder::PRIO_INTERESTING);
+ // mds rss metric is set to PRIO_USEFUL as it can be useful to detect mds cache oversizing
+ mdm_plb.add_u64(l_mdm_rss, "rss", "RSS", "rss",
+ PerfCountersBuilder::PRIO_USEFUL);
mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
@@ -3517,9 +3565,6 @@ void MDSRank::create_logger()
mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
- mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
- mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
-
mlogger = mdm_plb.create_perf_counters();
g_ceph_context->get_perfcounters_collection()->add(mlogger);
}
@@ -3842,6 +3887,9 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s
if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
}
+ if (changed.count("mds_op_history_slow_op_size") || changed.count("mds_op_history_slow_op_threshold")) {
+ op_tracker.set_history_slow_op_size_and_threshold(conf->mds_op_history_slow_op_size, conf->mds_op_history_slow_op_threshold);
+ }
if (changed.count("mds_enable_op_tracker")) {
op_tracker.set_tracking(conf->mds_enable_op_tracker);
}
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index b61fc178c..a9e8da181 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -43,6 +43,7 @@
#include "Server.h"
#include "MetricsHandler.h"
#include "osdc/Journaler.h"
+#include "MDSMetaRequest.h"
// Full .h import instead of forward declaration for PerfCounter, for the
// benefit of those including this header and using MDSRank::logger
@@ -253,6 +254,10 @@ class MDSRank {
progress_thread.signal();
}
+ uint64_t get_global_id() const {
+ return monc->get_global_id();
+ }
+
// Daemon lifetime functions: these guys break the abstraction
// and call up into the parent MDSDaemon instance. It's kind
// of unavoidable: if we want any depth into our calls
@@ -423,6 +428,8 @@ class MDSRank {
PerfCounters *logger = nullptr, *mlogger = nullptr;
OpTracker op_tracker;
+ std::map<ceph_tid_t, std::unique_ptr<MDSMetaRequest>> internal_client_requests;
+
// The last different state I held before current
MDSMap::DaemonState last_state = MDSMap::STATE_BOOT;
// The state assigned to me by the MDSMap
@@ -519,6 +526,7 @@ class MDSRank {
void command_openfiles_ls(Formatter *f);
void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f);
void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss);
+ void command_dump_dir(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss);
void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
// FIXME the state machine logic should be separable from the dispatch
diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc
index 6487084fb..9765d4d5b 100644
--- a/src/mds/MetricAggregator.cc
+++ b/src/mds/MetricAggregator.cc
@@ -4,6 +4,9 @@
#include <boost/range/adaptor/map.hpp>
#include <boost/range/algorithm/copy.hpp>
+#include "common/ceph_context.h"
+#include "common/perf_counters_key.h"
+
#include "MDSRank.h"
#include "MetricAggregator.h"
#include "mgr/MgrClient.h"
@@ -13,8 +16,36 @@
#undef dout_prefix
#define dout_prefix *_dout << "mds.metric.aggregator" << " " << __func__
+// Performance Counters
+ enum {
+ l_mds_client_metrics_start = 10000,
+ l_mds_client_metrics_num_clients,
+ l_mds_client_metrics_last
+ };
+
+enum {
+ l_mds_per_client_metrics_start = 20000,
+ l_mds_per_client_metrics_cap_hits,
+ l_mds_per_client_metrics_cap_misses,
+ l_mds_per_client_metrics_avg_read_latency,
+ l_mds_per_client_metrics_avg_write_latency,
+ l_mds_per_client_metrics_avg_metadata_latency,
+ l_mds_per_client_metrics_dentry_lease_hits,
+ l_mds_per_client_metrics_dentry_lease_misses,
+ l_mds_per_client_metrics_opened_files,
+ l_mds_per_client_metrics_opened_inodes,
+ l_mds_per_client_metrics_pinned_icaps,
+ l_mds_per_client_metrics_total_inodes,
+ l_mds_per_client_metrics_total_read_ops,
+ l_mds_per_client_metrics_total_read_size,
+ l_mds_per_client_metrics_total_write_ops,
+ l_mds_per_client_metrics_total_write_size,
+ l_mds_per_client_metrics_last
+ };
+
MetricAggregator::MetricAggregator(CephContext *cct, MDSRank *mds, MgrClient *mgrc)
: Dispatcher(cct),
+ m_cct(cct),
mds(mds),
mgrc(mgrc),
mds_pinger(mds) {
@@ -32,6 +63,15 @@ void MetricAggregator::ping_all_active_ranks() {
int MetricAggregator::init() {
dout(10) << dendl;
+ std::string labels = ceph::perf_counters::key_create("mds_client_metrics",
+ {{"fs_name", mds->mdsmap->get_fs_name()},
+ {"id", stringify(mds->get_global_id())}});
+ PerfCountersBuilder plb(m_cct, labels, l_mds_client_metrics_start, l_mds_client_metrics_last);
+ plb.add_u64(l_mds_client_metrics_num_clients,
+ "num_clients", "Numer of client sessions", "mcli", PerfCountersBuilder::PRIO_CRITICAL);
+ m_perf_counters = plb.create_perf_counters();
+ m_cct->get_perfcounters_collection()->add(m_perf_counters);
+
pinger = std::thread([this]() {
std::unique_lock locker(lock);
while (!stopping) {
@@ -61,6 +101,24 @@ void MetricAggregator::shutdown() {
std::scoped_lock locker(lock);
ceph_assert(!stopping);
stopping = true;
+
+ // dealloc per-client perf counter
+ for (auto [crpair, pc] : client_perf_counters) {
+ PerfCounters *perf_counters = nullptr;
+ std::swap(perf_counters, pc);
+ if (perf_counters != nullptr) {
+ m_cct->get_perfcounters_collection()->remove(perf_counters);
+ delete perf_counters;
+ }
+ }
+ client_perf_counters.clear();
+
+ PerfCounters *perf_counters = nullptr;
+ std::swap(perf_counters, m_perf_counters);
+ if (perf_counters != nullptr) {
+ m_cct->get_perfcounters_collection()->remove(perf_counters);
+ delete perf_counters;
+ }
}
if (pinger.joinable()) {
@@ -97,10 +155,110 @@ void MetricAggregator::refresh_metrics_for_rank(const entity_inst_t &client,
<< metrics << dendl;
auto &p = clients_by_rank.at(rank);
+ auto crpair = std::make_pair(client, rank);
bool ins = p.insert(client).second;
if (ins) {
dout(20) << ": rank=" << rank << " has " << p.size() << " connected"
<< " client(s)" << dendl;
+ if (m_perf_counters) {
+ m_perf_counters->inc(l_mds_client_metrics_num_clients);
+ }
+
+ std::string labels = ceph::perf_counters::key_create("mds_client_metrics-" + std::string(mds->mdsmap->get_fs_name()),
+ {{"client", stringify(client.name)},
+ {"rank", stringify(rank)}});
+ PerfCountersBuilder plb(m_cct, labels, l_mds_per_client_metrics_start, l_mds_per_client_metrics_last);
+ plb.add_u64(l_mds_per_client_metrics_cap_hits,
+ "cap_hits", "Capability hits", "hcap", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_cap_misses,
+ "cap_miss", "Capability misses", "mcap", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_time(l_mds_per_client_metrics_avg_read_latency,
+ "avg_read_latency", "Average Read Latency", "arlt", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_time(l_mds_per_client_metrics_avg_write_latency,
+ "avg_write_latency", "Average Write Latency", "awlt", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_time(l_mds_per_client_metrics_avg_metadata_latency,
+ "avg_metadata_latency", "Average Metadata Latency", "amlt", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_dentry_lease_hits,
+ "dentry_lease_hits", "Dentry Lease Hits", "hden", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_dentry_lease_misses,
+ "dentry_lease_miss", "Dentry Lease Misses", "mden", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_opened_files,
+ "opened_files", "Open Files", "ofil", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_opened_inodes,
+ "opened_inodes", "Open Inodes", "oino", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_pinned_icaps,
+ "pinned_icaps", "Pinned Inode Caps", "pino", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_total_inodes,
+ "total_inodes", "Total Inodes", "tino", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_total_read_ops,
+ "total_read_ops", "Total Read Operations", "rops", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_total_read_size,
+ "total_read_size", "Total Read Size", "rsiz", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_total_write_ops,
+ "total_write_ops", "Total Write Operations", "wops", PerfCountersBuilder::PRIO_CRITICAL);
+ plb.add_u64(l_mds_per_client_metrics_total_write_size,
+ "total_write_size", "Total Write Size", "wsiz", PerfCountersBuilder::PRIO_CRITICAL);
+ client_perf_counters[crpair] = plb.create_perf_counters();
+ m_cct->get_perfcounters_collection()->add(client_perf_counters[crpair]);
+ }
+
+ // update perf counters
+ PerfCounters *perf_counter_ptr = nullptr;
+ if (client_perf_counters.contains(crpair)) {
+ perf_counter_ptr = client_perf_counters[crpair];
+ }
+
+ if (perf_counter_ptr) {
+ // client capability hit ratio
+ perf_counter_ptr->set(l_mds_per_client_metrics_cap_hits, metrics.cap_hit_metric.hits);
+ perf_counter_ptr->set(l_mds_per_client_metrics_cap_misses, metrics.cap_hit_metric.misses);
+
+ // some averages
+ if (metrics.read_latency_metric.updated) {
+ utime_t ravg(metrics.read_latency_metric.mean.tv.tv_sec * 100,
+ metrics.read_latency_metric.mean.tv.tv_nsec / 1000000);
+ perf_counter_ptr->tset(l_mds_per_client_metrics_avg_read_latency, ravg);
+ }
+ if (metrics.write_latency_metric.updated) {
+ utime_t wavg(metrics.write_latency_metric.mean.tv.tv_sec * 100,
+ metrics.write_latency_metric.mean.tv.tv_nsec / 1000000);
+ perf_counter_ptr->set(l_mds_per_client_metrics_avg_write_latency, wavg);
+ }
+ if (metrics.metadata_latency_metric.updated) {
+ utime_t mavg(metrics.metadata_latency_metric.mean.tv.tv_sec * 100,
+ metrics.metadata_latency_metric.mean.tv.tv_nsec / 1000000);
+ perf_counter_ptr->set(l_mds_per_client_metrics_avg_metadata_latency, mavg);
+ }
+
+ // dentry leases
+ if (metrics.dentry_lease_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_dentry_lease_hits, metrics.dentry_lease_metric.hits);
+ perf_counter_ptr->set(l_mds_per_client_metrics_dentry_lease_misses, metrics.dentry_lease_metric.misses);
+ }
+
+ // file+inode opens, pinned inode caps
+ if (metrics.opened_files_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_opened_files, metrics.opened_files_metric.opened_files);
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.opened_files_metric.total_inodes);
+ }
+ if (metrics.opened_inodes_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_opened_inodes, metrics.opened_inodes_metric.total_inodes);
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.opened_inodes_metric.total_inodes);
+ }
+ if (metrics.pinned_icaps_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_pinned_icaps, metrics.pinned_icaps_metric.pinned_icaps);
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.pinned_icaps_metric.total_inodes);
+ }
+
+ // read+write io metrics
+ if (metrics.read_io_sizes_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_read_ops, metrics.read_io_sizes_metric.total_ops);
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_read_size, metrics.read_io_sizes_metric.total_size);
+ }
+ if (metrics.write_io_sizes_metric.updated) {
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_write_ops, metrics.write_io_sizes_metric.total_ops);
+ perf_counter_ptr->set(l_mds_per_client_metrics_total_write_size, metrics.write_io_sizes_metric.total_size);
+ }
}
auto update_counter_func = [&metrics](const MDSPerformanceCounterDescriptor &d,
@@ -260,6 +418,13 @@ void MetricAggregator::remove_metrics_for_rank(const entity_inst_t &client,
ceph_assert(rm);
dout(20) << ": rank=" << rank << " has " << p.size() << " connected"
<< " client(s)" << dendl;
+ auto crpair = std::make_pair(client, rank);
+ m_cct->get_perfcounters_collection()->remove(client_perf_counters[crpair]);
+ delete client_perf_counters[crpair];
+ client_perf_counters.erase(crpair);
+ }
+ if (m_perf_counters) {
+ m_perf_counters->dec(l_mds_client_metrics_num_clients);
}
auto sub_key_func = [client, rank](const MDSPerfMetricSubKeyDescriptor &d,
@@ -315,6 +480,10 @@ void MetricAggregator::handle_mds_metrics(const cref_t<MMDSMetrics> &m) {
<< rank << " with sequence number " << seq << dendl;
std::scoped_lock locker(lock);
+ if (stopping) {
+ dout(10) << ": stopping" << dendl;
+ return;
+ }
if (!mds_pinger.pong_received(rank, seq)) {
return;
}
diff --git a/src/mds/MetricAggregator.h b/src/mds/MetricAggregator.h
index fe9aef2e3..6d48756f7 100644
--- a/src/mds/MetricAggregator.h
+++ b/src/mds/MetricAggregator.h
@@ -11,6 +11,7 @@
#include "msg/msg_types.h"
#include "msg/Dispatcher.h"
#include "common/ceph_mutex.h"
+#include "common/perf_counters.h"
#include "include/common_fwd.h"
#include "messages/MMDSMetrics.h"
@@ -55,6 +56,7 @@ private:
// drop this lock when calling ->send_message_mds() else mds might
// deadlock
ceph::mutex lock = ceph::make_mutex("MetricAggregator::lock");
+ CephContext *m_cct;
MDSRank *mds;
MgrClient *mgrc;
@@ -72,6 +74,9 @@ private:
bool stopping = false;
+ PerfCounters *m_perf_counters;
+ std::map<std::pair<entity_inst_t, mds_rank_t>, PerfCounters*> client_perf_counters;
+
void handle_mds_metrics(const cref_t<MMDSMetrics> &m);
void refresh_metrics_for_rank(const entity_inst_t &client, mds_rank_t rank,
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index b963dee08..bc83f2191 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -387,6 +387,12 @@ struct MDRequestImpl : public MutationImpl {
void set_filepath(const filepath& fp);
void set_filepath2(const filepath& fp);
bool is_queued_for_replay() const;
+ bool get_queued_next_replay_op() const {
+ return queued_next_replay_op;
+ }
+ void set_queued_next_replay_op() {
+ queued_next_replay_op = true;
+ }
int compare_paths();
bool can_batch();
@@ -460,6 +466,7 @@ protected:
}
void _dump(ceph::Formatter *f, bool has_mds_lock) const;
void _dump_op_descriptor(std::ostream& stream) const override;
+ bool queued_next_replay_op = false;
};
struct MDPeerUpdate {
diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h
index 1f91c2020..b18395213 100644
--- a/src/mds/OpenFileTable.h
+++ b/src/mds/OpenFileTable.h
@@ -113,7 +113,7 @@ protected:
version_t omap_version = 0;
- unsigned omap_num_objs = 0;
+ uint32_t omap_num_objs = 0;
std::vector<unsigned> omap_num_items;
std::map<inodeno_t, OpenedAnchor> anchor_map;
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
index 6d799343f..742c464f4 100644
--- a/src/mds/ScrubStack.cc
+++ b/src/mds/ScrubStack.cc
@@ -320,7 +320,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done)
frag_vec_t frags;
in->dirfragtree.get_leaves(frags);
- dout(20) << __func__ << "recursive mode, frags " << frags << dendl;
+ dout(20) << __func__ << " recursive mode, frags " << frags << dendl;
for (auto &fg : frags) {
if (queued.contains(fg))
continue;
@@ -366,7 +366,6 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done)
scrub_r.tag = header->get_tag();
for (auto& p : scrub_remote) {
- p.second.simplify();
dout(20) << __func__ << " forward " << p.second << " to mds." << p.first << dendl;
auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEDIR, in->ino(),
std::move(p.second), header->get_tag(),
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index ced4ecffa..48e7b03ae 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -31,6 +31,7 @@
#include "Mutation.h"
#include "MetricsHandler.h"
#include "cephfs_features.h"
+#include "MDSContext.h"
#include "msg/Messenger.h"
@@ -305,6 +306,7 @@ void Server::dispatch(const cref_t<Message> &m)
return;
}
bool queue_replay = false;
+ dout(5) << "dispatch request in up:reconnect: " << *req << dendl;
if (req->is_replay() || req->is_async()) {
dout(3) << "queuing replayed op" << dendl;
queue_replay = true;
@@ -323,10 +325,13 @@ void Server::dispatch(const cref_t<Message> &m)
// process completed request in clientreplay stage. The completed request
// might have created new file/directorie. This guarantees MDS sends a reply
// to client before other request modifies the new file/directorie.
- if (session->have_completed_request(req->get_reqid().tid, NULL)) {
- dout(3) << "queuing completed op" << dendl;
+ bool r = session->have_completed_request(req->get_reqid().tid, NULL);
+ if (r) {
+ dout(3) << __func__ << ": queuing completed op" << dendl;
queue_replay = true;
- }
+ } else {
+ dout(20) << __func__ << ": request not complete" << dendl;
+ }
// this request was created before the cap reconnect message, drop any embedded
// cap releases.
req->releases.clear();
@@ -360,6 +365,9 @@ void Server::dispatch(const cref_t<Message> &m)
case CEPH_MSG_CLIENT_REQUEST:
handle_client_request(ref_cast<MClientRequest>(m));
return;
+ case CEPH_MSG_CLIENT_REPLY:
+ handle_client_reply(ref_cast<MClientReply>(m));
+ return;
case CEPH_MSG_CLIENT_RECLAIM:
handle_client_reclaim(ref_cast<MClientReclaim>(m));
return;
@@ -615,6 +623,7 @@ void Server::handle_client_session(const cref_t<MClientSession> &m)
session->get_push_seq());
if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
reply->supported_features = supported_features;
+ session->auth_caps.get_cap_auths(&reply->cap_auths);
mds->send_message_client(reply, session);
if (mdcache->is_readonly()) {
auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
@@ -708,6 +717,17 @@ void Server::handle_client_session(const cref_t<MClientSession> &m)
break;
}
+ if (session->auth_caps.root_squash_in_caps() && !client_metadata.features.test(CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK)) {
+ CachedStackStringStream css;
+ *css << "client lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK needed to enforce 'root_squash' MDS auth caps";
+ send_reject_message(css->strv());
+ mds->clog->warn() << "client session (" << session->info.inst
+ << ") lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK "
+ << " needed to enforce 'root_squash' MDS auth caps";
+ session->clear();
+ break;
+
+ }
// Special case for the 'root' metadata path; validate that the claimed
// root is actually within the caps of the session
if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
@@ -769,6 +789,7 @@ void Server::handle_client_session(const cref_t<MClientSession> &m)
mds->locker->resume_stale_caps(session);
mds->sessionmap.touch_session(session);
}
+ trim_completed_request_list(m->oldest_client_tid, session);
auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
mds->send_message_client(reply, session);
} else {
@@ -905,6 +926,7 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve
reply->supported_features = supported_features;
reply->metric_spec = supported_metric_spec;
}
+ session->auth_caps.get_cap_auths(&reply->cap_auths);
mds->send_message_client(reply, session);
if (mdcache->is_readonly()) {
auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
@@ -1061,6 +1083,7 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_
reply->supported_features = supported_features;
reply->metric_spec = supported_metric_spec;
}
+ session->auth_caps.get_cap_auths(&reply->cap_auths);
mds->send_message_client(reply, session);
if (mdcache->is_readonly())
@@ -1132,10 +1155,12 @@ void Server::find_idle_sessions()
return;
}
- std::vector<Session*> to_evict;
-
bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+ bool defer_client_eviction =
+ g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds")
+ && mds->objecter->with_osdmap([](const OSDMap &map) {
+ return map.any_osd_laggy(); });
if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
std::vector<Session*> new_stale;
@@ -1160,7 +1185,7 @@ void Server::find_idle_sessions()
dout(20) << "evicting session " << session->info.inst << " since autoclose "
"has arrived" << dendl;
// evict session without marking it stale
- to_evict.push_back(session);
+ laggy_clients.insert(session->get_client());
continue;
}
@@ -1189,7 +1214,7 @@ void Server::find_idle_sessions()
}
// do not go through stale, evict it directly.
- to_evict.push_back(session);
+ laggy_clients.insert(session->get_client());
} else {
dout(10) << "new stale session " << session->info.inst
<< " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
@@ -1205,7 +1230,7 @@ void Server::find_idle_sessions()
auto m = make_message<MClientSession>(CEPH_SESSION_STALE);
mds->send_message_client(m, session);
} else {
- to_evict.push_back(session);
+ laggy_clients.insert(session->get_client());
}
}
}
@@ -1224,11 +1249,21 @@ void Server::find_idle_sessions()
<< " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
break;
}
- to_evict.push_back(session);
+ laggy_clients.insert(session->get_client());
}
}
- for (auto session: to_evict) {
+ // don't evict client(s) if osds are laggy
+ if(defer_client_eviction && !laggy_clients.empty()) {
+ dout(5) << "Detected " << laggy_clients.size()
+ << " laggy clients, possibly due to laggy OSDs."
+ " Eviction is skipped until the OSDs return to normal."
+ << dendl;
+ return;
+ }
+
+ for (auto client: laggy_clients) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
if (session->is_importing()) {
dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
continue;
@@ -1247,6 +1282,8 @@ void Server::find_idle_sessions()
kill_session(session, NULL);
}
}
+ // clear as there's no use to keep the evicted clients in laggy_clients
+ clear_laggy_clients();
}
void Server::evict_cap_revoke_non_responders() {
@@ -1255,6 +1292,20 @@ void Server::evict_cap_revoke_non_responders() {
}
auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
+ // don't evict client(s) if osds are laggy
+ bool defer_client_eviction =
+ g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds")
+ && mds->objecter->with_osdmap([](const OSDMap &map) {
+ return map.any_osd_laggy(); })
+ && to_evict.size();
+ if(defer_client_eviction) {
+ laggy_clients.insert(to_evict.begin(), to_evict.end());
+ dout(0) << "Detected " << to_evict.size()
+ << " unresponsive clients, possibly due to laggy OSDs."
+ " Eviction is skipped until the OSDs return to normal."
+ << dendl;
+ return;
+ }
for (auto const &client: to_evict) {
mds->clog->warn() << "client id " << client << " has not responded to"
@@ -1522,6 +1573,12 @@ void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
*css << "missing required features '" << missing_features << "'";
error_str = css->strv();
}
+ if (session->auth_caps.root_squash_in_caps() &&
+ !session->info.client_metadata.features.test(CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK)) {
+ CachedStackStringStream css;
+ *css << "client lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK needed to enforce 'root_squash' MDS auth caps";
+ error_str = css->strv();
+ }
}
if (!error_str.empty()) {
@@ -1549,6 +1606,7 @@ void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
reply->supported_features = supported_features;
reply->metric_spec = supported_metric_spec;
}
+ session->auth_caps.get_cap_auths(&reply->cap_auths);
mds->send_message_client(reply, session);
mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
}
@@ -1984,12 +2042,15 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv
mdr->committing = true;
submit_mdlog_entry(le, fin, mdr, __func__);
- if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
- if (mds->queue_one_replay()) {
- dout(10) << " queued next replay op" << dendl;
- } else {
- dout(10) << " journaled last replay op" << dendl;
- }
+ if (mdr->is_queued_for_replay()) {
+
+ /* We want to queue the next replay op while waiting for the journaling, so
+ * do it now when the early (unsafe) replay is dispatched. Don't wait until
+ * this request is cleaned up in MDCache.cc.
+ */
+
+ mdr->set_queued_next_replay_op();
+ mds->queue_one_replay();
} else if (mdr->did_early_reply)
mds->locker->drop_rdlocks_for_early_reply(mdr.get());
else
@@ -2293,15 +2354,16 @@ void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &
mds->send_message_client(reply, session);
}
- if (req->is_queued_for_replay() &&
- (mdr->has_completed || reply->get_result() < 0)) {
- if (reply->get_result() < 0) {
- int r = reply->get_result();
+ if (client_inst.name.is_mds() && reply->get_op() == CEPH_MDS_OP_RENAME) {
+ mds->send_message(reply, mdr->client_request->get_connection());
+ }
+
+ if (req->is_queued_for_replay()) {
+ if (int r = reply->get_result(); r < 0) {
derr << "reply_client_request: failed to replay " << *req
- << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
+ << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
}
- mds->queue_one_replay();
}
// clean up request
@@ -2391,6 +2453,35 @@ void Server::set_trace_dist(const ref_t<MClientReply> &reply,
reply->set_trace(bl);
}
+// trim completed_request list
+void Server::trim_completed_request_list(ceph_tid_t tid, Session *session)
+{
+ if (tid == UINT64_MAX || !session)
+ return;
+
+ dout(15) << " oldest_client_tid=" << tid << dendl;
+ if (session->trim_completed_requests(tid)) {
+ // Sessions 'completed_requests' was dirtied, mark it to be
+ // potentially flushed at segment expiry.
+ mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+ if (session->get_num_trim_requests_warnings() > 0 &&
+ session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
+ session->reset_num_trim_requests_warnings();
+ } else {
+ if (session->get_num_completed_requests() >=
+ (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+ session->inc_num_trim_requests_warnings();
+ CachedStackStringStream css;
+ *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+ << tid << "), " << session->get_num_completed_requests()
+ << " completed requests recorded in session\n";
+ mds->clog->warn() << css->strv();
+ dout(20) << __func__ << " " << css->strv() << dendl;
+ }
+ }
+}
+
void Server::handle_client_request(const cref_t<MClientRequest> &req)
{
dout(4) << "handle_client_request " << *req << dendl;
@@ -2472,36 +2563,16 @@ void Server::handle_client_request(const cref_t<MClientRequest> &req)
}
// trim completed_request list
- if (req->get_oldest_client_tid() > 0) {
- dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
- ceph_assert(session);
- if (session->trim_completed_requests(req->get_oldest_client_tid())) {
- // Sessions 'completed_requests' was dirtied, mark it to be
- // potentially flushed at segment expiry.
- mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
-
- if (session->get_num_trim_requests_warnings() > 0 &&
- session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
- session->reset_num_trim_requests_warnings();
- } else {
- if (session->get_num_completed_requests() >=
- (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
- session->inc_num_trim_requests_warnings();
- CachedStackStringStream css;
- *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
- << req->get_oldest_client_tid() << "), "
- << session->get_num_completed_requests()
- << " completed requests recorded in session\n";
- mds->clog->warn() << css->strv();
- dout(20) << __func__ << " " << css->strv() << dendl;
- }
- }
- }
+ trim_completed_request_list(req->get_oldest_client_tid(), session);
// register + dispatch
MDRequestRef mdr = mdcache->request_start(req);
- if (!mdr.get())
+ if (!mdr.get()) {
+ dout(5) << __func__ << ": possibly duplicate op " << *req << dendl;
+ if (req->is_queued_for_replay())
+ mds->queue_one_replay();
return;
+ }
if (session) {
mdr->session = session;
@@ -2525,6 +2596,28 @@ void Server::handle_client_request(const cref_t<MClientRequest> &req)
return;
}
+void Server::handle_client_reply(const cref_t<MClientReply> &reply)
+{
+ dout(4) << "handle_client_reply " << *reply << dendl;
+
+ ceph_assert(reply->is_safe());
+ ceph_tid_t tid = reply->get_tid();
+
+ if (mds->internal_client_requests.count(tid) == 0) {
+ dout(1) << " no pending request on tid " << tid << dendl;
+ return;
+ }
+
+ switch (reply->get_op()) {
+ case CEPH_MDS_OP_RENAME:
+ break;
+ default:
+ dout(5) << " unknown client op " << reply->get_op() << dendl;
+ }
+
+ mds->internal_client_requests.erase(tid);
+}
+
void Server::handle_osd_map()
{
/* Note that we check the OSDMAP_FULL flag directly rather than
@@ -4534,6 +4627,20 @@ public:
}
};
+bool Server::is_valid_layout(file_layout_t *layout)
+{
+ if (!layout->is_valid()) {
+ dout(10) << " invalid initial file layout" << dendl;
+ return false;
+ }
+ if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
+ dout(10) << " invalid data pool " << layout->pool_id << dendl;
+ return false;
+ }
+
+ return true;
+}
+
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_openc(MDRequestRef& mdr)
{
@@ -4608,13 +4715,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
access |= MAY_SET_VXATTR;
}
- if (!layout.is_valid()) {
- dout(10) << " invalid initial file layout" << dendl;
- respond_to_request(mdr, -CEPHFS_EINVAL);
- return;
- }
- if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
- dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ if (!is_valid_layout(&layout)) {
respond_to_request(mdr, -CEPHFS_EINVAL);
return;
}
@@ -4864,7 +4965,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
unsigned max_bytes = req->head.args.readdir.max_bytes;
if (!max_bytes)
// make sure at least one item can be encoded
- max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+ max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size();
// start final blob
bufferlist dirbl;
@@ -5503,13 +5604,7 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
access |= MAY_SET_VXATTR;
}
- if (!layout.is_valid()) {
- dout(10) << "bad layout" << dendl;
- respond_to_request(mdr, -CEPHFS_EINVAL);
- return;
- }
- if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
- dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ if (!is_valid_layout(&layout)) {
respond_to_request(mdr, -CEPHFS_EINVAL);
return;
}
@@ -5636,14 +5731,8 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
if (layout != old_layout) {
access |= MAY_SET_VXATTR;
}
-
- if (!layout.is_valid()) {
- dout(10) << "bad layout" << dendl;
- respond_to_request(mdr, -CEPHFS_EINVAL);
- return;
- }
- if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
- dout(10) << " invalid data pool " << layout.pool_id << dendl;
+
+ if (!is_valid_layout(&layout)) {
respond_to_request(mdr, -CEPHFS_EINVAL);
return;
}
@@ -5821,15 +5910,11 @@ int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
if (r < 0) {
return r;
}
-
- if (validate && !layout->is_valid()) {
- dout(10) << __func__ << ": bad layout" << dendl;
- return -CEPHFS_EINVAL;
- }
- if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
- dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
- return -CEPHFS_EINVAL;
+
+ if (!is_valid_layout(layout)) {
+ return -CEPHFS_EINVAL;
}
+
return 0;
}
@@ -5859,9 +5944,13 @@ int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
return r;
}
} else if (name == "quota.max_bytes") {
- int64_t q = boost::lexical_cast<int64_t>(value);
- if (q < 0)
+ string cast_err;
+ int64_t q = strict_iec_cast<int64_t>(value, &cast_err);
+ if(!cast_err.empty()) {
+ dout(10) << __func__ << ": failed to parse quota.max_bytes: "
+ << cast_err << dendl;
return -CEPHFS_EINVAL;
+ }
quota->max_bytes = q;
} else if (name == "quota.max_files") {
int64_t q = boost::lexical_cast<int64_t>(value);
@@ -6127,6 +6216,10 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
inodeno_t subvol_ino = realm->get_subvolume_ino();
// can't create subvolume inside another subvolume
if (subvol_ino && subvol_ino != cur->ino()) {
+ dout(20) << "subvol ino changed between rdlock release and xlock "
+ << "policylock; subvol_ino: " << subvol_ino << ", "
+ << "cur->ino: " << cur->ino()
+ << dendl;
respond_to_request(mdr, -CEPHFS_EINVAL);
return;
}
@@ -6141,10 +6234,13 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
auto pi = cur->project_inode(mdr, false, true);
if (!srnode)
pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
- if (val)
+ if (val) {
+ dout(20) << "marking subvolume for ino: " << cur->ino() << dendl;
pi.snapnode->mark_subvolume();
- else
+ } else {
+ dout(20) << "clearing subvolume for ino: " << cur->ino() << dendl;
pi.snapnode->clear_subvolume();
+ }
mdr->no_early_reply = true;
pip = pi.inode.get();
@@ -6531,9 +6627,9 @@ void Server::handle_client_setxattr(MDRequestRef& mdr)
auto handler = Server::get_xattr_or_default_handler(name);
const auto& pxattrs = cur->get_projected_xattrs();
+ size_t cur_xattrs_size = 0;
if (pxattrs) {
// check xattrs kv pairs size
- size_t cur_xattrs_size = 0;
for (const auto& p : *pxattrs) {
if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
continue;
@@ -6541,12 +6637,12 @@ void Server::handle_client_setxattr(MDRequestRef& mdr)
cur_xattrs_size += p.first.length() + p.second.length();
}
- if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
- dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
- << cur_xattrs_size << ", inc " << inc << dendl;
- respond_to_request(mdr, -CEPHFS_ENOSPC);
- return;
- }
+ }
+ if (((cur_xattrs_size + inc) > mds->mdsmap->get_max_xattr_size())) {
+ dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
+ << cur_xattrs_size << ", inc " << inc << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOSPC);
+ return;
}
XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
@@ -6904,6 +7000,11 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
else
layout = mdcache->default_file_layout;
+ if (!is_valid_layout(&layout)) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
ceph_assert(newi);
@@ -10759,7 +10860,7 @@ void Server::handle_client_lssnap(MDRequestRef& mdr)
int max_bytes = req->head.args.readdir.max_bytes;
if (!max_bytes)
// make sure at least one item can be encoded
- max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+ max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size();
__u64 last_snapid = 0;
string offset_str = req->get_path2();
@@ -11413,7 +11514,7 @@ void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr)
unsigned max_bytes = req->head.args.snapdiff.max_bytes;
if (!max_bytes)
// make sure at least one item can be encoded
- max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+ max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size();
// start final blob
bufferlist dirbl;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 81a5933ba..47f86518b 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -158,7 +158,9 @@ public:
void force_clients_readonly();
// -- requests --
+ void trim_completed_request_list(ceph_tid_t tid, Session *session);
void handle_client_request(const cref_t<MClientRequest> &m);
+ void handle_client_reply(const cref_t<MClientReply> &m);
void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn,
LogEvent *le, MDSLogContextBase *fin);
@@ -235,6 +237,9 @@ public:
void handle_client_removexattr(MDRequestRef& mdr);
void handle_client_fsync(MDRequestRef& mdr);
+
+ // check layout
+ bool is_valid_layout(file_layout_t *layout);
// open
void handle_client_open(MDRequestRef& mdr);
@@ -328,6 +333,13 @@ public:
std::set<client_t> client_reclaim_gather;
+ std::set<client_t> get_laggy_clients() const {
+ return laggy_clients;
+ }
+ void clear_laggy_clients() {
+ laggy_clients.clear();
+ }
+
const bufferlist& get_snap_trace(Session *session, SnapRealm *realm) const;
const bufferlist& get_snap_trace(client_t client, SnapRealm *realm) const;
@@ -553,6 +565,9 @@ private:
size_t alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
size_t fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
+
+ // record laggy clients due to laggy OSDs
+ std::set<client_t> laggy_clients;
};
static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index 720396338..9cc2b0138 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -622,6 +622,9 @@ void Session::dump(Formatter *f, bool cap_dump) const
f->dump_object("session_cache_liveness", session_cache_liveness);
f->dump_object("cap_acquisition", cap_acquisition);
+ f->dump_unsigned("last_trim_completed_requests_tid", last_trim_completed_requests_tid);
+ f->dump_unsigned("last_trim_completed_flushes_tid", last_trim_completed_flushes_tid);
+
f->open_array_section("delegated_inos");
for (const auto& [start, len] : delegated_inos) {
f->open_object_section("ino_range");
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index ddf227be9..360dd66a2 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -314,6 +314,7 @@ public:
bool trim_completed_requests(ceph_tid_t mintid) {
// trim
bool erased_any = false;
+ last_trim_completed_requests_tid = mintid;
while (!info.completed_requests.empty() &&
(mintid == 0 || info.completed_requests.begin()->first < mintid)) {
info.completed_requests.erase(info.completed_requests.begin());
@@ -339,6 +340,7 @@ public:
}
bool trim_completed_flushes(ceph_tid_t mintid) {
bool erased_any = false;
+ last_trim_completed_flushes_tid = mintid;
while (!info.completed_flushes.empty() &&
(mintid == 0 || *info.completed_flushes.begin() < mintid)) {
info.completed_flushes.erase(info.completed_flushes.begin());
@@ -493,6 +495,9 @@ private:
unsigned num_trim_flushes_warnings = 0;
unsigned num_trim_requests_warnings = 0;
+
+ ceph_tid_t last_trim_completed_requests_tid = 0;
+ ceph_tid_t last_trim_completed_flushes_tid = 0;
};
class SessionFilter
diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc
index 9d303bcb0..ac8cdf832 100644
--- a/src/mds/SnapRealm.cc
+++ b/src/mds/SnapRealm.cc
@@ -309,7 +309,7 @@ void SnapRealm::adjust_parent()
void SnapRealm::split_at(SnapRealm *child)
{
- dout(10) << "split_at " << *child
+ dout(10) << __func__ << ": " << *child
<< " on " << *child->inode << dendl;
if (inode->is_mdsdir() || !child->inode->is_dir()) {
@@ -328,8 +328,23 @@ void SnapRealm::split_at(SnapRealm *child)
// it's a dir.
+ if (child->inode->get_projected_parent_dir()->inode->is_stray()) {
+ if (child->inode->containing_realm) {
+ dout(10) << " moving unlinked directory inode" << dendl;
+ child->inode->move_to_realm(child);
+ } else {
+ /* This shouldn't happen because an unlinked directory will have caps
+ * issued to the caller executing rmdir (for today's clients).
+ */
+ dout(10) << " skipping unlinked directory inode w/o caps" << dendl;
+ }
+ return;
+ }
+
// split open_children
- dout(10) << " open_children are " << open_children << dendl;
+ if (!open_children.empty()) {
+ dout(10) << " open_children are " << open_children << dendl;
+ }
for (set<SnapRealm*>::iterator p = open_children.begin();
p != open_children.end(); ) {
SnapRealm *realm = *p;
@@ -346,17 +361,25 @@ void SnapRealm::split_at(SnapRealm *child)
}
// split inodes_with_caps
+ std::unordered_map<CInode const*,bool> visited;
+ uint64_t count = 0;
+ dout(20) << " reserving space for " << CDir::count() << " dirs" << dendl;
+ visited.reserve(CDir::count()); /* a reasonable starting poing: keep in mind there may be CInode directories without fragments in cache */
for (auto p = inodes_with_caps.begin(); !p.end(); ) {
CInode *in = *p;
++p;
// does inode fall within the child realm?
- if (child->inode->is_ancestor_of(in)) {
- dout(20) << " child gets " << *in << dendl;
+ if (child->inode->is_ancestor_of(in, &visited)) {
+ dout(25) << " child gets " << *in << dendl;
in->move_to_realm(child);
+ ++count;
} else {
- dout(20) << " keeping " << *in << dendl;
+ dout(25) << " keeping " << *in << dendl;
}
}
+ dout(20) << " visited " << visited.size() << " directories" << dendl;
+
+ dout(10) << __func__ << ": split " << count << " inodes" << dendl;
}
void SnapRealm::merge_to(SnapRealm *newparent)
diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
index d288ce661..1f729b9c1 100644
--- a/src/mds/StrayManager.cc
+++ b/src/mds/StrayManager.cc
@@ -675,24 +675,41 @@ void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn)
{
dout(10) << __func__ << " " << *straydn << " to " << *rdn << dendl;
+ if (straydn->reintegration_reqid) {
+ dout(20) << __func__ << ": stray dentry " << *straydn
+ << " is already under reintegrating" << dendl;
+ return;
+ }
+
logger->inc(l_mdc_strays_reintegrated);
-
+
// rename it to remote linkage .
filepath src(straydn->get_name(), straydn->get_dir()->ino());
filepath dst(rdn->get_name(), rdn->get_dir()->ino());
+ ceph_tid_t tid = mds->issue_tid();
+
auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME);
req->set_filepath(dst);
req->set_filepath2(src);
- req->set_tid(mds->issue_tid());
+ req->set_tid(tid);
+
+ auto ptr = std::make_unique<StrayEvalRequest>(CEPH_MDS_OP_RENAME, tid, straydn);
+ mds->internal_client_requests.emplace(tid, std::move(ptr));
mds->send_message_mds(req, rdn->authority().first);
}
-
+
void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
{
dout(10) << __func__ << " " << *dn << " to mds." << to << dendl;
+ if (dn->reintegration_reqid) {
+ dout(20) << __func__ << ": stray dentry " << *dn
+ << " is already under migrating" << dendl;
+ return;
+ }
+
logger->inc(l_mdc_strays_migrated);
// rename it to another mds.
@@ -702,10 +719,15 @@ void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
filepath src(dn->get_name(), dirino);
filepath dst(dn->get_name(), MDS_INO_STRAY(to, MDS_INO_STRAY_INDEX(dirino)));
+ ceph_tid_t tid = mds->issue_tid();
+
auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME);
req->set_filepath(dst);
req->set_filepath2(src);
- req->set_tid(mds->issue_tid());
+ req->set_tid(tid);
+
+ auto ptr = std::make_unique<StrayEvalRequest>(CEPH_MDS_OP_RENAME, tid, dn);
+ mds->internal_client_requests.emplace(tid, std::move(ptr));
mds->send_message_mds(req, to);
}
diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h
index 86b6941a5..874fbbb9a 100644
--- a/src/mds/StrayManager.h
+++ b/src/mds/StrayManager.h
@@ -19,15 +19,30 @@
#include <list>
#include "Mutation.h"
#include "PurgeQueue.h"
+#include "MDSMetaRequest.h"
+#include "CDentry.h"
class MDSRank;
class CInode;
-class CDentry;
class StrayManager
{
// My public interface is for consumption by MDCache
public:
+ struct StrayEvalRequest : public MDSMetaRequest {
+ CDentry *dentry;
+ public:
+ explicit StrayEvalRequest(int o, ceph_tid_t t, CDentry *d) :
+ MDSMetaRequest(o, t), dentry(d) {
+ dentry->get(CDentry::PIN_PURGING);
+ dentry->reintegration_reqid = t;
+ }
+ ~StrayEvalRequest() {
+ dentry->reintegration_reqid = 0;
+ dentry->put(CDentry::PIN_PURGING);
+ }
+ };
+
explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_);
void set_logger(PerfCounters *l) {logger = l;}
void activate();
diff --git a/src/mds/cephfs_features.cc b/src/mds/cephfs_features.cc
index 4a864076b..a0336c8ba 100644
--- a/src/mds/cephfs_features.cc
+++ b/src/mds/cephfs_features.cc
@@ -30,6 +30,7 @@ static const std::array feature_names
"32bits_retry_fwd",
"new_snaprealm_info",
"has_owner_uidgid",
+ "client_mds_auth_caps",
};
static_assert(feature_names.size() == CEPHFS_FEATURE_MAX + 1);
diff --git a/src/mds/cephfs_features.h b/src/mds/cephfs_features.h
index 7d215e2a3..3a67e96db 100644
--- a/src/mds/cephfs_features.h
+++ b/src/mds/cephfs_features.h
@@ -48,7 +48,8 @@ namespace ceph {
#define CEPHFS_FEATURE_32BITS_RETRY_FWD 18
#define CEPHFS_FEATURE_NEW_SNAPREALM_INFO 19
#define CEPHFS_FEATURE_HAS_OWNER_UIDGID 20
-#define CEPHFS_FEATURE_MAX 20
+#define CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK 21
+#define CEPHFS_FEATURE_MAX 21
#define CEPHFS_FEATURES_ALL { \
0, 1, 2, 3, 4, \
@@ -70,6 +71,7 @@ namespace ceph {
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
CEPHFS_FEATURE_NEW_SNAPREALM_INFO, \
CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
+ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK \
}
#define CEPHFS_METRIC_FEATURES_ALL { \
diff --git a/src/mds/locks.c b/src/mds/locks.c
index dbe3ab8eb..f6ff8b982 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -117,7 +117,7 @@ const struct sm_state_t filelock[LOCK_MAX] = {
[LOCK_XSYN_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
[LOCK_XSYN] = { 0, true, LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
- [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+ [LOCK_EXCL_XSYN] = { LOCK_XSYN, true, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
[LOCK_PRE_SCAN] = { LOCK_SCAN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_SCAN] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },