diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:17 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:44 +0000 |
commit | 17d6a993fc17d533460c5f40f3908c708e057c18 (patch) | |
tree | 1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /src/mds | |
parent | Releasing progress-linux version 18.2.2-0progress7.99u1. (diff) | |
download | ceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip |
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/mds')
40 files changed, 949 insertions, 228 deletions
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 35f0f7942..5dd319a14 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -495,6 +495,28 @@ void Beacon::notify_health(MDSRank const *mds) MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv()); health.metrics.push_back(m); } + + // Report laggy client(s) due to laggy OSDs + { + bool defer_client_eviction = + g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }); + auto&& laggy_clients = mds->server->get_laggy_clients(); + if (defer_client_eviction && !laggy_clients.empty()) { + std::vector<MDSHealthMetric> laggy_clients_metrics; + for (const auto& laggy_client: laggy_clients) { + CachedStackStringStream css; + *css << "Client " << laggy_client << " is laggy; not evicted" + << " because some OSD(s) is/are laggy"; + MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv()); + laggy_clients_metrics.emplace_back(std::move(m)); + } + auto&& m = laggy_clients_metrics; + health.metrics.insert(std::end(health.metrics), std::cbegin(m), + std::cend(m)); + } + } } MDSMap::DaemonState Beacon::get_want_state() const diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index b6d169b9e..6722f0f2a 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -702,7 +702,7 @@ bool CDentry::check_corruption(bool load) { auto&& snapclient = dir->mdcache->mds->snapclient; auto next_snap = snapclient->get_last_seq()+1; - if (first > last || (snapclient->is_server_ready() && first > next_snap)) { + if (first > last || (snapclient->is_synced() && first > next_snap)) { if (load) { dout(1) << "loaded already corrupt dentry: " << *this << dendl; corrupt_first_loaded = true; diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 4cbf24f0c..a0aa02ab3 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -377,6 +377,8 @@ public: mempool::mds_co::map<client_t,ClientLease*> client_lease_map; std::map<int, std::unique_ptr<BatchOp>> batch_ops; + ceph_tid_t reintegration_reqid = 0; + protected: friend class Migrator; diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 0484c38cc..a8aaf11c0 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -3752,6 +3752,7 @@ bool CDir::scrub_local() mdcache->repair_dirfrag_stats(this); scrub_infop->header->set_repaired(); good = true; + mdcache->mds->damage_table.remove_dentry_damage_entry(this); } return good; } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 23cb087c8..71b6081be 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -965,11 +965,25 @@ CInode *CInode::get_parent_inode() return NULL; } -bool CInode::is_ancestor_of(const CInode *other) const +bool CInode::is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited) const { + std::vector<CInode const*> my_visited = {}; while (other) { - if (other == this) + if (visited && other->is_dir()) { + if (auto it = visited->find(other); it != visited->end()) { + for (auto& in : my_visited) { + (*visited)[in] = it->second; + } + return it->second; + } + my_visited.push_back(other); /* N.B.: this being non-empty means visited is assumed non-null */ + } + if (other == this) { + for (auto& in : my_visited) { + (*visited)[in] = true; + } return true; + } const CDentry *pdn = other->get_oldest_parent_dn(); if (!pdn) { ceph_assert(other->is_base()); @@ -977,6 +991,9 @@ bool CInode::is_ancestor_of(const CInode *other) const } other = pdn->get_dir()->get_inode(); } + for (auto& in : my_visited) { + (*visited)[in] = false; + } return false; } @@ -3457,7 +3474,7 @@ void CInode::remove_client_cap(client_t client) void CInode::move_to_realm(SnapRealm *realm) { - dout(10) << __func__ << " joining realm " << *realm + dout(20) << __func__ << " joining realm " << *realm << ", leaving realm " << *containing_realm << dendl; for (auto& p : client_caps) { containing_realm->remove_cap(p.first, &p.second); @@ -4788,6 +4805,7 @@ next: false); // Flag that we repaired this BT so that it won't go into damagetable results->backtrace.repaired = true; + in->mdcache->mds->damage_table.remove_backtrace_damage_entry(in->ino()); if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired); } @@ -4926,6 +4944,9 @@ next: << "freshly-calculated rstats don't match existing ones (will be fixed)"; in->mdcache->repair_inode_stats(in); results->raw_stats.repaired = true; + for (const auto &p : in->dirfrags){ + in->mdcache->mds->damage_table.remove_dirfrag_damage_entry(p.second); + } } else { results->raw_stats.error_str << "freshly-calculated rstats don't match existing ones"; @@ -5165,6 +5186,11 @@ void CInode::dump(Formatter *f, int flags) const } f->close_section(); } + + auto realm = find_snaprealm(); + inodeno_t subvol_ino = realm->get_subvolume_ino(); + bool is_subvol = (subvol_ino && subvol_ino == ino()); + f->dump_bool("is_subvolume", is_subvol); } /****** Scrub Stuff *****/ diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 979b45174..6f965bffa 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -712,7 +712,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno } // -- misc -- - bool is_ancestor_of(const CInode *other) const; + bool is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited=nullptr) const; bool is_projected_ancestor_of(const CInode *other) const; void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const; diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt index a12898f38..88c8a1db0 100644 --- a/src/mds/CMakeLists.txt +++ b/src/mds/CMakeLists.txt @@ -34,7 +34,6 @@ set(mds_srcs snap.cc SessionMap.cc MDSContext.cc - MDSAuthCaps.cc MDLog.cc MDSCacheObject.cc Mantle.cc diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 3fd6d2ce6..ebc626a22 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -381,7 +381,7 @@ private: ceph_seq_t mseq = 0; int suppress = 0; - unsigned state = 0; + uint32_t state = 0; int lock_cache_allowed = 0; }; diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc index 22802079d..2079d2333 100644 --- a/src/mds/DamageTable.cc +++ b/src/mds/DamageTable.cc @@ -15,6 +15,7 @@ #include "common/debug.h" #include "mds/CDir.h" +#include "mds/CInode.h" #include "DamageTable.h" @@ -200,6 +201,33 @@ bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) return false; } +void DamageTable::remove_dentry_damage_entry(CDir *dir) +{ + if (dentries.count( + DirFragIdent(dir->inode->ino(), dir->frag) + ) > 0){ + const auto frag_dentries = + dentries.at(DirFragIdent(dir->inode->ino(), dir->frag)); + for(const auto &i : frag_dentries) { + erase(i.second->id); + } + } +} + +void DamageTable::remove_dirfrag_damage_entry(CDir *dir) +{ + if (is_dirfrag_damaged(dir)){ + erase(dirfrags.find(DirFragIdent(dir->inode->ino(), dir->frag))->second->id); + } +} + +void DamageTable::remove_backtrace_damage_entry(inodeno_t ino) +{ + if (is_remote_damaged(ino)){ + erase(remotes.find(ino)->second->id); + } +} + bool DamageTable::oversized() const { return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries); diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h index 18a61e08b..a1b96fe22 100644 --- a/src/mds/DamageTable.h +++ b/src/mds/DamageTable.h @@ -22,6 +22,7 @@ #include "include/random.h" class CDir; +class CInode; typedef uint64_t damage_entry_id_t; @@ -155,6 +156,12 @@ class DamageTable */ bool notify_remote_damaged(inodeno_t ino, std::string_view path); + void remove_dentry_damage_entry(CDir *dir); + + void remove_dirfrag_damage_entry(CDir *dir); + + void remove_backtrace_damage_entry(inodeno_t ino); + bool is_dentry_damaged( const CDir *dir_frag, std::string_view dname, diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index b9ae05ac0..e1c98be1b 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -792,7 +792,8 @@ const MDSMap::mds_info_t* FSMap::get_available_standby(const Filesystem& fs) con break; } else if (info.join_fscid == FS_CLUSTER_ID_NONE) { who = &info; /* vanilla standby */ - } else if (who == nullptr) { + } else if (who == nullptr && + !fs.mds_map.test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) { who = &info; /* standby for another fs, last resort */ } } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 5d7ec56f2..0b1f64099 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1237,6 +1237,19 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSCon send_lock_message(lock, LOCK_AC_SYNC, softdata); } break; + case LOCK_XLOCKSNAP: + if (lock->get_sm() == &sm_filelock) { + int pending = lock->gcaps_allowed(CAP_ANY) || + lock->gcaps_allowed(CAP_LONER) || + lock->gcaps_allowed(CAP_XLOCKER); + int revoke = ~pending & (loner_issued | other_issued | xlocker_issued); + + // wait for 'Fb' to be revoked + if (revoke & CEPH_CAP_GBUFFER) { + return; + } + } + break; } } @@ -3567,6 +3580,36 @@ void Locker::kick_cap_releases(MDRequestRef& mdr) } } +__u32 Locker::get_xattr_total_length(CInode::mempool_xattr_map &xattr) +{ + __u32 total = 0; + + for (const auto &p : xattr) + total += (p.first.length() + p.second.length()); + return total; +} + +void Locker::decode_new_xattrs(CInode::mempool_inode *inode, + CInode::mempool_xattr_map *px, + const cref_t<MClientCaps> &m) +{ + CInode::mempool_xattr_map tmp; + + auto p = m->xattrbl.cbegin(); + decode_noshare(tmp, p); + __u32 total = get_xattr_total_length(tmp); + inode->xattr_version = m->head.xattr_version; + if (total > mds->mdsmap->get_max_xattr_size()) { + dout(1) << "Maximum xattr size exceeded: " << total + << " max size: " << mds->mdsmap->get_max_xattr_size() << dendl; + // Ignore new xattr (!!!) but increase xattr version + // XXX how to force the client to drop cached xattrs? + inode->xattr_version++; + } else { + *px = std::move(tmp); + } +} + /** * m and ack might be NULL, so don't dereference them unless dirty != 0 */ @@ -3637,10 +3680,8 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll // xattr if (xattrs) { dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version - << " len " << m->xattrbl.length() << dendl; - i->xattr_version = m->head.xattr_version; - auto p = m->xattrbl.cbegin(); - decode(*px, p); + << " len " << m->xattrbl.length() << dendl; + decode_new_xattrs(i, px, m); } { @@ -3879,13 +3920,6 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, if (!dirty && !change_max) return false; - Session *session = mds->get_session(m); - if (session->check_access(in, MAY_WRITE, - m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) { - dout(10) << "check_access failed, dropping cap update on " << *in << dendl; - return false; - } - // do the update. EUpdate *le = new EUpdate(mds->mdlog, "cap update"); mds->mdlog->start_entry(le); @@ -3932,9 +3966,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, // xattrs update? if (xattr) { dout(7) << " xattrs v" << pi.inode->xattr_version << " -> " << m->head.xattr_version << dendl; - pi.inode->xattr_version = m->head.xattr_version; - auto p = m->xattrbl.cbegin(); - decode_noshare(*pi.xattrs, p); + decode_new_xattrs(pi.inode.get(), pi.xattrs.get(), m); wrlock_force(&in->xattrlock, mut); } @@ -5769,6 +5801,7 @@ void Locker::handle_file_lock(ScatterLock *lock, const cref_t<MLock> &m) case LOCK_AC_SYNC: ceph_assert(lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_MIX || + lock->get_state() == LOCK_MIX_SYNC || lock->get_state() == LOCK_MIX_SYNC2); if (lock->get_state() == LOCK_MIX) { diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 3aff8db0b..1fe678940 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -262,6 +262,10 @@ private: bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const; uint64_t calc_new_max_size(const CInode::inode_const_ptr& pi, uint64_t size); + __u32 get_xattr_total_length(CInode::mempool_xattr_map &xattr); + void decode_new_xattrs(CInode::mempool_inode *inode, + CInode::mempool_xattr_map *px, + const cref_t<MClientCaps> &m); MDSRank *mds; MDCache *mdcache; diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index baa43bb43..da785179e 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -230,6 +230,7 @@ void MDBalancer::handle_export_pins(void) void MDBalancer::tick() { static int num_bal_times = g_conf()->mds_bal_max; + bool balance_automate = mds->mdsmap->allows_balance_automate(); auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval"); auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until"); time now = clock::now(); @@ -248,7 +249,8 @@ void MDBalancer::tick() // We can use duration_cast below, although the result is an int, // because the values from g_conf are also integers. // balance? - if (mds->get_nodeid() == 0 + if (balance_automate + && mds->get_nodeid() == 0 && mds->is_active() && bal_interval > 0 && chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval @@ -565,7 +567,8 @@ double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxe void MDBalancer::queue_split(const CDir *dir, bool fast) { - dout(10) << __func__ << " enqueuing " << *dir + constexpr const auto &_func_ = __func__; + dout(10) << _func_ << " enqueuing " << *dir << " (fast=" << fast << ")" << dendl; const dirfrag_t df = dir->dirfrag(); @@ -579,6 +582,16 @@ void MDBalancer::queue_split(const CDir *dir, bool fast) return; } + if (mds->is_stopping()) { + // not a good time. This could have been (!mds->is_active()) + // or at least (mds->is_stopping() || mds->is_stopped()), but + // is_stopped() is never true because an MDS respawns as soon as it's removed from the map; + // the narrow is_stopping check is to avoid potential regressions + // due to unknown coupling with other parts of the MDS (especially multiple ranks). + dout(5) << "ignoring the " << _func_ << " callback because the MDS state is '" << ceph_mds_state_name(mds->get_state()) << "'" << dendl; + return; + } + auto mdcache = mds->mdcache; CDir *dir = mdcache->get_dirfrag(df); @@ -593,7 +606,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast) // Pass on to MDCache: note that the split might still not // happen if the checks in MDCache::can_fragment fail. - dout(10) << __func__ << " splitting " << *dir << dendl; + dout(10) << _func_ << " splitting " << *dir << dendl; int bits = g_conf()->mds_bal_split_bits; if (dir->inode->is_ephemeral_dist()) { unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits(); @@ -623,6 +636,7 @@ void MDBalancer::queue_split(const CDir *dir, bool fast) void MDBalancer::queue_merge(CDir *dir) { const auto frag = dir->dirfrag(); + constexpr const auto &_func_ = __func__; auto callback = [this, frag](int r) { ceph_assert(frag.frag != frag_t()); @@ -631,6 +645,16 @@ void MDBalancer::queue_merge(CDir *dir) // starting one), and this context is the only one that erases it. merge_pending.erase(frag); + if (mds->is_stopping()) { + // not a good time. This could have been (!mds->is_active()) + // or at least (mds->is_stopping() || mds->is_stopped()), but + // is_stopped() is never true because an MDS respawns as soon as it's removed from the map; + // the narrow is_stopping check is to avoid potential regressions + // due to unknown coupling with other parts of the MDS (especially multiple ranks). + dout(5) << "ignoring the " << _func_ << " callback because the MDS state is '" << ceph_mds_state_name(mds->get_state()) << "'" << dendl; + return; + } + auto mdcache = mds->mdcache; CDir *dir = mdcache->get_dirfrag(frag); if (!dir) { @@ -662,7 +686,12 @@ void MDBalancer::queue_merge(CDir *dir) } bool all = true; for (auto& sib : sibs) { - if (!sib->is_auth() || !sib->should_merge()) { + auto is_auth = sib->is_auth(); + auto should_merge = sib->should_merge(); + + dout(20) << ": sib=" << *sib << ", is_auth=" << is_auth << ", should_merge=" + << should_merge << dendl; + if (!is_auth || !should_merge) { all = false; break; } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 2ea13155e..5480e6dcd 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6784,6 +6784,13 @@ std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap << " pinned=" << lru.lru_get_num_pinned() << dendl; + dout(20) << "bottom_lru: " << bottom_lru.lru_get_size() << " items" + ", " << bottom_lru.lru_get_top() << " top" + ", " << bottom_lru.lru_get_bot() << " bot" + ", " << bottom_lru.lru_get_pintail() << " pintail" + ", " << bottom_lru.lru_get_num_pinned() << " pinned" + << dendl; + const uint64_t trim_counter_start = trim_counter.get(); bool throttled = false; while (1) { @@ -6804,20 +6811,25 @@ std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap } unexpirables.clear(); + dout(20) << "lru: " << lru.lru_get_size() << " items" + ", " << lru.lru_get_top() << " top" + ", " << lru.lru_get_bot() << " bot" + ", " << lru.lru_get_pintail() << " pintail" + ", " << lru.lru_get_num_pinned() << " pinned" + << dendl; + // trim dentries from the LRU until count is reached - // if mds is in standby_replay and skip trimming the inodes - while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) { + while (!throttled && (cache_toofull() || count > 0)) { throttled |= trim_counter_start+trimmed >= trim_threshold; if (throttled) break; CDentry *dn = static_cast<CDentry*>(lru.lru_expire()); if (!dn) { break; } - if (is_standby_replay && dn->get_linkage()->inode) { - // we move the inodes that need to be trimmed to the end of the lru queue. - // refer to MDCache::standby_trim_segment - lru.lru_insert_bot(dn); - break; + if ((is_standby_replay && dn->get_linkage()->inode && + dn->get_linkage()->inode->item_open_file.is_on_list())) { + dout(20) << "unexpirable: " << *dn << dendl; + unexpirables.push_back(dn); } else if (trim_dentry(dn, expiremap)) { unexpirables.push_back(dn); } else { @@ -7463,69 +7475,42 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir) void MDCache::standby_trim_segment(LogSegment *ls) { - auto try_trim_inode = [this](CInode *in) { - if (in->get_num_ref() == 0 && - !in->item_open_file.is_on_list() && - in->parent != NULL && - in->parent->get_num_ref() == 0){ - touch_dentry_bottom(in->parent); - } - }; - - auto try_trim_dentry = [this](CDentry *dn) { - if (dn->get_num_ref() > 0) - return; - auto in = dn->get_linkage()->inode; - if(in && in->item_open_file.is_on_list()) - return; - touch_dentry_bottom(dn); - }; - ls->new_dirfrags.clear_list(); ls->open_files.clear_list(); while (!ls->dirty_dirfrags.empty()) { CDir *dir = ls->dirty_dirfrags.front(); dir->mark_clean(); - if (dir->inode) - try_trim_inode(dir->inode); } while (!ls->dirty_inodes.empty()) { CInode *in = ls->dirty_inodes.front(); in->mark_clean(); - try_trim_inode(in); } while (!ls->dirty_dentries.empty()) { CDentry *dn = ls->dirty_dentries.front(); dn->mark_clean(); - try_trim_dentry(dn); } while (!ls->dirty_parent_inodes.empty()) { CInode *in = ls->dirty_parent_inodes.front(); in->clear_dirty_parent(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_dir.empty()) { CInode *in = ls->dirty_dirfrag_dir.front(); in->filelock.remove_dirty(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_nest.empty()) { CInode *in = ls->dirty_dirfrag_nest.front(); in->nestlock.remove_dirty(); - try_trim_inode(in); } while (!ls->dirty_dirfrag_dirfragtree.empty()) { CInode *in = ls->dirty_dirfrag_dirfragtree.front(); in->dirfragtreelock.remove_dirty(); - try_trim_inode(in); } while (!ls->truncating_inodes.empty()) { auto it = ls->truncating_inodes.begin(); CInode *in = *it; ls->truncating_inodes.erase(it); in->put(CInode::PIN_TRUNCATING); - try_trim_inode(in); } } @@ -9897,6 +9882,12 @@ void MDCache::request_cleanup(MDRequestRef& mdr) // remove from map active_requests.erase(mdr->reqid); + // queue next replay op? + if (mdr->is_queued_for_replay() && !mdr->get_queued_next_replay_op()) { + mdr->set_queued_next_replay_op(); + mds->queue_one_replay(); + } + if (mds->logger) log_stat(); @@ -13444,6 +13435,12 @@ bool MDCache::dump_inode(Formatter *f, uint64_t number) { return true; } +void MDCache::dump_dir(Formatter *f, CDir *dir, bool dentry_dump) { + f->open_object_section("dir"); + dir->dump(f, dentry_dump ? CDir::DUMP_ALL : CDir::DUMP_DEFAULT); + f->close_section(); +} + void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) { const mds_rank_t max_mds = mdsmap.get_max_mds(); @@ -13506,7 +13503,7 @@ void MDCache::upkeep_main(void) if (active_with_clients) { trim_client_leases(); } - if (is_open()) { + if (is_open() || mds->is_standby_replay()) { trim(); } if (active_with_clients) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index d9f173038..f1b58c28d 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -528,6 +528,7 @@ class MDCache { void clean_open_file_lists(); void dump_openfiles(Formatter *f); bool dump_inode(Formatter *f, uint64_t number); + void dump_dir(Formatter *f, CDir *dir, bool dentry_dump=false); void rejoin_start(MDSContext *rejoin_done_); void rejoin_gather_finish(); diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 82899d2da..840a43733 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -1364,11 +1364,10 @@ void MDLog::_replay_thread() break; } - if (!journaler->is_readable() && - journaler->get_read_pos() == journaler->get_write_pos()) + if (journaler->get_read_pos() == journaler->get_write_pos()) { + dout(10) << "_replay: read_pos == write_pos" << dendl; break; - - ceph_assert(journaler->is_readable() || mds->is_daemon_stopping()); + } // read it uint64_t pos = journaler->get_read_pos(); diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc index d983f2d58..22383445e 100644 --- a/src/mds/MDSAuthCaps.cc +++ b/src/mds/MDSAuthCaps.cc @@ -33,6 +33,7 @@ using std::ostream; using std::string; using std::vector; +using std::string_view; namespace qi = boost::spirit::qi; namespace ascii = boost::spirit::ascii; namespace phoenix = boost::phoenix; @@ -53,6 +54,8 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()> using qi::_1; using qi::_2; using qi::_3; + using qi::_4; + using qi::_5; using qi::eps; using qi::lit; @@ -65,25 +68,13 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()> network_str %= +char_("/.:a-fA-F0-9]["); fs_name_str %= +char_("a-zA-Z0-9_.-"); - // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]] - // TODO: allow fsname, and root_squash to be specified with uid, and gidlist - path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path)); - uid %= (spaces >> lit("uid") >> lit('=') >> uint_); + path %= -(spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path)); + uid %= -(spaces >> lit("uid") >> lit('=') >> uint_); uintlist %= (uint_ % lit(',')); gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist); fs_name %= -(spaces >> lit("fsname") >> lit('=') >> fs_name_str); - root_squash %= (spaces >> lit("root_squash") >> attr(true)); - match = -( - (fs_name >> path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_2, _1, _3)] | - (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] | - (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] | - (fs_name >> path)[_val = phoenix::construct<MDSCapMatch>(_2, _1)] | - (fs_name >> root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), _1, _2)] | - (path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_1, std::string(), _2)] | - (path)[_val = phoenix::construct<MDSCapMatch>(_1)] | - (root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), std::string(), _1)] | - (fs_name)[_val = phoenix::construct<MDSCapMatch>(std::string(), - _1)]); + root_squash %= -(spaces >> lit("root_squash") >> attr(true)); + match = (fs_name >> path >> root_squash >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3, _4, _5)]; // capspec = * | r[w][f][p][s] capspec = spaces >> ( @@ -122,11 +113,11 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()> qi::rule<Iterator, bool()> root_squash; qi::rule<Iterator, MDSCapSpec()> capspec; qi::rule<Iterator, uint32_t()> uid; - qi::rule<Iterator, std::vector<uint32_t>() > uintlist; - qi::rule<Iterator, std::vector<uint32_t>() > gidlist; + qi::rule<Iterator, vector<uint32_t>() > uintlist; + qi::rule<Iterator, vector<uint32_t>() > gidlist; qi::rule<Iterator, MDSCapMatch()> match; qi::rule<Iterator, MDSCapGrant()> grant; - qi::rule<Iterator, std::vector<MDSCapGrant>()> grants; + qi::rule<Iterator, vector<MDSCapGrant>()> grants; qi::rule<Iterator, MDSAuthCaps()> mdscaps; }; @@ -142,7 +133,7 @@ void MDSCapMatch::normalize_path() // drop .. } -bool MDSCapMatch::match(std::string_view target_path, +bool MDSCapMatch::match(string_view target_path, const int caller_uid, const int caller_gid, const vector<uint64_t> *caller_gid_list) const @@ -174,7 +165,7 @@ bool MDSCapMatch::match(std::string_view target_path, return true; } -bool MDSCapMatch::match_path(std::string_view target_path) const +bool MDSCapMatch::match_path(string_view target_path) const { if (path.length()) { if (target_path.find(path) != 0) @@ -200,7 +191,7 @@ void MDSCapGrant::parse_network() * Is the client *potentially* able to access this path? Actual * permission will depend on uids/modes in the full is_capable. */ -bool MDSAuthCaps::path_capable(std::string_view inode_path) const +bool MDSAuthCaps::path_capable(string_view inode_path) const { for (const auto &i : grants) { if (i.match.match_path(inode_path)) { @@ -218,7 +209,7 @@ bool MDSAuthCaps::path_capable(std::string_view inode_path) const * This is true if any of the 'grant' clauses in the capability match the * requested path + op. */ -bool MDSAuthCaps::is_capable(std::string_view inode_path, +bool MDSAuthCaps::is_capable(string_view inode_path, uid_t inode_uid, gid_t inode_gid, unsigned inode_mode, uid_t caller_uid, gid_t caller_gid, @@ -338,7 +329,7 @@ void MDSAuthCaps::set_allow_all() {})); } -bool MDSAuthCaps::parse(std::string_view str, ostream *err) +bool MDSAuthCaps::parse(string_view str, ostream *err) { // Special case for legacy caps if (str == "allow") { @@ -363,10 +354,15 @@ bool MDSAuthCaps::parse(std::string_view str, ostream *err) // Make sure no grants are kept after parsing failed! grants.clear(); - if (err) - *err << "mds capability parse failed, stopped at '" - << std::string(iter, end) - << "' of '" << str << "'"; + if (err) { + if (string(iter, end).find("allow") != string::npos) { + *err << "Permission flags in MDS caps must start with 'r' or " << + "'rw' or be '*' or 'all'"; + } else { + *err << "mds capability parse failed, stopped at '" + << string(iter, end) << "' of '" << str << "'"; + } + } return false; } } @@ -465,3 +461,9 @@ ostream &operator<<(ostream &out, const MDSAuthCaps &cap) return out; } +ostream &operator<<(ostream &out, const MDSCapAuth &auth) +{ + out << "MDSCapAuth(" << auth.match << "readable=" + << auth.readable << ", writeable=" << auth.writeable << ")"; + return out; +} diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h index 5fcbb1f2f..bbb2589b3 100644 --- a/src/mds/MDSAuthCaps.h +++ b/src/mds/MDSAuthCaps.h @@ -19,6 +19,7 @@ #include <string_view> #include <vector> +#include "include/encoding.h" #include "include/common_fwd.h" #include "include/types.h" #include "common/debug.h" @@ -101,35 +102,31 @@ private: struct MDSCapMatch { static const int64_t MDS_AUTH_UID_ANY = -1; - MDSCapMatch() : uid(MDS_AUTH_UID_ANY), fs_name(std::string()) {} + MDSCapMatch() {} - MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) : - uid(uid_), gids(gids_), fs_name(std::string()) {} + MDSCapMatch(const std::string& fsname_, const std::string& path_, + bool root_squash_, int64_t uid_=MDS_AUTH_UID_ANY, + const std::vector<gid_t>& gids_={}) { + fs_name = std::move(fsname_); + path = std::move(path_); + root_squash = root_squash_; + uid = (uid_ == 0) ? -1 : uid_; + gids = gids_; - explicit MDSCapMatch(const std::string &path_) - : uid(MDS_AUTH_UID_ANY), path(path_), fs_name(std::string()) { normalize_path(); } - explicit MDSCapMatch(std::string path, std::string fs_name) : - uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name)) - { - normalize_path(); - } - - explicit MDSCapMatch(std::string path, std::string fs_name, bool root_squash_) : - uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name)), root_squash(root_squash_) - { - normalize_path(); - } - - MDSCapMatch(const std::string& path_, int64_t uid_, std::vector<gid_t>& gids_) - : uid(uid_), gids(gids_), path(path_), fs_name(std::string()) { - normalize_path(); + const MDSCapMatch& operator=(const MDSCapMatch& m) { + uid = m.uid; + gids = m.gids; + path = m.path; + fs_name = m.fs_name; + root_squash = m.root_squash; + return *this; } void normalize_path(); - + bool is_match_all() const { return uid == MDS_AUTH_UID_ANY && path == ""; @@ -149,12 +146,68 @@ struct MDSCapMatch { */ bool match_path(std::string_view target_path) const; - int64_t uid; // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(uid, bl); + encode(gids, bl); + encode(path, bl); + encode(fs_name, bl); + encode(root_squash, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(uid, p); + decode(gids, p); + decode(path, p); + decode(fs_name, p); + decode(root_squash, p); + DECODE_FINISH(p); + } + + // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY + int64_t uid = MDS_AUTH_UID_ANY; std::vector<gid_t> gids; // Use these GIDs std::string path; // Require path to be child of this (may be "" or "/" for any) std::string fs_name; bool root_squash=false; }; +WRITE_CLASS_ENCODER(MDSCapMatch) + +struct MDSCapAuth { + MDSCapAuth() {} + MDSCapAuth(MDSCapMatch m, bool r, bool w) : + match(m), readable(r), writeable(w) {} + + const MDSCapAuth& operator=(const MDSCapAuth& m) { + match = m.match; + readable = m.readable; + writeable = m.writeable; + return *this; + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(match, bl); + encode(readable, bl); + encode(writeable, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(match, p); + decode(readable, p); + decode(writeable, p); + DECODE_FINISH(p); + } + + MDSCapMatch match; + bool readable; + bool writeable; +}; +WRITE_CLASS_ENCODER(MDSCapAuth) struct MDSCapGrant { MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_, @@ -223,12 +276,31 @@ public: return false; } + void get_cap_auths(std::vector<MDSCapAuth> *cap_auths) + { + for (const auto& grant : grants) { + cap_auths->emplace_back(MDSCapAuth(grant.match, + grant.spec.allow_read(), + grant.spec.allow_write())); + } + } + + bool root_squash_in_caps() const { + for (const MDSCapGrant &g : grants) { + if (g.match.root_squash) { + return true; + } + } + return false; + } + friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap); private: std::vector<MDSCapGrant> grants; }; std::ostream &operator<<(std::ostream &out, const MDSCapMatch &match); +std::ostream &operator<<(std::ostream &out, const MDSCapAuth &auth); std::ostream &operator<<(std::ostream &out, const MDSCapSpec &spec); std::ostream &operator<<(std::ostream &out, const MDSCapGrant &grant); std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap); diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index d45acce06..e97fd2cf8 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -445,6 +445,12 @@ void MDSDaemon::set_up_admin_socket() asok_hook, "dump inode by inode number"); ceph_assert(r == 0); + r = admin_socket->register_command("dump dir " + "name=path,type=CephString,req=true " + "name=dentry_dump,type=CephBool,req=false", + asok_hook, + "dump directory by path"); + ceph_assert(r == 0); r = admin_socket->register_command("exit", asok_hook, "Terminate this MDS"); diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 6433d08c5..aed591d95 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -177,6 +177,7 @@ void MDSMap::dump(Formatter *f) const cephfs_dump_features(f, required_client_features); f->close_section(); f->dump_int("max_file_size", max_file_size); + f->dump_int("max_xattr_size", max_xattr_size); f->dump_int("last_failure", last_failure); f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); f->open_object_section("compat"); @@ -235,6 +236,8 @@ void MDSMap::dump_flags_state(Formatter *f) const f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS), allows_multimds_snaps()); f->dump_bool(flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY), allows_standby_replay()); f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)); + f->dump_bool(flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)); + f->dump_bool(flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)); f->close_section(); } @@ -268,6 +271,7 @@ void MDSMap::print(ostream& out) const out << "session_timeout\t" << session_timeout << "\n" << "session_autoclose\t" << session_autoclose << "\n"; out << "max_file_size\t" << max_file_size << "\n"; + out << "max_xattr_size\t" << max_xattr_size << "\n"; out << "required_client_features\t" << cephfs_stringify_features(required_client_features) << "\n"; out << "last_failure\t" << last_failure << "\n" << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n"; @@ -376,6 +380,10 @@ void MDSMap::print_flags(std::ostream& out) const { out << " " << flag_display.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); if (test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION); + if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) + out << " " << flag_display.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE)) + out << " " << flag_display.at(CEPH_MDSMAP_BALANCE_AUTOMATE); } void MDSMap::get_health(list<pair<health_status_t,string> >& summary, @@ -763,7 +771,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const encode(data_pools, bl); encode(cas_pool, bl); - __u16 ev = 17; + __u16 ev = 18; encode(ev, bl); encode(compat, bl); encode(metadata_pool, bl); @@ -791,6 +799,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const } encode(required_client_features, bl); encode(bal_rank_mask, bl); + encode(max_xattr_size, bl); ENCODE_FINISH(bl); } @@ -842,7 +851,8 @@ void MDSMap::decode(bufferlist::const_iterator& p) decode(cas_pool, p); } - // kclient ignores everything from here + // kclient skips most of what's below + // see fs/ceph/mdsmap.c for current decoding __u16 ev = 1; if (struct_v >= 2) decode(ev, p); @@ -942,6 +952,10 @@ void MDSMap::decode(bufferlist::const_iterator& p) decode(bal_rank_mask, p); } + if (ev >= 18) { + decode(max_xattr_size, p); + } + /* All MDS since at least v14.0.0 understand INLINE */ /* TODO: remove after R is released */ compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 75c44e27c..c61fc2ce1 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -50,6 +50,12 @@ static inline const auto MDS_FEATURE_INCOMPAT_SNAPREALM_V2 = CompatSet::Feature( #define MDS_FS_NAME_DEFAULT "cephfs" +/* + * Maximum size of xattrs the MDS can handle per inode by default. This + * includes the attribute name and 4+4 bytes for the key/value sizes. + */ +#define MDS_MAX_XATTR_SIZE (1<<16) /* 64K */ + class health_check_map_t; class MDSMap { @@ -196,6 +202,9 @@ public: uint64_t get_max_filesize() const { return max_file_size; } void set_max_filesize(uint64_t m) { max_file_size = m; } + uint64_t get_max_xattr_size() const { return max_xattr_size; } + void set_max_xattr_size(uint64_t m) { max_xattr_size = m; } + void set_min_compat_client(ceph_release_t version); void add_required_client_feature(size_t bit) { @@ -234,6 +243,15 @@ public: bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); } bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; } + void set_balance_automate() { + set_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); + ever_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + explicitly_allowed_features |= CEPH_MDSMAP_BALANCE_AUTOMATE; + } + void clear_balance_automate() { clear_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool allows_balance_automate() const { return test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE); } + bool was_balance_automate_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_BALANCE_AUTOMATE; } + void set_multimds_snaps_allowed() { set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS; @@ -620,6 +638,8 @@ protected: __u32 session_autoclose = 300; uint64_t max_file_size = 1ULL<<40; /* 1TB */ + uint64_t max_xattr_size = MDS_MAX_XATTR_SIZE; + feature_bitset_t required_client_features; std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default. @@ -664,7 +684,9 @@ private: {CEPH_MDSMAP_ALLOW_SNAPS, "allow_snaps"}, {CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS, "allow_multimds_snaps"}, {CEPH_MDSMAP_ALLOW_STANDBY_REPLAY, "allow_standby_replay"}, - {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"} + {CEPH_MDSMAP_REFUSE_CLIENT_SESSION, "refuse_client_session"}, + {CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS, "refuse_standby_for_another_fs"}, + {CEPH_MDSMAP_BALANCE_AUTOMATE, "balance_automate"} }; }; WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) diff --git a/src/mds/MDSMetaRequest.h b/src/mds/MDSMetaRequest.h new file mode 100644 index 000000000..ad4720410 --- /dev/null +++ b/src/mds/MDSMetaRequest.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2023 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_META_REQUEST_H +#define CEPH_MDS_META_REQUEST_H + +#include "include/types.h" + +struct MDSMetaRequest { +private: + int op; + ceph_tid_t tid; +public: + explicit MDSMetaRequest(int o, ceph_tid_t t) : + op(o), tid(t) { } + virtual ~MDSMetaRequest() { } + + int get_op() { return op; } + ceph_tid_t get_tid() { return tid; } +}; + +#endif // !CEPH_MDS_META_REQUEST_H diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 9a80534a4..aa6a8c162 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -551,6 +551,8 @@ MDSRank::MDSRank( cct->_conf->mds_op_log_threshold); op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size, cct->_conf->mds_op_history_duration); + op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->mds_op_history_slow_op_size, + cct->_conf->mds_op_history_slow_op_threshold); schedule_update_timer_task(); } @@ -744,6 +746,7 @@ void MDSRankDispatcher::tick() // ... if (is_clientreplay() || is_active() || is_stopping()) { + server->clear_laggy_clients(); server->find_idle_sessions(); server->evict_cap_revoke_non_responders(); locker->tick(); @@ -1185,6 +1188,7 @@ bool MDSRank::is_valid_message(const cref_t<Message> &m) { type == CEPH_MSG_CLIENT_RECONNECT || type == CEPH_MSG_CLIENT_RECLAIM || type == CEPH_MSG_CLIENT_REQUEST || + type == CEPH_MSG_CLIENT_REPLY || type == MSG_MDS_PEER_REQUEST || type == MSG_MDS_HEARTBEAT || type == MSG_MDS_TABLE_REQUEST || @@ -1238,6 +1242,7 @@ void MDSRank::handle_message(const cref_t<Message> &m) ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT); // fall-thru case CEPH_MSG_CLIENT_REQUEST: + case CEPH_MSG_CLIENT_REPLY: server->dispatch(m); break; case MSG_MDS_PEER_REQUEST: @@ -2060,6 +2065,7 @@ bool MDSRank::queue_one_replay() if (!replay_queue.empty()) { queue_waiter(replay_queue.front()); replay_queue.pop_front(); + dout(10) << " queued next replay op" << dendl; return true; } if (!replaying_requests_done) { @@ -2067,6 +2073,7 @@ bool MDSRank::queue_one_replay() mdlog->flush(); } maybe_clientreplay_done(); + dout(10) << " journaled last replay op" << dendl; return false; } @@ -2909,6 +2916,8 @@ void MDSRankDispatcher::handle_asok_command( command_openfiles_ls(f); } else if (command == "dump inode") { command_dump_inode(f, cmdmap, *css); + } else if (command == "dump dir") { + command_dump_dir(f, cmdmap, *css); } else if (command == "damage ls") { std::lock_guard l(mds_lock); damage_table.dump(f); @@ -3349,6 +3358,42 @@ void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostr } } +void MDSRank::command_dump_dir(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss) +{ + std::lock_guard l(mds_lock); + std::string path; + bool got = cmd_getval(cmdmap, "path", path); + if (!got) { + ss << "missing path argument"; + return; + } + + bool dentry_dump = false; + cmd_getval(cmdmap, "dentry_dump", dentry_dump); + + CInode *in = mdcache->cache_traverse(filepath(path.c_str())); + if (!in) { + ss << "directory inode not in cache"; + return; + } + + f->open_array_section("dirs"); + frag_vec_t leaves; + in->dirfragtree.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + CDir *dir = in->get_dirfrag(leaf); + if (dir) { + mdcache->dump_dir(f, dir, dentry_dump); + } else { + f->open_object_section("frag"); + f->dump_stream("frag") << leaf; + f->dump_string("status", "dirfrag not in cache"); + f->close_section(); + } + } + f->close_section(); +} + void MDSRank::dump_status(Formatter *f) const { f->dump_string("fs_name", std::string(mdsmap->get_fs_name())); @@ -3503,6 +3548,9 @@ void MDSRank::create_logger() PerfCountersBuilder::PRIO_INTERESTING); mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn", PerfCountersBuilder::PRIO_INTERESTING); + // mds rss metric is set to PRIO_USEFUL as it can be useful to detect mds cache oversizing + mdm_plb.add_u64(l_mdm_rss, "rss", "RSS", "rss", + PerfCountersBuilder::PRIO_USEFUL); mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened"); @@ -3517,9 +3565,6 @@ void MDSRank::create_logger() mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed"); mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size"); - mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); - mdm_plb.add_u64(l_mdm_rss, "rss", "RSS"); - mlogger = mdm_plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(mlogger); } @@ -3842,6 +3887,9 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) { op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration); } + if (changed.count("mds_op_history_slow_op_size") || changed.count("mds_op_history_slow_op_threshold")) { + op_tracker.set_history_slow_op_size_and_threshold(conf->mds_op_history_slow_op_size, conf->mds_op_history_slow_op_threshold); + } if (changed.count("mds_enable_op_tracker")) { op_tracker.set_tracking(conf->mds_enable_op_tracker); } diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index b61fc178c..a9e8da181 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -43,6 +43,7 @@ #include "Server.h" #include "MetricsHandler.h" #include "osdc/Journaler.h" +#include "MDSMetaRequest.h" // Full .h import instead of forward declaration for PerfCounter, for the // benefit of those including this header and using MDSRank::logger @@ -253,6 +254,10 @@ class MDSRank { progress_thread.signal(); } + uint64_t get_global_id() const { + return monc->get_global_id(); + } + // Daemon lifetime functions: these guys break the abstraction // and call up into the parent MDSDaemon instance. It's kind // of unavoidable: if we want any depth into our calls @@ -423,6 +428,8 @@ class MDSRank { PerfCounters *logger = nullptr, *mlogger = nullptr; OpTracker op_tracker; + std::map<ceph_tid_t, std::unique_ptr<MDSMetaRequest>> internal_client_requests; + // The last different state I held before current MDSMap::DaemonState last_state = MDSMap::STATE_BOOT; // The state assigned to me by the MDSMap @@ -519,6 +526,7 @@ class MDSRank { void command_openfiles_ls(Formatter *f); void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f); void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); + void command_dump_dir(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish); // FIXME the state machine logic should be separable from the dispatch diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc index 6487084fb..9765d4d5b 100644 --- a/src/mds/MetricAggregator.cc +++ b/src/mds/MetricAggregator.cc @@ -4,6 +4,9 @@ #include <boost/range/adaptor/map.hpp> #include <boost/range/algorithm/copy.hpp> +#include "common/ceph_context.h" +#include "common/perf_counters_key.h" + #include "MDSRank.h" #include "MetricAggregator.h" #include "mgr/MgrClient.h" @@ -13,8 +16,36 @@ #undef dout_prefix #define dout_prefix *_dout << "mds.metric.aggregator" << " " << __func__ +// Performance Counters + enum { + l_mds_client_metrics_start = 10000, + l_mds_client_metrics_num_clients, + l_mds_client_metrics_last + }; + +enum { + l_mds_per_client_metrics_start = 20000, + l_mds_per_client_metrics_cap_hits, + l_mds_per_client_metrics_cap_misses, + l_mds_per_client_metrics_avg_read_latency, + l_mds_per_client_metrics_avg_write_latency, + l_mds_per_client_metrics_avg_metadata_latency, + l_mds_per_client_metrics_dentry_lease_hits, + l_mds_per_client_metrics_dentry_lease_misses, + l_mds_per_client_metrics_opened_files, + l_mds_per_client_metrics_opened_inodes, + l_mds_per_client_metrics_pinned_icaps, + l_mds_per_client_metrics_total_inodes, + l_mds_per_client_metrics_total_read_ops, + l_mds_per_client_metrics_total_read_size, + l_mds_per_client_metrics_total_write_ops, + l_mds_per_client_metrics_total_write_size, + l_mds_per_client_metrics_last + }; + MetricAggregator::MetricAggregator(CephContext *cct, MDSRank *mds, MgrClient *mgrc) : Dispatcher(cct), + m_cct(cct), mds(mds), mgrc(mgrc), mds_pinger(mds) { @@ -32,6 +63,15 @@ void MetricAggregator::ping_all_active_ranks() { int MetricAggregator::init() { dout(10) << dendl; + std::string labels = ceph::perf_counters::key_create("mds_client_metrics", + {{"fs_name", mds->mdsmap->get_fs_name()}, + {"id", stringify(mds->get_global_id())}}); + PerfCountersBuilder plb(m_cct, labels, l_mds_client_metrics_start, l_mds_client_metrics_last); + plb.add_u64(l_mds_client_metrics_num_clients, + "num_clients", "Numer of client sessions", "mcli", PerfCountersBuilder::PRIO_CRITICAL); + m_perf_counters = plb.create_perf_counters(); + m_cct->get_perfcounters_collection()->add(m_perf_counters); + pinger = std::thread([this]() { std::unique_lock locker(lock); while (!stopping) { @@ -61,6 +101,24 @@ void MetricAggregator::shutdown() { std::scoped_lock locker(lock); ceph_assert(!stopping); stopping = true; + + // dealloc per-client perf counter + for (auto [crpair, pc] : client_perf_counters) { + PerfCounters *perf_counters = nullptr; + std::swap(perf_counters, pc); + if (perf_counters != nullptr) { + m_cct->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } + } + client_perf_counters.clear(); + + PerfCounters *perf_counters = nullptr; + std::swap(perf_counters, m_perf_counters); + if (perf_counters != nullptr) { + m_cct->get_perfcounters_collection()->remove(perf_counters); + delete perf_counters; + } } if (pinger.joinable()) { @@ -97,10 +155,110 @@ void MetricAggregator::refresh_metrics_for_rank(const entity_inst_t &client, << metrics << dendl; auto &p = clients_by_rank.at(rank); + auto crpair = std::make_pair(client, rank); bool ins = p.insert(client).second; if (ins) { dout(20) << ": rank=" << rank << " has " << p.size() << " connected" << " client(s)" << dendl; + if (m_perf_counters) { + m_perf_counters->inc(l_mds_client_metrics_num_clients); + } + + std::string labels = ceph::perf_counters::key_create("mds_client_metrics-" + std::string(mds->mdsmap->get_fs_name()), + {{"client", stringify(client.name)}, + {"rank", stringify(rank)}}); + PerfCountersBuilder plb(m_cct, labels, l_mds_per_client_metrics_start, l_mds_per_client_metrics_last); + plb.add_u64(l_mds_per_client_metrics_cap_hits, + "cap_hits", "Capability hits", "hcap", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_cap_misses, + "cap_miss", "Capability misses", "mcap", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_time(l_mds_per_client_metrics_avg_read_latency, + "avg_read_latency", "Average Read Latency", "arlt", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_time(l_mds_per_client_metrics_avg_write_latency, + "avg_write_latency", "Average Write Latency", "awlt", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_time(l_mds_per_client_metrics_avg_metadata_latency, + "avg_metadata_latency", "Average Metadata Latency", "amlt", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_dentry_lease_hits, + "dentry_lease_hits", "Dentry Lease Hits", "hden", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_dentry_lease_misses, + "dentry_lease_miss", "Dentry Lease Misses", "mden", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_opened_files, + "opened_files", "Open Files", "ofil", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_opened_inodes, + "opened_inodes", "Open Inodes", "oino", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_pinned_icaps, + "pinned_icaps", "Pinned Inode Caps", "pino", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_total_inodes, + "total_inodes", "Total Inodes", "tino", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_total_read_ops, + "total_read_ops", "Total Read Operations", "rops", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_total_read_size, + "total_read_size", "Total Read Size", "rsiz", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_total_write_ops, + "total_write_ops", "Total Write Operations", "wops", PerfCountersBuilder::PRIO_CRITICAL); + plb.add_u64(l_mds_per_client_metrics_total_write_size, + "total_write_size", "Total Write Size", "wsiz", PerfCountersBuilder::PRIO_CRITICAL); + client_perf_counters[crpair] = plb.create_perf_counters(); + m_cct->get_perfcounters_collection()->add(client_perf_counters[crpair]); + } + + // update perf counters + PerfCounters *perf_counter_ptr = nullptr; + if (client_perf_counters.contains(crpair)) { + perf_counter_ptr = client_perf_counters[crpair]; + } + + if (perf_counter_ptr) { + // client capability hit ratio + perf_counter_ptr->set(l_mds_per_client_metrics_cap_hits, metrics.cap_hit_metric.hits); + perf_counter_ptr->set(l_mds_per_client_metrics_cap_misses, metrics.cap_hit_metric.misses); + + // some averages + if (metrics.read_latency_metric.updated) { + utime_t ravg(metrics.read_latency_metric.mean.tv.tv_sec * 100, + metrics.read_latency_metric.mean.tv.tv_nsec / 1000000); + perf_counter_ptr->tset(l_mds_per_client_metrics_avg_read_latency, ravg); + } + if (metrics.write_latency_metric.updated) { + utime_t wavg(metrics.write_latency_metric.mean.tv.tv_sec * 100, + metrics.write_latency_metric.mean.tv.tv_nsec / 1000000); + perf_counter_ptr->set(l_mds_per_client_metrics_avg_write_latency, wavg); + } + if (metrics.metadata_latency_metric.updated) { + utime_t mavg(metrics.metadata_latency_metric.mean.tv.tv_sec * 100, + metrics.metadata_latency_metric.mean.tv.tv_nsec / 1000000); + perf_counter_ptr->set(l_mds_per_client_metrics_avg_metadata_latency, mavg); + } + + // dentry leases + if (metrics.dentry_lease_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_dentry_lease_hits, metrics.dentry_lease_metric.hits); + perf_counter_ptr->set(l_mds_per_client_metrics_dentry_lease_misses, metrics.dentry_lease_metric.misses); + } + + // file+inode opens, pinned inode caps + if (metrics.opened_files_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_opened_files, metrics.opened_files_metric.opened_files); + perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.opened_files_metric.total_inodes); + } + if (metrics.opened_inodes_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_opened_inodes, metrics.opened_inodes_metric.total_inodes); + perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.opened_inodes_metric.total_inodes); + } + if (metrics.pinned_icaps_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_pinned_icaps, metrics.pinned_icaps_metric.pinned_icaps); + perf_counter_ptr->set(l_mds_per_client_metrics_total_inodes, metrics.pinned_icaps_metric.total_inodes); + } + + // read+write io metrics + if (metrics.read_io_sizes_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_total_read_ops, metrics.read_io_sizes_metric.total_ops); + perf_counter_ptr->set(l_mds_per_client_metrics_total_read_size, metrics.read_io_sizes_metric.total_size); + } + if (metrics.write_io_sizes_metric.updated) { + perf_counter_ptr->set(l_mds_per_client_metrics_total_write_ops, metrics.write_io_sizes_metric.total_ops); + perf_counter_ptr->set(l_mds_per_client_metrics_total_write_size, metrics.write_io_sizes_metric.total_size); + } } auto update_counter_func = [&metrics](const MDSPerformanceCounterDescriptor &d, @@ -260,6 +418,13 @@ void MetricAggregator::remove_metrics_for_rank(const entity_inst_t &client, ceph_assert(rm); dout(20) << ": rank=" << rank << " has " << p.size() << " connected" << " client(s)" << dendl; + auto crpair = std::make_pair(client, rank); + m_cct->get_perfcounters_collection()->remove(client_perf_counters[crpair]); + delete client_perf_counters[crpair]; + client_perf_counters.erase(crpair); + } + if (m_perf_counters) { + m_perf_counters->dec(l_mds_client_metrics_num_clients); } auto sub_key_func = [client, rank](const MDSPerfMetricSubKeyDescriptor &d, @@ -315,6 +480,10 @@ void MetricAggregator::handle_mds_metrics(const cref_t<MMDSMetrics> &m) { << rank << " with sequence number " << seq << dendl; std::scoped_lock locker(lock); + if (stopping) { + dout(10) << ": stopping" << dendl; + return; + } if (!mds_pinger.pong_received(rank, seq)) { return; } diff --git a/src/mds/MetricAggregator.h b/src/mds/MetricAggregator.h index fe9aef2e3..6d48756f7 100644 --- a/src/mds/MetricAggregator.h +++ b/src/mds/MetricAggregator.h @@ -11,6 +11,7 @@ #include "msg/msg_types.h" #include "msg/Dispatcher.h" #include "common/ceph_mutex.h" +#include "common/perf_counters.h" #include "include/common_fwd.h" #include "messages/MMDSMetrics.h" @@ -55,6 +56,7 @@ private: // drop this lock when calling ->send_message_mds() else mds might // deadlock ceph::mutex lock = ceph::make_mutex("MetricAggregator::lock"); + CephContext *m_cct; MDSRank *mds; MgrClient *mgrc; @@ -72,6 +74,9 @@ private: bool stopping = false; + PerfCounters *m_perf_counters; + std::map<std::pair<entity_inst_t, mds_rank_t>, PerfCounters*> client_perf_counters; + void handle_mds_metrics(const cref_t<MMDSMetrics> &m); void refresh_metrics_for_rank(const entity_inst_t &client, mds_rank_t rank, diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index b963dee08..bc83f2191 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -387,6 +387,12 @@ struct MDRequestImpl : public MutationImpl { void set_filepath(const filepath& fp); void set_filepath2(const filepath& fp); bool is_queued_for_replay() const; + bool get_queued_next_replay_op() const { + return queued_next_replay_op; + } + void set_queued_next_replay_op() { + queued_next_replay_op = true; + } int compare_paths(); bool can_batch(); @@ -460,6 +466,7 @@ protected: } void _dump(ceph::Formatter *f, bool has_mds_lock) const; void _dump_op_descriptor(std::ostream& stream) const override; + bool queued_next_replay_op = false; }; struct MDPeerUpdate { diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 1f91c2020..b18395213 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -113,7 +113,7 @@ protected: version_t omap_version = 0; - unsigned omap_num_objs = 0; + uint32_t omap_num_objs = 0; std::vector<unsigned> omap_num_items; std::map<inodeno_t, OpenedAnchor> anchor_map; diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 6d799343f..742c464f4 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -320,7 +320,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) frag_vec_t frags; in->dirfragtree.get_leaves(frags); - dout(20) << __func__ << "recursive mode, frags " << frags << dendl; + dout(20) << __func__ << " recursive mode, frags " << frags << dendl; for (auto &fg : frags) { if (queued.contains(fg)) continue; @@ -366,7 +366,6 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) scrub_r.tag = header->get_tag(); for (auto& p : scrub_remote) { - p.second.simplify(); dout(20) << __func__ << " forward " << p.second << " to mds." << p.first << dendl; auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEDIR, in->ino(), std::move(p.second), header->get_tag(), diff --git a/src/mds/Server.cc b/src/mds/Server.cc index ced4ecffa..48e7b03ae 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -31,6 +31,7 @@ #include "Mutation.h" #include "MetricsHandler.h" #include "cephfs_features.h" +#include "MDSContext.h" #include "msg/Messenger.h" @@ -305,6 +306,7 @@ void Server::dispatch(const cref_t<Message> &m) return; } bool queue_replay = false; + dout(5) << "dispatch request in up:reconnect: " << *req << dendl; if (req->is_replay() || req->is_async()) { dout(3) << "queuing replayed op" << dendl; queue_replay = true; @@ -323,10 +325,13 @@ void Server::dispatch(const cref_t<Message> &m) // process completed request in clientreplay stage. The completed request // might have created new file/directorie. This guarantees MDS sends a reply // to client before other request modifies the new file/directorie. - if (session->have_completed_request(req->get_reqid().tid, NULL)) { - dout(3) << "queuing completed op" << dendl; + bool r = session->have_completed_request(req->get_reqid().tid, NULL); + if (r) { + dout(3) << __func__ << ": queuing completed op" << dendl; queue_replay = true; - } + } else { + dout(20) << __func__ << ": request not complete" << dendl; + } // this request was created before the cap reconnect message, drop any embedded // cap releases. req->releases.clear(); @@ -360,6 +365,9 @@ void Server::dispatch(const cref_t<Message> &m) case CEPH_MSG_CLIENT_REQUEST: handle_client_request(ref_cast<MClientRequest>(m)); return; + case CEPH_MSG_CLIENT_REPLY: + handle_client_reply(ref_cast<MClientReply>(m)); + return; case CEPH_MSG_CLIENT_RECLAIM: handle_client_reclaim(ref_cast<MClientReclaim>(m)); return; @@ -615,6 +623,7 @@ void Server::handle_client_session(const cref_t<MClientSession> &m) session->get_push_seq()); if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) reply->supported_features = supported_features; + session->auth_caps.get_cap_auths(&reply->cap_auths); mds->send_message_client(reply, session); if (mdcache->is_readonly()) { auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO); @@ -708,6 +717,17 @@ void Server::handle_client_session(const cref_t<MClientSession> &m) break; } + if (session->auth_caps.root_squash_in_caps() && !client_metadata.features.test(CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK)) { + CachedStackStringStream css; + *css << "client lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK needed to enforce 'root_squash' MDS auth caps"; + send_reject_message(css->strv()); + mds->clog->warn() << "client session (" << session->info.inst + << ") lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK " + << " needed to enforce 'root_squash' MDS auth caps"; + session->clear(); + break; + + } // Special case for the 'root' metadata path; validate that the claimed // root is actually within the caps of the session if (auto it = client_metadata.find("root"); it != client_metadata.end()) { @@ -769,6 +789,7 @@ void Server::handle_client_session(const cref_t<MClientSession> &m) mds->locker->resume_stale_caps(session); mds->sessionmap.touch_session(session); } + trim_completed_request_list(m->oldest_client_tid, session); auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq()); mds->send_message_client(reply, session); } else { @@ -905,6 +926,7 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve reply->supported_features = supported_features; reply->metric_spec = supported_metric_spec; } + session->auth_caps.get_cap_auths(&reply->cap_auths); mds->send_message_client(reply, session); if (mdcache->is_readonly()) { auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO); @@ -1061,6 +1083,7 @@ void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_ reply->supported_features = supported_features; reply->metric_spec = supported_metric_spec; } + session->auth_caps.get_cap_auths(&reply->cap_auths); mds->send_message_client(reply, session); if (mdcache->is_readonly()) @@ -1132,10 +1155,12 @@ void Server::find_idle_sessions() return; } - std::vector<Session*> to_evict; - bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale"); const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN); + bool defer_client_eviction = + g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }); if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) { std::vector<Session*> new_stale; @@ -1160,7 +1185,7 @@ void Server::find_idle_sessions() dout(20) << "evicting session " << session->info.inst << " since autoclose " "has arrived" << dendl; // evict session without marking it stale - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); continue; } @@ -1189,7 +1214,7 @@ void Server::find_idle_sessions() } // do not go through stale, evict it directly. - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } else { dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl; @@ -1205,7 +1230,7 @@ void Server::find_idle_sessions() auto m = make_message<MClientSession>(CEPH_SESSION_STALE); mds->send_message_client(m, session); } else { - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } } } @@ -1224,11 +1249,21 @@ void Server::find_idle_sessions() << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl; break; } - to_evict.push_back(session); + laggy_clients.insert(session->get_client()); } } - for (auto session: to_evict) { + // don't evict client(s) if osds are laggy + if(defer_client_eviction && !laggy_clients.empty()) { + dout(5) << "Detected " << laggy_clients.size() + << " laggy clients, possibly due to laggy OSDs." + " Eviction is skipped until the OSDs return to normal." + << dendl; + return; + } + + for (auto client: laggy_clients) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); if (session->is_importing()) { dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl; continue; @@ -1247,6 +1282,8 @@ void Server::find_idle_sessions() kill_session(session, NULL); } } + // clear as there's no use to keep the evicted clients in laggy_clients + clear_laggy_clients(); } void Server::evict_cap_revoke_non_responders() { @@ -1255,6 +1292,20 @@ void Server::evict_cap_revoke_non_responders() { } auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout); + // don't evict client(s) if osds are laggy + bool defer_client_eviction = + g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }) + && to_evict.size(); + if(defer_client_eviction) { + laggy_clients.insert(to_evict.begin(), to_evict.end()); + dout(0) << "Detected " << to_evict.size() + << " unresponsive clients, possibly due to laggy OSDs." + " Eviction is skipped until the OSDs return to normal." + << dendl; + return; + } for (auto const &client: to_evict) { mds->clog->warn() << "client id " << client << " has not responded to" @@ -1522,6 +1573,12 @@ void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m) *css << "missing required features '" << missing_features << "'"; error_str = css->strv(); } + if (session->auth_caps.root_squash_in_caps() && + !session->info.client_metadata.features.test(CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK)) { + CachedStackStringStream css; + *css << "client lacks CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK needed to enforce 'root_squash' MDS auth caps"; + error_str = css->strv(); + } } if (!error_str.empty()) { @@ -1549,6 +1606,7 @@ void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m) reply->supported_features = supported_features; reply->metric_spec = supported_metric_spec; } + session->auth_caps.get_cap_auths(&reply->cap_auths); mds->send_message_client(reply, session); mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay; } @@ -1984,12 +2042,15 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv mdr->committing = true; submit_mdlog_entry(le, fin, mdr, __func__); - if (mdr->client_request && mdr->client_request->is_queued_for_replay()) { - if (mds->queue_one_replay()) { - dout(10) << " queued next replay op" << dendl; - } else { - dout(10) << " journaled last replay op" << dendl; - } + if (mdr->is_queued_for_replay()) { + + /* We want to queue the next replay op while waiting for the journaling, so + * do it now when the early (unsafe) replay is dispatched. Don't wait until + * this request is cleaned up in MDCache.cc. + */ + + mdr->set_queued_next_replay_op(); + mds->queue_one_replay(); } else if (mdr->did_early_reply) mds->locker->drop_rdlocks_for_early_reply(mdr.get()); else @@ -2293,15 +2354,16 @@ void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> & mds->send_message_client(reply, session); } - if (req->is_queued_for_replay() && - (mdr->has_completed || reply->get_result() < 0)) { - if (reply->get_result() < 0) { - int r = reply->get_result(); + if (client_inst.name.is_mds() && reply->get_op() == CEPH_MDS_OP_RENAME) { + mds->send_message(reply, mdr->client_request->get_connection()); + } + + if (req->is_queued_for_replay()) { + if (int r = reply->get_result(); r < 0) { derr << "reply_client_request: failed to replay " << *req - << " error " << r << " (" << cpp_strerror(r) << ")" << dendl; + << " error " << r << " (" << cpp_strerror(r) << ")" << dendl; mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r; } - mds->queue_one_replay(); } // clean up request @@ -2391,6 +2453,35 @@ void Server::set_trace_dist(const ref_t<MClientReply> &reply, reply->set_trace(bl); } +// trim completed_request list +void Server::trim_completed_request_list(ceph_tid_t tid, Session *session) +{ + if (tid == UINT64_MAX || !session) + return; + + dout(15) << " oldest_client_tid=" << tid << dendl; + if (session->trim_completed_requests(tid)) { + // Sessions 'completed_requests' was dirtied, mark it to be + // potentially flushed at segment expiry. + mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name); + + if (session->get_num_trim_requests_warnings() > 0 && + session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests) + session->reset_num_trim_requests_warnings(); + } else { + if (session->get_num_completed_requests() >= + (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) { + session->inc_num_trim_requests_warnings(); + CachedStackStringStream css; + *css << "client." << session->get_client() << " does not advance its oldest_client_tid (" + << tid << "), " << session->get_num_completed_requests() + << " completed requests recorded in session\n"; + mds->clog->warn() << css->strv(); + dout(20) << __func__ << " " << css->strv() << dendl; + } + } +} + void Server::handle_client_request(const cref_t<MClientRequest> &req) { dout(4) << "handle_client_request " << *req << dendl; @@ -2472,36 +2563,16 @@ void Server::handle_client_request(const cref_t<MClientRequest> &req) } // trim completed_request list - if (req->get_oldest_client_tid() > 0) { - dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; - ceph_assert(session); - if (session->trim_completed_requests(req->get_oldest_client_tid())) { - // Sessions 'completed_requests' was dirtied, mark it to be - // potentially flushed at segment expiry. - mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name); - - if (session->get_num_trim_requests_warnings() > 0 && - session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests) - session->reset_num_trim_requests_warnings(); - } else { - if (session->get_num_completed_requests() >= - (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) { - session->inc_num_trim_requests_warnings(); - CachedStackStringStream css; - *css << "client." << session->get_client() << " does not advance its oldest_client_tid (" - << req->get_oldest_client_tid() << "), " - << session->get_num_completed_requests() - << " completed requests recorded in session\n"; - mds->clog->warn() << css->strv(); - dout(20) << __func__ << " " << css->strv() << dendl; - } - } - } + trim_completed_request_list(req->get_oldest_client_tid(), session); // register + dispatch MDRequestRef mdr = mdcache->request_start(req); - if (!mdr.get()) + if (!mdr.get()) { + dout(5) << __func__ << ": possibly duplicate op " << *req << dendl; + if (req->is_queued_for_replay()) + mds->queue_one_replay(); return; + } if (session) { mdr->session = session; @@ -2525,6 +2596,28 @@ void Server::handle_client_request(const cref_t<MClientRequest> &req) return; } +void Server::handle_client_reply(const cref_t<MClientReply> &reply) +{ + dout(4) << "handle_client_reply " << *reply << dendl; + + ceph_assert(reply->is_safe()); + ceph_tid_t tid = reply->get_tid(); + + if (mds->internal_client_requests.count(tid) == 0) { + dout(1) << " no pending request on tid " << tid << dendl; + return; + } + + switch (reply->get_op()) { + case CEPH_MDS_OP_RENAME: + break; + default: + dout(5) << " unknown client op " << reply->get_op() << dendl; + } + + mds->internal_client_requests.erase(tid); +} + void Server::handle_osd_map() { /* Note that we check the OSDMAP_FULL flag directly rather than @@ -4534,6 +4627,20 @@ public: } }; +bool Server::is_valid_layout(file_layout_t *layout) +{ + if (!layout->is_valid()) { + dout(10) << " invalid initial file layout" << dendl; + return false; + } + if (!mds->mdsmap->is_data_pool(layout->pool_id)) { + dout(10) << " invalid data pool " << layout->pool_id << dendl; + return false; + } + + return true; +} + /* This function takes responsibility for the passed mdr*/ void Server::handle_client_openc(MDRequestRef& mdr) { @@ -4608,13 +4715,7 @@ void Server::handle_client_openc(MDRequestRef& mdr) access |= MAY_SET_VXATTR; } - if (!layout.is_valid()) { - dout(10) << " invalid initial file layout" << dendl; - respond_to_request(mdr, -CEPHFS_EINVAL); - return; - } - if (!mds->mdsmap->is_data_pool(layout.pool_id)) { - dout(10) << " invalid data pool " << layout.pool_id << dendl; + if (!is_valid_layout(&layout)) { respond_to_request(mdr, -CEPHFS_EINVAL); return; } @@ -4864,7 +4965,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr) unsigned max_bytes = req->head.args.readdir.max_bytes; if (!max_bytes) // make sure at least one item can be encoded - max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size(); // start final blob bufferlist dirbl; @@ -5503,13 +5604,7 @@ void Server::handle_client_setlayout(MDRequestRef& mdr) access |= MAY_SET_VXATTR; } - if (!layout.is_valid()) { - dout(10) << "bad layout" << dendl; - respond_to_request(mdr, -CEPHFS_EINVAL); - return; - } - if (!mds->mdsmap->is_data_pool(layout.pool_id)) { - dout(10) << " invalid data pool " << layout.pool_id << dendl; + if (!is_valid_layout(&layout)) { respond_to_request(mdr, -CEPHFS_EINVAL); return; } @@ -5636,14 +5731,8 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr) if (layout != old_layout) { access |= MAY_SET_VXATTR; } - - if (!layout.is_valid()) { - dout(10) << "bad layout" << dendl; - respond_to_request(mdr, -CEPHFS_EINVAL); - return; - } - if (!mds->mdsmap->is_data_pool(layout.pool_id)) { - dout(10) << " invalid data pool " << layout.pool_id << dendl; + + if (!is_valid_layout(&layout)) { respond_to_request(mdr, -CEPHFS_EINVAL); return; } @@ -5821,15 +5910,11 @@ int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap, if (r < 0) { return r; } - - if (validate && !layout->is_valid()) { - dout(10) << __func__ << ": bad layout" << dendl; - return -CEPHFS_EINVAL; - } - if (!mds->mdsmap->is_data_pool(layout->pool_id)) { - dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl; - return -CEPHFS_EINVAL; + + if (!is_valid_layout(layout)) { + return -CEPHFS_EINVAL; } + return 0; } @@ -5859,9 +5944,13 @@ int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota) return r; } } else if (name == "quota.max_bytes") { - int64_t q = boost::lexical_cast<int64_t>(value); - if (q < 0) + string cast_err; + int64_t q = strict_iec_cast<int64_t>(value, &cast_err); + if(!cast_err.empty()) { + dout(10) << __func__ << ": failed to parse quota.max_bytes: " + << cast_err << dendl; return -CEPHFS_EINVAL; + } quota->max_bytes = q; } else if (name == "quota.max_files") { int64_t q = boost::lexical_cast<int64_t>(value); @@ -6127,6 +6216,10 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur) inodeno_t subvol_ino = realm->get_subvolume_ino(); // can't create subvolume inside another subvolume if (subvol_ino && subvol_ino != cur->ino()) { + dout(20) << "subvol ino changed between rdlock release and xlock " + << "policylock; subvol_ino: " << subvol_ino << ", " + << "cur->ino: " << cur->ino() + << dendl; respond_to_request(mdr, -CEPHFS_EINVAL); return; } @@ -6141,10 +6234,13 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur) auto pi = cur->project_inode(mdr, false, true); if (!srnode) pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq(); - if (val) + if (val) { + dout(20) << "marking subvolume for ino: " << cur->ino() << dendl; pi.snapnode->mark_subvolume(); - else + } else { + dout(20) << "clearing subvolume for ino: " << cur->ino() << dendl; pi.snapnode->clear_subvolume(); + } mdr->no_early_reply = true; pip = pi.inode.get(); @@ -6531,9 +6627,9 @@ void Server::handle_client_setxattr(MDRequestRef& mdr) auto handler = Server::get_xattr_or_default_handler(name); const auto& pxattrs = cur->get_projected_xattrs(); + size_t cur_xattrs_size = 0; if (pxattrs) { // check xattrs kv pairs size - size_t cur_xattrs_size = 0; for (const auto& p : *pxattrs) { if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) { continue; @@ -6541,12 +6637,12 @@ void Server::handle_client_setxattr(MDRequestRef& mdr) cur_xattrs_size += p.first.length() + p.second.length(); } - if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) { - dout(10) << "xattr kv pairs size too big. cur_xattrs_size " - << cur_xattrs_size << ", inc " << inc << dendl; - respond_to_request(mdr, -CEPHFS_ENOSPC); - return; - } + } + if (((cur_xattrs_size + inc) > mds->mdsmap->get_max_xattr_size())) { + dout(10) << "xattr kv pairs size too big. cur_xattrs_size " + << cur_xattrs_size << ", inc " << inc << dendl; + respond_to_request(mdr, -CEPHFS_ENOSPC); + return; } XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags); @@ -6904,6 +7000,11 @@ void Server::handle_client_mknod(MDRequestRef& mdr) else layout = mdcache->default_file_layout; + if (!is_valid_layout(&layout)) { + respond_to_request(mdr, -CEPHFS_EINVAL); + return; + } + CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout); ceph_assert(newi); @@ -10759,7 +10860,7 @@ void Server::handle_client_lssnap(MDRequestRef& mdr) int max_bytes = req->head.args.readdir.max_bytes; if (!max_bytes) // make sure at least one item can be encoded - max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size(); __u64 last_snapid = 0; string offset_str = req->get_path2(); @@ -11413,7 +11514,7 @@ void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr) unsigned max_bytes = req->head.args.snapdiff.max_bytes; if (!max_bytes) // make sure at least one item can be encoded - max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size; + max_bytes = (512 << 10) + mds->mdsmap->get_max_xattr_size(); // start final blob bufferlist dirbl; diff --git a/src/mds/Server.h b/src/mds/Server.h index 81a5933ba..47f86518b 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -158,7 +158,9 @@ public: void force_clients_readonly(); // -- requests -- + void trim_completed_request_list(ceph_tid_t tid, Session *session); void handle_client_request(const cref_t<MClientRequest> &m); + void handle_client_reply(const cref_t<MClientReply> &m); void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn, LogEvent *le, MDSLogContextBase *fin); @@ -235,6 +237,9 @@ public: void handle_client_removexattr(MDRequestRef& mdr); void handle_client_fsync(MDRequestRef& mdr); + + // check layout + bool is_valid_layout(file_layout_t *layout); // open void handle_client_open(MDRequestRef& mdr); @@ -328,6 +333,13 @@ public: std::set<client_t> client_reclaim_gather; + std::set<client_t> get_laggy_clients() const { + return laggy_clients; + } + void clear_laggy_clients() { + laggy_clients.clear(); + } + const bufferlist& get_snap_trace(Session *session, SnapRealm *realm) const; const bufferlist& get_snap_trace(client_t client, SnapRealm *realm) const; @@ -553,6 +565,9 @@ private: size_t alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max"); size_t fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size"); + + // record laggy clients due to laggy OSDs + std::set<client_t> laggy_clients; }; static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) { diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index 720396338..9cc2b0138 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -622,6 +622,9 @@ void Session::dump(Formatter *f, bool cap_dump) const f->dump_object("session_cache_liveness", session_cache_liveness); f->dump_object("cap_acquisition", cap_acquisition); + f->dump_unsigned("last_trim_completed_requests_tid", last_trim_completed_requests_tid); + f->dump_unsigned("last_trim_completed_flushes_tid", last_trim_completed_flushes_tid); + f->open_array_section("delegated_inos"); for (const auto& [start, len] : delegated_inos) { f->open_object_section("ino_range"); diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index ddf227be9..360dd66a2 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -314,6 +314,7 @@ public: bool trim_completed_requests(ceph_tid_t mintid) { // trim bool erased_any = false; + last_trim_completed_requests_tid = mintid; while (!info.completed_requests.empty() && (mintid == 0 || info.completed_requests.begin()->first < mintid)) { info.completed_requests.erase(info.completed_requests.begin()); @@ -339,6 +340,7 @@ public: } bool trim_completed_flushes(ceph_tid_t mintid) { bool erased_any = false; + last_trim_completed_flushes_tid = mintid; while (!info.completed_flushes.empty() && (mintid == 0 || *info.completed_flushes.begin() < mintid)) { info.completed_flushes.erase(info.completed_flushes.begin()); @@ -493,6 +495,9 @@ private: unsigned num_trim_flushes_warnings = 0; unsigned num_trim_requests_warnings = 0; + + ceph_tid_t last_trim_completed_requests_tid = 0; + ceph_tid_t last_trim_completed_flushes_tid = 0; }; class SessionFilter diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc index 9d303bcb0..ac8cdf832 100644 --- a/src/mds/SnapRealm.cc +++ b/src/mds/SnapRealm.cc @@ -309,7 +309,7 @@ void SnapRealm::adjust_parent() void SnapRealm::split_at(SnapRealm *child) { - dout(10) << "split_at " << *child + dout(10) << __func__ << ": " << *child << " on " << *child->inode << dendl; if (inode->is_mdsdir() || !child->inode->is_dir()) { @@ -328,8 +328,23 @@ void SnapRealm::split_at(SnapRealm *child) // it's a dir. + if (child->inode->get_projected_parent_dir()->inode->is_stray()) { + if (child->inode->containing_realm) { + dout(10) << " moving unlinked directory inode" << dendl; + child->inode->move_to_realm(child); + } else { + /* This shouldn't happen because an unlinked directory will have caps + * issued to the caller executing rmdir (for today's clients). + */ + dout(10) << " skipping unlinked directory inode w/o caps" << dendl; + } + return; + } + // split open_children - dout(10) << " open_children are " << open_children << dendl; + if (!open_children.empty()) { + dout(10) << " open_children are " << open_children << dendl; + } for (set<SnapRealm*>::iterator p = open_children.begin(); p != open_children.end(); ) { SnapRealm *realm = *p; @@ -346,17 +361,25 @@ void SnapRealm::split_at(SnapRealm *child) } // split inodes_with_caps + std::unordered_map<CInode const*,bool> visited; + uint64_t count = 0; + dout(20) << " reserving space for " << CDir::count() << " dirs" << dendl; + visited.reserve(CDir::count()); /* a reasonable starting poing: keep in mind there may be CInode directories without fragments in cache */ for (auto p = inodes_with_caps.begin(); !p.end(); ) { CInode *in = *p; ++p; // does inode fall within the child realm? - if (child->inode->is_ancestor_of(in)) { - dout(20) << " child gets " << *in << dendl; + if (child->inode->is_ancestor_of(in, &visited)) { + dout(25) << " child gets " << *in << dendl; in->move_to_realm(child); + ++count; } else { - dout(20) << " keeping " << *in << dendl; + dout(25) << " keeping " << *in << dendl; } } + dout(20) << " visited " << visited.size() << " directories" << dendl; + + dout(10) << __func__ << ": split " << count << " inodes" << dendl; } void SnapRealm::merge_to(SnapRealm *newparent) diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index d288ce661..1f729b9c1 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -675,24 +675,41 @@ void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn) { dout(10) << __func__ << " " << *straydn << " to " << *rdn << dendl; + if (straydn->reintegration_reqid) { + dout(20) << __func__ << ": stray dentry " << *straydn + << " is already under reintegrating" << dendl; + return; + } + logger->inc(l_mdc_strays_reintegrated); - + // rename it to remote linkage . filepath src(straydn->get_name(), straydn->get_dir()->ino()); filepath dst(rdn->get_name(), rdn->get_dir()->ino()); + ceph_tid_t tid = mds->issue_tid(); + auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME); req->set_filepath(dst); req->set_filepath2(src); - req->set_tid(mds->issue_tid()); + req->set_tid(tid); + + auto ptr = std::make_unique<StrayEvalRequest>(CEPH_MDS_OP_RENAME, tid, straydn); + mds->internal_client_requests.emplace(tid, std::move(ptr)); mds->send_message_mds(req, rdn->authority().first); } - + void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) { dout(10) << __func__ << " " << *dn << " to mds." << to << dendl; + if (dn->reintegration_reqid) { + dout(20) << __func__ << ": stray dentry " << *dn + << " is already under migrating" << dendl; + return; + } + logger->inc(l_mdc_strays_migrated); // rename it to another mds. @@ -702,10 +719,15 @@ void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to) filepath src(dn->get_name(), dirino); filepath dst(dn->get_name(), MDS_INO_STRAY(to, MDS_INO_STRAY_INDEX(dirino))); + ceph_tid_t tid = mds->issue_tid(); + auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME); req->set_filepath(dst); req->set_filepath2(src); - req->set_tid(mds->issue_tid()); + req->set_tid(tid); + + auto ptr = std::make_unique<StrayEvalRequest>(CEPH_MDS_OP_RENAME, tid, dn); + mds->internal_client_requests.emplace(tid, std::move(ptr)); mds->send_message_mds(req, to); } diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h index 86b6941a5..874fbbb9a 100644 --- a/src/mds/StrayManager.h +++ b/src/mds/StrayManager.h @@ -19,15 +19,30 @@ #include <list> #include "Mutation.h" #include "PurgeQueue.h" +#include "MDSMetaRequest.h" +#include "CDentry.h" class MDSRank; class CInode; -class CDentry; class StrayManager { // My public interface is for consumption by MDCache public: + struct StrayEvalRequest : public MDSMetaRequest { + CDentry *dentry; + public: + explicit StrayEvalRequest(int o, ceph_tid_t t, CDentry *d) : + MDSMetaRequest(o, t), dentry(d) { + dentry->get(CDentry::PIN_PURGING); + dentry->reintegration_reqid = t; + } + ~StrayEvalRequest() { + dentry->reintegration_reqid = 0; + dentry->put(CDentry::PIN_PURGING); + } + }; + explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_); void set_logger(PerfCounters *l) {logger = l;} void activate(); diff --git a/src/mds/cephfs_features.cc b/src/mds/cephfs_features.cc index 4a864076b..a0336c8ba 100644 --- a/src/mds/cephfs_features.cc +++ b/src/mds/cephfs_features.cc @@ -30,6 +30,7 @@ static const std::array feature_names "32bits_retry_fwd", "new_snaprealm_info", "has_owner_uidgid", + "client_mds_auth_caps", }; static_assert(feature_names.size() == CEPHFS_FEATURE_MAX + 1); diff --git a/src/mds/cephfs_features.h b/src/mds/cephfs_features.h index 7d215e2a3..3a67e96db 100644 --- a/src/mds/cephfs_features.h +++ b/src/mds/cephfs_features.h @@ -48,7 +48,8 @@ namespace ceph { #define CEPHFS_FEATURE_32BITS_RETRY_FWD 18 #define CEPHFS_FEATURE_NEW_SNAPREALM_INFO 19 #define CEPHFS_FEATURE_HAS_OWNER_UIDGID 20 -#define CEPHFS_FEATURE_MAX 20 +#define CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK 21 +#define CEPHFS_FEATURE_MAX 21 #define CEPHFS_FEATURES_ALL { \ 0, 1, 2, 3, 4, \ @@ -70,6 +71,7 @@ namespace ceph { CEPHFS_FEATURE_32BITS_RETRY_FWD, \ CEPHFS_FEATURE_NEW_SNAPREALM_INFO, \ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ + CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK \ } #define CEPHFS_METRIC_FEATURES_ALL { \ diff --git a/src/mds/locks.c b/src/mds/locks.c index dbe3ab8eb..f6ff8b982 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -117,7 +117,7 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_XSYN_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, [LOCK_XSYN] = { 0, true, LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, - [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, + [LOCK_EXCL_XSYN] = { LOCK_XSYN, true, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 }, [LOCK_PRE_SCAN] = { LOCK_SCAN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_SCAN] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, |