diff options
Diffstat (limited to 'src/mds/MDCache.cc')
-rw-r--r-- | src/mds/MDCache.cc | 13540 |
1 files changed, 13540 insertions, 0 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc new file mode 100644 index 000000000..2ea13155e --- /dev/null +++ b/src/mds/MDCache.cc @@ -0,0 +1,13540 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <errno.h> +#include <ostream> +#include <string> +#include <string_view> +#include <map> + +#include "MDCache.h" +#include "MDSRank.h" +#include "Server.h" +#include "Locker.h" +#include "MDLog.h" +#include "MDBalancer.h" +#include "Migrator.h" +#include "ScrubStack.h" + +#include "SnapClient.h" + +#include "MDSMap.h" + +#include "CInode.h" +#include "CDir.h" + +#include "Mutation.h" + +#include "include/ceph_fs.h" +#include "include/filepath.h" +#include "include/util.h" + +#include "messages/MClientCaps.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "common/MemoryModel.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/safe_io.h" + +#include "osdc/Journaler.h" +#include "osdc/Filer.h" + +#include "events/ESubtreeMap.h" +#include "events/EUpdate.h" +#include "events/EPeerUpdate.h" +#include "events/EImportFinish.h" +#include "events/EFragment.h" +#include "events/ECommitted.h" +#include "events/EPurged.h" +#include "events/ESessions.h" + +#include "InoTable.h" +#include "fscrypt.h" + +#include "common/Timer.h" + +#include "perfglue/heap_profiler.h" + + +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) + +using namespace std; + +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".cache "; +} + +set<int> SimpleLock::empty_gather_set; + + +/** + * All non-I/O contexts that require a reference + * to an MDCache instance descend from this. + */ +class MDCacheContext : public virtual MDSContext { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {} +}; + +class MDCacheLogContext : public virtual MDSLogContextBase { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {} +}; + +MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : + mds(m), + open_file_table(m), + filer(m->objecter, m->finisher), + stray_manager(m, purge_queue_), + recovery_queue(m), + trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")) +{ + migrator.reset(new Migrator(mds, this)); + + max_dir_commit_size = g_conf()->mds_dir_max_commit_size ? + (g_conf()->mds_dir_max_commit_size << 20) : + (0.9 *(g_conf()->osd_max_write_size << 20)); + + cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit"); + cache_reservation = g_conf().get_val<double>("mds_cache_reservation"); + cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold"); + + export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed"); + export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random"); + export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max"); + + symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery"); + + lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid")); + + bottom_lru.lru_set_midpoint(0); + + decayrate.set_halflife(g_conf()->mds_decay_halflife); + + upkeeper = std::thread(&MDCache::upkeep_main, this); +} + +MDCache::~MDCache() +{ + if (logger) { + g_ceph_context->get_perfcounters_collection()->remove(logger.get()); + } + if (upkeeper.joinable()) + upkeeper.join(); +} + +void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap) +{ + dout(20) << "config changes: " << changed << dendl; + if (changed.count("mds_cache_memory_limit")) + cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit"); + if (changed.count("mds_cache_reservation")) + cache_reservation = g_conf().get_val<double>("mds_cache_reservation"); + + bool ephemeral_pin_config_changed = false; + if (changed.count("mds_export_ephemeral_distributed")) { + export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed"); + dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl; + /* copy to vector to avoid removals during iteration */ + ephemeral_pin_config_changed = true; + } + if (changed.count("mds_export_ephemeral_random")) { + export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random"); + dout(10) << "Migrating any ephemeral random pinned inodes" << dendl; + /* copy to vector to avoid removals during iteration */ + ephemeral_pin_config_changed = true; + } + if (ephemeral_pin_config_changed) { + std::vector<CInode*> migrate; + migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_export_pin(true); + } + } + if (changed.count("mds_export_ephemeral_random_max")) { + export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max"); + } + if (changed.count("mds_health_cache_threshold")) + cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold"); + if (changed.count("mds_cache_mid")) + lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid")); + if (changed.count("mds_cache_trim_decay_rate")) { + trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate")); + } + if (changed.count("mds_symlink_recovery")) { + symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery"); + dout(10) << "Storing symlink targets on file object's head " << symlink_recovery << dendl; + } + + migrator->handle_conf_change(changed, mdsmap); + mds->balancer->handle_conf_change(changed, mdsmap); +} + +void MDCache::log_stat() +{ + mds->logger->set(l_mds_inodes, lru.lru_get_size()); + mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned()); + mds->logger->set(l_mds_inodes_top, lru.lru_get_top()); + mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot()); + mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail()); + mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps); + mds->logger->set(l_mds_caps, Capability::count()); + if (root) { + mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles); + mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes); + mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps); + } +} + + +// + +bool MDCache::shutdown() +{ + { + std::scoped_lock lock(upkeep_mutex); + upkeep_trim_shutdown = true; + upkeep_cvar.notify_one(); + } + if (lru.lru_get_size() > 0) { + dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl; + //show_cache(); + show_subtrees(); + //dump(); + } + return true; +} + + +// ==================================================================== +// some inode functions + +void MDCache::add_inode(CInode *in) +{ + // add to inode map + if (in->last == CEPH_NOSNAP) { + auto &p = inode_map[in->ino()]; + ceph_assert(!p); // should be no dup inos! + p = in; + } else { + auto &p = snap_inode_map[in->vino()]; + ceph_assert(!p); // should be no dup inos! + p = in; + } + + if (in->ino() < MDS_INO_SYSTEM_BASE) { + if (in->ino() == CEPH_INO_ROOT) + root = in; + else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) + myin = in; + else if (in->is_stray()) { + if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) { + strays[MDS_INO_STRAY_INDEX(in->ino())] = in; + } + } + if (in->is_base()) + base_inodes.insert(in); + } +} + +void MDCache::remove_inode(CInode *o) +{ + dout(14) << "remove_inode " << *o << dendl; + + if (o->get_parent_dn()) { + // FIXME: multiple parents? + CDentry *dn = o->get_parent_dn(); + ceph_assert(!dn->is_dirty()); + dn->dir->unlink_inode(dn); // leave dentry ... FIXME? + } + + if (o->is_dirty()) + o->mark_clean(); + if (o->is_dirty_parent()) + o->clear_dirty_parent(); + + o->clear_scatter_dirty(); + + o->clear_clientwriteable(); + + o->item_open_file.remove_myself(); + + if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN)) + export_pin_queue.erase(o); + + if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN)) + export_pin_delayed_queue.erase(o); + + o->clear_ephemeral_pin(true, true); + + // remove from inode map + if (o->last == CEPH_NOSNAP) { + inode_map.erase(o->ino()); + } else { + o->item_caps.remove_myself(); + snap_inode_map.erase(o->vino()); + } + + clear_taken_inos(o->ino()); + + if (o->ino() < MDS_INO_SYSTEM_BASE) { + if (o == root) root = 0; + if (o == myin) myin = 0; + if (o->is_stray()) { + if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) { + strays[MDS_INO_STRAY_INDEX(o->ino())] = 0; + } + } + if (o->is_base()) + base_inodes.erase(o); + } + + // delete it + ceph_assert(o->get_num_ref() == 0); + delete o; +} + +file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap) +{ + file_layout_t result = file_layout_t::get_default(); + result.pool_id = mdsmap.get_first_data_pool(); + return result; +} + +file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap) +{ + file_layout_t result = file_layout_t::get_default(); + result.pool_id = mdsmap.get_metadata_pool(); + if (g_conf()->mds_log_segment_size > 0) { + result.object_size = g_conf()->mds_log_segment_size; + result.stripe_unit = g_conf()->mds_log_segment_size; + } + return result; +} + +void MDCache::init_layouts() +{ + default_file_layout = gen_default_file_layout(*(mds->mdsmap)); + default_log_layout = gen_default_log_layout(*(mds->mdsmap)); +} + +void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const +{ + auto _inode = in->_get_inode(); + _inode->ino = ino; + _inode->version = 1; + _inode->xattr_version = 1; + _inode->mode = 0500 | mode; + _inode->size = 0; + _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now(); + _inode->nlink = 1; + _inode->truncate_size = -1ull; + _inode->change_attr = 0; + _inode->export_pin = MDS_RANK_NONE; + + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout)); + if (_inode->is_dir()) { + _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + _inode->rstat.rsubdirs = 1; /* itself */ + _inode->rstat.rctime = in->get_inode()->ctime; + } else { + _inode->layout = default_file_layout; + ++_inode->rstat.rfiles; + } + _inode->accounted_rstat = _inode->rstat; + + if (in->is_base()) { + if (in->is_root()) + in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN); + else + in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN); + in->open_snaprealm(); // empty snaprealm + ceph_assert(!in->snaprealm->parent); // created its own + in->snaprealm->srnode.seq = 1; + } +} + +CInode *MDCache::create_system_inode(inodeno_t ino, int mode) +{ + dout(0) << "creating system inode with ino:" << ino << dendl; + CInode *in = new CInode(this); + create_unlinked_system_inode(in, ino, mode); + add_inode(in); + return in; +} + +CInode *MDCache::create_root_inode() +{ + CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); + auto _inode = in->_get_inode(); + _inode->uid = g_conf()->mds_root_ino_uid; + _inode->gid = g_conf()->mds_root_ino_gid; + _inode->layout = default_file_layout; + _inode->layout.pool_id = mds->mdsmap->get_first_data_pool(); + return in; +} + +void MDCache::create_empty_hierarchy(MDSGather *gather) +{ + // create root dir + CInode *root = create_root_inode(); + + // force empty root dir + CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); + adjust_subtree_auth(rootdir, mds->get_nodeid()); + rootdir->dir_rep = CDir::REP_ALL; //NONE; + + ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat); + ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat); + ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat); + /* Do no update rootdir rstat information of the fragment, rstat upkeep magic + * assume version 0 is stale/invalid. + */ + + rootdir->mark_complete(); + rootdir->_get_fnode()->version = rootdir->pre_dirty(); + rootdir->mark_dirty(mds->mdlog->get_current_segment()); + rootdir->commit(0, gather->new_sub()); + + root->store(gather->new_sub()); + root->mark_dirty_parent(mds->mdlog->get_current_segment(), true); + root->store_backtrace(gather->new_sub()); +} + +void MDCache::create_mydir_hierarchy(MDSGather *gather) +{ + // create mds dir + CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR); + + CDir *mydir = my->get_or_open_dirfrag(this, frag_t()); + auto mydir_fnode = mydir->_get_fnode(); + + adjust_subtree_auth(mydir, mds->get_nodeid()); + + LogSegment *ls = mds->mdlog->get_current_segment(); + + // stray dir + for (int i = 0; i < NUM_STRAY; ++i) { + CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR); + CDir *straydir = stray->get_or_open_dirfrag(this, frag_t()); + CachedStackStringStream css; + *css << "stray" << i; + CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, ""); + sdn->_mark_dirty(mds->mdlog->get_current_segment()); + + stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat; + + mydir_fnode->rstat.add(stray->get_inode()->rstat); + mydir_fnode->fragstat.nsubdirs++; + // save them + straydir->mark_complete(); + straydir->_get_fnode()->version = straydir->pre_dirty(); + straydir->mark_dirty(ls); + straydir->commit(0, gather->new_sub()); + stray->mark_dirty_parent(ls, true); + stray->store_backtrace(gather->new_sub()); + } + + mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat; + mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat; + + auto inode = myin->_get_inode(); + inode->dirstat = mydir->get_fnode()->fragstat; + inode->rstat = mydir->get_fnode()->rstat; + ++inode->rstat.rsubdirs; + inode->accounted_rstat = inode->rstat; + + mydir->mark_complete(); + mydir_fnode->version = mydir->pre_dirty(); + mydir->mark_dirty(ls); + mydir->commit(0, gather->new_sub()); + + myin->store(gather->new_sub()); +} + +struct C_MDC_CreateSystemFile : public MDCacheLogContext { + MutationRef mut; + CDentry *dn; + version_t dpv; + MDSContext *fin; + C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) : + MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {} + void finish(int r) override { + mdcache->_create_system_file_finish(mut, dn, dpv, fin); + } +}; + +void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin) +{ + dout(10) << "_create_system_file " << name << " in " << *dir << dendl; + CDentry *dn = dir->add_null_dentry(name); + + dn->push_projected_linkage(in); + version_t dpv = dn->pre_dirty(); + + CDir *mdir = 0; + auto inode = in->_get_inode(); + if (in->is_dir()) { + inode->rstat.rsubdirs = 1; + + mdir = in->get_or_open_dirfrag(this, frag_t()); + mdir->mark_complete(); + mdir->_get_fnode()->version = mdir->pre_dirty(); + } else { + inode->rstat.rfiles = 1; + } + + inode->version = dn->pre_dirty(); + + SnapRealm *realm = dir->get_inode()->find_snaprealm(); + dn->first = in->first = realm->get_newest_seq() + 1; + + MutationRef mut(new MutationImpl()); + + // force some locks. hacky. + mds->locker->wrlock_force(&dir->inode->filelock, mut); + mds->locker->wrlock_force(&dir->inode->nestlock, mut); + + mut->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "create system file"); + mds->mdlog->start_entry(le); + + if (!in->is_mdsdir()) { + predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); + le->metablob.add_primary_dentry(dn, in, true); + } else { + predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1); + journal_dirty_inode(mut.get(), &le->metablob, in); + dn->push_projected_linkage(in->ino(), in->d_type()); + le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type()); + le->metablob.add_root(true, in); + } + if (mdir) + le->metablob.add_new_dir(mdir); // dirty AND complete AND new + + mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin)); + mds->mdlog->flush(); +} + +void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin) +{ + dout(10) << "_create_system_file_finish " << *dn << dendl; + + dn->pop_projected_linkage(); + dn->mark_dirty(dpv, mut->ls); + + CInode *in = dn->get_linkage()->get_inode(); + in->mark_dirty(mut->ls); + + if (in->is_dir()) { + CDir *dir = in->get_dirfrag(frag_t()); + ceph_assert(dir); + dir->mark_dirty(mut->ls); + dir->mark_new(mut->ls); + } + + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + + fin->complete(0); + + //if (dir && MDS_INO_IS_MDSDIR(in->ino())) + //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET); +} + + + +struct C_MDS_RetryOpenRoot : public MDSInternalContext { + MDCache *cache; + explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {} + void finish(int r) override { + if (r < 0) { + // If we can't open root, something disastrous has happened: mark + // this rank damaged for operator intervention. Note that + // it is not okay to call suicide() here because we are in + // a Finisher callback. + cache->mds->damaged(); + ceph_abort(); // damaged should never return + } else { + cache->open_root(); + } + } +}; + +void MDCache::open_root_inode(MDSContext *c) +{ + if (mds->get_nodeid() == mds->mdsmap->get_root()) { + CInode *in; + in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate! + in->fetch(c); + } else { + discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root()); + } +} + +void MDCache::open_mydir_inode(MDSContext *c) +{ + CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! + in->fetch(c); +} + +void MDCache::open_mydir_frag(MDSContext *c) +{ + open_mydir_inode( + new MDSInternalContextWrapper(mds, + new LambdaContext([this, c](int r) { + if (r < 0) { + c->complete(r); + return; + } + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + adjust_subtree_auth(mydir, mds->get_nodeid()); + mydir->fetch(c); + }) + ) + ); +} + +void MDCache::open_root() +{ + dout(10) << "open_root" << dendl; + + if (!root) { + open_root_inode(new C_MDS_RetryOpenRoot(this)); + return; + } + if (mds->get_nodeid() == mds->mdsmap->get_root()) { + ceph_assert(root->is_auth()); + CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); + ceph_assert(rootdir); + if (!rootdir->is_subtree_root()) + adjust_subtree_auth(rootdir, mds->get_nodeid()); + if (!rootdir->is_complete()) { + rootdir->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + } else { + ceph_assert(!root->is_auth()); + CDir *rootdir = root->get_dirfrag(frag_t()); + if (!rootdir) { + open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this)); + return; + } + } + + if (!myin) { + CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! + in->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + adjust_subtree_auth(mydir, mds->get_nodeid()); + + populate_mydir(); +} + +void MDCache::advance_stray() { + // check whether the directory has been fragmented + if (stray_fragmenting_index >= 0) { + auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags(); + bool any_fragmenting = false; + for (const auto& dir : dfs) { + if (dir->state_test(CDir::STATE_FRAGMENTING) || + mds->balancer->is_fragment_pending(dir->dirfrag())) { + any_fragmenting = true; + break; + } + } + if (!any_fragmenting) + stray_fragmenting_index = -1; + } + + for (int i = 1; i < NUM_STRAY; i++){ + stray_index = (stray_index + i) % NUM_STRAY; + if (stray_index != stray_fragmenting_index) + break; + } + + if (stray_fragmenting_index == -1 && is_open()) { + // Fragment later stray dir in advance. We don't choose past + // stray dir because in-flight requests may still use it. + stray_fragmenting_index = (stray_index + 3) % NUM_STRAY; + auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags(); + bool any_fragmenting = false; + for (const auto& dir : dfs) { + if (dir->should_split()) { + mds->balancer->queue_split(dir, true); + any_fragmenting = true; + } else if (dir->should_merge()) { + mds->balancer->queue_merge(dir); + any_fragmenting = true; + } + } + if (!any_fragmenting) + stray_fragmenting_index = -1; + } + + dout(10) << "advance_stray to index " << stray_index + << " fragmenting index " << stray_fragmenting_index << dendl; +} + +void MDCache::populate_mydir() +{ + ceph_assert(myin); + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); + ceph_assert(mydir); + + dout(10) << "populate_mydir " << *mydir << dendl; + + if (!mydir->is_complete()) { + mydir->fetch(new C_MDS_RetryOpenRoot(this)); + return; + } + + if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) { + // A missing dirfrag, we will recreate it. Before that, we must dirty + // it before dirtying any of the strays we create within it. + mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, " + "recreating it now"; + LogSegment *ls = mds->mdlog->get_current_segment(); + mydir->state_clear(CDir::STATE_BADFRAG); + mydir->mark_complete(); + mydir->_get_fnode()->version = mydir->pre_dirty(); + mydir->mark_dirty(ls); + } + + // open or create stray + uint64_t num_strays = 0; + for (int i = 0; i < NUM_STRAY; ++i) { + CachedStackStringStream css; + *css << "stray" << i; + CDentry *straydn = mydir->lookup(css->str()); + + // allow for older fs's with stray instead of stray0 + if (straydn == NULL && i == 0) + straydn = mydir->lookup("stray"); + + if (!straydn || !straydn->get_linkage()->get_inode()) { + _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR), + new C_MDS_RetryOpenRoot(this)); + return; + } + ceph_assert(straydn); + ceph_assert(strays[i]); + // we make multiple passes through this method; make sure we only pin each stray once. + if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) { + strays[i]->get(CInode::PIN_STRAY); + strays[i]->state_set(CInode::STATE_STRAYPINNED); + strays[i]->get_stickydirs(); + } + dout(20) << " stray num " << i << " is " << *strays[i] << dendl; + + // open all frags + frag_vec_t leaves; + strays[i]->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = strays[i]->get_dirfrag(leaf); + if (!dir) { + dir = strays[i]->get_or_open_dirfrag(this, leaf); + } + + // DamageTable applies special handling to strays: it will + // have damaged() us out if one is damaged. + ceph_assert(!dir->state_test(CDir::STATE_BADFRAG)); + + if (dir->get_version() == 0) { + dir->fetch_keys({}, new C_MDS_RetryOpenRoot(this)); + return; + } + + if (dir->get_frag_size() > 0) + num_strays += dir->get_frag_size(); + } + } + + // okay! + dout(10) << "populate_mydir done" << dendl; + ceph_assert(!open); + open = true; + mds->queue_waiters(waiting_for_open); + + stray_manager.set_num_strays(num_strays); + stray_manager.activate(); + + scan_stray_dir(); +} + +void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin) +{ + discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1))); +} + +CDir *MDCache::get_stray_dir(CInode *in) +{ + string straydname; + in->name_stray_dentry(straydname); + + CInode *strayi = get_stray(); + ceph_assert(strayi); + frag_t fg = strayi->pick_dirfrag(straydname); + CDir *straydir = strayi->get_dirfrag(fg); + ceph_assert(straydir); + return straydir; +} + +MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info) +{ + // inode? + if (info.ino) + return get_inode(info.ino, info.snapid); + + // dir or dentry. + CDir *dir = get_dirfrag(info.dirfrag); + if (!dir) return 0; + + if (info.dname.length()) + return dir->lookup(info.dname, info.snapid); + else + return dir; +} + + +// ==================================================================== +// consistent hash ring + +/* + * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf +*/ +mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg) +{ + const mds_rank_t max_mds = mds->mdsmap->get_max_mds(); + uint64_t hash = rjhash64(ino); + if (fg) + hash = rjhash64(hash + rjhash64(fg.value())); + + int64_t b = -1, j = 0; + while (j < max_mds) { + b = j; + hash = hash*2862933555777941757ULL + 1; + j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1)); + } + // verify bounds before returning + auto result = mds_rank_t(b); + ceph_assert(result >= 0 && result < max_mds); + return result; +} + + +// ==================================================================== +// subtree management + +/* + * adjust the dir_auth of a subtree. + * merge with parent and/or child subtrees, if is it appropriate. + * merge can ONLY happen if both parent and child have unambiguous auth. + */ +void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop) +{ + dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir << dendl; + + show_subtrees(); + + CDir *root; + if (dir->inode->is_base()) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) { + subtrees[root]; + root->get(CDir::PIN_SUBTREE); + } + } else { + root = get_subtree_root(dir); // subtree root + } + ceph_assert(root); + ceph_assert(subtrees.count(root)); + dout(7) << " current root is " << *root << dendl; + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << dendl; + ceph_assert(subtrees.count(dir) == 0); + subtrees[dir]; // create empty subtree bounds list for me. + dir->get(CDir::PIN_SUBTREE); + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set<CDir*>::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set<CDir*>::iterator next = p; + ++next; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << dendl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + + // adjust recursive pop counters + if (adjust_pop && dir->is_auth()) { + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree.sub(dir->pop_auth_subtree); + if (p->is_subtree_root()) break; + p = p->inode->get_parent_dir(); + } + } + } + + show_subtrees(); +} + + +void MDCache::try_subtree_merge(CDir *dir) +{ + dout(7) << "try_subtree_merge " << *dir << dendl; + // record my old bounds + auto oldbounds = subtrees.at(dir); + + set<CInode*> to_eval; + // try merge at my root + try_subtree_merge_at(dir, &to_eval); + + // try merge at my old bounds + for (auto bound : oldbounds) + try_subtree_merge_at(bound, &to_eval); + + if (!(mds->is_any_replay() || mds->is_resolve())) { + for(auto in : to_eval) + eval_subtree_root(in); + } +} + +void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop) +{ + dout(10) << "try_subtree_merge_at " << *dir << dendl; + + if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN || + dir->state_test(CDir::STATE_EXPORTBOUND) || + dir->state_test(CDir::STATE_AUXSUBTREE)) + return; + + auto it = subtrees.find(dir); + ceph_assert(it != subtrees.end()); + + // merge with parent? + CDir *parent = dir; + if (!dir->inode->is_base()) + parent = get_subtree_root(dir->get_parent_dir()); + + if (parent != dir && // we have a parent, + parent->dir_auth == dir->dir_auth) { // auth matches, + // merge with parent. + dout(10) << " subtree merge at " << *dir << dendl; + dir->set_dir_auth(CDIR_AUTH_DEFAULT); + + // move our bounds under the parent + subtrees[parent].insert(it->second.begin(), it->second.end()); + + // we are no longer a subtree or bound + dir->put(CDir::PIN_SUBTREE); + subtrees.erase(it); + subtrees[parent].erase(dir); + + // adjust popularity? + if (adjust_pop && dir->is_auth()) { + CDir *cur = dir; + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree.add(dir->pop_auth_subtree); + p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru); + if (p->is_subtree_root()) break; + cur = p; + p = p->inode->get_parent_dir(); + } + } + + if (to_eval && dir->get_inode()->is_auth()) + to_eval->insert(dir->get_inode()); + + show_subtrees(15); + } +} + +void MDCache::eval_subtree_root(CInode *diri) +{ + // evaluate subtree inode filelock? + // (we should scatter the filelock on subtree bounds) + ceph_assert(diri->is_auth()); + mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST); +} + + +void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir + << " bounds " << bounds + << dendl; + + show_subtrees(); + + CDir *root; + if (dir->ino() == CEPH_INO_ROOT) { + root = dir; // bootstrap hack. + if (subtrees.count(root) == 0) { + subtrees[root]; + root->get(CDir::PIN_SUBTREE); + } + } else { + root = get_subtree_root(dir); // subtree root + } + ceph_assert(root); + ceph_assert(subtrees.count(root)); + dout(7) << " current root is " << *root << dendl; + + mds_authority_t oldauth = dir->authority(); + + if (root == dir) { + // i am already a subtree. + dir->set_dir_auth(auth); + } else { + // i am a new subtree. + dout(10) << " new subtree at " << *dir << dendl; + ceph_assert(subtrees.count(dir) == 0); + subtrees[dir]; // create empty subtree bounds list for me. + dir->get(CDir::PIN_SUBTREE); + + // set dir_auth + dir->set_dir_auth(auth); + + // move items nested beneath me, under me. + set<CDir*>::iterator p = subtrees[root].begin(); + while (p != subtrees[root].end()) { + set<CDir*>::iterator next = p; + ++next; + if (get_subtree_root((*p)->get_parent_dir()) == dir) { + // move under me + dout(10) << " claiming child bound " << **p << dendl; + subtrees[dir].insert(*p); + subtrees[root].erase(p); + } + p = next; + } + + // i am a bound of the parent subtree. + subtrees[root].insert(dir); + + // i am now the subtree root. + root = dir; + } + + set<CInode*> to_eval; + + // verify/adjust bounds. + // - these may be new, or + // - beneath existing ambiguous bounds (which will be collapsed), + // - but NOT beneath unambiguous bounds. + for (const auto& bound : bounds) { + // new bound? + if (subtrees[dir].count(bound) == 0) { + if (get_subtree_root(bound) == dir) { + dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; + adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. + } + else { + dout(10) << " want bound " << *bound << dendl; + CDir *t = get_subtree_root(bound->get_parent_dir()); + if (subtrees[t].count(bound) == 0) { + ceph_assert(t != dir); + dout(10) << " new bound " << *bound << dendl; + adjust_subtree_auth(bound, t->authority()); + } + // make sure it's nested beneath ambiguous subtree(s) + while (1) { + while (subtrees[dir].count(t) == 0) + t = get_subtree_root(t->get_parent_dir()); + dout(10) << " swallowing intervening subtree at " << *t << dendl; + adjust_subtree_auth(t, auth); + try_subtree_merge_at(t, &to_eval); + t = get_subtree_root(bound->get_parent_dir()); + if (t == dir) break; + } + } + } + else { + dout(10) << " already have bound " << *bound << dendl; + } + } + // merge stray bounds? + while (!subtrees[dir].empty()) { + set<CDir*> copy = subtrees[dir]; + for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) { + if (bounds.count(*p) == 0) { + CDir *stray = *p; + dout(10) << " swallowing extra subtree at " << *stray << dendl; + adjust_subtree_auth(stray, auth); + try_subtree_merge_at(stray, &to_eval); + } + } + // swallowing subtree may add new subtree bounds + if (copy == subtrees[dir]) + break; + } + + // bound should now match. + verify_subtree_bounds(dir, bounds); + + show_subtrees(); + + if (!(mds->is_any_replay() || mds->is_resolve())) { + for(auto in : to_eval) + eval_subtree_root(in); + } +} + + +/* + * return a set of CDir*'s that correspond to the given bound set. Only adjust + * fragmentation as necessary to get an equivalent bounding set. That is, only + * split if one of our frags spans the provided bounding set. Never merge. + */ +void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds) +{ + dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl; + + // sort by ino + map<inodeno_t, fragset_t> byino; + for (auto& frag : dfs) { + byino[frag.ino].insert_raw(frag.frag); + } + dout(10) << " by ino: " << byino << dendl; + + for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) { + p->second.simplify(); + CInode *diri = get_inode(p->first); + if (!diri) + continue; + dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl; + + fragtree_t tmpdft; + for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) + tmpdft.force_to_leaf(g_ceph_context, *q); + + for (const auto& fg : p->second) { + frag_vec_t leaves; + diri->dirfragtree.get_leaves_under(fg, leaves); + if (leaves.empty()) { + frag_t approx_fg = diri->dirfragtree[fg.value()]; + frag_vec_t approx_leaves; + tmpdft.get_leaves_under(approx_fg, approx_leaves); + for (const auto& leaf : approx_leaves) { + if (p->second.get().count(leaf) == 0) { + // not bound, so the resolve message is from auth MDS of the dirfrag + force_dir_fragment(diri, leaf); + } + } + } + + auto&& [complete, sibs] = diri->get_dirfrags_under(fg); + for (const auto& sib : sibs) + bounds.insert(sib); + } + } +} + +void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth) +{ + dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth + << " on " << *dir << " bound_dfs " << bound_dfs << dendl; + + set<CDir*> bounds; + get_force_dirfrag_bound_set(bound_dfs, bounds); + adjust_bounded_subtree_auth(dir, bounds, auth); +} + +void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result) +{ + dout(10) << "map_dirfrag_set " << dfs << dendl; + + // group by inode + map<inodeno_t, fragset_t> ino_fragset; + for (const auto &df : dfs) { + ino_fragset[df.ino].insert_raw(df.frag); + } + // get frags + for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin(); + p != ino_fragset.end(); + ++p) { + p->second.simplify(); + CInode *in = get_inode(p->first); + if (!in) + continue; + + frag_vec_t fgs; + for (const auto& fg : p->second) { + in->dirfragtree.get_leaves_under(fg, fgs); + } + + dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs + << " on " << *in << dendl; + + for (const auto& fg : fgs) { + CDir *dir = in->get_dirfrag(fg); + if (dir) + result.insert(dir); + } + } +} + + + +CDir *MDCache::get_subtree_root(CDir *dir) +{ + // find the underlying dir that delegates (or is about to delegate) auth + while (true) { + if (dir->is_subtree_root()) + return dir; + dir = dir->get_inode()->get_parent_dir(); + if (!dir) + return 0; // none + } +} + +CDir *MDCache::get_projected_subtree_root(CDir *dir) +{ + // find the underlying dir that delegates (or is about to delegate) auth + while (true) { + if (dir->is_subtree_root()) + return dir; + dir = dir->get_inode()->get_projected_parent_dir(); + if (!dir) + return 0; // none + } +} + +void MDCache::remove_subtree(CDir *dir) +{ + dout(10) << "remove_subtree " << *dir << dendl; + auto it = subtrees.find(dir); + ceph_assert(it != subtrees.end()); + subtrees.erase(it); + dir->put(CDir::PIN_SUBTREE); + if (dir->get_parent_dir()) { + CDir *p = get_subtree_root(dir->get_parent_dir()); + auto it = subtrees.find(p); + ceph_assert(it != subtrees.end()); + auto count = it->second.erase(dir); + ceph_assert(count == 1); + } +} + +void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds) +{ + ceph_assert(subtrees.count(dir)); + bounds = subtrees[dir]; +} + +void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds) +{ + if (subtrees.count(dir)) { + // just copy them, dir is a subtree. + get_subtree_bounds(dir, bounds); + } else { + // find them + CDir *root = get_subtree_root(dir); + for (set<CDir*>::iterator p = subtrees[root].begin(); + p != subtrees[root].end(); + ++p) { + CDir *t = *p; + while (t != root) { + t = t->get_parent_dir(); + ceph_assert(t); + if (t == dir) { + bounds.insert(*p); + continue; + } + } + } + } +} + +void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds) +{ + // for debugging only. + ceph_assert(subtrees.count(dir)); + if (bounds != subtrees[dir]) { + dout(0) << "verify_subtree_bounds failed" << dendl; + set<CDir*> b = bounds; + for (auto &cd : subtrees[dir]) { + if (bounds.count(cd)) { + b.erase(cd); + continue; + } + dout(0) << " missing bound " << *cd << dendl; + } + for (const auto &cd : b) + dout(0) << " extra bound " << *cd << dendl; + } + ceph_assert(bounds == subtrees[dir]); +} + +void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds) +{ + // for debugging only. + ceph_assert(subtrees.count(dir)); + + // make sure that any bounds i do have are properly noted as such. + int failed = 0; + for (const auto &fg : bounds) { + CDir *bd = get_dirfrag(fg); + if (!bd) continue; + if (subtrees[dir].count(bd) == 0) { + dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; + failed++; + } + } + ceph_assert(failed == 0); +} + +void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir) +{ + dout(10) << "project_subtree_rename " << *diri << " from " << *olddir + << " to " << *newdir << dendl; + projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir)); +} + +void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop) +{ + dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; + + CDir *newdir = diri->get_parent_dir(); + + if (pop) { + map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri); + ceph_assert(p != projected_subtree_renames.end()); + ceph_assert(!p->second.empty()); + ceph_assert(p->second.front().first == olddir); + ceph_assert(p->second.front().second == newdir); + p->second.pop_front(); + if (p->second.empty()) + projected_subtree_renames.erase(p); + } + + // adjust total auth pin of freezing subtree + if (olddir != newdir) { + auto&& dfls = diri->get_nested_dirfrags(); + for (const auto& dir : dfls) + olddir->adjust_freeze_after_rename(dir); + } + + // adjust subtree + // N.B. make sure subtree dirfrags are at the front of the list + auto dfls = diri->get_subtree_dirfrags(); + diri->get_nested_dirfrags(dfls); + for (const auto& dir : dfls) { + dout(10) << "dirfrag " << *dir << dendl; + CDir *oldparent = get_subtree_root(olddir); + dout(10) << " old parent " << *oldparent << dendl; + CDir *newparent = get_subtree_root(newdir); + dout(10) << " new parent " << *newparent << dendl; + + auto& oldbounds = subtrees[oldparent]; + auto& newbounds = subtrees[newparent]; + + if (olddir != newdir) + mds->balancer->adjust_pop_for_rename(olddir, dir, false); + + if (oldparent == newparent) { + dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; + } else if (dir->is_subtree_root()) { + // children are fine. change parent. + dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; + { + auto n = oldbounds.erase(dir); + ceph_assert(n == 1); + } + newbounds.insert(dir); + // caller is responsible for 'eval diri' + try_subtree_merge_at(dir, NULL, false); + } else { + // mid-subtree. + + // see if any old bounds move to the new parent. + std::vector<CDir*> tomove; + for (const auto& bound : oldbounds) { + CDir *broot = get_subtree_root(bound->get_parent_dir()); + if (broot != oldparent) { + ceph_assert(broot == newparent); + tomove.push_back(bound); + } + } + for (const auto& bound : tomove) { + dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; + oldbounds.erase(bound); + newbounds.insert(bound); + } + + // did auth change? + if (oldparent->authority() != newparent->authority()) { + adjust_subtree_auth(dir, oldparent->authority(), false); + // caller is responsible for 'eval diri' + try_subtree_merge_at(dir, NULL, false); + } + } + + if (olddir != newdir) + mds->balancer->adjust_pop_for_rename(newdir, dir, true); + } + + show_subtrees(); +} + +// =================================== +// journal and snap/cow helpers + + +/* + * find first inode in cache that follows given snapid. otherwise, return current. + */ +CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows) +{ + dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl; + ceph_assert(in->last == CEPH_NOSNAP); + + auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows)); + if (p != snap_inode_map.end() && p->second->ino() == in->ino()) { + dout(10) << "pick_inode_snap found " << *p->second << dendl; + in = p->second; + } + + return in; +} + + +/* + * note: i'm currently cheating wrt dirty and inode.version on cow + * items. instead of doing a full dir predirty, i just take the + * original item's version, and set the dirty flag (via + * mutation::add_cow_{inode,dentry}() and mutation::apply(). that + * means a special case in the dir commit clean sweep assertions. + * bah. + */ +CInode *MDCache::cow_inode(CInode *in, snapid_t last) +{ + ceph_assert(last >= in->first); + + CInode *oldin = new CInode(this, true, in->first, last); + auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode()); + _inode->trim_client_ranges(last); + oldin->reset_inode(std::move(_inode)); + auto _xattrs = in->get_previous_projected_xattrs(); + oldin->reset_xattrs(std::move(_xattrs)); + + oldin->symlink = in->symlink; + + if (in->first < in->oldest_snap) + in->oldest_snap = in->first; + + in->first = last+1; + + dout(10) << "cow_inode " << *in << " to " << *oldin << dendl; + add_inode(oldin); + + if (in->last != CEPH_NOSNAP) { + CInode *head_in = get_inode(in->ino()); + ceph_assert(head_in); + auto ret = head_in->split_need_snapflush(oldin, in); + if (ret.first) { + oldin->client_snap_caps = in->client_snap_caps; + if (!oldin->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + if (lock->get_state() != LOCK_SNAP_SYNC) { + ceph_assert(lock->is_stable()); + lock->set_state(LOCK_SNAP_SYNC); // gathering + oldin->auth_pin(lock); + } + lock->get_wrlock(true); + } + } + } + if (!ret.second) { + auto client_snap_caps = std::move(in->client_snap_caps); + in->client_snap_caps.clear(); + in->item_open_file.remove_myself(); + in->item_caps.remove_myself(); + + if (!client_snap_caps.empty()) { + MDSContext::vec finished; + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering + lock->put_wrlock(); + if (!lock->get_num_wrlocks()) { + lock->set_state(LOCK_SYNC); + lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished); + in->auth_unpin(lock); + } + } + mds->queue_waiters(finished); + } + } + return oldin; + } + + if (!in->client_caps.empty()) { + const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps(); + // clone caps? + for (auto &p : in->client_caps) { + client_t client = p.first; + Capability *cap = &p.second; + int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued(); + if ((issued & CEPH_CAP_ANY_WR) && + cap->client_follows < last) { + dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl; + oldin->client_snap_caps.insert(client); + cap->client_follows = last; + + // we need snapflushes for any intervening snaps + dout(10) << " snaps " << snaps << dendl; + for (auto q = snaps.lower_bound(oldin->first); + q != snaps.end() && *q <= last; + ++q) { + in->add_need_snapflush(oldin, *q, client); + } + } else { + dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl; + } + } + + if (!oldin->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock); + ceph_assert(lock); + if (lock->get_state() != LOCK_SNAP_SYNC) { + ceph_assert(lock->is_stable()); + lock->set_state(LOCK_SNAP_SYNC); // gathering + oldin->auth_pin(lock); + } + lock->get_wrlock(true); + } + } + } + return oldin; +} + +void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, + CDentry *dn, snapid_t follows, + CInode **pcow_inode, CDentry::linkage_t *dnl) +{ + if (!dn) { + dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl; + return; + } + dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl; + ceph_assert(dn->is_auth()); + + // nothing to cow on a null dentry, fix caller + if (!dnl) + dnl = dn->get_projected_linkage(); + ceph_assert(!dnl->is_null()); + + CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL; + bool cow_head = false; + if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) { + ceph_assert(in->is_frozen_inode()); + cow_head = true; + } + if (in && (in->is_multiversion() || cow_head)) { + // multiversion inode. + SnapRealm *realm = NULL; + + if (in->get_projected_parent_dn() != dn) { + ceph_assert(follows == CEPH_NOSNAP); + realm = dn->dir->inode->find_snaprealm(); + snapid_t dir_follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(dir_follows >= realm->get_newest_seq()); + + if (dir_follows+1 > dn->first) { + snapid_t oldfirst = dn->first; + dn->first = dir_follows+1; + if (realm->has_snaps_in_range(oldfirst, dir_follows)) { + CDir *dir = dn->dir; + CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows); + dout(10) << " olddn " << *olddn << dendl; + ceph_assert(dir->is_projected()); + olddn->set_projected_version(dir->get_projected_version()); + metablob->add_remote_dentry(olddn, true); + mut->add_cow_dentry(olddn); + // FIXME: adjust link count here? hmm. + + if (dir_follows+1 > in->first) + in->cow_old_inode(dir_follows, cow_head); + } + } + + follows = dir_follows; + if (in->snaprealm) { + realm = in->snaprealm; + ceph_assert(follows >= realm->get_newest_seq()); + } + } else { + realm = in->find_snaprealm(); + if (follows == CEPH_NOSNAP) { + follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(follows >= realm->get_newest_seq()); + } + } + + // already cloned? + if (follows < in->first) { + dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl; + return; + } + + if (!realm->has_snaps_in_range(in->first, follows)) { + dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl; + in->first = follows + 1; + return; + } + + in->cow_old_inode(follows, cow_head); + + } else { + SnapRealm *realm = dn->dir->inode->find_snaprealm(); + if (follows == CEPH_NOSNAP) { + follows = get_global_snaprealm()->get_newest_seq(); + ceph_assert(follows >= realm->get_newest_seq()); + } + + // already cloned? + if (follows < dn->first) { + dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl; + return; + } + + // update dn.first before adding old dentry to cdir's map + snapid_t oldfirst = dn->first; + dn->first = follows+1; + + if (!realm->has_snaps_in_range(oldfirst, follows)) { + dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl; + if (in) + in->first = follows+1; + return; + } + + dout(10) << " dn " << *dn << dendl; + CDir *dir = dn->get_dir(); + ceph_assert(dir->is_projected()); + + if (in) { + CInode *oldin = cow_inode(in, follows); + ceph_assert(in->is_projected()); + mut->add_cow_inode(oldin); + if (pcow_inode) + *pcow_inode = oldin; + CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows); + dout(10) << " olddn " << *olddn << dendl; + bool need_snapflush = !oldin->client_snap_caps.empty(); + if (need_snapflush) { + mut->ls->open_files.push_back(&oldin->item_open_file); + mds->locker->mark_need_snapflush_inode(oldin); + } + olddn->set_projected_version(dir->get_projected_version()); + metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush); + mut->add_cow_dentry(olddn); + } else { + ceph_assert(dnl->is_remote()); + CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows); + dout(10) << " olddn " << *olddn << dendl; + + olddn->set_projected_version(dir->get_projected_version()); + metablob->add_remote_dentry(olddn, true); + mut->add_cow_dentry(olddn); + } + } +} + +void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows) +{ + if (in->is_base()) { + metablob->add_root(true, in); + } else { + if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP) + follows = in->first - 1; + CDentry *dn = in->get_projected_parent_dn(); + if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry + journal_cow_dentry(mut, metablob, dn, follows); + if (in->get_projected_inode()->is_backtrace_updated()) { + bool dirty_pool = in->get_projected_inode()->layout.pool_id != + in->get_previous_projected_inode()->layout.pool_id; + metablob->add_primary_dentry(dn, in, true, true, dirty_pool); + } else { + metablob->add_primary_dentry(dn, in, true); + } + } +} + + + +// nested --------------------------------------------------------------- + +void MDCache::project_rstat_inode_to_frag(const MutationRef& mut, + CInode *cur, CDir *parent, snapid_t first, + int linkunlink, SnapRealm *prealm) +{ + CDentry *parentdn = cur->get_projected_parent_dn(); + + if (cur->first > first) + first = cur->first; + + dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink + << " " << *cur << dendl; + dout(20) << " frag head is [" << parent->first << ",head] " << dendl; + dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl; + + /* + * FIXME. this incompletely propagates rstats to _old_ parents + * (i.e. shortly after a directory rename). but we need full + * blown hard link backpointers to make this work properly... + */ + snapid_t floor = parentdn->first; + dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl; + + if (!prealm) + prealm = parent->inode->find_snaprealm(); + const set<snapid_t> snaps = prealm->get_snaps(); + + if (cur->last != CEPH_NOSNAP) { + ceph_assert(cur->dirty_old_rstats.empty()); + set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor)); + if (q == snaps.end() || *q > cur->last) + return; + } + + if (cur->last >= floor) { + bool update = true; + if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) { + // rename src inode is not projected in the peer rename prep case. so we should + // avoid updateing the inode. + ceph_assert(linkunlink < 0); + ceph_assert(cur->is_frozen_inode()); + update = false; + } + // hacky + const CInode::mempool_inode *pi; + if (update && mut->is_projected(cur)) { + pi = cur->_get_projected_inode(); + } else { + pi = cur->get_projected_inode().get(); + if (update) { + // new inode + ceph_assert(pi->rstat == pi->accounted_rstat); + update = false; + } + } + _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent, + linkunlink, update); + } + + if (g_conf()->mds_snap_rstat) { + for (const auto &p : cur->dirty_old_rstats) { + const auto &old = cur->get_old_inodes()->at(p); + snapid_t ofirst = std::max(old.first, floor); + auto it = snaps.lower_bound(ofirst); + if (it == snaps.end() || *it > p) + continue; + if (p >= floor) + _project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false); + } + } + cur->dirty_old_rstats.clear(); +} + + +void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last, + CDir *parent, int linkunlink, bool update_inode) +{ + dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl; + dout(20) << " inode rstat " << inode->rstat << dendl; + dout(20) << " inode accounted_rstat " << inode->accounted_rstat << dendl; + nest_info_t delta; + if (linkunlink == 0) { + delta.add(inode->rstat); + delta.sub(inode->accounted_rstat); + } else if (linkunlink < 0) { + delta.sub(inode->accounted_rstat); + } else { + delta.add(inode->rstat); + } + dout(20) << " delta " << delta << dendl; + + + while (last >= ofirst) { + /* + * pick fnode version to update. at each iteration, we want to + * pick a segment ending in 'last' to update. split as necessary + * to make that work. then, adjust first up so that we only + * update one segment at a time. then loop to cover the whole + * [ofirst,last] interval. + */ + nest_info_t *prstat; + snapid_t first; + auto pf = parent->_get_projected_fnode(); + if (last == CEPH_NOSNAP) { + if (g_conf()->mds_snap_rstat) + first = std::max(ofirst, parent->first); + else + first = parent->first; + prstat = &pf->rstat; + dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl; + + if (first > parent->first && + !(pf->rstat == pf->accounted_rstat)) { + dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat [" + << parent->first << "," << (first-1) << "] " + << " " << *prstat << "/" << pf->accounted_rstat + << dendl; + parent->dirty_old_rstat[first-1].first = parent->first; + parent->dirty_old_rstat[first-1].rstat = pf->rstat; + parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat; + } + parent->first = first; + } else if (!g_conf()->mds_snap_rstat) { + // drop snapshots' rstats + break; + } else if (last >= parent->first) { + first = parent->first; + parent->dirty_old_rstat[last].first = first; + parent->dirty_old_rstat[last].rstat = pf->rstat; + parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat; + prstat = &parent->dirty_old_rstat[last].rstat; + dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] " + << " " << *prstat << "/" << pf->accounted_rstat << dendl; + } else { + // be careful, dirty_old_rstat is a _sparse_ map. + // sorry, this is ugly. + first = ofirst; + + // find any intersection with last + auto it = parent->dirty_old_rstat.lower_bound(last); + if (it == parent->dirty_old_rstat.end()) { + dout(20) << " no dirty_old_rstat with last >= last " << last << dendl; + if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) { + dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl; + first = parent->dirty_old_rstat.rbegin()->first+1; + } + } else { + // *it last is >= last + if (it->second.first <= last) { + // *it intersects [first,last] + if (it->second.first < first) { + dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl; + parent->dirty_old_rstat[first-1] = it->second; + it->second.first = first; + } + if (it->second.first > first) + first = it->second.first; + if (last < it->first) { + dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl; + parent->dirty_old_rstat[last] = it->second; + it->second.first = last+1; + } + } else { + // *it is to the _right_ of [first,last] + it = parent->dirty_old_rstat.lower_bound(first); + // new *it last is >= first + if (it->second.first <= last && // new *it isn't also to the right, and + it->first >= first) { // it intersects our first bit, + dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl; + first = it->first+1; + } + dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl; + } + } + dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl; + parent->dirty_old_rstat[last].first = first; + prstat = &parent->dirty_old_rstat[last].rstat; + } + + // apply + dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl; + ceph_assert(last >= first); + prstat->add(delta); + dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl; + + last = first-1; + } + + if (update_inode) { + auto _inode = const_cast<CInode::mempool_inode*>(inode); + _inode->accounted_rstat = _inode->rstat; + } +} + +void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat, + const nest_info_t& accounted_rstat, + snapid_t ofirst, snapid_t last, + CInode *pin, bool cow_head) +{ + dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl; + dout(20) << " frag rstat " << rstat << dendl; + dout(20) << " frag accounted_rstat " << accounted_rstat << dendl; + nest_info_t delta = rstat; + delta.sub(accounted_rstat); + dout(20) << " delta " << delta << dendl; + + CInode::old_inode_map_ptr _old_inodes; + while (last >= ofirst) { + CInode::mempool_inode *pi; + snapid_t first; + if (last == pin->last) { + pi = pin->_get_projected_inode(); + first = std::max(ofirst, pin->first); + if (first > pin->first) { + auto& old = pin->cow_old_inode(first-1, cow_head); + dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl; + } + } else { + if (!_old_inodes) { + _old_inodes = CInode::allocate_old_inode_map(); + if (pin->is_any_old_inodes()) + *_old_inodes = *pin->get_old_inodes(); + } + if (last >= pin->first) { + first = pin->first; + pin->cow_old_inode(last, cow_head); + } else { + // our life is easier here because old_inodes is not sparse + // (although it may not begin at snapid 1) + auto it = _old_inodes->lower_bound(last); + if (it == _old_inodes->end()) { + dout(10) << " no old_inode <= " << last << ", done." << dendl; + break; + } + first = it->second.first; + if (first > last) { + dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl; + //assert(p == pin->old_inodes.begin()); + break; + } + if (it->first > last) { + dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to [" + << (last+1) << "," << it->first << "]" << dendl; + (*_old_inodes)[last] = it->second; + it->second.first = last+1; + pin->dirty_old_rstats.insert(it->first); + } + } + if (first < ofirst) { + dout(10) << " splitting left old_inode [" << first << "," << last << "] to [" + << first << "," << ofirst-1 << "]" << dendl; + (*_old_inodes)[ofirst-1] = (*_old_inodes)[last]; + pin->dirty_old_rstats.insert(ofirst-1); + (*_old_inodes)[last].first = first = ofirst; + } + pi = &(*_old_inodes)[last].inode; + pin->dirty_old_rstats.insert(last); + } + dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl; + pi->rstat.add(delta); + dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl; + + last = first-1; + } + if (_old_inodes) + pin->reset_old_inodes(std::move(_old_inodes)); +} + +void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change) +{ + if (!(mds->is_active() || mds->is_stopping())) + return; + + if (!in->is_auth() || in->is_frozen()) + return; + + const auto& pi = in->get_projected_inode(); + if (!pi->quota.is_enabled() && !quota_change) + return; + + // creaete snaprealm for quota inode (quota was set before mimic) + if (!in->get_projected_srnode()) + mds->server->create_quota_realm(in); + + for (auto &p : in->client_caps) { + Capability *cap = &p.second; + if (cap->is_noquota()) + continue; + + if (exclude_ct >= 0 && exclude_ct != p.first) + goto update; + + if (cap->last_rbytes == pi->rstat.rbytes && + cap->last_rsize == pi->rstat.rsize()) + continue; + + if (pi->quota.max_files > 0) { + if (pi->rstat.rsize() >= pi->quota.max_files) + goto update; + + if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) < + abs(cap->last_rsize - pi->rstat.rsize())) + goto update; + } + + if (pi->quota.max_bytes > 0) { + if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3)) + goto update; + + if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) < + abs(cap->last_rbytes - pi->rstat.rbytes)) + goto update; + } + + continue; + +update: + cap->last_rsize = pi->rstat.rsize(); + cap->last_rbytes = pi->rstat.rbytes; + + auto msg = make_message<MClientQuota>(); + msg->ino = in->ino(); + msg->rstat = pi->rstat; + msg->quota = pi->quota; + mds->send_message_client_counted(msg, cap->get_session()); + } + for (const auto &it : in->get_replicas()) { + auto msg = make_message<MGatherCaps>(); + msg->ino = in->ino(); + mds->send_message_mds(msg, it.first); + } +} + +/* + * NOTE: we _have_ to delay the scatter if we are called during a + * rejoin, because we can't twiddle locks between when the + * rejoin_(weak|strong) is received and when we send the rejoin_ack. + * normally, this isn't a problem: a recover mds doesn't twiddle locks + * (no requests), and a survivor acks immediately. _except_ that + * during rejoin_(weak|strong) processing, we may complete a lock + * gather, and do a scatter_writebehind.. and we _can't_ twiddle the + * scatterlock state in that case or the lock states will get out of + * sync between the auth and replica. + * + * the simple solution is to never do the scatter here. instead, put + * the scatterlock on a list if it isn't already wrlockable. this is + * probably the best plan anyway, since we avoid too many + * scatters/locks under normal usage. + */ +/* + * some notes on dirlock/nestlock scatterlock semantics: + * + * the fragstat (dirlock) will never be updated without + * dirlock+nestlock wrlock held by the caller. + * + * the rstat (nestlock) _may_ get updated without a wrlock when nested + * data is pushed up the tree. this could be changed with some + * restructuring here, but in its current form we ensure that the + * fragstat+rstat _always_ reflect an accurrate summation over the dir + * frag, which is nice. and, we only need to track frags that need to + * be nudged (and not inodes with pending rstat changes that need to + * be pushed into the frag). a consequence of this is that the + * accounted_rstat on scatterlock sync may not match our current + * rstat. this is normal and expected. + */ +void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink, + snapid_t cfollows) +{ + bool primary_dn = flags & PREDIRTY_PRIMARY; + bool do_parent_mtime = flags & PREDIRTY_DIR; + bool shallow = flags & PREDIRTY_SHALLOW; + + ceph_assert(mds->mdlog->entry_is_open()); + + // make sure stamp is set + if (mut->get_mds_stamp() == utime_t()) + mut->set_mds_stamp(ceph_clock_now()); + + if (in->is_base()) + return; + + dout(10) << "predirty_journal_parents" + << (do_parent_mtime ? " do_parent_mtime":"") + << " linkunlink=" << linkunlink + << (primary_dn ? " primary_dn":" remote_dn") + << (shallow ? " SHALLOW":"") + << " follows " << cfollows + << " " << *in << dendl; + + if (!parent) { + ceph_assert(primary_dn); + parent = in->get_projected_parent_dn()->get_dir(); + } + + if (flags == 0 && linkunlink == 0) { + dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl; + blob->add_dir_context(parent); + return; + } + + // build list of inodes to wrlock, dirty, and update + list<CInode*> lsi; + CInode *cur = in; + CDentry *parentdn = NULL; + bool first = true; + while (parent) { + //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack + ceph_assert(parent->is_auth()); + + // opportunistically adjust parent dirfrag + CInode *pin = parent->get_inode(); + + // inode -> dirfrag + mut->auth_pin(parent); + + auto pf = parent->project_fnode(mut); + pf->version = parent->pre_dirty(); + + if (do_parent_mtime || linkunlink) { + ceph_assert(mut->is_wrlocked(&pin->filelock)); + ceph_assert(mut->is_wrlocked(&pin->nestlock)); + ceph_assert(cfollows == CEPH_NOSNAP); + + // update stale fragstat/rstat? + parent->resync_accounted_fragstat(); + parent->resync_accounted_rstat(); + + if (do_parent_mtime) { + pf->fragstat.mtime = mut->get_op_stamp(); + pf->fragstat.change_attr++; + dout(10) << "predirty_journal_parents bumping fragstat change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl; + if (pf->fragstat.mtime > pf->rstat.rctime) { + dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; + pf->rstat.rctime = pf->fragstat.mtime; + } else { + dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; + } + } + if (linkunlink) { + dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; + if (in->is_dir()) { + pf->fragstat.nsubdirs += linkunlink; + //pf->rstat.rsubdirs += linkunlink; + } else { + pf->fragstat.nfiles += linkunlink; + //pf->rstat.rfiles += linkunlink; + } + } + } + + // rstat + if (!primary_dn) { + // don't update parent this pass + } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) && + pin->versionlock.can_wrlock())) { + dout(20) << " unwritable parent nestlock " << pin->nestlock + << ", marking dirty rstat on " << *cur << dendl; + cur->mark_dirty_rstat(); + } else { + // if we don't hold a wrlock reference on this nestlock, take one, + // because we are about to write into the dirfrag fnode and that needs + // to commit before the lock can cycle. + if (linkunlink) { + ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer()); + } + + if (!mut->is_wrlocked(&pin->nestlock)) { + dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl; + mds->locker->wrlock_force(&pin->nestlock, mut); + } + + // now we can project the inode rstat diff the dirfrag + SnapRealm *prealm = pin->find_snaprealm(); + + snapid_t follows = cfollows; + if (follows == CEPH_NOSNAP) + follows = prealm->get_newest_seq(); + + snapid_t first = follows+1; + + // first, if the frag is stale, bring it back in sync. + parent->resync_accounted_rstat(); + + // now push inode rstats into frag + project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm); + cur->clear_dirty_rstat(); + } + + bool stop = false; + if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) { + dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl; + stop = true; + } + + // delay propagating until later? + if (!stop && !first && + g_conf()->mds_dirstat_min_interval > 0) { + double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop; + if (since_last_prop < g_conf()->mds_dirstat_min_interval) { + dout(10) << "predirty_journal_parents last prop " << since_last_prop + << " < " << g_conf()->mds_dirstat_min_interval + << ", stopping" << dendl; + stop = true; + } else { + dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl; + } + } + + // can cast only because i'm passing nowait=true in the sole user + if (!stop && + !mut->is_wrlocked(&pin->nestlock) && + (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too + !mds->locker->wrlock_try(&pin->nestlock, mut) + )) { // ** do not initiate.. see above comment ** + dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock + << " on " << *pin << dendl; + stop = true; + } + if (stop) { + dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl; + mds->locker->mark_updated_scatterlock(&pin->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest); + mut->add_updated_lock(&pin->nestlock); + if (do_parent_mtime || linkunlink) { + mds->locker->mark_updated_scatterlock(&pin->filelock); + mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir); + mut->add_updated_lock(&pin->filelock); + } + break; + } + if (!mut->is_wrlocked(&pin->versionlock)) + mds->locker->local_wrlock_grab(&pin->versionlock, mut); + + ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer()); + + pin->last_dirstat_prop = mut->get_mds_stamp(); + + // dirfrag -> diri + mut->auth_pin(pin); + lsi.push_front(pin); + + pin->pre_cow_old_inode(); // avoid cow mayhem! + + auto pi = pin->project_inode(mut); + pi.inode->version = pin->pre_dirty(); + + // dirstat + if (do_parent_mtime || linkunlink) { + dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl; + dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; + bool touched_mtime = false, touched_chattr = false; + pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); + pf->accounted_fragstat = pf->fragstat; + if (touched_mtime) + pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime; + if (touched_chattr) + pi.inode->change_attr++; + dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl; + + if (parent->get_frag() == frag_t()) { // i.e., we are the only frag + if (pi.inode->dirstat.size() < 0) + ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter); + if (pi.inode->dirstat.size() != pf->fragstat.size()) { + mds->clog->error() << "unmatched fragstat size on single dirfrag " + << parent->dirfrag() << ", inode has " << pi.inode->dirstat + << ", dirfrag has " << pf->fragstat; + + // trust the dirfrag for now + pi.inode->dirstat = pf->fragstat; + + ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter); + } + } + } + + // rstat + dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl; + + // first, if the frag is stale, bring it back in sync. + parent->resync_accounted_rstat(); + + if (g_conf()->mds_snap_rstat) { + for (auto &p : parent->dirty_old_rstat) { + project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first, + p.first, pin, true); + } + } + parent->dirty_old_rstat.clear(); + project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false); + + pf->accounted_rstat = pf->rstat; + + if (parent->get_frag() == frag_t()) { // i.e., we are the only frag + if (pi.inode->rstat.rbytes != pf->rstat.rbytes) { + mds->clog->error() << "unmatched rstat rbytes on single dirfrag " + << parent->dirfrag() << ", inode has " << pi.inode->rstat + << ", dirfrag has " << pf->rstat; + + // trust the dirfrag for now + pi.inode->rstat = pf->rstat; + + ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter); + } + } + + parent->check_rstats(); + broadcast_quota_to_client(pin); + if (pin->is_base()) + break; + // next parent! + cur = pin; + parentdn = pin->get_projected_parent_dn(); + ceph_assert(parentdn); + parent = parentdn->get_dir(); + linkunlink = 0; + do_parent_mtime = false; + primary_dn = true; + first = false; + } + + // now, stick it in the blob + ceph_assert(parent); + ceph_assert(parent->is_auth()); + blob->add_dir_context(parent); + blob->add_dir(parent, true); + for (const auto& in : lsi) { + journal_dirty_inode(mut.get(), blob, in); + } + +} + + + + + +// =================================== +// peer requests + + +/* + * some handlers for leader requests with peers. we need to make + * sure leader journal commits before we forget we leadered them and + * remove them from the uncommitted_leaders map (used during recovery + * to commit|abort peers). + */ +struct C_MDC_CommittedLeader : public MDCacheLogContext { + metareqid_t reqid; + C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {} + void finish(int r) override { + mdcache->_logged_leader_commit(reqid); + } +}; + +void MDCache::log_leader_commit(metareqid_t reqid) +{ + dout(10) << "log_leader_commit " << reqid << dendl; + uncommitted_leaders[reqid].committing = true; + mds->mdlog->start_submit_entry(new ECommitted(reqid), + new C_MDC_CommittedLeader(this, reqid)); +} + +void MDCache::_logged_leader_commit(metareqid_t reqid) +{ + dout(10) << "_logged_leader_commit " << reqid << dendl; + ceph_assert(uncommitted_leaders.count(reqid)); + uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid); + mds->queue_waiters(uncommitted_leaders[reqid].waiters); + uncommitted_leaders.erase(reqid); +} + +// while active... + +void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from) +{ + dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl; + ceph_assert(uncommitted_leaders.count(r)); + uncommitted_leaders[r].peers.erase(from); + if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty()) + log_leader_commit(r); +} + +void MDCache::logged_leader_update(metareqid_t reqid) +{ + dout(10) << "logged_leader_update " << reqid << dendl; + ceph_assert(uncommitted_leaders.count(reqid)); + uncommitted_leaders[reqid].safe = true; + auto p = pending_leaders.find(reqid); + if (p != pending_leaders.end()) { + pending_leaders.erase(p); + if (pending_leaders.empty()) + process_delayed_resolve(); + } +} + +/* + * Leader may crash after receiving all peers' commit acks, but before journalling + * the final commit. Peers may crash after journalling the peer commit, but before + * sending commit ack to the leader. Commit leaders with no uncommitted peer when + * resolve finishes. + */ +void MDCache::finish_committed_leaders() +{ + for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin(); + p != uncommitted_leaders.end(); + ++p) { + p->second.recovering = false; + if (!p->second.committing && p->second.peers.empty()) { + dout(10) << "finish_committed_leaders " << p->first << dendl; + log_leader_commit(p->first); + } + } +} + +/* + * at end of resolve... we must journal a commit|abort for all peer + * updates, before moving on. + * + * this is so that the leader can safely journal ECommitted on ops it + * leaders when it reaches up:active (all other recovering nodes must + * complete resolve before that happens). + */ +struct C_MDC_PeerCommit : public MDCacheLogContext { + mds_rank_t from; + metareqid_t reqid; + C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {} + void finish(int r) override { + mdcache->_logged_peer_commit(from, reqid); + } +}; + +void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid) +{ + dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl; + + // send a message + auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED); + mds->send_message_mds(req, from); +} + + + + + + +// ==================================================================== +// import map, recovery + +void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, + map<dirfrag_t,vector<dirfrag_t> >& subtrees) +{ + if (subtrees.count(oldparent)) { + vector<dirfrag_t>& v = subtrees[oldparent]; + dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl; + for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it) + if (*it == df) { + v.erase(it); + break; + } + } + if (subtrees.count(newparent)) { + vector<dirfrag_t>& v = subtrees[newparent]; + dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl; + v.push_back(df); + } +} + +ESubtreeMap *MDCache::create_subtree_map() +{ + dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " + << num_subtrees_fullauth() << " fullauth" + << dendl; + + show_subtrees(); + + ESubtreeMap *le = new ESubtreeMap(); + mds->mdlog->_start_entry(le); + + map<dirfrag_t, CDir*> dirs_to_add; + + if (myin) { + CDir* mydir = myin->get_dirfrag(frag_t()); + dirs_to_add[mydir->dirfrag()] = mydir; + } + + // include all auth subtrees, and their bounds. + // and a spanning tree to tie it to the root. + for (auto& [dir, bounds] : subtrees) { + // journal subtree as "ours" if we are + // me, -2 + // me, me + // me, !me (may be importing and ambiguous!) + + // so not + // !me, * + if (dir->get_dir_auth().first != mds->get_nodeid()) + continue; + + if (migrator->is_ambiguous_import(dir->dirfrag()) || + my_ambiguous_imports.count(dir->dirfrag())) { + dout(15) << " ambig subtree " << *dir << dendl; + le->ambiguous_subtrees.insert(dir->dirfrag()); + } else { + dout(15) << " auth subtree " << *dir << dendl; + } + + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[dir->dirfrag()].clear(); + + // bounds + size_t nbounds = bounds.size(); + if (nbounds > 3) { + dout(15) << " subtree has " << nbounds << " bounds" << dendl; + } + for (auto& bound : bounds) { + if (nbounds <= 3) { + dout(15) << " subtree bound " << *bound << dendl; + } + dirs_to_add[bound->dirfrag()] = bound; + le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); + } + } + + // apply projected renames + for (const auto& [diri, renames] : projected_subtree_renames) { + for (const auto& [olddir, newdir] : renames) { + dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl; + + auto&& dfls = diri->get_dirfrags(); + for (const auto& dir : dfls) { + dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl; + CDir *oldparent = get_projected_subtree_root(olddir); + dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl; + CDir *newparent = get_projected_subtree_root(newdir); + dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl; + + if (oldparent == newparent) { + dout(15) << "parent unchanged for " << dir->dirfrag() << " at " + << oldparent->dirfrag() << dendl; + continue; + } + + if (dir->is_subtree_root()) { + if (le->subtrees.count(newparent->dirfrag()) && + oldparent->get_dir_auth() != newparent->get_dir_auth()) + dirs_to_add[dir->dirfrag()] = dir; + // children are fine. change parent. + _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), + le->subtrees); + } else { + // mid-subtree. + + if (oldparent->get_dir_auth() != newparent->get_dir_auth()) { + dout(10) << " creating subtree for " << dir->dirfrag() << dendl; + // if oldparent is auth, subtree is mine; include it. + if (le->subtrees.count(oldparent->dirfrag())) { + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[dir->dirfrag()].clear(); + } + // if newparent is auth, subtree is a new bound + if (le->subtrees.count(newparent->dirfrag())) { + dirs_to_add[dir->dirfrag()] = dir; + le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound + } + newparent = dir; + } + + // see if any old bounds move to the new parent. + for (auto& bound : subtrees.at(oldparent)) { + if (dir->contains(bound->get_parent_dir())) + _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), + le->subtrees); + } + } + } + } + } + + // simplify the journaled map. our in memory map may have more + // subtrees than needed due to migrations that are just getting + // started or just completing. but on replay, the "live" map will + // be simple and we can do a straight comparison. + for (auto& [frag, bfrags] : le->subtrees) { + if (le->ambiguous_subtrees.count(frag)) + continue; + unsigned i = 0; + while (i < bfrags.size()) { + dirfrag_t b = bfrags[i]; + if (le->subtrees.count(b) && + le->ambiguous_subtrees.count(b) == 0) { + auto& bb = le->subtrees.at(b); + dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl; + for (auto& r : bb) { + bfrags.push_back(r); + } + dirs_to_add.erase(b); + le->subtrees.erase(b); + bfrags.erase(bfrags.begin() + i); + } else { + ++i; + } + } + } + + for (auto &p : dirs_to_add) { + CDir *dir = p.second; + le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); + le->metablob.add_dir(dir, false); + } + + dout(15) << " subtrees " << le->subtrees << dendl; + dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl; + + //le->metablob.print(cout); + le->expire_pos = mds->mdlog->journaler->get_expire_pos(); + return le; +} + +void MDCache::dump_resolve_status(Formatter *f) const +{ + f->open_object_section("resolve_status"); + f->dump_stream("resolve_gather") << resolve_gather; + f->dump_stream("resolve_ack_gather") << resolve_gather; + f->close_section(); +} + +void MDCache::resolve_start(MDSContext *resolve_done_) +{ + dout(10) << "resolve_start" << dendl; + ceph_assert(!resolve_done); + resolve_done.reset(resolve_done_); + + if (mds->mdsmap->get_root() != mds->get_nodeid()) { + // if we don't have the root dir, adjust it to UNKNOWN. during + // resolve we want mds0 to explicit claim the portion of it that + // it owns, so that anything beyond its bounds get left as + // unknown. + CDir *rootdir = root->get_dirfrag(frag_t()); + if (rootdir) + adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); + } + resolve_gather = recovery_set; + + resolve_snapclient_commits = mds->snapclient->get_journaled_tids(); +} + +void MDCache::send_resolves() +{ + send_peer_resolves(); + + if (!resolve_done) { + // I'm survivor: refresh snap cache + mds->snapclient->sync( + new MDSInternalContextWrapper(mds, + new LambdaContext([this](int r) { + maybe_finish_peer_resolve(); + }) + ) + ); + dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl; + return; + } + if (!resolve_ack_gather.empty()) { + dout(10) << "send_resolves still waiting for resolve ack from (" + << resolve_ack_gather << ")" << dendl; + return; + } + if (!resolve_need_rollback.empty()) { + dout(10) << "send_resolves still waiting for rollback to commit on (" + << resolve_need_rollback << ")" << dendl; + return; + } + + send_subtree_resolves(); +} + +void MDCache::send_peer_resolves() +{ + dout(10) << "send_peer_resolves" << dendl; + + map<mds_rank_t, ref_t<MMDSResolve>> resolves; + + if (mds->is_resolve()) { + for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin(); + p != uncommitted_peers.end(); + ++p) { + mds_rank_t leader = p->second.leader; + auto &m = resolves[leader]; + if (!m) m = make_message<MMDSResolve>(); + m->add_peer_request(p->first, false); + } + } else { + set<mds_rank_t> resolve_set; + mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE); + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (!mdr->is_peer()) + continue; + if (!mdr->peer_did_prepare() && !mdr->committing) { + continue; + } + mds_rank_t leader = mdr->peer_to_mds; + if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) { + dout(10) << " including uncommitted " << *mdr << dendl; + if (!resolves.count(leader)) + resolves[leader] = make_message<MMDSResolve>(); + if (!mdr->committing && + mdr->has_more() && mdr->more()->is_inode_exporter) { + // re-send cap exports + CInode *in = mdr->more()->rename_inode; + map<client_t, Capability::Export> cap_map; + in->export_client_caps(cap_map); + bufferlist bl; + MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map); + encode(inode_caps, bl); + resolves[leader]->add_peer_request(p->first, bl); + } else { + resolves[leader]->add_peer_request(p->first, mdr->committing); + } + } + } + } + + for (auto &p : resolves) { + dout(10) << "sending peer resolve to mds." << p.first << dendl; + mds->send_message_mds(p.second, p.first); + resolve_ack_gather.insert(p.first); + } +} + +void MDCache::send_subtree_resolves() +{ + dout(10) << "send_subtree_resolves" << dendl; + + if (migrator->is_exporting() || migrator->is_importing()) { + dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl; + migrator->show_importing(); + migrator->show_exporting(); + resolves_pending = true; + return; // not now + } + + map<mds_rank_t, ref_t<MMDSResolve>> resolves; + for (set<mds_rank_t>::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (*p == mds->get_nodeid()) + continue; + if (mds->is_resolve() || mds->mdsmap->is_resolve(*p)) + resolves[*p] = make_message<MMDSResolve>(); + } + + map<dirfrag_t, vector<dirfrag_t> > my_subtrees; + map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports; + + // known + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + // only our subtrees + if (dir->authority().first != mds->get_nodeid()) + continue; + + if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag())) + continue; // we'll add it below + + if (migrator->is_ambiguous_import(dir->dirfrag())) { + // ambiguous (mid-import) + set<CDir*> bounds; + get_subtree_bounds(dir, bounds); + vector<dirfrag_t> dfls; + for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) + dfls.push_back((*q)->dirfrag()); + + my_ambig_imports[dir->dirfrag()] = dfls; + dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl; + } else { + // not ambiguous. + for (auto &q : resolves) { + resolves[q.first]->add_subtree(dir->dirfrag()); + } + // bounds too + vector<dirfrag_t> dfls; + for (set<CDir*>::iterator q = subtrees[dir].begin(); + q != subtrees[dir].end(); + ++q) { + CDir *bound = *q; + dfls.push_back(bound->dirfrag()); + } + + my_subtrees[dir->dirfrag()] = dfls; + dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl; + } + } + + // ambiguous + for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin(); + p != my_ambiguous_imports.end(); + ++p) { + my_ambig_imports[p->first] = p->second; + dout(10) << " ambig " << p->first << " " << p->second << dendl; + } + + // simplify the claimed subtree. + for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) { + unsigned i = 0; + while (i < p->second.size()) { + dirfrag_t b = p->second[i]; + if (my_subtrees.count(b)) { + vector<dirfrag_t>& bb = my_subtrees[b]; + dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl; + for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r) + p->second.push_back(*r); + my_subtrees.erase(b); + p->second.erase(p->second.begin() + i); + } else { + ++i; + } + } + } + + // send + for (auto &p : resolves) { + const ref_t<MMDSResolve> &m = p.second; + if (mds->is_resolve()) { + m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits); + } else { + m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids()); + } + m->subtrees = my_subtrees; + m->ambiguous_imports = my_ambig_imports; + dout(10) << "sending subtee resolve to mds." << p.first << dendl; + mds->send_message_mds(m, p.first); + } + resolves_pending = false; +} + +void MDCache::maybe_finish_peer_resolve() { + if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) { + // snap cache get synced or I'm in resolve state + if (mds->snapclient->is_synced() || resolve_done) + send_subtree_resolves(); + process_delayed_resolve(); + } +} + +void MDCache::handle_mds_failure(mds_rank_t who) +{ + dout(7) << "handle_mds_failure mds." << who << dendl; + + dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl; + + resolve_gather.insert(who); + discard_delayed_resolve(who); + ambiguous_peer_updates.erase(who); + + rejoin_gather.insert(who); + rejoin_sent.erase(who); // i need to send another + rejoin_ack_sent.erase(who); // i need to send another + rejoin_ack_gather.erase(who); // i'll need/get another. + + dout(10) << " resolve_gather " << resolve_gather << dendl; + dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl; + dout(10) << " rejoin_sent " << rejoin_sent << dendl; + dout(10) << " rejoin_gather " << rejoin_gather << dendl; + dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; + + + // tell the migrator too. + migrator->handle_mds_failure_or_stop(who); + + // tell the balancer too. + mds->balancer->handle_mds_failure(who); + + // clean up any requests peer to/from this node + list<MDRequestRef> finish; + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + // peer to the failed node? + if (mdr->peer_to_mds == who) { + if (mdr->peer_did_prepare()) { + dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl; + if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds)) + remove_ambiguous_peer_update(p->first, mdr->peer_to_mds); + + if (!mdr->more()->waiting_on_peer.empty()) { + ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); + // will rollback, no need to wait + mdr->reset_peer_request(); + mdr->more()->waiting_on_peer.clear(); + } + } else if (!mdr->committing) { + dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl; + if (mdr->peer_request || mdr->peer_rolling_back()) + mdr->aborted = true; + else + finish.push_back(mdr); + } + } + + if (mdr->is_peer() && mdr->peer_did_prepare()) { + if (mdr->more()->waiting_on_peer.count(who)) { + ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); + dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds." + << who << dendl; + mdr->more()->waiting_on_peer.erase(who); + if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request) + mds->queue_waiter(new C_MDS_RetryRequest(this, mdr)); + } + + if (mdr->more()->srcdn_auth_mds == who && + mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) { + // rename srcdn's auth mds failed, resolve even I'm a survivor. + dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl; + add_ambiguous_peer_update(p->first, mdr->peer_to_mds); + } + } else if (mdr->peer_request) { + const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request; + // FIXME: Peer rename request can arrive after we notice mds failure. + // This can cause mds to crash (does not affect integrity of FS). + if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP && + peer_req->srcdn_auth == who) + peer_req->mark_interrupted(); + } + + // failed node is peer? + if (mdr->is_leader() && !mdr->committing) { + if (mdr->more()->srcdn_auth_mds == who) { + dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds." + << who << " to recover" << dendl; + ceph_assert(mdr->more()->witnessed.count(who) == 0); + if (mdr->more()->is_ambiguous_auth) + mdr->clear_ambiguous_auth(); + // rename srcdn's auth mds failed, all witnesses will rollback + mdr->more()->witnessed.clear(); + pending_leaders.erase(p->first); + } + + if (mdr->more()->witnessed.count(who)) { + mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds; + if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) { + dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds." + << mdr->more()->srcdn_auth_mds << " to reply" << dendl; + // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack + // until either the request is committing or the peer also fails. + ceph_assert(mdr->more()->waiting_on_peer.size() == 1); + pending_leaders.insert(p->first); + } else { + dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds." + << who << " to recover" << dendl; + if (srcdn_auth >= 0) + ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0); + + // discard this peer's prepare (if any) + mdr->more()->witnessed.erase(who); + } + } + + if (mdr->more()->waiting_on_peer.count(who)) { + dout(10) << " leader request " << *mdr << " waiting for peer mds." << who + << " to recover" << dendl; + // retry request when peer recovers + mdr->more()->waiting_on_peer.erase(who); + if (mdr->more()->waiting_on_peer.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr)); + } + + if (mdr->locking && mdr->locking_target_mds == who) + mdr->finish_locking(mdr->locking); + } + } + + for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin(); + p != uncommitted_leaders.end(); + ++p) { + // The failed MDS may have already committed the peer update + if (p->second.peers.count(who)) { + p->second.recovering = true; + p->second.peers.erase(who); + } + } + + while (!finish.empty()) { + dout(10) << "cleaning up peer request " << *finish.front() << dendl; + request_finish(finish.front()); + finish.pop_front(); + } + + kick_find_ino_peers(who); + kick_open_ino_peers(who); + + for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin(); + p != fragments.end(); ) { + dirfrag_t df = p->first; + fragment_info_t& info = p->second; + + if (info.is_fragmenting()) { + if (info.notify_ack_waiting.erase(who) && + info.notify_ack_waiting.empty()) { + fragment_drop_locks(info); + fragment_maybe_finish(p++); + } else { + ++p; + } + continue; + } + + ++p; + dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl; + std::vector<CDir*> dirs; + info.dirs.swap(dirs); + fragments.erase(df); + fragment_unmark_unfreeze_dirs(dirs); + } + + // MDCache::shutdown_export_strays() always exports strays to mds.0 + if (who == mds_rank_t(0)) + shutdown_exporting_strays.clear(); + + show_subtrees(); +} + +/* + * handle_mds_recovery - called on another node's transition + * from resolve -> active. + */ +void MDCache::handle_mds_recovery(mds_rank_t who) +{ + dout(7) << "handle_mds_recovery mds." << who << dendl; + + // exclude all discover waiters. kick_discovers() will do the job + static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR; + static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY; + + MDSContext::vec waiters; + + // wake up any waiters in their subtrees + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + + if (dir->authority().first != who || + dir->authority().second == mds->get_nodeid()) + continue; + ceph_assert(!dir->is_auth()); + + // wake any waiters + std::queue<CDir*> q; + q.push(dir); + + while (!q.empty()) { + CDir *d = q.front(); + q.pop(); + d->take_waiting(d_mask, waiters); + + // inode waiters too + for (auto &p : d->items) { + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + dnl->get_inode()->take_waiting(i_mask, waiters); + + // recurse? + auto&& ls = dnl->get_inode()->get_dirfrags(); + for (const auto& subdir : ls) { + if (!subdir->is_subtree_root()) + q.push(subdir); + } + } + } + } + } + + kick_open_ino_peers(who); + kick_find_ino_peers(who); + + // queue them up. + mds->queue_waiters(waiters); +} + +void MDCache::set_recovery_set(set<mds_rank_t>& s) +{ + dout(7) << "set_recovery_set " << s << dendl; + recovery_set = s; +} + + +/* + * during resolve state, we share resolves to determine who + * is authoritative for which trees. we expect to get an resolve + * from _everyone_ in the recovery_set (the mds cluster at the time of + * the first failure). + * + * This functions puts the passed message before returning + */ +void MDCache::handle_resolve(const cref_t<MMDSResolve> &m) +{ + dout(7) << "handle_resolve from " << m->get_source() << dendl; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_RESOLVE) { + if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { + mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); + return; + } + // wait until we reach the resolve stage! + return; + } + + discard_delayed_resolve(from); + + // ambiguous peer requests? + if (!m->peer_requests.empty()) { + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) { + if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) { + ceph_assert(!p->second.committing); + pending_leaders.insert(p->first); + } + } + + if (!pending_leaders.empty()) { + dout(10) << " still have pending updates, delay processing peer resolve" << dendl; + delayed_resolve[from] = m; + return; + } + } + + auto ack = make_message<MMDSResolveAck>(); + for (const auto &p : m->peer_requests) { + if (uncommitted_leaders.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) { + // COMMIT + if (p.second.committing) { + // already committing, waiting for the OP_COMMITTED peer reply + dout(10) << " already committing peer request " << p << " noop "<< dendl; + } else { + dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl; + ack->add_commit(p.first); + } + uncommitted_leaders[p.first].peers.insert(from); // wait for peer OP_COMMITTED before we log ECommitted + + if (p.second.inode_caps.length() > 0) { + // peer wants to export caps (rename) + ceph_assert(mds->is_resolve()); + MMDSResolve::peer_inode_cap inode_caps; + auto q = p.second.inode_caps.cbegin(); + decode(inode_caps, q); + inodeno_t ino = inode_caps.ino; + map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports; + ceph_assert(get_inode(ino)); + + for (map<client_t,Capability::Export>::iterator q = cap_exports.begin(); + q != cap_exports.end(); + ++q) { + Capability::Import& im = rejoin_imported_caps[from][ino][q->first]; + im.cap_id = ++last_cap_id; // assign a new cap ID + im.issue_seq = 1; + im.mseq = q->second.mseq; + + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) + rejoin_client_map.emplace(q->first, session->info.inst); + } + + // will process these caps in rejoin stage + rejoin_peer_exports[ino].first = from; + rejoin_peer_exports[ino].second.swap(cap_exports); + + // send information of imported caps back to peer + encode(rejoin_imported_caps[from][ino], ack->commit[p.first]); + } + } else { + // ABORT + dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl; + ceph_assert(!p.second.committing); + ack->add_abort(p.first); + } + } + mds->send_message(ack, m->get_connection()); + return; + } + + if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) { + dout(10) << "delay processing subtree resolve" << dendl; + delayed_resolve[from] = m; + return; + } + + bool survivor = false; + // am i a surviving ambiguous importer? + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + survivor = true; + // check for any import success/failure (from this node) + map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin(); + while (p != my_ambiguous_imports.end()) { + map<dirfrag_t, vector<dirfrag_t> >::iterator next = p; + ++next; + CDir *dir = get_dirfrag(p->first); + ceph_assert(dir); + dout(10) << "checking ambiguous import " << *dir << dendl; + if (migrator->is_importing(dir->dirfrag()) && + migrator->get_import_peer(dir->dirfrag()) == from) { + ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); + + // check if sender claims the subtree + bool claimed_by_sender = false; + for (const auto &q : m->subtrees) { + // an ambiguous import won't race with a refragmentation; it's appropriate to force here. + CDir *base = get_force_dirfrag(q.first, false); + if (!base || !base->contains(dir)) + continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. + + bool inside = true; + set<CDir*> bounds; + get_force_dirfrag_bound_set(q.second, bounds); + for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) { + CDir *bound = *p; + if (bound->contains(dir)) { + inside = false; // nope, bound is dir or parent of dir, not inside. + break; + } + } + if (inside) + claimed_by_sender = true; + } + + my_ambiguous_imports.erase(p); // no longer ambiguous. + if (claimed_by_sender) { + dout(7) << "ambiguous import failed on " << *dir << dendl; + migrator->import_reverse(dir); + } else { + dout(7) << "ambiguous import succeeded on " << *dir << dendl; + migrator->import_finish(dir, true); + } + } + p = next; + } + } + + // update my dir_auth values + // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous + // migrations between other nodes) + for (const auto& p : m->subtrees) { + dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl; + CDir *dir = get_force_dirfrag(p.first, !survivor); + if (!dir) + continue; + adjust_bounded_subtree_auth(dir, p.second, from); + try_subtree_merge(dir); + } + + show_subtrees(); + + // note ambiguous imports too + for (const auto& p : m->ambiguous_imports) { + dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl; + other_ambiguous_imports[from][p.first] = p.second; + } + + // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload + // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds + for (const auto& p : m->table_clients) { + dout(10) << " noting " << get_mdstable_name(p.type) + << " pending_commits " << p.pending_commits << dendl; + MDSTableClient *client = mds->get_table_client(p.type); + for (const auto& q : p.pending_commits) + client->notify_commit(q); + } + + // did i get them all? + resolve_gather.erase(from); + + maybe_resolve_finish(); +} + +void MDCache::process_delayed_resolve() +{ + dout(10) << "process_delayed_resolve" << dendl; + map<mds_rank_t, cref_t<MMDSResolve>> tmp; + tmp.swap(delayed_resolve); + for (auto &p : tmp) { + handle_resolve(p.second); + } +} + +void MDCache::discard_delayed_resolve(mds_rank_t who) +{ + delayed_resolve.erase(who); +} + +void MDCache::maybe_resolve_finish() +{ + ceph_assert(resolve_ack_gather.empty()); + ceph_assert(resolve_need_rollback.empty()); + + if (!resolve_gather.empty()) { + dout(10) << "maybe_resolve_finish still waiting for resolves (" + << resolve_gather << ")" << dendl; + return; + } + + dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; + disambiguate_my_imports(); + finish_committed_leaders(); + + if (resolve_done) { + ceph_assert(mds->is_resolve()); + trim_unlinked_inodes(); + recalc_auth_bits(false); + resolve_done.release()->complete(0); + } else { + // I am survivor. + maybe_send_pending_rejoins(); + } +} + +void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack) +{ + dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + if (!resolve_ack_gather.count(from) || + mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) { + return; + } + + if (ambiguous_peer_updates.count(from)) { + ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from)); + ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + } + + for (const auto &p : ack->commit) { + dout(10) << " commit on peer " << p.first << dendl; + + if (ambiguous_peer_updates.count(from)) { + remove_ambiguous_peer_update(p.first, from); + continue; + } + + if (mds->is_resolve()) { + // replay + MDPeerUpdate *su = get_uncommitted_peer(p.first, from); + ceph_assert(su); + + // log commit + mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from, + EPeerUpdate::OP_COMMIT, su->origop), + new C_MDC_PeerCommit(this, from, p.first)); + mds->mdlog->flush(); + + finish_uncommitted_peer(p.first); + } else { + MDRequestRef mdr = request_get(p.first); + // information about leader imported caps + if (p.second.length() > 0) + mdr->more()->inode_import.share(p.second); + + ceph_assert(mdr->peer_request == 0); // shouldn't be doing anything! + request_finish(mdr); + } + } + + for (const auto &metareq : ack->abort) { + dout(10) << " abort on peer " << metareq << dendl; + + if (mds->is_resolve()) { + MDPeerUpdate *su = get_uncommitted_peer(metareq, from); + ceph_assert(su); + + // perform rollback (and journal a rollback entry) + // note: this will hold up the resolve a bit, until the rollback entries journal. + MDRequestRef null_ref; + switch (su->origop) { + case EPeerUpdate::LINK: + mds->server->do_link_rollback(su->rollback, from, null_ref); + break; + case EPeerUpdate::RENAME: + mds->server->do_rename_rollback(su->rollback, from, null_ref); + break; + case EPeerUpdate::RMDIR: + mds->server->do_rmdir_rollback(su->rollback, from, null_ref); + break; + default: + ceph_abort(); + } + } else { + MDRequestRef mdr = request_get(metareq); + mdr->aborted = true; + if (mdr->peer_request) { + if (mdr->peer_did_prepare()) // journaling peer prepare ? + add_rollback(metareq, from); + } else { + request_finish(mdr); + } + } + } + + if (!ambiguous_peer_updates.count(from)) { + resolve_ack_gather.erase(from); + maybe_finish_peer_resolve(); + } +} + +void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su) +{ + auto const &ret = uncommitted_peers.emplace(std::piecewise_construct, + std::forward_as_tuple(reqid), + std::forward_as_tuple()); + ceph_assert(ret.second); + ls->uncommitted_peers.insert(reqid); + upeer &u = ret.first->second; + u.leader = leader; + u.ls = ls; + u.su = su; + if (su == nullptr) { + return; + } + for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) + uncommitted_peer_rename_olddir[*p]++; + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) + uncommitted_peer_unlink[*p]++; +} + +void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist) +{ + auto it = uncommitted_peers.find(reqid); + if (it == uncommitted_peers.end()) { + ceph_assert(!assert_exist); + return; + } + upeer &u = it->second; + MDPeerUpdate* su = u.su; + + if (!u.waiters.empty()) { + mds->queue_waiters(u.waiters); + } + u.ls->uncommitted_peers.erase(reqid); + uncommitted_peers.erase(it); + + if (su == nullptr) { + return; + } + // discard the non-auth subtree we renamed out of + for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) { + CInode *diri = *p; + map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri); + ceph_assert(it != uncommitted_peer_rename_olddir.end()); + it->second--; + if (it->second == 0) { + uncommitted_peer_rename_olddir.erase(it); + auto&& ls = diri->get_dirfrags(); + for (const auto& dir : ls) { + CDir *root = get_subtree_root(dir); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { + try_trim_non_auth_subtree(root); + if (dir != root) + break; + } + } + } else + ceph_assert(it->second > 0); + } + // removed the inodes that were unlinked by peer update + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) { + CInode *in = *p; + map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in); + ceph_assert(it != uncommitted_peer_unlink.end()); + it->second--; + if (it->second == 0) { + uncommitted_peer_unlink.erase(it); + if (!in->get_projected_parent_dn()) + mds->mdcache->remove_inode_recursive(in); + } else + ceph_assert(it->second > 0); + } + delete su; +} + +MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader) +{ + + MDPeerUpdate* su = nullptr; + auto it = uncommitted_peers.find(reqid); + if (it != uncommitted_peers.end() && + it->second.leader == leader) { + su = it->second.su; + } + return su; +} + +void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) { + auto p = resolve_need_rollback.find(reqid); + ceph_assert(p != resolve_need_rollback.end()); + if (mds->is_resolve()) { + finish_uncommitted_peer(reqid, false); + } else if (mdr) { + finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled); + } + resolve_need_rollback.erase(p); + maybe_finish_peer_resolve(); +} + +void MDCache::disambiguate_other_imports() +{ + dout(10) << "disambiguate_other_imports" << dendl; + + bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + // other nodes' ambiguous imports + for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin(); + p != other_ambiguous_imports.end(); + ++p) { + mds_rank_t who = p->first; + dout(10) << "ambiguous imports for mds." << who << dendl; + + for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; + // an ambiguous import will not race with a refragmentation; it's appropriate to force here. + CDir *dir = get_force_dirfrag(q->first, recovering); + if (!dir) continue; + + if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander + dir->authority() == CDIR_AUTH_UNDEF) { // resolving + dout(10) << " mds." << who << " did import " << *dir << dendl; + adjust_bounded_subtree_auth(dir, q->second, who); + try_subtree_merge(dir); + } else { + dout(10) << " mds." << who << " did not import " << *dir << dendl; + } + } + } + other_ambiguous_imports.clear(); +} + +void MDCache::disambiguate_my_imports() +{ + dout(10) << "disambiguate_my_imports" << dendl; + + if (!mds->is_resolve()) { + ceph_assert(my_ambiguous_imports.empty()); + return; + } + + disambiguate_other_imports(); + + // my ambiguous imports + mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid()); + while (!my_ambiguous_imports.empty()) { + map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin(); + + CDir *dir = get_dirfrag(q->first); + ceph_assert(dir); + + if (dir->authority() != me_ambig) { + dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; + cancel_ambiguous_import(dir); + + mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); + + // subtree may have been swallowed by another node claiming dir + // as their own. + CDir *root = get_subtree_root(dir); + if (root != dir) + dout(10) << " subtree root is " << *root << dendl; + ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us! + try_trim_non_auth_subtree(root); + } else { + dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl; + finish_ambiguous_import(q->first); + mds->mdlog->start_submit_entry(new EImportFinish(dir, true)); + } + } + ceph_assert(my_ambiguous_imports.empty()); + mds->mdlog->flush(); + + // verify all my subtrees are unambiguous! + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (dir->is_ambiguous_dir_auth()) { + dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; + } + ceph_assert(!dir->is_ambiguous_dir_auth()); + } + + show_subtrees(); +} + + +void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds) +{ + ceph_assert(my_ambiguous_imports.count(base) == 0); + my_ambiguous_imports[base] = bounds; +} + + +void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds) +{ + // make a list + vector<dirfrag_t> binos; + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) + binos.push_back((*p)->dirfrag()); + + // note: this can get called twice if the exporter fails during recovery + if (my_ambiguous_imports.count(base->dirfrag())) + my_ambiguous_imports.erase(base->dirfrag()); + + add_ambiguous_import(base->dirfrag(), binos); +} + +void MDCache::cancel_ambiguous_import(CDir *dir) +{ + dirfrag_t df = dir->dirfrag(); + ceph_assert(my_ambiguous_imports.count(df)); + dout(10) << "cancel_ambiguous_import " << df + << " bounds " << my_ambiguous_imports[df] + << " " << *dir + << dendl; + my_ambiguous_imports.erase(df); +} + +void MDCache::finish_ambiguous_import(dirfrag_t df) +{ + ceph_assert(my_ambiguous_imports.count(df)); + vector<dirfrag_t> bounds; + bounds.swap(my_ambiguous_imports[df]); + my_ambiguous_imports.erase(df); + + dout(10) << "finish_ambiguous_import " << df + << " bounds " << bounds + << dendl; + CDir *dir = get_dirfrag(df); + ceph_assert(dir); + + // adjust dir_auth, import maps + adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); + try_subtree_merge(dir); +} + +void MDCache::remove_inode_recursive(CInode *in) +{ + dout(10) << "remove_inode_recursive " << *in << dendl; + auto&& ls = in->get_dirfrags(); + for (const auto& subdir : ls) { + dout(10) << " removing dirfrag " << *subdir << dendl; + auto it = subdir->items.begin(); + while (it != subdir->items.end()) { + CDentry *dn = it->second; + ++it; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + CInode *tin = dnl->get_inode(); + subdir->unlink_inode(dn, false); + remove_inode_recursive(tin); + } + subdir->remove_dentry(dn); + } + + if (subdir->is_subtree_root()) + remove_subtree(subdir); + in->close_dirfrag(subdir->dirfrag().frag); + } + remove_inode(in); +} + +bool MDCache::expire_recursive(CInode *in, expiremap &expiremap) +{ + ceph_assert(!in->is_auth()); + + dout(10) << __func__ << ":" << *in << dendl; + + // Recurse into any dirfrags beneath this inode + auto&& ls = in->get_dirfrags(); + for (const auto& subdir : ls) { + if (!in->is_mdsdir() && subdir->is_subtree_root()) { + dout(10) << __func__ << ": stray still has subtree " << *in << dendl; + return true; + } + + for (auto it = subdir->items.begin(); it != subdir->items.end();) { + CDentry *dn = it->second; + it++; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { + CInode *tin = dnl->get_inode(); + + /* Remote strays with linkage (i.e. hardlinks) should not be + * expired, because they may be the target of + * a rename() as the owning MDS shuts down */ + if (!tin->is_stray() && tin->get_inode()->nlink) { + dout(10) << __func__ << ": stray still has linkage " << *tin << dendl; + return true; + } + + const bool abort = expire_recursive(tin, expiremap); + if (abort) { + return true; + } + } + if (dn->lru_is_expireable()) { + trim_dentry(dn, expiremap); + } else { + dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl; + return true; + } + } + } + + return false; +} + +void MDCache::trim_unlinked_inodes() +{ + dout(7) << "trim_unlinked_inodes" << dendl; + int count = 0; + vector<CInode*> q; + for (auto &p : inode_map) { + CInode *in = p.second; + if (in->get_parent_dn() == NULL && !in->is_base()) { + dout(7) << " will trim from " << *in << dendl; + q.push_back(in); + } + + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + for (auto& in : q) { + remove_inode_recursive(in); + + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } +} + +/** recalc_auth_bits() + * once subtree auth is disambiguated, we need to adjust all the + * auth and dirty bits in our cache before moving on. + */ +void MDCache::recalc_auth_bits(bool replay) +{ + dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl; + + if (root) { + root->inode_auth.first = mds->mdsmap->get_root(); + bool auth = mds->get_nodeid() == root->inode_auth.first; + if (auth) { + root->state_set(CInode::STATE_AUTH); + } else { + root->state_clear(CInode::STATE_AUTH); + if (!replay) + root->state_set(CInode::STATE_REJOINING); + } + } + + set<CInode*> subtree_inodes; + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (p->first->dir_auth.first == mds->get_nodeid()) + subtree_inodes.insert(p->first->inode); + } + + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (p->first->inode->is_mdsdir()) { + CInode *in = p->first->inode; + bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()); + if (auth) { + in->state_set(CInode::STATE_AUTH); + } else { + in->state_clear(CInode::STATE_AUTH); + if (!replay) + in->state_set(CInode::STATE_REJOINING); + } + } + + std::queue<CDir*> dfq; // dirfrag queue + dfq.push(p->first); + + bool auth = p->first->authority().first == mds->get_nodeid(); + dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; + + while (!dfq.empty()) { + CDir *dir = dfq.front(); + dfq.pop(); + + // dir + if (auth) { + dir->state_set(CDir::STATE_AUTH); + } else { + dir->state_clear(CDir::STATE_AUTH); + if (!replay) { + // close empty non-auth dirfrag + if (!dir->is_subtree_root() && dir->get_num_any() == 0) { + dir->inode->close_dirfrag(dir->get_frag()); + continue; + } + dir->state_set(CDir::STATE_REJOINING); + dir->state_clear(CDir::STATE_COMPLETE); + if (dir->is_dirty()) + dir->mark_clean(); + } + } + + // dentries in this dir + for (auto &p : dir->items) { + // dn + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (auth) { + dn->mark_auth(); + } else { + dn->clear_auth(); + if (!replay) { + dn->state_set(CDentry::STATE_REJOINING); + if (dn->is_dirty()) + dn->mark_clean(); + } + } + + if (dnl->is_primary()) { + // inode + CInode *in = dnl->get_inode(); + if (auth) { + in->state_set(CInode::STATE_AUTH); + } else { + in->state_clear(CInode::STATE_AUTH); + if (!replay) { + in->state_set(CInode::STATE_REJOINING); + if (in->is_dirty()) + in->mark_clean(); + if (in->is_dirty_parent()) + in->clear_dirty_parent(); + // avoid touching scatterlocks for our subtree roots! + if (subtree_inodes.count(in) == 0) + in->clear_scatter_dirty(); + } + } + // recurse? + if (in->is_dir()) { + auto&& dfv = in->get_nested_dirfrags(); + for (const auto& dir : dfv) { + dfq.push(dir); + } + } + } + } + } + } + + show_subtrees(); + show_cache(); +} + + + +// =========================================================================== +// REJOIN + +/* + * notes on scatterlock recovery: + * + * - recovering inode replica sends scatterlock data for any subtree + * roots (the only ones that are possibly dirty). + * + * - surviving auth incorporates any provided scatterlock data. any + * pending gathers are then finished, as with the other lock types. + * + * that takes care of surviving auth + (recovering replica)*. + * + * - surviving replica sends strong_inode, which includes current + * scatterlock state, AND any dirty scatterlock data. this + * provides the recovering auth with everything it might need. + * + * - recovering auth must pick initial scatterlock state based on + * (weak|strong) rejoins. + * - always assimilate scatterlock data (it can't hurt) + * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC. + * - include base inode in ack for all inodes that saw scatterlock content + * + * also, for scatter gather, + * + * - auth increments {frag,r}stat.version on completion of any gather. + * + * - auth incorporates changes in a gather _only_ if the version + * matches. + * + * - replica discards changes any time the scatterlock syncs, and + * after recovery. + */ + +void MDCache::dump_rejoin_status(Formatter *f) const +{ + f->open_object_section("rejoin_status"); + f->dump_stream("rejoin_gather") << rejoin_gather; + f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather; + f->dump_unsigned("num_opening_inodes", cap_imports_num_opening); + f->close_section(); +} + +void MDCache::rejoin_start(MDSContext *rejoin_done_) +{ + dout(10) << "rejoin_start" << dendl; + ceph_assert(!rejoin_done); + rejoin_done.reset(rejoin_done_); + + rejoin_gather = recovery_set; + // need finish opening cap inodes before sending cache rejoins + rejoin_gather.insert(mds->get_nodeid()); + process_imported_caps(); +} + +/* + * rejoin phase! + * + * this initiates rejoin. it should be called before we get any + * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). + * + * we start out by sending rejoins to everyone in the recovery set. + * + * if we are rejoin, send for all regions in our cache. + * if we are active|stopping, send only to nodes that are rejoining. + */ +void MDCache::rejoin_send_rejoins() +{ + dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; + + if (rejoin_gather.count(mds->get_nodeid())) { + dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl; + rejoins_pending = true; + return; + } + if (!resolve_gather.empty()) { + dout(7) << "rejoin_send_rejoins still waiting for resolves (" + << resolve_gather << ")" << dendl; + rejoins_pending = true; + return; + } + + ceph_assert(!migrator->is_importing()); + ceph_assert(!migrator->is_exporting()); + + if (!mds->is_rejoin()) { + disambiguate_other_imports(); + } + + map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins; + + + // if i am rejoining, send a rejoin to everyone. + // otherwise, just send to others who are rejoining. + for (const auto& rank : recovery_set) { + if (rank == mds->get_nodeid()) continue; // nothing to myself! + if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node! + if (mds->is_rejoin()) + rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK); + else if (mds->mdsmap->is_rejoin(rank)) + rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG); + } + + if (mds->is_rejoin()) { + map<client_t, pair<Session*, set<mds_rank_t> > > client_exports; + for (auto& p : cap_exports) { + mds_rank_t target = p.second.first; + if (rejoins.count(target) == 0) + continue; + for (auto q = p.second.second.begin(); q != p.second.second.end(); ) { + Session *session = nullptr; + auto it = client_exports.find(q->first); + if (it != client_exports.end()) { + session = it->second.first; + if (session) + it->second.second.insert(target); + } else { + session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + auto& r = client_exports[q->first]; + r.first = session; + if (session) + r.second.insert(target); + } + if (session) { + ++q; + } else { + // remove reconnect with no session + p.second.second.erase(q++); + } + } + rejoins[target]->cap_exports[p.first] = p.second.second; + } + for (auto& p : client_exports) { + Session *session = p.second.first; + for (auto& q : p.second.second) { + auto rejoin = rejoins[q]; + rejoin->client_map[p.first] = session->info.inst; + rejoin->client_metadata_map[p.first] = session->info.client_metadata; + } + } + } + + + // check all subtrees + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + ceph_assert(dir->is_subtree_root()); + if (dir->is_ambiguous_dir_auth()) { + // exporter is recovering, importer is survivor. + ceph_assert(rejoins.count(dir->authority().first)); + ceph_assert(!rejoins.count(dir->authority().second)); + continue; + } + + // my subtree? + if (dir->is_auth()) + continue; // skip my own regions! + + mds_rank_t auth = dir->get_dir_auth().first; + ceph_assert(auth >= 0); + if (rejoins.count(auth) == 0) + continue; // don't care about this node's subtrees + + rejoin_walk(dir, rejoins[auth]); + } + + // rejoin root inodes, too + for (auto &p : rejoins) { + if (mds->is_rejoin()) { + // weak + if (p.first == 0 && root) { + p.second->add_weak_inode(root->vino()); + if (root->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on root " << *root << dendl; + p.second->add_scatterlock_state(root); + } + } + if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) { + if (in) + p.second->add_weak_inode(in->vino()); + } + } else { + // strong + if (p.first == 0 && root) { + p.second->add_strong_inode(root->vino(), + root->get_replica_nonce(), + root->get_caps_wanted(), + root->filelock.get_state(), + root->nestlock.get_state(), + root->dirfragtreelock.get_state()); + root->state_set(CInode::STATE_REJOINING); + if (root->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on root " << *root << dendl; + p.second->add_scatterlock_state(root); + } + } + + if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) { + p.second->add_strong_inode(in->vino(), + in->get_replica_nonce(), + in->get_caps_wanted(), + in->filelock.get_state(), + in->nestlock.get_state(), + in->dirfragtreelock.get_state()); + in->state_set(CInode::STATE_REJOINING); + } + } + } + + if (!mds->is_rejoin()) { + // i am survivor. send strong rejoin. + // note request remote_auth_pins, xlocks + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (mdr->is_peer()) + continue; + // auth pins + for (const auto& q : mdr->object_states) { + if (q.second.remote_auth_pinned == MDS_RANK_NONE) + continue; + if (!q.first->is_auth()) { + mds_rank_t target = q.second.remote_auth_pinned; + ceph_assert(target == q.first->authority().first); + if (rejoins.count(target) == 0) continue; + const auto& rejoin = rejoins[target]; + + dout(15) << " " << *mdr << " authpin on " << *q.first << dendl; + MDSCacheObjectInfo i; + q.first->set_object_info(i); + if (i.ino) + rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt); + else + rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt); + + if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin && + mdr->more()->rename_inode == q.first) + rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid), + mdr->reqid, mdr->attempt); + } + } + // xlocks + for (const auto& q : mdr->locks) { + auto lock = q.lock; + auto obj = lock->get_parent(); + if (q.is_xlock() && !obj->is_auth()) { + mds_rank_t who = obj->authority().first; + if (rejoins.count(who) == 0) continue; + const auto& rejoin = rejoins[who]; + + dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl; + MDSCacheObjectInfo i; + obj->set_object_info(i); + if (i.ino) + rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(), + mdr->reqid, mdr->attempt); + else + rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid, + mdr->reqid, mdr->attempt); + } else if (q.is_remote_wrlock()) { + mds_rank_t who = q.wrlock_target; + if (rejoins.count(who) == 0) continue; + const auto& rejoin = rejoins[who]; + + dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl; + MDSCacheObjectInfo i; + obj->set_object_info(i); + ceph_assert(i.ino); + rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(), + mdr->reqid, mdr->attempt); + } + } + } + } + + // send the messages + for (auto &p : rejoins) { + ceph_assert(rejoin_sent.count(p.first) == 0); + ceph_assert(rejoin_ack_gather.count(p.first) == 0); + rejoin_sent.insert(p.first); + rejoin_ack_gather.insert(p.first); + mds->send_message_mds(p.second, p.first); + } + rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too + rejoins_pending = false; + + // nothing? + if (mds->is_rejoin() && rejoin_gather.empty()) { + dout(10) << "nothing to rejoin" << dendl; + rejoin_gather_finish(); + } +} + + +/** + * rejoin_walk - build rejoin declarations for a subtree + * + * @param dir subtree root + * @param rejoin rejoin message + * + * from a rejoining node: + * weak dirfrag + * weak dentries (w/ connectivity) + * + * from a surviving node: + * strong dirfrag + * strong dentries (no connectivity!) + * strong inodes + */ +void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin) +{ + dout(10) << "rejoin_walk " << *dir << dendl; + + std::vector<CDir*> nested; // finish this dir, then do nested items + + if (mds->is_rejoin()) { + // WEAK + rejoin->add_weak_dirfrag(dir->dirfrag()); + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->last == CEPH_NOSNAP); + CDentry::linkage_t *dnl = dn->get_linkage(); + dout(15) << " add_weak_primary_dentry " << *dn << dendl; + ceph_assert(dnl->is_primary()); + CInode *in = dnl->get_inode(); + ceph_assert(dnl->get_inode()->is_dir()); + rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino()); + { + auto&& dirs = in->get_nested_dirfrags(); + nested.insert(std::end(nested), std::begin(dirs), std::end(dirs)); + } + if (in->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on " << *in << dendl; + rejoin->add_scatterlock_state(in); + } + } + } else { + // STRONG + dout(15) << " add_strong_dirfrag " << *dir << dendl; + rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); + dir->state_set(CDir::STATE_REJOINING); + + for (auto it = dir->items.begin(); it != dir->items.end(); ) { + CDentry *dn = it->second; + ++it; + dn->state_set(CDentry::STATE_REJOINING); + CDentry::linkage_t *dnl = dn->get_linkage(); + CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL; + + // trim snap dentries. because they may have been pruned by + // their auth mds (snap deleted) + if (dn->last != CEPH_NOSNAP) { + if (in && !in->remote_parents.empty()) { + // unlink any stale remote snap dentry. + for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) { + CDentry *remote_dn = *it2; + ++it2; + ceph_assert(remote_dn->last != CEPH_NOSNAP); + remote_dn->unlink_remote(remote_dn->get_linkage()); + } + } + if (dn->lru_is_expireable()) { + if (!dnl->is_null()) + dir->unlink_inode(dn, false); + if (in) + remove_inode(in); + dir->remove_dentry(dn); + continue; + } else { + // Inventing null/remote dentry shouldn't cause problem + ceph_assert(!dnl->is_primary()); + } + } + + dout(15) << " add_strong_dentry " << *dn << dendl; + rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(), + dn->first, dn->last, + dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_d_type():0, + dn->get_replica_nonce(), + dn->lock.get_state()); + dn->state_set(CDentry::STATE_REJOINING); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dout(15) << " add_strong_inode " << *in << dendl; + rejoin->add_strong_inode(in->vino(), + in->get_replica_nonce(), + in->get_caps_wanted(), + in->filelock.get_state(), + in->nestlock.get_state(), + in->dirfragtreelock.get_state()); + in->state_set(CInode::STATE_REJOINING); + { + auto&& dirs = in->get_nested_dirfrags(); + nested.insert(std::end(nested), std::begin(dirs), std::end(dirs)); + } + if (in->is_dirty_scattered()) { + dout(10) << " sending scatterlock state on " << *in << dendl; + rejoin->add_scatterlock_state(in); + } + } + } + } + + // recurse into nested dirs + for (const auto& dir : nested) { + rejoin_walk(dir, rejoin); + } +} + + +/* + * i got a rejoin. + * - reply with the lockstate + * + * if i am active|stopping, + * - remove source from replica list for everything not referenced here. + */ +void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m) +{ + dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() + << " (" << m->get_payload().length() << " bytes)" + << dendl; + + switch (m->op) { + case MMDSCacheRejoin::OP_WEAK: + handle_cache_rejoin_weak(m); + break; + case MMDSCacheRejoin::OP_STRONG: + handle_cache_rejoin_strong(m); + break; + case MMDSCacheRejoin::OP_ACK: + handle_cache_rejoin_ack(m); + break; + + default: + ceph_abort(); + } +} + + +/* + * handle_cache_rejoin_weak + * + * the sender + * - is recovering from their journal. + * - may have incorrect (out of date) inode contents + * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient + * + * if the sender didn't trim_non_auth(), they + * - may have incorrect (out of date) dentry/inode linkage + * - may have deleted/purged inodes + * and i may have to go to disk to get accurate inode contents. yuck. + */ +void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak) +{ + mds_rank_t from = mds_rank_t(weak->get_source().num()); + + // possible response(s) + ref_t<MMDSCacheRejoin> ack; // if survivor + set<vinodeno_t> acked_inodes; // if survivor + set<SimpleLock *> gather_locks; // if survivor + bool survivor = false; // am i a survivor? + + if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { + survivor = true; + dout(10) << "i am a surivivor, and will ack immediately" << dendl; + ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK); + + map<inodeno_t,map<client_t,Capability::Import> > imported_caps; + + // check cap exports + for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { + CInode *in = get_inode(p->first); + ceph_assert(!in || in->is_auth()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl; + Capability *cap = rejoin_import_cap(in, q->first, q->second, from); + Capability::Import& im = imported_caps[p->first][q->first]; + if (cap) { + im.cap_id = cap->get_cap_id(); + im.issue_seq = cap->get_last_seq(); + im.mseq = cap->get_mseq(); + } else { + // all are zero + } + } + mds->locker->eval(in, CEPH_CAP_LOCKS, true); + } + + encode(imported_caps, ack->imported_caps); + } else { + ceph_assert(mds->is_rejoin()); + + // we may have already received a strong rejoin from the sender. + rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks); + ceph_assert(gather_locks.empty()); + + // check cap exports. + rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end()); + rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(), + weak->client_metadata_map.end()); + + for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { + CInode *in = get_inode(p->first); + ceph_assert(!in || in->is_auth()); + // note + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl; + cap_imports[p->first][q->first][from] = q->second; + } + } + } + + // assimilate any potentially dirty scatterlock state + for (const auto &p : weak->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file); + in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest); + in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft); + if (!survivor) + rejoin_potential_updated_scatterlocks.insert(in); + } + + // recovering peer may send incorrect dirfrags here. we need to + // infer which dirfrag they meant. the ack will include a + // strong_dirfrag that will set them straight on the fragmentation. + + // walk weak map + set<CDir*> dirs_to_share; + for (const auto &p : weak->weak_dirfrags) { + CInode *diri = get_inode(p.ino); + if (!diri) + dout(0) << " missing dir ino " << p.ino << dendl; + ceph_assert(diri); + + frag_vec_t leaves; + if (diri->dirfragtree.is_leaf(p.frag)) { + leaves.push_back(p.frag); + } else { + diri->dirfragtree.get_leaves_under(p.frag, leaves); + if (leaves.empty()) + leaves.push_back(diri->dirfragtree[p.frag.value()]); + } + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) { + dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl; + continue; + } + ceph_assert(dir); + if (dirs_to_share.count(dir)) { + dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl; + } else { + dirs_to_share.insert(dir); + unsigned nonce = dir->add_replica(from); + dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl; + if (ack) { + ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep); + ack->add_dirfrag_base(dir); + } + } + } + } + + for (const auto &p : weak->weak) { + CInode *diri = get_inode(p.first); + if (!diri) + dout(0) << " missing dir ino " << p.first << dendl; + ceph_assert(diri); + + // weak dentries + CDir *dir = 0; + for (const auto &q : p.second) { + // locate proper dirfrag. + // optimize for common case (one dirfrag) to avoid dirs_to_share set check + frag_t fg = diri->pick_dirfrag(q.first.name); + if (!dir || dir->get_frag() != fg) { + dir = diri->get_dirfrag(fg); + if (!dir) + dout(0) << " missing dir frag " << fg << " on " << *diri << dendl; + ceph_assert(dir); + ceph_assert(dirs_to_share.count(dir)); + } + + // and dentry + CDentry *dn = dir->lookup(q.first.name, q.first.snapid); + ceph_assert(dn); + CDentry::linkage_t *dnl = dn->get_linkage(); + ceph_assert(dnl->is_primary()); + + if (survivor && dn->is_replica(from)) + dentry_remove_replica(dn, from, gather_locks); + unsigned dnonce = dn->add_replica(from); + dout(10) << " have " << *dn << dendl; + if (ack) + ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(), + dn->first, dn->last, + dnl->get_inode()->ino(), inodeno_t(0), 0, + dnonce, dn->lock.get_replica_state()); + + // inode + CInode *in = dnl->get_inode(); + ceph_assert(in); + + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from, true, gather_locks); + unsigned inonce = in->add_replica(from); + dout(10) << " have " << *in << dendl; + + // scatter the dirlock, just in case? + if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag()) + in->filelock.set_state(LOCK_MIX); + + if (ack) { + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); + } + } + } + + // weak base inodes? (root, stray, etc.) + for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin(); + p != weak->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + ceph_assert(in); // hmm fixme wrt stray? + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from, true, gather_locks); + unsigned inonce = in->add_replica(from); + dout(10) << " have base " << *in << dendl; + + if (ack) { + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); + } + } + + ceph_assert(rejoin_gather.count(from)); + rejoin_gather.erase(from); + if (survivor) { + // survivor. do everything now. + for (const auto &p : weak->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl; + acked_inodes.insert(in->vino()); + ack->add_inode_base(in, mds->mdsmap->get_up_features()); + } + + rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks); + mds->send_message(ack, weak->get_connection()); + + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { + if (!(*p)->is_stable()) + mds->locker->eval_gather(*p); + } + } else { + // done? + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) { + rejoin_gather_finish(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; + } + } +} + +/* + * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects + * + * all validated replicas are acked with a strong nonce, etc. if that isn't in the + * ack, the replica dne, and we can remove it from our replica maps. + */ +void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack, + set<vinodeno_t>& acked_inodes, + set<SimpleLock *>& gather_locks) +{ + dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl; + + auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) { + // inode? + if (in->is_auth() && + in->is_replica(from) && + (ack == NULL || acked_inodes.count(in->vino()) == 0)) { + inode_remove_replica(in, from, false, gather_locks); + dout(10) << " rem " << *in << dendl; + } + + if (!in->is_dir()) + return; + + const auto&& dfs = in->get_dirfrags(); + for (const auto& dir : dfs) { + if (!dir->is_auth()) + continue; + + if (dir->is_replica(from) && + (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) { + dir->remove_replica(from); + dout(10) << " rem " << *dir << dendl; + } + + // dentries + for (auto &p : dir->items) { + CDentry *dn = p.second; + + if (dn->is_replica(from)) { + if (ack) { + const auto it = ack->strong_dentries.find(dir->dirfrag()); + if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) { + continue; + } + } + dentry_remove_replica(dn, from, gather_locks); + dout(10) << " rem " << *dn << dendl; + } + } + } + }; + + for (auto &p : inode_map) + scour_func(p.second); + for (auto &p : snap_inode_map) + scour_func(p.second); +} + + +CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last) +{ + CInode *in = new CInode(this, true, 2, last); + in->_get_inode()->ino = ino; + in->state_set(CInode::STATE_REJOINUNDEF); + add_inode(in); + rejoin_undef_inodes.insert(in); + dout(10) << " invented " << *in << dendl; + return in; +} + +CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df) +{ + CInode *in = get_inode(df.ino); + if (!in) + in = rejoin_invent_inode(df.ino, CEPH_NOSNAP); + if (!in->is_dir()) { + ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF)); + in->_get_inode()->mode = S_IFDIR; + in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + } + CDir *dir = in->get_or_open_dirfrag(this, df.frag); + dir->state_set(CDir::STATE_REJOINUNDEF); + rejoin_undef_dirfrags.insert(dir); + dout(10) << " invented " << *dir << dendl; + return dir; +} + +void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong) +{ + mds_rank_t from = mds_rank_t(strong->get_source().num()); + + // only a recovering node will get a strong rejoin. + if (!mds->is_rejoin()) { + if (mds->get_want_state() == MDSMap::STATE_REJOIN) { + mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong)); + return; + } + ceph_abort_msg("got unexpected rejoin message during recovery"); + } + + // assimilate any potentially dirty scatterlock state + for (const auto &p : strong->inode_scatterlocks) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file); + in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest); + in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft); + rejoin_potential_updated_scatterlocks.insert(in); + } + + rejoin_unlinked_inodes[from].clear(); + + // surviving peer may send incorrect dirfrag here (maybe they didn't + // get the fragment notify, or maybe we rolled back?). we need to + // infer the right frag and get them with the program. somehow. + // we don't normally send ACK.. so we'll need to bundle this with + // MISSING or something. + + // strong dirfrags/dentries. + // also process auth_pins, xlocks. + for (const auto &p : strong->strong_dirfrags) { + auto& dirfrag = p.first; + CInode *diri = get_inode(dirfrag.ino); + if (!diri) + diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP); + CDir *dir = diri->get_dirfrag(dirfrag.frag); + bool refragged = false; + if (dir) { + dout(10) << " have " << *dir << dendl; + } else { + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t())); + else if (diri->dirfragtree.is_leaf(dirfrag.frag)) + dir = rejoin_invent_dirfrag(dirfrag); + } + if (dir) { + dir->add_replica(from, p.second.nonce); + dir->dir_rep = p.second.dir_rep; + } else { + dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl; + frag_vec_t leaves; + diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves); + if (leaves.empty()) + leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]); + dout(10) << " maps to frag(s) " << leaves << dendl; + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) + dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf)); + else + dout(10) << " have(approx) " << *dir << dendl; + dir->add_replica(from, p.second.nonce); + dir->dir_rep = p.second.dir_rep; + } + refragged = true; + } + + const auto it = strong->strong_dentries.find(dirfrag); + if (it != strong->strong_dentries.end()) { + const auto& dmap = it->second; + for (const auto &q : dmap) { + const string_snap_t& ss = q.first; + const MMDSCacheRejoin::dn_strong& d = q.second; + CDentry *dn; + if (!refragged) + dn = dir->lookup(ss.name, ss.snapid); + else { + frag_t fg = diri->pick_dirfrag(ss.name); + dir = diri->get_dirfrag(fg); + ceph_assert(dir); + dn = dir->lookup(ss.name, ss.snapid); + } + if (!dn) { + if (d.is_remote()) { + dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid); + } else if (d.is_null()) { + dn = dir->add_null_dentry(ss.name, d.first, ss.snapid); + } else { + CInode *in = get_inode(d.ino, ss.snapid); + if (!in) in = rejoin_invent_inode(d.ino, ss.snapid); + dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid); + } + dout(10) << " invented " << *dn << dendl; + } + CDentry::linkage_t *dnl = dn->get_linkage(); + + // dn auth_pin? + const auto pinned_it = strong->authpinned_dentries.find(dirfrag); + if (pinned_it != strong->authpinned_dentries.end()) { + const auto peer_reqid_it = pinned_it->second.find(ss); + if (peer_reqid_it != pinned_it->second.end()) { + for (const auto &r : peer_reqid_it->second) { + dout(10) << " dn authpin by " << r << " on " << *dn << dendl; + + // get/create peer mdrequest + MDRequestRef mdr; + if (have_request(r.reqid)) + mdr = request_get(r.reqid); + else + mdr = request_start_peer(r.reqid, r.attempt, strong); + mdr->auth_pin(dn); + } + } + } + + // dn xlock? + const auto xlocked_it = strong->xlocked_dentries.find(dirfrag); + if (xlocked_it != strong->xlocked_dentries.end()) { + const auto ss_req_it = xlocked_it->second.find(ss); + if (ss_req_it != xlocked_it->second.end()) { + const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second; + dout(10) << " dn xlock by " << r << " on " << *dn << dendl; + MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above. + ceph_assert(mdr->is_auth_pinned(dn)); + if (!mdr->is_xlocked(&dn->versionlock)) { + ceph_assert(dn->versionlock.can_xlock_local()); + dn->versionlock.get_xlock(mdr, mdr->get_client()); + mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK); + } + if (dn->lock.is_stable()) + dn->auth_pin(&dn->lock); + dn->lock.set_state(LOCK_XLOCK); + dn->lock.get_xlock(mdr, mdr->get_client()); + mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK); + } + } + + dn->add_replica(from, d.nonce); + dout(10) << " have " << *dn << dendl; + + if (dnl->is_primary()) { + if (d.is_primary()) { + if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) { + // the survivor missed MDentryUnlink+MDentryLink messages ? + ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); + CInode *in = get_inode(d.ino, ss.snapid); + ceph_assert(in); + ceph_assert(in->get_parent_dn()); + rejoin_unlinked_inodes[from].insert(in); + dout(7) << " sender has primary dentry but wrong inode" << dendl; + } + } else { + // the survivor missed MDentryLink message ? + ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); + dout(7) << " sender doesn't have primay dentry" << dendl; + } + } else { + if (d.is_primary()) { + // the survivor missed MDentryUnlink message ? + CInode *in = get_inode(d.ino, ss.snapid); + ceph_assert(in); + ceph_assert(in->get_parent_dn()); + rejoin_unlinked_inodes[from].insert(in); + dout(7) << " sender has primary dentry but we don't" << dendl; + } + } + } + } + } + + for (const auto &p : strong->strong_inodes) { + CInode *in = get_inode(p.first); + ceph_assert(in); + in->add_replica(from, p.second.nonce); + dout(10) << " have " << *in << dendl; + + const MMDSCacheRejoin::inode_strong& is = p.second; + + // caps_wanted + if (is.caps_wanted) { + in->set_mds_caps_wanted(from, is.caps_wanted); + dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) + << " on " << *in << dendl; + } + + // scatterlocks? + // infer state from replica state: + // * go to MIX if they might have wrlocks + // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) + in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK + in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); + in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); + + // auth pin? + const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino()); + if (authpinned_inodes_it != strong->authpinned_inodes.end()) { + for (const auto& r : authpinned_inodes_it->second) { + dout(10) << " inode authpin by " << r << " on " << *in << dendl; + + // get/create peer mdrequest + MDRequestRef mdr; + if (have_request(r.reqid)) + mdr = request_get(r.reqid); + else + mdr = request_start_peer(r.reqid, r.attempt, strong); + if (strong->frozen_authpin_inodes.count(in->vino())) { + ceph_assert(!in->get_num_auth_pins()); + mdr->freeze_auth_pin(in); + } else { + ceph_assert(!in->is_frozen_auth_pin()); + } + mdr->auth_pin(in); + } + } + // xlock(s)? + const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino()); + if (xlocked_inodes_it != strong->xlocked_inodes.end()) { + for (const auto &q : xlocked_inodes_it->second) { + SimpleLock *lock = in->get_lock(q.first); + dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl; + MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above. + ceph_assert(mdr->is_auth_pinned(in)); + if (!mdr->is_xlocked(&in->versionlock)) { + ceph_assert(in->versionlock.can_xlock_local()); + in->versionlock.get_xlock(mdr, mdr->get_client()); + mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK); + } + if (lock->is_stable()) + in->auth_pin(lock); + lock->set_state(LOCK_XLOCK); + if (lock == &in->filelock) + in->loner_cap = -1; + lock->get_xlock(mdr, mdr->get_client()); + mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK); + } + } + } + // wrlock(s)? + for (const auto &p : strong->wrlocked_inodes) { + CInode *in = get_inode(p.first); + for (const auto &q : p.second) { + SimpleLock *lock = in->get_lock(q.first); + for (const auto &r : q.second) { + dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl; + MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above. + if (in->is_auth()) + ceph_assert(mdr->is_auth_pinned(in)); + lock->set_state(LOCK_MIX); + if (lock == &in->filelock) + in->loner_cap = -1; + lock->get_wrlock(true); + mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK); + } + } + } + + // done? + ceph_assert(rejoin_gather.count(from)); + rejoin_gather.erase(from); + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) { + rejoin_gather_finish(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; + } +} + +void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack) +{ + dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN); + bool survivor = !mds->is_rejoin(); + + // for sending cache expire message + set<CInode*> isolated_inodes; + set<CInode*> refragged_inodes; + list<pair<CInode*,int> > updated_realms; + + // dirs + for (const auto &p : ack->strong_dirfrags) { + // we may have had incorrect dir fragmentation; refragment based + // on what they auth tells us. + CDir *dir = get_dirfrag(p.first); + if (!dir) { + dir = get_force_dirfrag(p.first, false); + if (dir) + refragged_inodes.insert(dir->get_inode()); + } + if (!dir) { + CInode *diri = get_inode(p.first.ino); + if (!diri) { + // barebones inode; the full inode loop below will clean up. + diri = new CInode(this, false); + auto _inode = diri->_get_inode(); + _inode->ino = p.first.ino; + _inode->mode = S_IFDIR; + _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + add_inode(diri); + if (MDS_INO_MDSDIR(from) == p.first.ino) { + diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN); + dout(10) << " add inode " << *diri << dendl; + } else { + diri->inode_auth = CDIR_AUTH_DEFAULT; + isolated_inodes.insert(diri); + dout(10) << " unconnected dirfrag " << p.first << dendl; + } + } + // barebones dirfrag; the full dirfrag loop below will clean up. + dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false)); + if (MDS_INO_MDSDIR(from) == p.first.ino || + (dir->authority() != CDIR_AUTH_UNDEF && + dir->authority().first != from)) + adjust_subtree_auth(dir, from); + dout(10) << " add dirfrag " << *dir << dendl; + } + + dir->set_replica_nonce(p.second.nonce); + dir->state_clear(CDir::STATE_REJOINING); + dout(10) << " got " << *dir << dendl; + + // dentries + auto it = ack->strong_dentries.find(p.first); + if (it != ack->strong_dentries.end()) { + for (const auto &q : it->second) { + CDentry *dn = dir->lookup(q.first.name, q.first.snapid); + if(!dn) + dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid); + + CDentry::linkage_t *dnl = dn->get_linkage(); + + ceph_assert(dn->last == q.first.snapid); + if (dn->first != q.second.first) { + dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl; + dn->first = q.second.first; + } + + // may have bad linkage if we missed dentry link/unlink messages + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (!q.second.is_primary() || + vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) { + dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl; + dir->unlink_inode(dn); + } + } else if (dnl->is_remote()) { + if (!q.second.is_remote() || + q.second.remote_ino != dnl->get_remote_ino() || + q.second.remote_d_type != dnl->get_remote_d_type()) { + dout(10) << " had bad linkage for " << *dn << dendl; + dir->unlink_inode(dn); + } + } else { + if (!q.second.is_null()) + dout(10) << " had bad linkage for " << *dn << dendl; + } + + // hmm, did we have the proper linkage here? + if (dnl->is_null() && !q.second.is_null()) { + if (q.second.is_remote()) { + dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type); + } else { + CInode *in = get_inode(q.second.ino, q.first.snapid); + if (!in) { + // barebones inode; assume it's dir, the full inode loop below will clean up. + in = new CInode(this, false, q.second.first, q.first.snapid); + auto _inode = in->_get_inode(); + _inode->ino = q.second.ino; + _inode->mode = S_IFDIR; + _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + add_inode(in); + dout(10) << " add inode " << *in << dendl; + } else if (in->get_parent_dn()) { + dout(10) << " had bad linkage for " << *(in->get_parent_dn()) + << ", unlinking " << *in << dendl; + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); + } + dn->dir->link_primary_inode(dn, in); + isolated_inodes.erase(in); + } + } + + dn->set_replica_nonce(q.second.nonce); + dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor); + dn->state_clear(CDentry::STATE_REJOINING); + dout(10) << " got " << *dn << dendl; + } + } + } + + for (const auto& in : refragged_inodes) { + auto&& ls = in->get_nested_dirfrags(); + for (const auto& dir : ls) { + if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag())) + continue; + ceph_assert(dir->get_num_any() == 0); + in->close_dirfrag(dir->get_frag()); + } + } + + // full dirfrags + for (const auto &p : ack->dirfrag_bases) { + CDir *dir = get_dirfrag(p.first); + ceph_assert(dir); + auto q = p.second.cbegin(); + dir->_decode_base(q); + dout(10) << " got dir replica " << *dir << dendl; + } + + // full inodes + auto p = ack->inode_base.cbegin(); + while (!p.end()) { + inodeno_t ino; + snapid_t last; + bufferlist basebl; + decode(ino, p); + decode(last, p); + decode(basebl, p); + CInode *in = get_inode(ino, last); + ceph_assert(in); + auto q = basebl.cbegin(); + snapid_t sseq = 0; + if (in->snaprealm) + sseq = in->snaprealm->srnode.seq; + in->_decode_base(q); + if (in->snaprealm && in->snaprealm->srnode.seq != sseq) { + int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT; + updated_realms.push_back(pair<CInode*,int>(in, snap_op)); + } + dout(10) << " got inode base " << *in << dendl; + } + + // inodes + p = ack->inode_locks.cbegin(); + //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl; + while (!p.end()) { + inodeno_t ino; + snapid_t last; + __u32 nonce; + bufferlist lockbl; + decode(ino, p); + decode(last, p); + decode(nonce, p); + decode(lockbl, p); + + CInode *in = get_inode(ino, last); + ceph_assert(in); + in->set_replica_nonce(nonce); + auto q = lockbl.cbegin(); + in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor); + in->state_clear(CInode::STATE_REJOINING); + dout(10) << " got inode locks " << *in << dendl; + } + + // FIXME: This can happen if entire subtree, together with the inode subtree root + // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. + ceph_assert(isolated_inodes.empty()); + + map<inodeno_t,map<client_t,Capability::Import> > peer_imported; + auto bp = ack->imported_caps.cbegin(); + decode(peer_imported, bp); + + for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin(); + p != peer_imported.end(); + ++p) { + auto& ex = cap_exports.at(p->first); + ceph_assert(ex.first == from); + for (map<client_t,Capability::Import>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + auto r = ex.second.find(q->first); + ceph_assert(r != ex.second.end()); + + dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (!session) { + dout(10) << " no session for client." << p->first << dendl; + ex.second.erase(r); + continue; + } + + // mark client caps stale. + auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0, + r->second.capinfo.cap_id, 0, + mds->get_osd_epoch_barrier()); + m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, + (q->second.cap_id > 0 ? from : -1), 0); + mds->send_message_client_counted(m, session); + + ex.second.erase(r); + } + ceph_assert(ex.second.empty()); + } + + for (auto p : updated_realms) { + CInode *in = p.first; + bool notify_clients; + if (mds->is_rejoin()) { + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + notify_clients = false; + } else { + // notify clients if I'm survivor + notify_clients = true; + } + do_realm_invalidate_and_update_notify(in, p.second, notify_clients); + } + + // done? + ceph_assert(rejoin_ack_gather.count(from)); + rejoin_ack_gather.erase(from); + if (!survivor) { + if (rejoin_gather.empty()) { + // eval unstable scatter locks after all wrlocks are rejoined. + while (!rejoin_eval_locks.empty()) { + SimpleLock *lock = rejoin_eval_locks.front(); + rejoin_eval_locks.pop_front(); + if (!lock->is_stable()) + mds->locker->eval_gather(lock); + } + } + + if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. + rejoin_ack_gather.empty()) { + // finally, kickstart past snap parent opens + open_snaprealms(); + } else { + dout(7) << "still need rejoin from (" << rejoin_gather << ")" + << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; + } + } else { + // survivor. + mds->queue_waiters(rejoin_waiters); + } +} + +/** + * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes + * + * FIXME: wait, can this actually happen? a survivor should generate cache trim + * messages that clean these guys up... + */ +void MDCache::rejoin_trim_undef_inodes() +{ + dout(10) << "rejoin_trim_undef_inodes" << dendl; + + while (!rejoin_undef_inodes.empty()) { + set<CInode*>::iterator p = rejoin_undef_inodes.begin(); + CInode *in = *p; + rejoin_undef_inodes.erase(p); + + in->clear_replica_map(); + + // close out dirfrags + if (in->is_dir()) { + const auto&& dfls = in->get_dirfrags(); + for (const auto& dir : dfls) { + dir->clear_replica_map(); + + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->clear_replica_map(); + + dout(10) << " trimming " << *dn << dendl; + dir->remove_dentry(dn); + } + + dout(10) << " trimming " << *dir << dendl; + in->close_dirfrag(dir->dirfrag().frag); + } + } + + CDentry *dn = in->get_parent_dn(); + if (dn) { + dn->clear_replica_map(); + dout(10) << " trimming " << *dn << dendl; + dn->dir->remove_dentry(dn); + } else { + dout(10) << " trimming " << *in << dendl; + remove_inode(in); + } + } + + ceph_assert(rejoin_undef_inodes.empty()); +} + +void MDCache::rejoin_gather_finish() +{ + dout(10) << "rejoin_gather_finish" << dendl; + ceph_assert(mds->is_rejoin()); + ceph_assert(rejoin_ack_gather.count(mds->get_nodeid())); + + if (open_undef_inodes_dirfrags()) + return; + + if (process_imported_caps()) + return; + + choose_lock_states_and_reconnect_caps(); + + identify_files_to_recover(); + rejoin_send_acks(); + + // signal completion of fetches, rejoin_gather_finish, etc. + rejoin_ack_gather.erase(mds->get_nodeid()); + + // did we already get our acks too? + if (rejoin_ack_gather.empty()) { + // finally, open snaprealms + open_snaprealms(); + } +} + +class C_MDC_RejoinOpenInoFinish: public MDCacheContext { + inodeno_t ino; +public: + C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} + void finish(int r) override { + mdcache->rejoin_open_ino_finish(ino, r); + } +}; + +void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret) +{ + dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl; + + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret == mds->get_nodeid()) { + ceph_assert(get_inode(ino)); + } else { + auto p = cap_imports.find(ino); + ceph_assert(p != cap_imports.end()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + ceph_assert(q->second.count(MDS_RANK_NONE)); + ceph_assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + + ceph_assert(cap_imports_num_opening > 0); + cap_imports_num_opening--; + + if (cap_imports_num_opening == 0) { + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) + rejoin_gather_finish(); + else if (rejoin_gather.count(mds->get_nodeid())) + process_imported_caps(); + } +} + +class C_MDC_RejoinSessionsOpened : public MDCacheLogContext { +public: + map<client_t,pair<Session*,uint64_t> > session_map; + C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {} + void finish(int r) override { + ceph_assert(r == 0); + mdcache->rejoin_open_sessions_finish(session_map); + } +}; + +void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map) +{ + dout(10) << "rejoin_open_sessions_finish" << dendl; + mds->server->finish_force_open_sessions(session_map); + rejoin_session_map.swap(session_map); + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) + rejoin_gather_finish(); +} + +void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret) +{ + auto p = cap_imports.find(ino); + if (p != cap_imports.end()) { + dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl; + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret != mds->get_nodeid()) { + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + ceph_assert(q->second.count(MDS_RANK_NONE)); + ceph_assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + } +} + +bool MDCache::process_imported_caps() +{ + dout(10) << "process_imported_caps" << dendl; + + if (!open_file_table.is_prefetched() && + open_file_table.prefetch_inodes()) { + open_file_table.wait_for_prefetch( + new MDSInternalContextWrapper(mds, + new LambdaContext([this](int r) { + ceph_assert(rejoin_gather.count(mds->get_nodeid())); + process_imported_caps(); + }) + ) + ); + return true; + } + + open_ino_batch_start(); + + for (auto& p : cap_imports) { + CInode *in = get_inode(p.first); + if (in) { + ceph_assert(in->is_auth()); + cap_imports_missing.erase(p.first); + continue; + } + if (cap_imports_missing.count(p.first) > 0) + continue; + + uint64_t parent_ino = 0; + std::string_view d_name; + for (auto& q : p.second) { + for (auto& r : q.second) { + auto &icr = r.second; + if (icr.capinfo.pathbase && + icr.path.length() > 0 && + icr.path.find('/') == string::npos) { + parent_ino = icr.capinfo.pathbase; + d_name = icr.path; + break; + } + } + if (parent_ino) + break; + } + + dout(10) << " opening missing ino " << p.first << dendl; + cap_imports_num_opening++; + auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first); + if (parent_ino) { + vector<inode_backpointer_t> ancestors; + ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0)); + open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors); + } else { + open_ino(p.first, (int64_t)-1, fin, false); + } + if (!(cap_imports_num_opening % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + + open_ino_batch_submit(); + + if (cap_imports_num_opening > 0) + return true; + + // called by rejoin_gather_finish() ? + if (rejoin_gather.count(mds->get_nodeid()) == 0) { + if (!rejoin_client_map.empty() && + rejoin_session_map.empty()) { + C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this); + version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, + rejoin_client_metadata_map, + finish->session_map); + ESessions *le = new ESessions(pv, std::move(rejoin_client_map), + std::move(rejoin_client_metadata_map)); + mds->mdlog->start_submit_entry(le, finish); + mds->mdlog->flush(); + rejoin_client_map.clear(); + rejoin_client_metadata_map.clear(); + return true; + } + + // process caps that were exported by peer rename + for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin(); + p != rejoin_peer_exports.end(); + ++p) { + CInode *in = get_inode(p->first); + ceph_assert(in); + for (map<client_t,Capability::Export>::iterator q = p->second.second.begin(); + q != p->second.second.end(); + ++q) { + auto r = rejoin_session_map.find(q->first); + if (r == rejoin_session_map.end()) + continue; + + Session *session = r->second.first; + Capability *cap = in->get_client_cap(q->first); + if (!cap) { + cap = in->add_client_cap(q->first, session); + // add empty item to reconnected_caps + (void)reconnected_caps[p->first][q->first]; + } + cap->merge(q->second, true); + + Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first]; + ceph_assert(cap->get_last_seq() == im.issue_seq); + ceph_assert(cap->get_mseq() == im.mseq); + cap->set_cap_id(im.cap_id); + // send cap import because we assigned a new cap ID + do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1, + p->second.first, CEPH_CAP_FLAG_AUTH); + } + } + rejoin_peer_exports.clear(); + rejoin_imported_caps.clear(); + + // process cap imports + // ino -> client -> frommds -> capex + for (auto p = cap_imports.begin(); p != cap_imports.end(); ) { + CInode *in = get_inode(p->first); + if (!in) { + dout(10) << " still missing ino " << p->first + << ", will try again after replayed client requests" << dendl; + ++p; + continue; + } + ceph_assert(in->is_auth()); + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + Session *session; + { + auto r = rejoin_session_map.find(q->first); + session = (r != rejoin_session_map.end() ? r->second.first : nullptr); + } + + for (auto r = q->second.begin(); r != q->second.end(); ++r) { + if (!session) { + if (r->first >= 0) + (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero + continue; + } + + Capability *cap = in->reconnect_cap(q->first, r->second, session); + add_reconnected_cap(q->first, in->ino(), r->second); + if (r->first >= 0) { + if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists + cap->inc_mseq(); + do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0); + + Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first]; + im.cap_id = cap->get_cap_id(); + im.issue_seq = cap->get_last_seq(); + im.mseq = cap->get_mseq(); + } + } + } + cap_imports.erase(p++); // remove and move on + } + } else { + trim_non_auth(); + + ceph_assert(rejoin_gather.count(mds->get_nodeid())); + rejoin_gather.erase(mds->get_nodeid()); + ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid())); + maybe_send_pending_rejoins(); + } + return false; +} + +void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, + client_t client, snapid_t snap_follows) +{ + dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl; + + if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1)) + return; + + const set<snapid_t>& snaps = realm->get_snaps(); + snapid_t follows = snap_follows; + + while (true) { + CInode *in = pick_inode_snap(head_in, follows); + if (in == head_in) + break; + + bool need_snapflush = false; + for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1))); + p != snaps.end() && *p <= in->last; + ++p) { + head_in->add_need_snapflush(in, *p, client); + need_snapflush = true; + } + follows = in->last; + if (!need_snapflush) + continue; + + dout(10) << " need snapflush from client." << client << " on " << *in << dendl; + + if (in->client_snap_caps.empty()) { + for (int i = 0; i < num_cinode_locks; i++) { + int lockid = cinode_lock_info[i].lock; + SimpleLock *lock = in->get_lock(lockid); + ceph_assert(lock); + in->auth_pin(lock); + lock->set_state(LOCK_SNAP_SYNC); + lock->get_wrlock(true); + } + } + in->client_snap_caps.insert(client); + mds->locker->mark_need_snapflush_inode(in); + } +} + +/* + * choose lock states based on reconnected caps + */ +void MDCache::choose_lock_states_and_reconnect_caps() +{ + dout(10) << "choose_lock_states_and_reconnect_caps" << dendl; + + int count = 0; + for (auto p : inode_map) { + CInode *in = p.second; + if (in->last != CEPH_NOSNAP) + continue; + + if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat()) + in->mark_dirty_rstat(); + + int dirty_caps = 0; + auto q = reconnected_caps.find(in->ino()); + if (q != reconnected_caps.end()) { + for (const auto &it : q->second) + dirty_caps |= it.second.dirty_caps; + } + in->choose_lock_states(dirty_caps); + dout(15) << " chose lock states on " << *in << dendl; + + if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } +} + +void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, + map<client_t,ref_t<MClientSnap>>& splits) +{ + ref_t<MClientSnap> snap; + auto it = splits.find(client); + if (it != splits.end()) { + snap = it->second; + snap->head.op = CEPH_SNAP_OP_SPLIT; + } else { + snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT); + splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap)); + snap->head.split = realm->inode->ino(); + snap->bl = mds->server->get_snap_trace(client, realm); + + for (const auto& child : realm->open_children) + snap->split_realms.push_back(child->inode->ino()); + } + snap->split_inos.push_back(ino); +} + +void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, + map<client_t,ref_t<MClientSnap>>& splits) +{ + ceph_assert(parent_realm); + + vector<inodeno_t> split_inos; + vector<inodeno_t> split_realms; + + for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) + split_inos.push_back((*p)->ino()); + for (set<SnapRealm*>::iterator p = realm->open_children.begin(); + p != realm->open_children.end(); + ++p) + split_realms.push_back((*p)->inode->ino()); + + for (const auto& p : realm->client_caps) { + ceph_assert(!p.second->empty()); + auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple()); + if (em.second) { + auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT); + update->head.split = parent_realm->inode->ino(); + update->split_inos = split_inos; + update->split_realms = split_realms; + update->bl = mds->server->get_snap_trace(p.first, parent_realm); + em.first->second = std::move(update); + } + } +} + +void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits) +{ + dout(10) << "send_snaps" << dendl; + + for (auto &p : splits) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v)); + if (session) { + dout(10) << " client." << p.first + << " split " << p.second->head.split + << " inos " << p.second->split_inos + << dendl; + mds->send_message_client_counted(p.second, session); + } else { + dout(10) << " no session for client." << p.first << dendl; + } + } + splits.clear(); +} + + +/* + * remove any items from logsegment open_file lists that don't have + * any caps + */ +void MDCache::clean_open_file_lists() +{ + dout(10) << "clean_open_file_lists" << dendl; + + for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + + elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file)); + while (!q.end()) { + CInode *in = *q; + ++q; + if (in->last == CEPH_NOSNAP) { + dout(10) << " unlisting unwanted/capless inode " << *in << dendl; + in->item_open_file.remove_myself(); + } else { + if (in->client_snap_caps.empty()) { + dout(10) << " unlisting flushed snap inode " << *in << dendl; + in->item_open_file.remove_myself(); + } + } + } + } +} + +void MDCache::dump_openfiles(Formatter *f) +{ + f->open_array_section("openfiles"); + for (auto p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + + auto q = ls->open_files.begin(member_offset(CInode, item_open_file)); + while (!q.end()) { + CInode *in = *q; + ++q; + if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted()) + || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty())) + continue; + f->open_object_section("file"); + in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS); + f->close_section(); + } + } + f->close_section(); +} + +Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds) +{ + dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds + << " on " << *in << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); + if (!session) { + dout(10) << " no session for client." << client << dendl; + return NULL; + } + + Capability *cap = in->reconnect_cap(client, icr, session); + + if (frommds >= 0) { + if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists + cap->inc_mseq(); + do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0); + } + + return cap; +} + +void MDCache::export_remaining_imported_caps() +{ + dout(10) << "export_remaining_imported_caps" << dendl; + + CachedStackStringStream css; + + int count = 0; + for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { + *css << " ino " << p->first << "\n"; + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) { + // mark client caps stale. + auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, + 0, 0, 0, + mds->get_osd_epoch_barrier()); + stale->set_cap_peer(0, 0, 0, -1, 0); + mds->send_message_client_counted(stale, q->first); + } + } + + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + + for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin(); + p != cap_reconnect_waiters.end(); + ++p) + mds->queue_waiters(p->second); + + cap_imports.clear(); + cap_reconnect_waiters.clear(); + + if (css->strv().length()) { + mds->clog->warn() << "failed to reconnect caps for missing inodes:" + << css->strv(); + } +} + +Capability* MDCache::try_reconnect_cap(CInode *in, Session *session) +{ + client_t client = session->info.get_client(); + Capability *cap = nullptr; + const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client); + if (rc) { + cap = in->reconnect_cap(client, *rc, session); + dout(10) << "try_reconnect_cap client." << client + << " reconnect wanted " << ccap_string(rc->capinfo.wanted) + << " issue " << ccap_string(rc->capinfo.issued) + << " on " << *in << dendl; + remove_replay_cap_reconnect(in->ino(), client); + + if (in->is_replicated()) { + mds->locker->try_eval(in, CEPH_CAP_LOCKS); + } else { + int dirty_caps = 0; + auto p = reconnected_caps.find(in->ino()); + if (p != reconnected_caps.end()) { + auto q = p->second.find(client); + if (q != p->second.end()) + dirty_caps = q->second.dirty_caps; + } + in->choose_lock_states(dirty_caps); + dout(15) << " chose lock states on " << *in << dendl; + } + + map<inodeno_t, MDSContext::vec >::iterator it = + cap_reconnect_waiters.find(in->ino()); + if (it != cap_reconnect_waiters.end()) { + mds->queue_waiters(it->second); + cap_reconnect_waiters.erase(it); + } + } + return cap; +} + + + +// ------- +// cap imports and delayed snap parent opens + +void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap, + uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, + int peer, int p_flags) +{ + SnapRealm *realm = in->find_snaprealm(); + dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; + if (cap->get_last_seq() == 0) // reconnected cap + cap->inc_last_seq(); + cap->set_last_issue(); + cap->set_last_issue_stamp(ceph_clock_now()); + cap->clear_new(); + auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT, + in->ino(), realm->inode->ino(), cap->get_cap_id(), + cap->get_last_seq(), cap->pending(), cap->wanted(), + 0, cap->get_mseq(), mds->get_osd_epoch_barrier()); + in->encode_cap_message(reap, cap); + reap->snapbl = mds->server->get_snap_trace(session, realm); + reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags); + mds->send_message_client_counted(reap, session); +} + +void MDCache::do_delayed_cap_imports() +{ + dout(10) << "do_delayed_cap_imports" << dendl; + + ceph_assert(delayed_imported_caps.empty()); +} + +struct C_MDC_OpenSnapRealms : public MDCacheContext { + explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {} + void finish(int r) override { + mdcache->open_snaprealms(); + } +}; + +void MDCache::open_snaprealms() +{ + dout(10) << "open_snaprealms" << dendl; + + auto it = rejoin_pending_snaprealms.begin(); + while (it != rejoin_pending_snaprealms.end()) { + CInode *in = *it; + SnapRealm *realm = in->snaprealm; + ceph_assert(realm); + + map<client_t,ref_t<MClientSnap>> splits; + // finish off client snaprealm reconnects? + auto q = reconnected_snaprealms.find(in->ino()); + if (q != reconnected_snaprealms.end()) { + for (const auto& r : q->second) + finish_snaprealm_reconnect(r.first, realm, r.second, splits); + reconnected_snaprealms.erase(q); + } + + for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) { + CInode *child = *p; + auto q = reconnected_caps.find(child->ino()); + ceph_assert(q != reconnected_caps.end()); + for (auto r = q->second.begin(); r != q->second.end(); ++r) { + Capability *cap = child->get_client_cap(r->first); + if (!cap) + continue; + if (r->second.snap_follows > 0) { + if (r->second.snap_follows < child->first - 1) { + rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows); + } else if (r->second.snapflush) { + // When processing a cap flush message that is re-sent, it's possble + // that the sender has already released all WR caps. So we should + // force MDCache::cow_inode() to setup CInode::client_need_snapflush. + cap->mark_needsnapflush(); + } + } + // make sure client's cap is in the correct snaprealm. + if (r->second.realm_ino != in->ino()) { + prepare_realm_split(realm, r->first, child->ino(), splits); + } + } + } + + rejoin_pending_snaprealms.erase(it++); + in->put(CInode::PIN_OPENINGSNAPPARENTS); + + send_snaps(splits); + } + + notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE); + + if (!reconnected_snaprealms.empty()) { + dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl; + for (auto& p : reconnected_snaprealms) { + CachedStackStringStream css; + *css << " " << p.first << " {"; + bool first = true; + for (auto& q : p.second) { + if (!first) + *css << ", "; + *css << "client." << q.first << "/" << q.second; + } + *css << "}"; + dout(5) << css->strv() << dendl; + } + } + ceph_assert(rejoin_waiters.empty()); + ceph_assert(rejoin_pending_snaprealms.empty()); + dout(10) << "open_snaprealms - all open" << dendl; + do_delayed_cap_imports(); + + ceph_assert(rejoin_done); + rejoin_done.release()->complete(0); + reconnected_caps.clear(); +} + +bool MDCache::open_undef_inodes_dirfrags() +{ + dout(10) << "open_undef_inodes_dirfrags " + << rejoin_undef_inodes.size() << " inodes " + << rejoin_undef_dirfrags.size() << " dirfrags" << dendl; + + // dirfrag -> (fetch_complete, keys_to_fetch) + map<CDir*, pair<bool, std::vector<dentry_key_t> > > fetch_queue; + for (auto& dir : rejoin_undef_dirfrags) { + ceph_assert(dir->get_version() == 0); + fetch_queue.emplace(std::piecewise_construct, std::make_tuple(dir), std::make_tuple()); + } + + if (g_conf().get_val<bool>("mds_dir_prefetch")) { + for (auto& in : rejoin_undef_inodes) { + ceph_assert(!in->is_base()); + ceph_assert(in->get_parent_dir()); + fetch_queue.emplace(std::piecewise_construct, std::make_tuple(in->get_parent_dir()), std::make_tuple()); + } + } else { + for (auto& in : rejoin_undef_inodes) { + assert(!in->is_base()); + CDentry *dn = in->get_parent_dn(); + auto& p = fetch_queue[dn->get_dir()]; + + if (dn->last != CEPH_NOSNAP) { + p.first = true; + p.second.clear(); + } else if (!p.first) { + p.second.push_back(dn->key()); + } + } + } + + if (fetch_queue.empty()) + return false; + + MDSGatherBuilder gather(g_ceph_context, + new MDSInternalContextWrapper(mds, + new LambdaContext([this](int r) { + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) + rejoin_gather_finish(); + }) + ) + ); + + for (auto& p : fetch_queue) { + CDir *dir = p.first; + CInode *diri = dir->get_inode(); + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag())); + if (p.second.first || p.second.second.empty()) { + dir->fetch(gather.new_sub()); + } else { + dir->fetch_keys(p.second.second, gather.new_sub()); + } + } + ceph_assert(gather.has_subs()); + gather.activate(); + return true; +} + +void MDCache::opened_undef_inode(CInode *in) { + dout(10) << "opened_undef_inode " << *in << dendl; + rejoin_undef_inodes.erase(in); + if (in->is_dir()) { + // FIXME: re-hash dentries if necessary + ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash); + if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) { + CDir *dir = in->get_dirfrag(frag_t()); + ceph_assert(dir); + rejoin_undef_dirfrags.erase(dir); + in->force_dirfrags(); + auto&& ls = in->get_dirfrags(); + for (const auto& dir : ls) { + rejoin_undef_dirfrags.insert(dir); + } + } + } +} + +void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, + map<client_t,ref_t<MClientSnap>>& updates) +{ + if (seq < realm->get_newest_seq()) { + dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < " + << realm->get_newest_seq() << " on " << *realm << dendl; + auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE); + snap->bl = mds->server->get_snap_trace(client, realm); + updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap)); + } else { + dout(10) << "finish_snaprealm_reconnect client." << client << " up to date" + << " on " << *realm << dendl; + } +} + + + +void MDCache::rejoin_send_acks() +{ + dout(7) << "rejoin_send_acks" << dendl; + + // replicate stray + for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin(); + p != rejoin_unlinked_inodes.end(); + ++p) { + for (set<CInode*>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CInode *in = *q; + dout(7) << " unlinked inode " << *in << dendl; + // inode expired + if (!in->is_replica(p->first)) + continue; + while (1) { + CDentry *dn = in->get_parent_dn(); + if (dn->is_replica(p->first)) + break; + dn->add_replica(p->first); + CDir *dir = dn->get_dir(); + if (dir->is_replica(p->first)) + break; + dir->add_replica(p->first); + in = dir->get_inode(); + if (in->is_replica(p->first)) + break; + in->add_replica(p->first); + if (in->is_base()) + break; + } + } + } + rejoin_unlinked_inodes.clear(); + + // send acks to everyone in the recovery set + map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks; + for (set<mds_rank_t>::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (rejoin_ack_sent.count(*p)) + continue; + acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK); + } + + rejoin_ack_sent = recovery_set; + + // walk subtrees + for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (!dir->is_auth()) + continue; + dout(10) << "subtree " << *dir << dendl; + + // auth items in this subtree + std::queue<CDir*> dq; + dq.push(dir); + + while (!dq.empty()) { + CDir *dir = dq.front(); + dq.pop(); + + // dir + for (auto &r : dir->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep); + it->second->add_dirfrag_base(dir); + } + + for (auto &p : dir->items) { + CDentry *dn = p.second; + CDentry::linkage_t *dnl = dn->get_linkage(); + + // inode + CInode *in = NULL; + if (dnl->is_primary()) + in = dnl->get_inode(); + + // dentry + for (auto &r : dn->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(), + dn->first, dn->last, + dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), + dnl->is_remote() ? dnl->get_remote_d_type():0, + ++r.second, + dn->lock.get_replica_state()); + // peer missed MDentrylink message ? + if (in && !in->is_replica(r.first)) + in->add_replica(r.first); + } + + if (!in) + continue; + + for (auto &r : in->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(in, mds->mdsmap->get_up_features()); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(in, ++r.second, bl); + } + + // subdirs in this subtree? + { + auto&& dirs = in->get_nested_dirfrags(); + for (const auto& dir : dirs) { + dq.push(dir); + } + } + } + } + } + + // base inodes too + if (root && root->is_auth()) + for (auto &r : root->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(root, mds->mdsmap->get_up_features()); + bufferlist bl; + root->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(root, ++r.second, bl); + } + if (myin) + for (auto &r : myin->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(myin, mds->mdsmap->get_up_features()); + bufferlist bl; + myin->_encode_locks_state_for_rejoin(bl, r.first); + it->second->add_inode_locks(myin, ++r.second, bl); + } + + // include inode base for any inodes whose scatterlocks may have updated + for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin(); + p != rejoin_potential_updated_scatterlocks.end(); + ++p) { + CInode *in = *p; + for (const auto &r : in->get_replicas()) { + auto it = acks.find(r.first); + if (it == acks.end()) + continue; + it->second->add_inode_base(in, mds->mdsmap->get_up_features()); + } + } + + // send acks + for (auto p = acks.begin(); p != acks.end(); ++p) { + encode(rejoin_imported_caps[p->first], p->second->imported_caps); + mds->send_message_mds(p->second, p->first); + } + + rejoin_imported_caps.clear(); +} + +class C_MDC_ReIssueCaps : public MDCacheContext { + CInode *in; +public: + C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) : + MDCacheContext(mdc), in(i) + { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS)) + mdcache->mds->locker->issue_caps(in); + in->put(CInode::PIN_PTRWAITER); + } +}; + +void MDCache::reissue_all_caps() +{ + dout(10) << "reissue_all_caps" << dendl; + + int count = 0; + for (auto &p : inode_map) { + int n = 1; + CInode *in = p.second; + if (in->is_head() && in->is_any_caps()) { + // called by MDSRank::active_start(). There shouldn't be any frozen subtree. + if (in->is_frozen_inode()) { + in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in)); + continue; + } + if (!mds->locker->eval(in, CEPH_CAP_LOCKS)) + n += mds->locker->issue_caps(in); + } + + if ((count % mds->heartbeat_reset_grace()) + n >= mds->heartbeat_reset_grace()) + mds->heartbeat_reset(); + count += n; + } +} + + +// =============================================================================== + +struct C_MDC_QueuedCow : public MDCacheContext { + CInode *in; + MutationRef mut; + C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) : + MDCacheContext(mdc), in(i), mut(m) {} + void finish(int r) override { + mdcache->_queued_file_recover_cow(in, mut); + } +}; + + +void MDCache::queue_file_recover(CInode *in) +{ + dout(10) << "queue_file_recover " << *in << dendl; + ceph_assert(in->is_auth()); + + // cow? + /* + SnapRealm *realm = in->find_snaprealm(); + set<snapid_t> s = realm->get_snaps(); + while (!s.empty() && *s.begin() < in->first) + s.erase(s.begin()); + while (!s.empty() && *s.rbegin() > in->last) + s.erase(*s.rbegin()); + dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl; + if (s.size() > 1) { + auto pi = in->project_inode(mut); + pi.inode.version = in->pre_dirty(); + + auto mut(std::make_shared<MutationImpl>()); + mut->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow"); + mds->mdlog->start_entry(le); + predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + + s.erase(*s.begin()); + while (!s.empty()) { + snapid_t snapid = *s.begin(); + CInode *cow_inode = 0; + journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode); + ceph_assert(cow_inode); + recovery_queue.enqueue(cow_inode); + s.erase(*s.begin()); + } + + in->parent->first = in->first; + le->metablob.add_primary_dentry(in->parent, in, true); + mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut)); + mds->mdlog->flush(); + } + */ + + recovery_queue.enqueue(in); +} + +void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut) +{ + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); +} + + +/* + * called after recovery to recover file sizes for previously opened (for write) + * files. that is, those where max_size > size. + */ +void MDCache::identify_files_to_recover() +{ + dout(10) << "identify_files_to_recover" << dendl; + int count = 0; + + // Clear the recover and check queues in case the monitor sends rejoin mdsmap twice. + rejoin_recover_q.clear(); + rejoin_check_q.clear(); + + for (auto &p : inode_map) { + CInode *in = p.second; + if (!in->is_auth()) + continue; + + if (in->last != CEPH_NOSNAP) + continue; + + // Only normal files need file size recovery + if (!in->is_file()) { + continue; + } + + bool recover = false; + const auto& client_ranges = in->get_projected_inode()->client_ranges; + if (!client_ranges.empty()) { + in->mark_clientwriteable(); + for (auto& p : client_ranges) { + Capability *cap = in->get_client_cap(p.first); + if (cap) { + cap->mark_clientwriteable(); + } else { + dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl; + recover = true; + break; + } + } + } + + if (recover) { + if (in->filelock.is_stable()) { + in->auth_pin(&in->filelock); + } else { + ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP); + } + in->filelock.set_state(LOCK_PRE_SCAN); + rejoin_recover_q.push_back(in); + } else { + rejoin_check_q.push_back(in); + } + + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } +} + +void MDCache::start_files_to_recover() +{ + int count = 0; + for (CInode *in : rejoin_check_q) { + if (in->filelock.get_state() == LOCK_XLOCKSNAP) + mds->locker->issue_caps(in); + mds->locker->check_inode_max_size(in); + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + rejoin_check_q.clear(); + for (CInode *in : rejoin_recover_q) { + mds->locker->file_recover(&in->filelock); + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + if (!rejoin_recover_q.empty()) { + rejoin_recover_q.clear(); + do_file_recover(); + } +} + +void MDCache::do_file_recover() +{ + recovery_queue.advance(); +} + +// =============================================================================== + + +// ---------------------------- +// truncate + +class C_MDC_RetryTruncate : public MDCacheContext { + CInode *in; + LogSegment *ls; +public: + C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) : + MDCacheContext(c), in(i), ls(l) {} + void finish(int r) override { + mdcache->_truncate_inode(in, ls); + } +}; + +void MDCache::truncate_inode(CInode *in, LogSegment *ls) +{ + const auto& pi = in->get_projected_inode(); + dout(10) << "truncate_inode " + << pi->truncate_from << " -> " << pi->truncate_size + << " on " << *in + << dendl; + + ls->truncating_inodes.insert(in); + in->get(CInode::PIN_TRUNCATING); + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + ceph_assert(in->filelock.is_xlocked()); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + mds->locker->issue_caps(in); + return; + } + + _truncate_inode(in, ls); +} + +struct C_IO_MDC_TruncateWriteFinish : public MDCacheIOContext { + CInode *in; + LogSegment *ls; + uint32_t block_size; + C_IO_MDC_TruncateWriteFinish(MDCache *c, CInode *i, LogSegment *l, uint32_t bs) : + MDCacheIOContext(c, false), in(i), ls(l), block_size(bs) { + } + void finish(int r) override { + ceph_assert(r == 0 || r == -CEPHFS_ENOENT); + mdcache->truncate_inode_write_finish(in, ls, block_size); + } + void print(ostream& out) const override { + out << "file_truncate_write(" << in->ino() << ")"; + } +}; + +struct C_IO_MDC_TruncateFinish : public MDCacheIOContext { + CInode *in; + LogSegment *ls; + C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) : + MDCacheIOContext(c, false), in(i), ls(l) { + } + void finish(int r) override { + ceph_assert(r == 0 || r == -CEPHFS_ENOENT); + mdcache->truncate_inode_finish(in, ls); + } + void print(ostream& out) const override { + out << "file_truncate(" << in->ino() << ")"; + } +}; + +void MDCache::_truncate_inode(CInode *in, LogSegment *ls) +{ + const auto& pi = in->get_inode(); + dout(10) << "_truncate_inode " + << pi->truncate_from << " -> " << pi->truncate_size + << " fscrypt last block length is " << pi->fscrypt_last_block.length() + << " on " << *in << dendl; + + ceph_assert(pi->is_truncating()); + ceph_assert(pi->truncate_size < (1ULL << 63)); + ceph_assert(pi->truncate_from < (1ULL << 63)); + ceph_assert(pi->truncate_size < pi->truncate_from || + (pi->truncate_size == pi->truncate_from && + pi->fscrypt_last_block.length())); + + + SnapRealm *realm = in->find_snaprealm(); + SnapContext nullsnap; + const SnapContext *snapc; + if (realm) { + dout(10) << " realm " << *realm << dendl; + snapc = &realm->get_snap_context(); + } else { + dout(10) << " NO realm, using null context" << dendl; + snapc = &nullsnap; + ceph_assert(in->last == CEPH_NOSNAP); + } + dout(10) << "_truncate_inode snapc " << snapc << " on " << *in + << " fscrypt_last_block length is " << pi->fscrypt_last_block.length() + << dendl; + auto layout = pi->layout; + struct ceph_fscrypt_last_block_header header; + memset(&header, 0, sizeof(header)); + bufferlist data; + if (pi->fscrypt_last_block.length()) { + auto bl = pi->fscrypt_last_block.cbegin(); + DECODE_START(1, bl); + decode(header.change_attr, bl); + decode(header.file_offset, bl); + decode(header.block_size, bl); + + /* + * The block_size will be in unit of KB, so if the last block is not + * located in a file hole, the struct_len should be larger than the + * header.block_size. + */ + if (struct_len > header.block_size) { + bl.copy(header.block_size, data); + } + DECODE_FINISH(bl); + } + + if (data.length()) { + dout(10) << "_truncate_inode write on inode " << *in << " change_attr: " + << header.change_attr << " offset: " << header.file_offset << " blen: " + << header.block_size << dendl; + filer.write(in->ino(), &layout, *snapc, header.file_offset, header.block_size, + data, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateWriteFinish(this, in, ls, + header.block_size), + mds->finisher)); + } else { // located in file hole. + uint64_t length = pi->truncate_from - pi->truncate_size; + + /* + * When the fscrypt is enabled the truncate_from and truncate_size + * possibly equal and both are aligned up to header.block_size. In + * this case we will always request a larger length to make sure the + * OSD won't miss truncating the last object. + */ + if (pi->fscrypt_last_block.length()) { + dout(10) << "_truncate_inode truncate on inode " << *in << " hits a hole!" << dendl; + length += header.block_size; + } + ceph_assert(length); + + dout(10) << "_truncate_inode truncate on inode " << *in << dendl; + filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length, + pi->truncate_seq, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), + mds->finisher)); + } + +} + +struct C_MDC_TruncateLogged : public MDCacheLogContext { + CInode *in; + MutationRef mut; + C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) : + MDCacheLogContext(m), in(i), mut(mu) {} + void finish(int r) override { + mdcache->truncate_inode_logged(in, mut); + } +}; + +void MDCache::truncate_inode_write_finish(CInode *in, LogSegment *ls, + uint32_t block_size) +{ + const auto& pi = in->get_inode(); + dout(10) << "_truncate_inode_write " + << pi->truncate_from << " -> " << pi->truncate_size + << " on " << *in << dendl; + + ceph_assert(pi->is_truncating()); + ceph_assert(pi->truncate_size < (1ULL << 63)); + ceph_assert(pi->truncate_from < (1ULL << 63)); + ceph_assert(pi->truncate_size < pi->truncate_from || + (pi->truncate_size == pi->truncate_from && + pi->fscrypt_last_block.length())); + + + SnapRealm *realm = in->find_snaprealm(); + SnapContext nullsnap; + const SnapContext *snapc; + if (realm) { + dout(10) << " realm " << *realm << dendl; + snapc = &realm->get_snap_context(); + } else { + dout(10) << " NO realm, using null context" << dendl; + snapc = &nullsnap; + ceph_assert(in->last == CEPH_NOSNAP); + } + dout(10) << "_truncate_inode_write snapc " << snapc << " on " << *in + << " fscrypt_last_block length is " << pi->fscrypt_last_block.length() + << dendl; + auto layout = pi->layout; + /* + * When the fscrypt is enabled the truncate_from and truncate_size + * possibly equal and both are aligned up to header.block_size. In + * this case we will always request a larger length to make sure the + * OSD won't miss truncating the last object. + */ + uint64_t length = pi->truncate_from - pi->truncate_size + block_size; + filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length, + pi->truncate_seq, ceph::real_time::min(), 0, + new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), + mds->finisher)); +} + +void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) +{ + dout(10) << "truncate_inode_finish " << *in << dendl; + + set<CInode*>::iterator p = ls->truncating_inodes.find(in); + ceph_assert(p != ls->truncating_inodes.end()); + ls->truncating_inodes.erase(p); + + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + + // update + auto pi = in->project_inode(mut); + pi.inode->version = in->pre_dirty(); + pi.inode->truncate_from = 0; + pi.inode->truncate_pending--; + pi.inode->fscrypt_last_block = bufferlist(); + + EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); + mds->mdlog->start_entry(le); + + predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); + journal_dirty_inode(mut.get(), &le->metablob, in); + le->metablob.add_truncate_finish(in->ino(), ls->seq); + mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut)); + + // flush immediately if there are readers/writers waiting + if (in->is_waiter_for(CInode::WAIT_TRUNC) || + (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + mds->mdlog->flush(); +} + +void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut) +{ + dout(10) << "truncate_inode_logged " << *in << dendl; + mut->apply(); + mds->locker->drop_locks(mut.get()); + mut->cleanup(); + + in->put(CInode::PIN_TRUNCATING); + in->auth_unpin(this); + + MDSContext::vec waiters; + in->take_waiting(CInode::WAIT_TRUNC, waiters); + mds->queue_waiters(waiters); +} + + +void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls) +{ + dout(20) << "add_recovered_truncate " << *in << " in log segment " + << ls->seq << "/" << ls->offset << dendl; + ls->truncating_inodes.insert(in); + in->get(CInode::PIN_TRUNCATING); +} + +void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls) +{ + dout(20) << "remove_recovered_truncate " << *in << " in log segment " + << ls->seq << "/" << ls->offset << dendl; + // if we have the logseg the truncate started in, it must be in our list. + set<CInode*>::iterator p = ls->truncating_inodes.find(in); + ceph_assert(p != ls->truncating_inodes.end()); + ls->truncating_inodes.erase(p); + in->put(CInode::PIN_TRUNCATING); +} + +void MDCache::start_recovered_truncates() +{ + dout(10) << "start_recovered_truncates" << dendl; + for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin(); + p != mds->mdlog->segments.end(); + ++p) { + LogSegment *ls = p->second; + for (set<CInode*>::iterator q = ls->truncating_inodes.begin(); + q != ls->truncating_inodes.end(); + ++q) { + CInode *in = *q; + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + ceph_assert(in->filelock.is_stable()); + in->filelock.set_state(LOCK_XLOCKDONE); + in->auth_pin(&in->filelock); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + // start_files_to_recover will revoke caps + continue; + } + _truncate_inode(in, ls); + } + } +} + + +class C_MDS_purge_completed_finish : public MDCacheLogContext { + interval_set<inodeno_t> inos; + LogSegment *ls; + version_t inotablev; +public: + C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos, + LogSegment *_ls, version_t iv) + : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {} + void finish(int r) override { + ceph_assert(r == 0); + if (inotablev) { + get_mds()->inotable->apply_release_ids(inos); + ceph_assert(get_mds()->inotable->get_version() == inotablev); + } + ls->purge_inodes_finish(inos); + } +}; + +void MDCache::start_purge_inodes(){ + dout(10) << "start_purge_inodes" << dendl; + for (auto& p : mds->mdlog->segments){ + LogSegment *ls = p.second; + if (ls->purging_inodes.size()){ + purge_inodes(ls->purging_inodes, ls); + } + } +} + +void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls) +{ + dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl; + // FIXME: handle non-default data pool and namespace + + auto cb = new LambdaContext([this, inos, ls](int r){ + ceph_assert(r == 0 || r == -2); + mds->inotable->project_release_ids(inos); + version_t piv = mds->inotable->get_projected_version(); + ceph_assert(piv != 0); + mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv), + new C_MDS_purge_completed_finish(this, inos, ls, piv)); + mds->mdlog->flush(); + }); + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher)); + SnapContext nullsnapc; + for (const auto& [start, len] : inos) { + for (auto i = start; i < start + len ; i += 1) { + filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1, + ceph::real_clock::now(), 0, gather.new_sub()); + } + } + gather.activate(); +} + +// ================================================================================ +// cache trimming + +std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap) +{ + bool is_standby_replay = mds->is_standby_replay(); + std::vector<CDentry *> unexpirables; + uint64_t trimmed = 0; + + auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold"); + + dout(7) << "trim_lru trimming " << count + << " items from LRU" + << " size=" << lru.lru_get_size() + << " mid=" << lru.lru_get_top() + << " pintail=" << lru.lru_get_pintail() + << " pinned=" << lru.lru_get_num_pinned() + << dendl; + + const uint64_t trim_counter_start = trim_counter.get(); + bool throttled = false; + while (1) { + throttled |= trim_counter_start+trimmed >= trim_threshold; + if (throttled) break; + CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire()); + if (!dn) + break; + if (trim_dentry(dn, expiremap)) { + unexpirables.push_back(dn); + } else { + trimmed++; + } + } + + for (auto &dn : unexpirables) { + bottom_lru.lru_insert_mid(dn); + } + unexpirables.clear(); + + // trim dentries from the LRU until count is reached + // if mds is in standby_replay and skip trimming the inodes + while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) { + throttled |= trim_counter_start+trimmed >= trim_threshold; + if (throttled) break; + CDentry *dn = static_cast<CDentry*>(lru.lru_expire()); + if (!dn) { + break; + } + if (is_standby_replay && dn->get_linkage()->inode) { + // we move the inodes that need to be trimmed to the end of the lru queue. + // refer to MDCache::standby_trim_segment + lru.lru_insert_bot(dn); + break; + } else if (trim_dentry(dn, expiremap)) { + unexpirables.push_back(dn); + } else { + trimmed++; + if (count > 0) count--; + } + } + trim_counter.hit(trimmed); + + for (auto &dn : unexpirables) { + lru.lru_insert_mid(dn); + } + unexpirables.clear(); + + dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl; + return std::pair<bool, uint64_t>(throttled, trimmed); +} + +/* + * note: only called while MDS is active or stopping... NOT during recovery. + * however, we may expire a replica whose authority is recovering. + * + * @param count is number of dentries to try to expire + */ +std::pair<bool, uint64_t> MDCache::trim(uint64_t count) +{ + uint64_t used = cache_size(); + uint64_t limit = cache_memory_limit; + expiremap expiremap; + + dout(7) << "trim bytes_used=" << bytes2str(used) + << " limit=" << bytes2str(limit) + << " reservation=" << cache_reservation + << "% count=" << count << dendl; + + // process delayed eval_stray() + stray_manager.advance_delayed(); + + auto result = trim_lru(count, expiremap); + auto& trimmed = result.second; + + // trim non-auth, non-bound subtrees + for (auto p = subtrees.begin(); p != subtrees.end();) { + CDir *dir = p->first; + ++p; + CInode *diri = dir->get_inode(); + if (dir->is_auth()) { + if (diri->is_auth() && !diri->is_base()) { + /* this situation should correspond to an export pin */ + if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) { + /* pinned empty subtree, try to drop */ + if (dir->state_test(CDir::STATE_AUXSUBTREE)) { + dout(20) << "trimming empty pinned subtree " << *dir << dendl; + dir->state_clear(CDir::STATE_AUXSUBTREE); + remove_subtree(dir); + diri->close_dirfrag(dir->dirfrag().frag); + } + } + } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) { + if (dir->state_test(CDir::STATE_EXPORTING) || + !(mds->is_active() || mds->is_stopping()) || + dir->is_freezing() || dir->is_frozen()) + continue; + + migrator->export_empty_import(dir); + ++trimmed; + } + } else if (!diri->is_auth() && dir->get_num_ref() <= 1) { + // only subtree pin + if (diri->get_num_ref() > diri->get_num_subtree_roots()) { + continue; + } + + // don't trim subtree root if its auth MDS is recovering. + // This simplify the cache rejoin code. + if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first)) + continue; + trim_dirfrag(dir, 0, expiremap); + ++trimmed; + } + } + + // trim root? + if (mds->is_stopping() && root) { + auto&& ls = root->get_dirfrags(); + for (const auto& dir : ls) { + if (dir->get_num_ref() == 1) { // subtree pin + trim_dirfrag(dir, 0, expiremap); + ++trimmed; + } + } + if (root->get_num_ref() == 0) { + trim_inode(0, root, 0, expiremap); + ++trimmed; + } + } + + std::set<mds_rank_t> stopping; + mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING); + stopping.erase(mds->get_nodeid()); + for (auto rank : stopping) { + CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank)); + if (!mdsdir_in) + continue; + + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple()); + if (em.second) { + em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); + } + + dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds->get_nodeid() << dendl; + + const bool aborted = expire_recursive(mdsdir_in, expiremap); + if (!aborted) { + dout(20) << __func__ << ": successfully expired mdsdir" << dendl; + auto&& ls = mdsdir_in->get_dirfrags(); + for (auto dir : ls) { + if (dir->get_num_ref() == 1) { // subtree pin + trim_dirfrag(dir, dir, expiremap); + ++trimmed; + } + } + if (mdsdir_in->get_num_ref() == 0) { + trim_inode(NULL, mdsdir_in, NULL, expiremap); + ++trimmed; + } + } else { + dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl; + } + } + + // Other rank's base inodes (when I'm stopping) + if (mds->is_stopping()) { + for (set<CInode*>::iterator p = base_inodes.begin(); + p != base_inodes.end();) { + CInode *base_in = *p; + ++p; + if (MDS_INO_IS_MDSDIR(base_in->ino()) && + MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) { + dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl; + if (base_in->get_num_ref() == 0) { + trim_inode(NULL, base_in, NULL, expiremap); + ++trimmed; + } + } + } + } + + // send any expire messages + send_expire_messages(expiremap); + + return result; +} + +void MDCache::send_expire_messages(expiremap& expiremap) +{ + // send expires + for (const auto &p : expiremap) { + if (mds->is_cluster_degraded() && + (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_sent.count(p.first) == 0))) { + continue; + } + dout(7) << "sending cache_expire to " << p.first << dendl; + mds->send_message_mds(p.second, p.first); + } + expiremap.clear(); +} + + +bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap) +{ + dout(12) << "trim_dentry " << *dn << dendl; + + CDentry::linkage_t *dnl = dn->get_linkage(); + + CDir *dir = dn->get_dir(); + ceph_assert(dir); + + CDir *con = get_subtree_root(dir); + if (con) + dout(12) << " in container " << *con << dendl; + else { + dout(12) << " no container; under a not-yet-linked dir" << dendl; + ceph_assert(dn->is_auth()); + } + + // If replica dentry is not readable, it's likely we will receive + // MDentryLink/MDentryUnlink message soon (It's possible we first + // receive a MDentryUnlink message, then MDentryLink message) + // MDentryLink message only replicates an inode, so we should + // avoid trimming the inode's parent dentry. This is because that + // unconnected replicas are problematic for subtree migration. + if (!dn->is_auth() && !dn->lock.can_read(-1) && + !dn->get_dir()->get_inode()->is_stray()) + return true; + + // adjust the dir state + // NOTE: we can safely remove a clean, null dentry without effecting + // directory completeness. + // (check this _before_ we unlink the inode, below!) + bool clear_complete = false; + if (dn->is_auth() && !(dnl->is_null() && dn->is_clean())) + clear_complete = true; + + // unlink the dentry + if (dnl->is_remote()) { + // just unlink. + dir->unlink_inode(dn, false); + } else if (dnl->is_primary()) { + // expire the inode, too. + CInode *in = dnl->get_inode(); + ceph_assert(in); + if (trim_inode(dn, in, con, expiremap)) + return true; // purging stray instead of trimming + } else { + ceph_assert(dnl->is_null()); + } + + if (!dn->is_auth()) { + // notify dentry authority. + mds_authority_t auth = dn->authority(); + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *dn << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); + em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce()); + } + } + + if (clear_complete) { + if (dn->last == CEPH_NOSNAP) + dir->add_to_bloom(dn); + dir->state_clear(CDir::STATE_COMPLETE); + } + + // remove dentry + dir->remove_dentry(dn); + + if (mds->logger) mds->logger->inc(l_mds_inodes_expired); + return false; +} + + +void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap) +{ + dout(15) << "trim_dirfrag " << *dir << dendl; + + if (dir->is_subtree_root()) { + ceph_assert(!dir->is_auth() || + (!dir->is_replicated() && dir->inode->is_base())); + remove_subtree(dir); // remove from subtree map + } + ceph_assert(dir->get_num_ref() == 0); + + CInode *in = dir->get_inode(); + + if (!dir->is_auth()) { + mds_authority_t auth = dir->authority(); + + // was this an auth delegation? (if so, slightly modified container) + dirfrag_t condf; + if (dir->is_subtree_root()) { + dout(12) << " subtree root, container is " << *dir << dendl; + con = dir; + condf = dir->dirfrag(); + } else { + condf = con->dirfrag(); + } + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *dir << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */ + em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce); + } + } + + in->close_dirfrag(dir->dirfrag().frag); +} + +/** + * Try trimming an inode from the cache + * + * @return true if the inode is still in cache, else false if it was trimmed + */ +bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap) +{ + dout(15) << "trim_inode " << *in << dendl; + ceph_assert(in->get_num_ref() == 0); + + if (in->is_dir()) { + // If replica inode's dirfragtreelock is not readable, it's likely + // some dirfrags of the inode are being fragmented and we will receive + // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new + // dirfrags, so we should avoid trimming these dirfrags' parent inode. + // This is because that unconnected replicas are problematic for + // subtree migration. + // + if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) { + return true; + } + + // DIR + auto&& dfls = in->get_dirfrags(); + for (const auto& dir : dfls) { + ceph_assert(!dir->is_subtree_root()); + trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p + } + } + + // INODE + if (in->is_auth()) { + // eval stray after closing dirfrags + if (dn && !dn->state_test(CDentry::STATE_PURGING)) { + maybe_eval_stray(in); + if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0) + return true; + } + } else { + mds_authority_t auth = in->authority(); + + dirfrag_t df; + if (con) + df = con->dirfrag(); + else + df = dirfrag_t(0,frag_t()); // must be a root or stray inode. + + for (int p=0; p<2; p++) { + mds_rank_t a = auth.first; + if (p) a = auth.second; + if (a < 0 || (p == 1 && auth.second == auth.first)) break; + if (con && mds->get_nodeid() == auth.second && + con->is_importing()) break; // don't send any expire while importing. + if (a == mds->get_nodeid()) continue; // on export, ignore myself. + + dout(12) << " sending expire to mds." << a << " on " << *in << dendl; + ceph_assert(a != mds->get_nodeid()); + auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple()); + if (em.second) + em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */ + em.first->second->add_inode(df, in->vino(), in->get_replica_nonce()); + } + } + + /* + if (in->is_auth()) { + if (in->hack_accessed) + mds->logger->inc("outt"); + else { + mds->logger->inc("outut"); + mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp); + } + } + */ + + // unlink + if (dn) + dn->get_dir()->unlink_inode(dn, false); + remove_inode(in); + return false; +} + + +/** + * trim_non_auth - remove any non-auth items from our cache + * + * this reduces the amount of non-auth metadata in our cache, reducing the + * load incurred by the rejoin phase. + * + * the only non-auth items that remain are those that are needed to + * attach our own subtrees to the root. + * + * when we are done, all dentries will be in the top bit of the lru. + * + * why we have to do this: + * we may not have accurate linkage for non-auth items. which means we will + * know which subtree it falls into, and can not be sure to declare it to the + * correct authority. + */ +void MDCache::trim_non_auth() +{ + dout(7) << "trim_non_auth" << dendl; + + // temporarily pin all subtree roots + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) + p->first->get(CDir::PIN_SUBTREETEMP); + + list<CDentry*> auth_list; + + // trim non-auth items from the lru + for (;;) { + CDentry *dn = NULL; + if (bottom_lru.lru_get_size() > 0) + dn = static_cast<CDentry*>(bottom_lru.lru_expire()); + if (!dn && lru.lru_get_size() > 0) + dn = static_cast<CDentry*>(lru.lru_expire()); + if (!dn) + break; + + CDentry::linkage_t *dnl = dn->get_linkage(); + + if (dn->is_auth()) { + // add back into lru (at the top) + auth_list.push_back(dn); + + if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth()) + dn->unlink_remote(dnl); + } else { + // non-auth. expire. + CDir *dir = dn->get_dir(); + ceph_assert(dir); + + // unlink the dentry + dout(10) << " removing " << *dn << dendl; + if (dnl->is_remote()) { + dir->unlink_inode(dn, false); + } + else if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dout(10) << " removing " << *in << dendl; + auto&& ls = in->get_dirfrags(); + for (const auto& subdir : ls) { + ceph_assert(!subdir->is_subtree_root()); + in->close_dirfrag(subdir->dirfrag().frag); + } + dir->unlink_inode(dn, false); + remove_inode(in); + } + else { + ceph_assert(dnl->is_null()); + } + + ceph_assert(!dir->has_bloom()); + dir->remove_dentry(dn); + // adjust the dir state + dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! + // close empty non-auth dirfrag + if (!dir->is_subtree_root() && dir->get_num_any() == 0) + dir->inode->close_dirfrag(dir->get_frag()); + } + } + + for (const auto& dn : auth_list) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) + bottom_lru.lru_insert_mid(dn); + else + lru.lru_insert_top(dn); + } + + // move everything in the pintail to the top bit of the lru. + lru.lru_touch_entire_pintail(); + + // unpin all subtrees + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) + p->first->put(CDir::PIN_SUBTREETEMP); + + if (lru.lru_get_size() == 0 && + bottom_lru.lru_get_size() == 0) { + // root, stray, etc.? + auto p = inode_map.begin(); + while (p != inode_map.end()) { + CInode *in = p->second; + ++p; + if (!in->is_auth()) { + auto&& ls = in->get_dirfrags(); + for (const auto& dir : ls) { + dout(10) << " removing " << *dir << dendl; + ceph_assert(dir->get_num_ref() == 1); // SUBTREE + remove_subtree(dir); + in->close_dirfrag(dir->dirfrag().frag); + } + dout(10) << " removing " << *in << dendl; + ceph_assert(!in->get_parent_dn()); + ceph_assert(in->get_num_ref() == 0); + remove_inode(in); + } + } + } + + show_subtrees(); +} + +/** + * Recursively trim the subtree rooted at directory to remove all + * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors + * of those links. This is used to clear invalid data out of the cache. + * Note that it doesn't clear the passed-in directory, since that's not + * always safe. + */ +bool MDCache::trim_non_auth_subtree(CDir *dir) +{ + dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl; + + bool keep_dir = !can_trim_non_auth_dirfrag(dir); + + auto j = dir->begin(); + auto i = j; + while (j != dir->end()) { + i = j++; + CDentry *dn = i->second; + dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary()) { // check for subdirectories, etc + CInode *in = dnl->get_inode(); + bool keep_inode = false; + if (in->is_dir()) { + auto&& subdirs = in->get_dirfrags(); + for (const auto& subdir : subdirs) { + if (subdir->is_subtree_root()) { + keep_inode = true; + dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl; + } else { + if (trim_non_auth_subtree(subdir)) + keep_inode = true; + else { + in->close_dirfrag(subdir->get_frag()); + dir->state_clear(CDir::STATE_COMPLETE); // now incomplete! + } + } + } + + } + if (!keep_inode) { // remove it! + dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl; + dir->unlink_inode(dn, false); + remove_inode(in); + ceph_assert(!dir->has_bloom()); + dir->remove_dentry(dn); + } else { + dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl; + dn->clear_auth(); + in->state_clear(CInode::STATE_AUTH); + } + } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback + dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl; + } else { // just remove it + dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl; + if (dnl->is_remote()) + dir->unlink_inode(dn, false); + dir->remove_dentry(dn); + } + } + dir->state_clear(CDir::STATE_AUTH); + /** + * We've now checked all our children and deleted those that need it. + * Now return to caller, and tell them if *we're* a keeper. + */ + return keep_dir || dir->get_num_any(); +} + +/* + * during replay, when we determine a subtree is no longer ours, we + * try to trim it from our cache. because subtrees must be connected + * to the root, the fact that we can trim this tree may mean that our + * children or parents can also be trimmed. + */ +void MDCache::try_trim_non_auth_subtree(CDir *dir) +{ + dout(10) << "try_trim_nonauth_subtree " << *dir << dendl; + + // can we now trim child subtrees? + set<CDir*> bounds; + get_subtree_bounds(dir, bounds); + for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) { + CDir *bd = *p; + if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth + bd->get_num_any() == 0 && // and empty + can_trim_non_auth_dirfrag(bd)) { + CInode *bi = bd->get_inode(); + dout(10) << " closing empty non-auth child subtree " << *bd << dendl; + remove_subtree(bd); + bd->mark_clean(); + bi->close_dirfrag(bd->get_frag()); + } + } + + if (trim_non_auth_subtree(dir)) { + // keep + try_subtree_merge(dir); + } else { + // can we trim this subtree (and possibly our ancestors) too? + while (true) { + CInode *diri = dir->get_inode(); + if (diri->is_base()) { + if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) { + dout(10) << " closing empty non-auth subtree " << *dir << dendl; + remove_subtree(dir); + dir->mark_clean(); + diri->close_dirfrag(dir->get_frag()); + + dout(10) << " removing " << *diri << dendl; + ceph_assert(!diri->get_parent_dn()); + ceph_assert(diri->get_num_ref() == 0); + remove_inode(diri); + } + break; + } + + CDir *psub = get_subtree_root(diri->get_parent_dir()); + dout(10) << " parent subtree is " << *psub << dendl; + if (psub->get_dir_auth().first == mds->get_nodeid()) + break; // we are auth, keep. + + dout(10) << " closing empty non-auth subtree " << *dir << dendl; + remove_subtree(dir); + dir->mark_clean(); + diri->close_dirfrag(dir->get_frag()); + + dout(10) << " parent subtree also non-auth: " << *psub << dendl; + if (trim_non_auth_subtree(psub)) + break; + dir = psub; + } + } + + show_subtrees(); +} + +void MDCache::standby_trim_segment(LogSegment *ls) +{ + auto try_trim_inode = [this](CInode *in) { + if (in->get_num_ref() == 0 && + !in->item_open_file.is_on_list() && + in->parent != NULL && + in->parent->get_num_ref() == 0){ + touch_dentry_bottom(in->parent); + } + }; + + auto try_trim_dentry = [this](CDentry *dn) { + if (dn->get_num_ref() > 0) + return; + auto in = dn->get_linkage()->inode; + if(in && in->item_open_file.is_on_list()) + return; + touch_dentry_bottom(dn); + }; + + ls->new_dirfrags.clear_list(); + ls->open_files.clear_list(); + + while (!ls->dirty_dirfrags.empty()) { + CDir *dir = ls->dirty_dirfrags.front(); + dir->mark_clean(); + if (dir->inode) + try_trim_inode(dir->inode); + } + while (!ls->dirty_inodes.empty()) { + CInode *in = ls->dirty_inodes.front(); + in->mark_clean(); + try_trim_inode(in); + } + while (!ls->dirty_dentries.empty()) { + CDentry *dn = ls->dirty_dentries.front(); + dn->mark_clean(); + try_trim_dentry(dn); + } + while (!ls->dirty_parent_inodes.empty()) { + CInode *in = ls->dirty_parent_inodes.front(); + in->clear_dirty_parent(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_dir.empty()) { + CInode *in = ls->dirty_dirfrag_dir.front(); + in->filelock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_nest.empty()) { + CInode *in = ls->dirty_dirfrag_nest.front(); + in->nestlock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->dirty_dirfrag_dirfragtree.empty()) { + CInode *in = ls->dirty_dirfrag_dirfragtree.front(); + in->dirfragtreelock.remove_dirty(); + try_trim_inode(in); + } + while (!ls->truncating_inodes.empty()) { + auto it = ls->truncating_inodes.begin(); + CInode *in = *it; + ls->truncating_inodes.erase(it); + in->put(CInode::PIN_TRUNCATING); + try_trim_inode(in); + } +} + +void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m) +{ + mds_rank_t from = mds_rank_t(m->get_from()); + + dout(7) << "cache_expire from mds." << from << dendl; + + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + set<SimpleLock *> gather_locks; + // loop over realms + for (const auto &p : m->realms) { + // check container? + if (p.first.ino > 0) { + CInode *expired_inode = get_inode(p.first.ino); + ceph_assert(expired_inode); // we had better have this. + CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag); + ceph_assert(parent_dir); + + int export_state = -1; + if (parent_dir->is_auth() && parent_dir->is_exporting()) { + export_state = migrator->get_export_state(parent_dir); + ceph_assert(export_state >= 0); + } + + if (!parent_dir->is_auth() || + (export_state != -1 && + ((export_state == Migrator::EXPORT_WARNING && + migrator->export_has_warned(parent_dir,from)) || + export_state == Migrator::EXPORT_EXPORTING || + export_state == Migrator::EXPORT_LOGGINGFINISH || + (export_state == Migrator::EXPORT_NOTIFYING && + !migrator->export_has_notified(parent_dir,from))))) { + + // not auth. + dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl; + ceph_assert(parent_dir->is_frozen_tree_root()); + + // make a message container + + auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple()); + if (em.second) + em.first->second = make_message<MCacheExpire>(from); /* new */ + + // merge these expires into it + em.first->second->add_realm(p.first, p.second); + continue; + } + ceph_assert(export_state <= Migrator::EXPORT_PREPPING || + (export_state == Migrator::EXPORT_WARNING && + !migrator->export_has_warned(parent_dir, from))); + + dout(7) << "expires for " << *parent_dir << dendl; + } else { + dout(7) << "containerless expires (root, stray inodes)" << dendl; + } + + // INODES + for (const auto &q : p.second.inodes) { + CInode *in = get_inode(q.first); + unsigned nonce = q.second; + + if (!in) { + dout(0) << " inode expire on " << q.first << " from " << from + << ", don't have it" << dendl; + ceph_assert(in); + } + ceph_assert(in->is_auth()); + dout(20) << __func__ << ": expiring inode " << *in << dendl; + + // check nonce + if (nonce == in->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " inode expire on " << *in << " from mds." << from + << " cached_by was " << in->get_replicas() << dendl; + inode_remove_replica(in, from, false, gather_locks); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " inode expire on " << *in << " from mds." << from + << " with old nonce " << nonce + << " (current " << in->get_replica_nonce(from) << "), dropping" + << dendl; + } + } + + // DIRS + for (const auto &q : p.second.dirs) { + CDir *dir = get_dirfrag(q.first); + unsigned nonce = q.second; + + if (!dir) { + CInode *diri = get_inode(q.first.ino); + if (diri) { + if (mds->is_rejoin() && + rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet + !diri->is_replica(from)) { + auto&& ls = diri->get_nested_dirfrags(); + dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from + << " while rejoining, inode isn't replicated" << dendl; + for (const auto& d : ls) { + dir = d; + if (dir->is_replica(from)) { + dout(7) << " dir expire on " << *dir << " from mds." << from << dendl; + dir->remove_replica(from); + } + } + continue; + } + CDir *other = diri->get_approx_dirfrag(q.first.frag); + if (other) { + dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from + << " have " << *other << ", mismatched frags, dropping" << dendl; + continue; + } + } + dout(0) << " dir expire on " << q.first << " from " << from + << ", don't have it" << dendl; + ceph_assert(dir); + } + dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl; + + ceph_assert(dir->is_auth()); + + // check nonce + if (nonce == dir->get_replica_nonce(from)) { + // remove from our cached_by + dout(7) << " dir expire on " << *dir << " from mds." << from + << " replicas was " << dir->get_replicas() << dendl; + dir->remove_replica(from); + } + else { + // this is an old nonce, ignore expire. + dout(7) << " dir expire on " << *dir << " from mds." << from + << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) + << "), dropping" << dendl; + } + } + + // DENTRIES + for (const auto &pd : p.second.dentries) { + dout(10) << " dn expires in dir " << pd.first << dendl; + CInode *diri = get_inode(pd.first.ino); + ceph_assert(diri); + CDir *dir = diri->get_dirfrag(pd.first.frag); + + if (!dir) { + dout(0) << " dn expires on " << pd.first << " from " << from + << ", must have refragmented" << dendl; + } else { + ceph_assert(dir->is_auth()); + } + + for (const auto &p : pd.second) { + unsigned nonce = p.second; + CDentry *dn; + + if (dir) { + dn = dir->lookup(p.first.first, p.first.second); + } else { + // which dirfrag for this dentry? + CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first)); + ceph_assert(dir); + ceph_assert(dir->is_auth()); + dn = dir->lookup(p.first.first, p.first.second); + } + + if (!dn) { + if (dir) + dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl; + else + dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl; + } + ceph_assert(dn); + + if (nonce == dn->get_replica_nonce(from)) { + dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl; + dentry_remove_replica(dn, from, gather_locks); + } + else { + dout(7) << " dentry_expire on " << *dn << " from mds." << from + << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) + << "), dropping" << dendl; + } + } + } + } + + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { + if (!(*p)->is_stable()) + mds->locker->eval_gather(*p); + } +} + +void MDCache::process_delayed_expire(CDir *dir) +{ + dout(7) << "process_delayed_expire on " << *dir << dendl; + for (const auto &p : delayed_expire[dir]) { + handle_cache_expire(p.second); + } + delayed_expire.erase(dir); +} + +void MDCache::discard_delayed_expire(CDir *dir) +{ + dout(7) << "discard_delayed_expire on " << *dir << dendl; + delayed_expire.erase(dir); +} + +void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin, + set<SimpleLock *>& gather_locks) +{ + in->remove_replica(from); + in->set_mds_caps_wanted(from, 0); + + // note: this code calls _eval more often than it needs to! + // fix lock + if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock); + if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock); + if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock); + if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock); + if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock); + if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock); + + // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state. + // Don't remove the recovering mds from lock's gathering list because + // it may hold rejoined wrlocks. + if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock); + if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock); + if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock); +} + +void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks) +{ + dn->remove_replica(from); + + // fix lock + if (dn->lock.remove_replica(from)) + gather_locks.insert(&dn->lock); + + // Replicated strays might now be elegible for purge + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + maybe_eval_stray(dnl->get_inode()); + } +} + +void MDCache::trim_client_leases() +{ + utime_t now = ceph_clock_now(); + + dout(10) << "trim_client_leases" << dendl; + + std::size_t pool = 0; + for (const auto& list : client_leases) { + pool += 1; + if (list.empty()) + continue; + + auto before = list.size(); + while (!list.empty()) { + ClientLease *r = list.front(); + if (r->ttl > now) break; + CDentry *dn = static_cast<CDentry*>(r->parent); + dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl; + dn->remove_client_lease(r, mds->locker); + } + auto after = list.size(); + dout(10) << "trim_client_leases pool " << pool << " trimmed " + << (before-after) << " leases, " << after << " left" << dendl; + } +} + +void MDCache::check_memory_usage() +{ + static MemoryModel mm(g_ceph_context); + static MemoryModel::snap last; + mm.sample(&last); + static MemoryModel::snap baseline = last; + + // check client caps + ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes); + double caps_per_inode = 0.0; + if (CInode::count()) + caps_per_inode = (double)Capability::count() / (double)CInode::count(); + + dout(2) << "Memory usage: " + << " total " << last.get_total() + << ", rss " << last.get_rss() + << ", heap " << last.get_heap() + << ", baseline " << baseline.get_heap() + << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps" + << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode" + << dendl; + + mds->update_mlogger(); + mds->mlogger->set(l_mdm_rss, last.get_rss()); + mds->mlogger->set(l_mdm_heap, last.get_heap()); +} + + + +// ========================================================================================= +// shutdown + +class C_MDC_ShutdownCheck : public MDCacheContext { +public: + explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {} + void finish(int) override { + mdcache->shutdown_check(); + } +}; + +void MDCache::shutdown_check() +{ + dout(0) << "shutdown_check at " << ceph_clock_now() << dendl; + + // cache + char old_val[32] = { 0 }; + char *o = old_val; + g_conf().get_val("debug_mds", &o, sizeof(old_val)); + g_conf().set_val("debug_mds", "10"); + g_conf().apply_changes(nullptr); + show_cache(); + g_conf().set_val("debug_mds", old_val); + g_conf().apply_changes(nullptr); + mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // this + dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; + + + if (mds->objecter->is_active()) { + dout(0) << "objecter still active" << dendl; + mds->objecter->dump_active(); + } +} + + +void MDCache::shutdown_start() +{ + dout(5) << "shutdown_start" << dendl; + + if (g_conf()->mds_shutdown_check) + mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // g_conf()->debug_mds = 10; +} + + + +bool MDCache::shutdown_pass() +{ + dout(7) << "shutdown_pass" << dendl; + + if (mds->is_stopped()) { + dout(7) << " already shut down" << dendl; + show_cache(); + show_subtrees(); + return true; + } + + // empty stray dir + bool strays_all_exported = shutdown_export_strays(); + + // trim cache + trim(UINT64_MAX); + dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + + // Export all subtrees to another active (usually rank 0) if not rank 0 + int num_auth_subtree = 0; + if (!subtrees.empty() && mds->get_nodeid() != 0) { + dout(7) << "looking for subtrees to export" << dendl; + std::vector<CDir*> ls; + for (auto& [dir, bounds] : subtrees) { + dout(10) << " examining " << *dir << " bounds " << bounds << dendl; + if (dir->get_inode()->is_mdsdir() || !dir->is_auth()) + continue; + num_auth_subtree++; + if (dir->is_frozen() || + dir->is_freezing() || + dir->is_ambiguous_dir_auth() || + dir->state_test(CDir::STATE_EXPORTING) || + dir->get_inode()->is_ephemerally_pinned()) { + continue; + } + ls.push_back(dir); + } + + migrator->clear_export_queue(); + // stopping mds does not call MDBalancer::tick() + mds->balancer->handle_export_pins(); + for (const auto& dir : ls) { + mds_rank_t dest = dir->get_inode()->authority().first; + if (dest > 0 && !mds->mdsmap->is_active(dest)) + dest = 0; + dout(7) << "sending " << *dir << " back to mds." << dest << dendl; + migrator->export_dir_nicely(dir, dest); + } + } + + if (!strays_all_exported) { + dout(7) << "waiting for strays to migrate" << dendl; + return false; + } + + if (num_auth_subtree > 0) { + ceph_assert(mds->get_nodeid() > 0); + dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl; + show_subtrees(); + return false; + } + + // close out any sessions (and open files!) before we try to trim the log, etc. + if (mds->sessionmap.have_unclosed_sessions()) { + if (!mds->server->terminating_sessions) + mds->server->terminate_sessions(); + return false; + } + + // Fully trim the log so that all objects in cache are clean and may be + // trimmed by a future MDCache::trim. Note that MDSRank::tick does not + // trim the log such that the cache eventually becomes clean. + if (mds->mdlog->get_num_segments() > 0) { + auto ls = mds->mdlog->get_current_segment(); + if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) { + // Current segment contains events other than subtreemap or + // there are dirty dirfrags (see CDir::log_mark_dirty()) + mds->mdlog->start_new_segment(); + mds->mdlog->flush(); + } + } + mds->mdlog->trim_all(); + if (mds->mdlog->get_num_segments() > 1) { + dout(7) << "still >1 segments, waiting for log to trim" << dendl; + return false; + } + + // drop our reference to our stray dir inode + for (int i = 0; i < NUM_STRAY; ++i) { + if (strays[i] && + strays[i]->state_test(CInode::STATE_STRAYPINNED)) { + strays[i]->state_clear(CInode::STATE_STRAYPINNED); + strays[i]->put(CInode::PIN_STRAY); + strays[i]->put_stickydirs(); + } + } + + CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL; + if (mydir && !mydir->is_subtree_root()) + mydir = NULL; + + // subtrees map not empty yet? + if (subtrees.size() > (mydir ? 1 : 0)) { + dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; + show_subtrees(); + migrator->show_importing(); + migrator->show_exporting(); + if (!migrator->is_importing() && !migrator->is_exporting()) + show_cache(); + return false; + } + ceph_assert(!migrator->is_exporting()); + ceph_assert(!migrator->is_importing()); + + // replicas may dirty scatter locks + if (myin && myin->is_replicated()) { + dout(7) << "still have replicated objects" << dendl; + return false; + } + + if ((myin && myin->get_num_auth_pins()) || + (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) { + dout(7) << "still have auth pinned objects" << dendl; + return false; + } + + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the mdlog" << dendl; + mds->mdlog->cap(); + } + + if (!mds->mdlog->empty()) + mds->mdlog->trim(0); + + if (!mds->mdlog->empty()) { + dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() + << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; + return false; + } + + if (!did_shutdown_log_cap) { + // flush journal header + dout(7) << "writing header for (now-empty) journal" << dendl; + ceph_assert(mds->mdlog->empty()); + mds->mdlog->write_head(0); + // NOTE: filer active checker below will block us until this completes. + did_shutdown_log_cap = true; + return false; + } + + // filer active? + if (mds->objecter->is_active()) { + dout(7) << "objecter still active" << dendl; + mds->objecter->dump_active(); + return false; + } + + // trim what we can from the cache + if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) { + dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; + show_cache(); + //dump(); + return false; + } + + // make mydir subtree go away + if (mydir) { + if (mydir->get_num_ref() > 1) { // subtree pin + dout(7) << "there's still reference to mydir " << *mydir << dendl; + show_cache(); + return false; + } + + remove_subtree(mydir); + myin->close_dirfrag(mydir->get_frag()); + } + ceph_assert(subtrees.empty()); + + if (myin) { + remove_inode(myin); + ceph_assert(!myin); + } + + if (global_snaprealm) { + remove_inode(global_snaprealm->inode); + global_snaprealm = nullptr; + } + + // done! + dout(5) << "shutdown done." << dendl; + return true; +} + +bool MDCache::shutdown_export_strays() +{ + static const unsigned MAX_EXPORTING = 100; + + if (mds->get_nodeid() == 0) + return true; + + if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2) + return false; + + dout(10) << "shutdown_export_strays " << shutdown_export_next.first + << " '" << shutdown_export_next.second << "'" << dendl; + + bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0)); + bool all_exported = false; + +again: + auto next = shutdown_export_next; + + for (int i = 0; i < NUM_STRAY; ++i) { + CInode *strayi = strays[i]; + if (!strayi || + !strayi->state_test(CInode::STATE_STRAYPINNED)) + continue; + if (strayi->ino() < next.first.ino) + continue; + + deque<CDir*> dfls; + strayi->get_dirfrags(dfls); + + while (!dfls.empty()) { + CDir *dir = dfls.front(); + dfls.pop_front(); + + if (dir->dirfrag() < next.first) + continue; + if (next.first < dir->dirfrag()) { + next.first = dir->dirfrag(); + next.second.clear(); + } + + if (!dir->is_complete()) { + MDSContext *fin = nullptr; + if (shutdown_exporting_strays.empty()) { + fin = new MDSInternalContextWrapper(mds, + new LambdaContext([this](int r) { + shutdown_export_strays(); + }) + ); + } + dir->fetch(fin); + goto done; + } + + CDir::dentry_key_map::iterator it; + if (next.second.empty()) { + it = dir->begin(); + } else { + auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second)); + it = dir->lower_bound(dentry_key_t(0, next.second, hash)); + } + + for (; it != dir->end(); ++it) { + CDentry *dn = it->second; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_null()) + continue; + + if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) { + next.second = it->first.name; + goto done; + } + + auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino()); + if (!ret.second) { + dout(10) << "already exporting/purging " << *dn << dendl; + continue; + } + + // Don't try to migrate anything that is actually + // being purged right now + if (!dn->state_test(CDentry::STATE_PURGING)) + stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root! + + if (shutdown_exporting_strays.size() >= MAX_EXPORTING) { + ++it; + if (it != dir->end()) { + next.second = it->first.name; + } else { + if (dfls.empty()) + next.first.ino.val++; + else + next.first = dfls.front()->dirfrag(); + next.second.clear(); + } + goto done; + } + } + } + } + + if (shutdown_exporting_strays.empty()) { + dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0); + if (first_df < shutdown_export_next.first || + !shutdown_export_next.second.empty()) { + shutdown_export_next.first = first_df; + shutdown_export_next.second.clear(); + goto again; + } + all_exported = true; + } + +done: + shutdown_export_next = next; + return all_exported; +} + +// ========= messaging ============== + +void MDCache::dispatch(const cref_t<Message> &m) +{ + switch (m->get_type()) { + + // RESOLVE + case MSG_MDS_RESOLVE: + handle_resolve(ref_cast<MMDSResolve>(m)); + break; + case MSG_MDS_RESOLVEACK: + handle_resolve_ack(ref_cast<MMDSResolveAck>(m)); + break; + + // REJOIN + case MSG_MDS_CACHEREJOIN: + handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m)); + break; + + case MSG_MDS_DISCOVER: + handle_discover(ref_cast<MDiscover>(m)); + break; + case MSG_MDS_DISCOVERREPLY: + handle_discover_reply(ref_cast<MDiscoverReply>(m)); + break; + + case MSG_MDS_DIRUPDATE: + handle_dir_update(ref_cast<MDirUpdate>(m)); + break; + + case MSG_MDS_CACHEEXPIRE: + handle_cache_expire(ref_cast<MCacheExpire>(m)); + break; + + case MSG_MDS_DENTRYLINK: + handle_dentry_link(ref_cast<MDentryLink>(m)); + break; + case MSG_MDS_DENTRYUNLINK: + handle_dentry_unlink(ref_cast<MDentryUnlink>(m)); + break; + + case MSG_MDS_FRAGMENTNOTIFY: + handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m)); + break; + case MSG_MDS_FRAGMENTNOTIFYACK: + handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m)); + break; + + case MSG_MDS_FINDINO: + handle_find_ino(ref_cast<MMDSFindIno>(m)); + break; + case MSG_MDS_FINDINOREPLY: + handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m)); + break; + + case MSG_MDS_OPENINO: + handle_open_ino(ref_cast<MMDSOpenIno>(m)); + break; + case MSG_MDS_OPENINOREPLY: + handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m)); + break; + + case MSG_MDS_SNAPUPDATE: + handle_snap_update(ref_cast<MMDSSnapUpdate>(m)); + break; + + default: + derr << "cache unknown message " << m->get_type() << dendl; + ceph_abort_msg("cache unknown message"); + } +} + +int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, + const filepath& path, int flags, + vector<CDentry*> *pdnvec, CInode **pin) +{ + bool discover = (flags & MDS_TRAVERSE_DISCOVER); + bool forward = !discover; + bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED); + bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY); + bool want_inode = (flags & MDS_TRAVERSE_WANT_INODE); + bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH); + bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2)); + bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH); + bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY); + bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK); + + if (forward) + ceph_assert(mdr); // forward requires a request + + snapid_t snapid = CEPH_NOSNAP; + if (mdr) + mdr->snapid = snapid; + + client_t client = mdr ? mdr->get_client() : -1; + + if (mds->logger) mds->logger->inc(l_mds_traverse); + + dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl; + CInode *cur = get_inode(path.get_ino()); + if (!cur) { + if (MDS_INO_IS_MDSDIR(path.get_ino())) { + open_foreign_mdsdir(path.get_ino(), cf.build()); + return 1; + } + if (MDS_INO_IS_STRAY(path.get_ino())) { + mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino()); + unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino()); + filepath path(strays[idx]->get_parent_dn()->get_name(), + MDS_INO_MDSDIR(rank)); + MDRequestRef null_ref; + return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr); + } + return -CEPHFS_ESTALE; + } + if (cur->state_test(CInode::STATE_PURGING)) + return -CEPHFS_ESTALE; + + if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE) + mds->locker->find_and_attach_lock_cache(mdr, cur); + + if (mdr && mdr->lock_cache) { + if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT) + mdr->dir_layout = mdr->lock_cache->get_dir_layout(); + } else if (rdlock_snap) { + int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0; + if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) || + (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) { + bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT); + if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout)) + return 1; + } + } + + // start trace + if (pdnvec) + pdnvec->clear(); + if (pin) + *pin = cur; + + CInode *target_inode = nullptr; + MutationImpl::LockOpVec lov; + int r; + + for (unsigned depth = 0; depth < path.depth(); ) { + dout(12) << "traverse: path seg depth " << depth << " '" << path[depth] + << "' snapid " << snapid << dendl; + + if (!cur->is_dir()) { + dout(7) << "traverse: " << *cur << " not a dir " << dendl; + return -CEPHFS_ENOTDIR; + } + + // walk into snapdir? + if (path[depth].length() == 0) { + dout(10) << "traverse: snapdir" << dendl; + if (!mdr || depth > 0) // snapdir must be the first component + return -CEPHFS_EINVAL; + snapid = CEPH_SNAPDIR; + mdr->snapid = snapid; + depth++; + continue; + } + // walk thru snapdir? + if (snapid == CEPH_SNAPDIR) { + if (!mdr) + return -CEPHFS_EINVAL; + SnapRealm *realm = cur->find_snaprealm(); + snapid = realm->resolve_snapname(path[depth], cur->ino()); + dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl; + if (!snapid) { + if (pdnvec) + pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); + return -CEPHFS_ENOENT; + } + if (depth == path.depth() - 1) + target_inode = cur; + mdr->snapid = snapid; + depth++; + continue; + } + + // open dir + frag_t fg = cur->pick_dirfrag(path[depth]); + CDir *curdir = cur->get_dirfrag(fg); + if (!curdir) { + if (cur->is_auth()) { + // parent dir frozen_dir? + if (cur->is_frozen()) { + dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl; + cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build()); + return 1; + } + curdir = cur->get_or_open_dirfrag(this, fg); + } else { + // discover? + dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; + discover_path(cur, snapid, path.postfixpath(depth), cf.build(), + path_locked); + if (mds->logger) mds->logger->inc(l_mds_traverse_discover); + return 1; + } + } + ceph_assert(curdir); + +#ifdef MDS_VERIFY_FRAGSTAT + if (curdir->is_complete()) + curdir->verify_fragstat(); +#endif + + // frozen? + /* + if (curdir->is_frozen()) { + // doh! + // FIXME: traverse is allowed? + dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin)); + if (onfinish) delete onfinish; + return 1; + } + */ + + // Defer the auth check until the target inode is determined not to exist + // if want_inode is true. + if (want_auth && want_dentry && !want_inode && depth == path.depth() - 1 && + (r = maybe_request_forward_to_auth(mdr, cf, curdir)) != 0) + return r; + + // Before doing dirfrag->dn lookup, compare with DamageTable's + // record of which dentries were unreadable + if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) { + dout(4) << "traverse: stopped lookup at damaged dentry " + << *curdir << "/" << path[depth] << " snap=" << snapid << dendl; + return -CEPHFS_EIO; + } + + // dentry + CDentry *dn = curdir->lookup(path[depth], snapid); + if (dn) { + if (dn->state_test(CDentry::STATE_PURGING)) + return -CEPHFS_ENOENT; + + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + // If an auth check was deferred before and the target inode is found + // not to exist now, do the auth check here if necessary. + if (want_auth && want_dentry && want_inode && depth == path.depth() - 1 && + dnl->is_null() && (r = maybe_request_forward_to_auth(mdr, cf, dn)) != 0) + return r; + + if (rdlock_path) { + lov.clear(); + // do not xlock the tail dentry if target inode exists and caller wants it + if (xlock_dentry && (dnl->is_null() || !want_inode) && + depth == path.depth() - 1) { + ceph_assert(dn->is_auth()); + if (depth > 0 || !mdr->lock_cache) { + lov.add_wrlock(&cur->filelock); + lov.add_wrlock(&cur->nestlock); + if (rdlock_authlock) + lov.add_rdlock(&cur->authlock); + } + lov.add_xlock(&dn->lock); + } else { + // force client to flush async dir operation if necessary + if (cur->filelock.is_cached()) + lov.add_wrlock(&cur->filelock); + lov.add_rdlock(&dn->lock); + } + if (!mds->locker->acquire_locks(mdr, lov)) { + dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl; + return 1; + } + } else if (!path_locked && + !dn->lock.can_read(client) && + !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) { + dout(10) << "traverse: non-readable dentry at " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build()); + if (mds->logger) + mds->logger->inc(l_mds_traverse_lock); + if (dn->is_auth() && dn->lock.is_unstable_and_locked()) + mds->mdlog->flush(); + return 1; + } + + if (pdnvec) + pdnvec->push_back(dn); + + // can we conclude CEPHFS_ENOENT? + if (dnl->is_null()) { + dout(10) << "traverse: null+readable dentry at " << *dn << dendl; + if (depth == path.depth() - 1) { + if (want_dentry) + break; + } else { + if (pdnvec) + pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); + } + return -CEPHFS_ENOENT; + } + + // do we have inode? + CInode *in = dnl->get_inode(); + if (!in) { + ceph_assert(dnl->is_remote()); + // do i have it? + in = get_inode(dnl->get_remote_ino()); + if (in) { + dout(7) << "linking in remote in " << *in << dendl; + dn->link_remote(dnl, in); + } else { + dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl; + ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! + if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { + dout(4) << "traverse: remote dentry points to damaged ino " + << *dn << dendl; + return -CEPHFS_EIO; + } + open_remote_dentry(dn, true, cf.build(), + (path_locked && depth == path.depth() - 1)); + if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino); + return 1; + } + } + + cur = in; + + if (rdlock_snap && !(want_dentry && !want_inode && depth == path.depth() - 1)) { + lov.clear(); + lov.add_rdlock(&cur->snaplock); + if (!mds->locker->acquire_locks(mdr, lov)) { + dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl; + return 1; + } + } + + if (depth == path.depth() - 1) + target_inode = cur; + + // add to trace, continue. + touch_inode(cur); + if (pin) + *pin = cur; + depth++; + continue; + } + + ceph_assert(!dn); + + // MISS. dentry doesn't exist. + dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; + + if (curdir->is_auth()) { + // dentry is mine. + if (curdir->is_complete() || + (snapid == CEPH_NOSNAP && + curdir->has_bloom() && + !curdir->is_in_bloom(path[depth]))) { + // file not found + if (pdnvec) { + // instantiate a null dn? + if (depth < path.depth() - 1) { + dout(20) << " didn't traverse full path; not returning pdnvec" << dendl; + } else if (snapid < CEPH_MAXSNAP) { + dout(20) << " not adding null for snapid " << snapid << dendl; + } else if (curdir->is_frozen()) { + dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build()); + return 1; + } else { + // create a null dentry + dn = curdir->add_null_dentry(path[depth]); + dout(20) << " added null " << *dn << dendl; + + if (rdlock_path) { + lov.clear(); + if (xlock_dentry) { + if (depth > 0 || !mdr->lock_cache) { + lov.add_wrlock(&cur->filelock); + lov.add_wrlock(&cur->nestlock); + if (rdlock_authlock) + lov.add_rdlock(&cur->authlock); + } + lov.add_xlock(&dn->lock); + } else { + // force client to flush async dir operation if necessary + if (cur->filelock.is_cached()) + lov.add_wrlock(&cur->filelock); + lov.add_rdlock(&dn->lock); + } + if (!mds->locker->acquire_locks(mdr, lov)) { + dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl; + return 1; + } + } + } + if (dn) { + pdnvec->push_back(dn); + if (want_dentry) + break; + } else { + pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); + } + } + return -CEPHFS_ENOENT; + } else { + + // Check DamageTable for missing fragments before trying to fetch + // this + if (mds->damage_table.is_dirfrag_damaged(curdir)) { + dout(4) << "traverse: damaged dirfrag " << *curdir + << ", blocking fetch" << dendl; + return -CEPHFS_EIO; + } + + // directory isn't complete; reload + dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; + touch_inode(cur); + curdir->fetch(path[depth], snapid, cf.build()); + if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch); + return 1; + } + } else { + // dirfrag/dentry is not mine. + + if (forward && + mdr && mdr->client_request && + (int)depth < mdr->client_request->get_num_fwd()){ + dout(7) << "traverse: snap " << snapid << " and depth " << depth + << " < fwd " << mdr->client_request->get_num_fwd() + << ", discovering instead of forwarding" << dendl; + discover = true; + } + + if ((discover)) { + dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; + discover_path(curdir, snapid, path.postfixpath(depth), cf.build(), + path_locked); + if (mds->logger) mds->logger->inc(l_mds_traverse_discover); + return 1; + } + if (forward) { + // forward + dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; + + r = maybe_request_forward_to_auth(mdr, cf, curdir); + ceph_assert(r != 0); + + if (r == 2 && mds->logger) + mds->logger->inc(l_mds_traverse_forward); + + return r; + } + } + + ceph_abort(); // i shouldn't get here + } + + if (path.depth() == 0) { + dout(7) << "no tail dentry, base " << *cur << dendl; + if (want_dentry && !want_inode) { + return -CEPHFS_ENOENT; + } + target_inode = cur; + } + + if (target_inode) { + dout(7) << "found target " << *target_inode << dendl; + if (want_auth && !(want_dentry && !want_inode) && + (r = maybe_request_forward_to_auth(mdr, cf, target_inode)) != 0) + return r; + } + + // success. + if (mds->logger) mds->logger->inc(l_mds_traverse_hit); + dout(10) << "path_traverse finish on snapid " << snapid << dendl; + if (mdr) + ceph_assert(mdr->snapid == snapid); + + if (flags & MDS_TRAVERSE_RDLOCK_SNAP) + mdr->locking_state |= MutationImpl::SNAP_LOCKED; + else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2) + mdr->locking_state |= MutationImpl::SNAP2_LOCKED; + + if (rdlock_path) + mdr->locking_state |= MutationImpl::PATH_LOCKED; + + return 0; +} + +int MDCache::maybe_request_forward_to_auth(MDRequestRef& mdr, MDSContextFactory& cf, + MDSCacheObject *p) +{ + if (p->is_ambiguous_auth()) { + dout(7) << "waiting for single auth on " << *p << dendl; + p->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build()); + return 1; + } + if (!p->is_auth()) { + dout(7) << "fw to auth for " << *p << dendl; + request_forward(mdr, p->authority().first); + return 2; + } + return 0; +} + +CInode *MDCache::cache_traverse(const filepath& fp) +{ + dout(10) << "cache_traverse " << fp << dendl; + + CInode *in; + unsigned depth = 0; + char mdsdir_name[16]; + sprintf(mdsdir_name, "~mds%d", mds->get_nodeid()); + + if (fp.get_ino()) { + in = get_inode(fp.get_ino()); + } else if (fp.depth() > 0 && (fp[0] == "~mdsdir" || fp[0] == mdsdir_name)) { + in = myin; + depth = 1; + } else { + in = root; + } + if (!in) + return NULL; + + for (; depth < fp.depth(); depth++) { + std::string_view dname = fp[depth]; + frag_t fg = in->pick_dirfrag(dname); + dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl; + CDir *curdir = in->get_dirfrag(fg); + if (!curdir) + return NULL; + CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP); + if (!dn) + return NULL; + in = dn->get_linkage()->get_inode(); + if (!in) + return NULL; + } + dout(10) << " got " << *in << dendl; + return in; +} + + +/** + * open_remote_dir -- open up a remote dirfrag + * + * @param diri base inode + * @param approxfg approximate fragment. + * @param fin completion callback + */ +void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin) +{ + dout(10) << "open_remote_dir on " << *diri << dendl; + ceph_assert(diri->is_dir()); + ceph_assert(!diri->is_auth()); + ceph_assert(diri->get_dirfrag(approxfg) == 0); + + discover_dir_frag(diri, approxfg, fin); +} + + +/** + * get_dentry_inode - get or open inode + * + * @param dn the dentry + * @param mdr current request + * + * will return inode for primary, or link up/open up remote link's inode as necessary. + * If it's not available right now, puts mdr on wait list and returns null. + */ +CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected) +{ + CDentry::linkage_t *dnl; + if (projected) + dnl = dn->get_projected_linkage(); + else + dnl = dn->get_linkage(); + + ceph_assert(!dnl->is_null()); + + if (dnl->is_primary()) + return dnl->inode; + + ceph_assert(dnl->is_remote()); + CInode *in = get_inode(dnl->get_remote_ino()); + if (in) { + dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; + dn->link_remote(dnl, in); + return in; + } else { + dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; + open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr)); + return 0; + } +} + +struct C_MDC_OpenRemoteDentry : public MDCacheContext { + CDentry *dn; + inodeno_t ino; + MDSContext *onfinish; + bool want_xlocked; + C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) : + MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) { + dn->get(MDSCacheObject::PIN_PTRWAITER); + } + void finish(int r) override { + mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r); + dn->put(MDSCacheObject::PIN_PTRWAITER); + } +}; + +void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked) +{ + dout(10) << "open_remote_dentry " << *dn << dendl; + CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage(); + inodeno_t ino = dnl->get_remote_ino(); + int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1; + open_ino(ino, pool, + new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace +} + +void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin, + bool want_xlocked, int r) +{ + if (r < 0) { + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_remote() && dnl->get_remote_ino() == ino) { + dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; + dn->state_set(CDentry::STATE_BADREMOTEINO); + + std::string path; + CDir *dir = dn->get_dir(); + if (dir) { + dir->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + } + + bool fatal = mds->damage_table.notify_remote_damaged(ino, path); + if (fatal) { + mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } + } else { + r = 0; + } + } + fin->complete(r < 0 ? r : 0); +} + + +void MDCache::make_trace(vector<CDentry*>& trace, CInode *in) +{ + // empty trace if we're a base inode + if (in->is_base()) + return; + + CInode *parent = in->get_parent_inode(); + ceph_assert(parent); + make_trace(trace, parent); + + CDentry *dn = in->get_parent_dn(); + dout(15) << "make_trace adding " << *dn << dendl; + trace.push_back(dn); +} + + +// ------------------------------------------------------------------------------- +// Open inode by inode number + +class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext { + inodeno_t ino; + public: + bufferlist bl; + C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) : + MDCacheIOContext(c), ino(i) {} + void finish(int r) override { + mdcache->_open_ino_backtrace_fetched(ino, bl, r); + } + void print(ostream& out) const override { + out << "openino_backtrace_fetch" << ino << ")"; + } +}; + +struct C_MDC_OpenInoTraverseDir : public MDCacheContext { + inodeno_t ino; + cref_t<MMDSOpenIno> msg; + bool parent; + public: + C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) : + MDCacheContext(c), ino(i), msg(m), parent(p) {} + void finish(int r) override { + if (r < 0 && !parent) + r = -CEPHFS_EAGAIN; + if (msg) { + mdcache->handle_open_ino(msg, r); + return; + } + auto& info = mdcache->opening_inodes.at(ino); + mdcache->_open_ino_traverse_dir(ino, info, r); + } +}; + +struct C_MDC_OpenInoParentOpened : public MDCacheContext { + inodeno_t ino; + public: + C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} + void finish(int r) override { + mdcache->_open_ino_parent_opened(ino, r); + } +}; + +void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err) +{ + dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; + + open_ino_info_t& info = opening_inodes.at(ino); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + inode_backtrace_t backtrace; + if (err == 0) { + try { + decode(backtrace, bl); + } catch (const buffer::error &decode_exc) { + derr << "corrupt backtrace on ino x0" << std::hex << ino + << std::dec << ": " << decode_exc.what() << dendl; + open_ino_finish(ino, info, -CEPHFS_EIO); + return; + } + if (backtrace.pool != info.pool && backtrace.pool != -1) { + dout(10) << " old object in pool " << info.pool + << ", retrying pool " << backtrace.pool << dendl; + info.pool = backtrace.pool; + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + return; + } + } else if (err == -CEPHFS_ENOENT) { + int64_t meta_pool = mds->get_metadata_pool(); + if (info.pool != meta_pool) { + dout(10) << " no object in pool " << info.pool + << ", retrying pool " << meta_pool << dendl; + info.pool = meta_pool; + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + return; + } + err = 0; // backtrace.ancestors.empty() is checked below + } + + if (err == 0) { + if (backtrace.ancestors.empty()) { + dout(10) << " got empty backtrace " << dendl; + err = -CEPHFS_ESTALE; + } else if (!info.ancestors.empty()) { + if (info.ancestors[0] == backtrace.ancestors[0]) { + dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl; + err = -CEPHFS_EINVAL; + } else { + info.last_err = 0; + } + } + } + if (err) { + dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl; + if (info.last_err) + err = info.last_err; + open_ino_finish(ino, info, err); + return; + } + + dout(10) << " got backtrace " << backtrace << dendl; + info.ancestors = backtrace.ancestors; + + _open_ino_traverse_dir(ino, info, 0); +} + +void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) +{ + dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; + + open_ino_info_t& info = opening_inodes.at(ino); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret == mds->get_nodeid()) { + _open_ino_traverse_dir(ino, info, 0); + } else { + if (ret >= 0) { + mds_rank_t checked_rank = mds_rank_t(ret); + info.check_peers = true; + info.auth_hint = checked_rank; + info.checked.erase(checked_rank); + } + do_open_ino(ino, info, ret); + } +} + +void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret) { + do_open_ino(ino, info, ret); + return; + } + + mds_rank_t hint = info.auth_hint; + ret = open_ino_traverse_dir(ino, NULL, info.ancestors, + info.discover, info.want_xlocked, &hint); + if (ret > 0) + return; + if (hint != mds->get_nodeid()) + info.auth_hint = hint; + do_open_ino(ino, info, ret); +} + +void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, bool parent, + CDir *dir, std::string_view dname) +{ + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); + + auto fin = new C_MDC_OpenInoTraverseDir(this, ino, m, parent); + if (open_ino_batch && !dname.empty()) { + auto& p = open_ino_batched_fetch[dir]; + p.first.emplace_back(dname); + p.second.emplace_back(fin); + return; + } + + dir->fetch(dname, CEPH_NOSNAP, fin); + if (mds->logger) + mds->logger->inc(l_mds_openino_dir_fetch); +} + +int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, + const vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, mds_rank_t *hint) +{ + dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl; + int err = 0; + for (unsigned i = 0; i < ancestors.size(); i++) { + const auto& ancestor = ancestors.at(i); + CInode *diri = get_inode(ancestor.dirino); + + if (!diri) { + if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) { + open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + continue; + } + + if (diri->state_test(CInode::STATE_REJOINUNDEF)) { + CDentry *dn = diri->get_parent_dn(); + CDir *dir = dn->get_dir(); + while (dir->state_test(CDir::STATE_REJOINUNDEF) && + dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { + dn = dir->get_inode()->get_parent_dn(); + dir = dn->get_dir(); + } + _open_ino_fetch_dir(ino, m, i == 0, dir, dn->name); + return 1; + } + + if (!diri->is_dir()) { + dout(10) << " " << *diri << " is not dir" << dendl; + if (i == 0) + err = -CEPHFS_ENOTDIR; + break; + } + + const string& name = ancestor.dname; + frag_t fg = diri->pick_dirfrag(name); + CDir *dir = diri->get_dirfrag(fg); + if (!dir) { + if (diri->is_auth()) { + if (diri->is_frozen()) { + dout(10) << " " << *diri << " is frozen, waiting " << dendl; + diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + dir = diri->get_or_open_dirfrag(this, fg); + } else if (discover) { + open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + } + if (dir) { + inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino; + CDentry *dn = dir->lookup(name); + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + if (dir->is_auth()) { + if (dnl && dnl->is_primary() && + dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { + dout(10) << " fetching undef " << *dnl->get_inode() << dendl; + _open_ino_fetch_dir(ino, m, i == 0, dir, name); + return 1; + } + + if (!dnl && !dir->is_complete() && + (!dir->has_bloom() || dir->is_in_bloom(name))) { + dout(10) << " fetching incomplete " << *dir << dendl; + _open_ino_fetch_dir(ino, m, i == 0, dir, name); + return 1; + } + + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -CEPHFS_ENOENT; + } else if (discover) { + if (!dnl) { + filepath path(name, 0); + discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0), + (i == 0 && want_xlocked)); + return 1; + } + if (dnl->is_null() && !dn->lock.can_read(-1)) { + dout(10) << " null " << *dn << " is not readable, waiting" << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); + return 1; + } + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -CEPHFS_ENOENT; + } + } + if (hint && i == 0) + *hint = dir ? dir->authority().first : diri->authority().first; + break; + } + return err; +} + +void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl; + + MDSContext::vec waiters; + waiters.swap(info.waiters); + opening_inodes.erase(ino); + finish_contexts(g_ceph_context, waiters, ret); +} + +void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err) +{ + if (err < 0 && err != -CEPHFS_EAGAIN) { + info.checked.clear(); + info.checking = MDS_RANK_NONE; + info.check_peers = true; + info.fetch_backtrace = true; + if (info.discover) { + info.discover = false; + info.ancestors.clear(); + } + if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR) + info.last_err = err; + } + + if (info.check_peers || info.discover) { + if (info.discover) { + // got backtrace from peer, but failed to find inode. re-check peers + info.discover = false; + info.ancestors.clear(); + info.checked.clear(); + } + info.check_peers = false; + info.checking = MDS_RANK_NONE; + do_open_ino_peer(ino, info); + } else if (info.fetch_backtrace) { + info.check_peers = true; + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + info.checked.clear(); + C_IO_MDC_OpenInoBacktraceFetched *fin = + new C_IO_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, + new C_OnFinisher(fin, mds->finisher)); + } else { + ceph_assert(!info.ancestors.empty()); + info.checking = mds->get_nodeid(); + open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(), + new C_MDC_OpenInoParentOpened(this, ino), info.want_replica); + } +} + +void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) +{ + set<mds_rank_t> all, active; + mds->mdsmap->get_mds_set(all); + if (mds->get_state() == MDSMap::STATE_REJOIN) + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN); + else + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); + + dout(10) << "do_open_ino_peer " << ino << " active " << active + << " all " << all << " checked " << info.checked << dendl; + + mds_rank_t whoami = mds->get_nodeid(); + mds_rank_t peer = MDS_RANK_NONE; + if (info.auth_hint >= 0 && info.auth_hint != whoami) { + if (active.count(info.auth_hint)) { + peer = info.auth_hint; + info.auth_hint = MDS_RANK_NONE; + } + } else { + for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) + if (*p != whoami && info.checked.count(*p) == 0) { + peer = *p; + break; + } + } + if (peer < 0) { + all.erase(whoami); + if (all != info.checked) { + dout(10) << " waiting for more peers to be active" << dendl; + } else { + dout(10) << " all MDS peers have been checked " << dendl; + do_open_ino(ino, info, 0); + } + } else { + info.checking = peer; + vector<inode_backpointer_t> *pa = NULL; + // got backtrace from peer or backtrace just fetched + if (info.discover || !info.fetch_backtrace) + pa = &info.ancestors; + mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer); + if (mds->logger) + mds->logger->inc(l_mds_openino_peer_discover); + } +} + +void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err) +{ + if (mds->get_state() < MDSMap::STATE_REJOIN && + mds->get_want_state() != CEPH_MDS_STATE_REJOIN) { + return; + } + + dout(10) << "handle_open_ino " << *m << " err " << err << dendl; + + auto from = mds_rank_t(m->get_source().num()); + inodeno_t ino = m->ino; + ref_t<MMDSOpenInoReply> reply; + CInode *in = get_inode(ino); + if (in) { + dout(10) << " have " << *in << dendl; + reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0)); + if (in->is_auth()) { + touch_inode(in); + while (1) { + CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + CInode *diri = pdn->get_dir()->get_inode(); + reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), + in->get_version())); + in = diri; + } + } else { + reply->hint = in->authority().first; + } + } else if (err < 0) { + reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err); + } else { + mds_rank_t hint = MDS_RANK_NONE; + int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint); + if (ret > 0) + return; + reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret); + } + mds->send_message_mds(reply, from); +} + +void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m) +{ + dout(10) << "handle_open_ino_reply " << *m << dendl; + + inodeno_t ino = m->ino; + mds_rank_t from = mds_rank_t(m->get_source().num()); + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end() && it->second.checking == from) { + open_ino_info_t& info = it->second; + info.checking = MDS_RANK_NONE; + info.checked.insert(from); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + } else if (!m->ancestors.empty()) { + dout(10) << " found ino " << ino << " on mds." << from << dendl; + if (!info.want_replica) { + open_ino_finish(ino, info, from); + return; + } + + info.ancestors = m->ancestors; + info.auth_hint = from; + info.checking = mds->get_nodeid(); + info.discover = true; + _open_ino_traverse_dir(ino, info, 0); + } else if (m->error) { + dout(10) << " error " << m->error << " from mds." << from << dendl; + do_open_ino(ino, info, m->error); + } else { + if (m->hint >= 0 && m->hint != mds->get_nodeid()) { + info.auth_hint = m->hint; + info.checked.erase(m->hint); + } + do_open_ino_peer(ino, info); + } + } +} + +void MDCache::kick_open_ino_peers(mds_rank_t who) +{ + dout(10) << "kick_open_ino_peers mds." << who << dendl; + + for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin(); + p != opening_inodes.end(); + ++p) { + open_ino_info_t& info = p->second; + if (info.checking == who) { + dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl; + info.checking = MDS_RANK_NONE; + do_open_ino_peer(p->first, info); + } else if (info.checking == MDS_RANK_NONE) { + dout(10) << " kicking ino " << p->first << " who was waiting" << dendl; + do_open_ino_peer(p->first, info); + } + } +} + +void MDCache::open_ino_batch_start() +{ + dout(10) << __func__ << dendl; + open_ino_batch = true; +} + +void MDCache::open_ino_batch_submit() +{ + dout(10) << __func__ << dendl; + open_ino_batch = false; + + for (auto& [dir, p] : open_ino_batched_fetch) { + CInode *in = dir->inode; + std::vector<dentry_key_t> keys; + for (auto& dname : p.first) + keys.emplace_back(CEPH_NOSNAP, dname, in->hash_dentry_name(dname)); + dir->fetch_keys(keys, + new MDSInternalContextWrapper(mds, + new LambdaContext([this, waiters = std::move(p.second)](int r) mutable { + mds->queue_waiters_front(waiters); + }) + ) + ); + if (mds->logger) + mds->logger->inc(l_mds_openino_dir_fetch); + } + open_ino_batched_fetch.clear(); +} + +void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin, + bool want_replica, bool want_xlocked, + vector<inode_backpointer_t> *ancestors_hint, + mds_rank_t auth_hint) +{ + dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " + << want_replica << dendl; + + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end()) { + open_ino_info_t& info = it->second; + if (want_replica) { + info.want_replica = true; + if (want_xlocked && !info.want_xlocked) { + if (!info.ancestors.empty()) { + CInode *diri = get_inode(info.ancestors[0].dirino); + if (diri) { + frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname); + CDir *dir = diri->get_dirfrag(fg); + if (dir && !dir->is_auth()) { + filepath path(info.ancestors[0].dname, 0); + discover_path(dir, CEPH_NOSNAP, path, NULL, true); + } + } + } + info.want_xlocked = true; + } + } + info.waiters.push_back(fin); + } else { + open_ino_info_t& info = opening_inodes[ino]; + info.want_replica = want_replica; + info.want_xlocked = want_xlocked; + info.tid = ++open_ino_last_tid; + info.pool = pool >= 0 ? pool : default_file_layout.pool_id; + info.waiters.push_back(fin); + if (auth_hint != MDS_RANK_NONE) + info.auth_hint = auth_hint; + if (ancestors_hint) { + info.ancestors = std::move(*ancestors_hint); + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + _open_ino_traverse_dir(ino, info, 0); + } else { + do_open_ino(ino, info, 0); + } + } +} + +/* ---------------------------- */ + +/* + * search for a given inode on MDS peers. optionally start with the given node. + + + TODO + - recover from mds node failure, recovery + - traverse path + + */ +void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, + mds_rank_t hint, bool path_locked) +{ + dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl; + CInode *in = get_inode(ino); + if (in && in->state_test(CInode::STATE_PURGING)) { + c->complete(-CEPHFS_ESTALE); + return; + } + ceph_assert(!in); + + ceph_tid_t tid = ++find_ino_peer_last_tid; + find_ino_peer_info_t& fip = find_ino_peer[tid]; + fip.ino = ino; + fip.tid = tid; + fip.fin = c; + fip.path_locked = path_locked; + fip.hint = hint; + _do_find_ino_peer(fip); +} + +void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip) +{ + set<mds_rank_t> all, active; + mds->mdsmap->get_mds_set(all); + mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); + + dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino + << " active " << active << " all " << all + << " checked " << fip.checked + << dendl; + + mds_rank_t m = MDS_RANK_NONE; + if (fip.hint >= 0) { + m = fip.hint; + fip.hint = MDS_RANK_NONE; + } else { + for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p) + if (*p != mds->get_nodeid() && + fip.checked.count(*p) == 0) { + m = *p; + break; + } + } + if (m == MDS_RANK_NONE) { + all.erase(mds->get_nodeid()); + if (all != fip.checked) { + dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl; + } else { + dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl; + fip.fin->complete(-CEPHFS_ESTALE); + find_ino_peer.erase(fip.tid); + } + } else { + fip.checking = m; + mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m); + } +} + +void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m) +{ + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + dout(10) << "handle_find_ino " << *m << dendl; + auto r = make_message<MMDSFindInoReply>(m->tid); + CInode *in = get_inode(m->ino); + if (in) { + in->make_path(r->path); + dout(10) << " have " << r->path << " " << *in << dendl; + + /* + * If the the CInode was just created by using openc in current + * auth MDS, but the client just sends a getattr request to another + * replica MDS. Then here it will make a path of '#INODE-NUMBER' + * only because the CInode hasn't been linked yet, and the replica + * MDS will keep retrying until the auth MDS flushes the mdlog and + * the C_MDS_openc_finish and link_primary_inode are called at most + * 5 seconds later. + */ + if (!in->get_parent_dn() && in->is_auth()) { + mds->mdlog->flush(); + } + } + mds->send_message_mds(r, mds_rank_t(m->get_source().num())); +} + + +void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m) +{ + auto p = find_ino_peer.find(m->tid); + if (p != find_ino_peer.end()) { + dout(10) << "handle_find_ino_reply " << *m << dendl; + find_ino_peer_info_t& fip = p->second; + + // success? + if (get_inode(fip.ino)) { + dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl; + mds->queue_waiter(fip.fin); + find_ino_peer.erase(p); + return; + } + + mds_rank_t from = mds_rank_t(m->get_source().num()); + if (fip.checking == from) + fip.checking = MDS_RANK_NONE; + fip.checked.insert(from); + + if (!m->path.empty()) { + // we got a path! + vector<CDentry*> trace; + CF_MDS_RetryMessageFactory cf(mds, m); + MDRequestRef null_ref; + int flags = MDS_TRAVERSE_DISCOVER; + if (fip.path_locked) + flags |= MDS_TRAVERSE_PATH_LOCKED; + int r = path_traverse(null_ref, cf, m->path, flags, &trace); + if (r > 0) + return; + dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path + << ", retrying" << dendl; + fip.checked.clear(); + _do_find_ino_peer(fip); + } else { + // nope, continue. + _do_find_ino_peer(fip); + } + } else { + dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl; + } +} + +void MDCache::kick_find_ino_peers(mds_rank_t who) +{ + // find_ino_peers requests we should move on from + for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin(); + p != find_ino_peer.end(); + ++p) { + find_ino_peer_info_t& fip = p->second; + if (fip.checking == who) { + dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl; + fip.checking = MDS_RANK_NONE; + _do_find_ino_peer(fip); + } else if (fip.checking == MDS_RANK_NONE) { + dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl; + _do_find_ino_peer(fip); + } + } +} + +/* ---------------------------- */ + +int MDCache::get_num_client_requests() +{ + int count = 0; + for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + MDRequestRef& mdr = p->second; + if (mdr->reqid.name.is_client() && !mdr->is_peer()) + count++; + } + return count; +} + +MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req) +{ + // did we win a forward race against a peer? + if (active_requests.count(req->get_reqid())) { + MDRequestRef& mdr = active_requests[req->get_reqid()]; + ceph_assert(mdr); + if (mdr->is_peer()) { + dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl; + mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; + } + return MDRequestRef(); + } + + // register new client request + MDRequestImpl::Params params; + params.reqid = req->get_reqid(); + params.attempt = req->get_num_fwd(); + params.client_req = req; + params.initiated = req->get_recv_stamp(); + params.throttled = req->get_throttle_stamp(); + params.all_read = req->get_recv_complete_stamp(); + params.dispatched = req->get_dispatch_stamp(); + + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + active_requests[params.reqid] = mdr; + mdr->set_op_stamp(req->get_stamp()); + dout(7) << "request_start " << *mdr << dendl; + return mdr; +} + +MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m) +{ + int by = m->get_source().num(); + MDRequestImpl::Params params; + params.reqid = ri; + params.attempt = attempt; + params.triggering_peer_req = m; + params.peer_to = by; + params.initiated = m->get_recv_stamp(); + params.throttled = m->get_throttle_stamp(); + params.all_read = m->get_recv_complete_stamp(); + params.dispatched = m->get_dispatch_stamp(); + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + ceph_assert(active_requests.count(mdr->reqid) == 0); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl; + return mdr; +} + +MDRequestRef MDCache::request_start_internal(int op) +{ + utime_t now = ceph_clock_now(); + MDRequestImpl::Params params; + params.reqid.name = entity_name_t::MDS(mds->get_nodeid()); + params.reqid.tid = mds->issue_tid(); + params.initiated = now; + params.throttled = now; + params.all_read = now; + params.dispatched = now; + params.internal_op = op; + MDRequestRef mdr = + mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(¶ms); + + if (active_requests.count(mdr->reqid)) { + auto& _mdr = active_requests[mdr->reqid]; + dout(0) << __func__ << " existing " << *_mdr << " op " << _mdr->internal_op << dendl; + dout(0) << __func__ << " new " << *mdr << " op " << op << dendl; + ceph_abort(); + } + active_requests[mdr->reqid] = mdr; + dout(7) << __func__ << " " << *mdr << " op " << op << dendl; + return mdr; +} + +MDRequestRef MDCache::request_get(metareqid_t rid) +{ + ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid); + ceph_assert(p != active_requests.end()); + dout(7) << "request_get " << rid << " " << *p->second << dendl; + return p->second; +} + +void MDCache::request_finish(MDRequestRef& mdr) +{ + dout(7) << "request_finish " << *mdr << dendl; + mdr->mark_event("finishing request"); + + // peer finisher? + if (mdr->has_more() && mdr->more()->peer_commit) { + Context *fin = mdr->more()->peer_commit; + mdr->more()->peer_commit = 0; + int ret; + if (mdr->aborted) { + mdr->aborted = false; + ret = -1; + mdr->more()->peer_rolling_back = true; + } else { + ret = 0; + mdr->committing = true; + } + fin->complete(ret); // this must re-call request_finish. + return; + } + + switch(mdr->internal_op) { + case CEPH_MDS_OP_FRAGMENTDIR: + logger->inc(l_mdss_ireq_fragmentdir); + break; + case CEPH_MDS_OP_EXPORTDIR: + logger->inc(l_mdss_ireq_exportdir); + break; + case CEPH_MDS_OP_ENQUEUE_SCRUB: + logger->inc(l_mdss_ireq_enqueue_scrub); + break; + case CEPH_MDS_OP_FLUSH: + logger->inc(l_mdss_ireq_flush); + break; + case CEPH_MDS_OP_REPAIR_FRAGSTATS: + logger->inc(l_mdss_ireq_fragstats); + break; + case CEPH_MDS_OP_REPAIR_INODESTATS: + logger->inc(l_mdss_ireq_inodestats); + break; + } + + request_cleanup(mdr); +} + + +void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port) +{ + CachedStackStringStream css; + *css << "forwarding request to mds." << who; + mdr->mark_event(css->strv()); + if (mdr->client_request && mdr->client_request->get_source().is_client()) { + dout(7) << "request_forward " << *mdr << " to mds." << who << " req " + << *mdr->client_request << dendl; + if (mdr->is_batch_head()) { + mdr->release_batch_op()->forward(who); + } else { + mds->forward_message_mds(mdr, who); + } + if (mds->logger) mds->logger->inc(l_mds_forward); + } else if (mdr->internal_op >= 0) { + dout(10) << "request_forward on internal op; cancelling" << dendl; + mdr->internal_op_finish->complete(-CEPHFS_EXDEV); + } else { + dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request + << " was from mds" << dendl; + } + request_cleanup(mdr); +} + + +void MDCache::dispatch_request(MDRequestRef& mdr) +{ + if (mdr->client_request) { + mds->server->dispatch_client_request(mdr); + } else if (mdr->peer_request) { + mds->server->dispatch_peer_request(mdr); + } else { + switch (mdr->internal_op) { + case CEPH_MDS_OP_FRAGMENTDIR: + dispatch_fragment_dir(mdr); + break; + case CEPH_MDS_OP_EXPORTDIR: + migrator->dispatch_export_dir(mdr, 0); + break; + case CEPH_MDS_OP_ENQUEUE_SCRUB: + enqueue_scrub_work(mdr); + break; + case CEPH_MDS_OP_FLUSH: + flush_dentry_work(mdr); + break; + case CEPH_MDS_OP_REPAIR_FRAGSTATS: + repair_dirfrag_stats_work(mdr); + break; + case CEPH_MDS_OP_REPAIR_INODESTATS: + repair_inode_stats_work(mdr); + break; + case CEPH_MDS_OP_RDLOCK_FRAGSSTATS: + rdlock_dirfrags_stats_work(mdr); + break; + default: + ceph_abort(); + } + } +} + + +void MDCache::request_drop_foreign_locks(MDRequestRef& mdr) +{ + if (!mdr->has_more()) + return; + + // clean up peers + // (will implicitly drop remote dn pins) + for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin(); + p != mdr->more()->peers.end(); + ++p) { + auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, + MMDSPeerRequest::OP_FINISH); + + if (mdr->killed && !mdr->committing) { + r->mark_abort(); + } else if (mdr->more()->srcdn_auth_mds == *p && + mdr->more()->inode_import.length() > 0) { + // information about rename imported caps + r->inode_export = std::move(mdr->more()->inode_import); + } + + mds->send_message_mds(r, *p); + } + + /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them + * implicitly. Note that we don't call the finishers -- there shouldn't + * be any on a remote lock and the request finish wakes up all + * the waiters anyway! */ + + for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) { + SimpleLock *lock = it->lock; + if (it->is_xlock() && !lock->get_parent()->is_auth()) { + dout(10) << "request_drop_foreign_locks forgetting lock " << *lock + << " on " << lock->get_parent() << dendl; + lock->put_xlock(); + mdr->locks.erase(it++); + } else if (it->is_remote_wrlock()) { + dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock + << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl; + if (it->is_wrlock()) { + it->clear_remote_wrlock(); + ++it; + } else { + mdr->locks.erase(it++); + } + } else { + ++it; + } + } + + mdr->more()->peers.clear(); /* we no longer have requests out to them, and + * leaving them in can cause double-notifies as + * this function can get called more than once */ +} + +void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr) +{ + request_drop_foreign_locks(mdr); + mds->locker->drop_non_rdlocks(mdr.get()); +} + +void MDCache::request_drop_locks(MDRequestRef& mdr) +{ + request_drop_foreign_locks(mdr); + mds->locker->drop_locks(mdr.get()); +} + +void MDCache::request_cleanup(MDRequestRef& mdr) +{ + dout(15) << "request_cleanup " << *mdr << dendl; + + if (mdr->has_more()) { + if (mdr->more()->is_ambiguous_auth) + mdr->clear_ambiguous_auth(); + if (!mdr->more()->waiting_for_finish.empty()) + mds->queue_waiters(mdr->more()->waiting_for_finish); + } + + request_drop_locks(mdr); + + // drop (local) auth pins + mdr->drop_local_auth_pins(); + + // drop stickydirs + mdr->put_stickydirs(); + + mds->locker->kick_cap_releases(mdr); + + // drop cache pins + mdr->drop_pins(); + + // remove from session + mdr->item_session_request.remove_myself(); + + // remove from map + active_requests.erase(mdr->reqid); + + if (mds->logger) + log_stat(); + + mdr->mark_event("cleaned up request"); +} + +void MDCache::request_kill(MDRequestRef& mdr) +{ + // rollback peer requests is tricky. just let the request proceed. + if (mdr->has_more() && + (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) { + if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) { + ceph_assert(mdr->more()->witnessed.empty()); + mdr->aborted = true; + dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl; + } else { + dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl; + } + + ceph_assert(mdr->used_prealloc_ino == 0); + ceph_assert(mdr->prealloc_inos.empty()); + + mdr->session = NULL; + mdr->item_session_request.remove_myself(); + return; + } + + mdr->killed = true; + mdr->mark_event("killing request"); + + if (mdr->committing) { + dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl; + mdr->item_session_request.remove_myself(); + } else { + dout(10) << "request_kill " << *mdr << dendl; + request_cleanup(mdr); + } +} + +// ------------------------------------------------------------------------------- +// SNAPREALMS + +void MDCache::create_global_snaprealm() +{ + CInode *in = new CInode(this); // dummy inode + create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755); + add_inode(in); + global_snaprealm = in->snaprealm; +} + +void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients) +{ + dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl; + + vector<inodeno_t> split_inos; + vector<inodeno_t> split_realms; + + if (notify_clients) { + if (snapop == CEPH_SNAP_OP_SPLIT) { + // notify clients of update|split + for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p) + split_inos.push_back((*p)->ino()); + + for (auto& r : in->snaprealm->open_children) + split_realms.push_back(r->inode->ino()); + } + } + + map<client_t, ref_t<MClientSnap>> updates; + list<SnapRealm*> q; + q.push_back(in->snaprealm); + while (!q.empty()) { + SnapRealm *realm = q.front(); + q.pop_front(); + + dout(10) << " realm " << *realm << " on " << *realm->inode << dendl; + realm->invalidate_cached_snaps(); + + if (notify_clients) { + for (const auto& p : realm->client_caps) { + const auto& client = p.first; + const auto& caps = p.second; + ceph_assert(!caps->empty()); + + auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple()); + if (em.second) { + auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT); + update->head.split = in->ino(); + update->split_inos = split_inos; + update->split_realms = split_realms; + update->bl = mds->server->get_snap_trace(em.first->first, in->snaprealm); + em.first->second = std::move(update); + } + } + } + + // notify for active children, too. + dout(10) << " " << realm << " open_children are " << realm->open_children << dendl; + for (auto& r : realm->open_children) + q.push_back(r); + } + + if (notify_clients) + send_snaps(updates); +} + +void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op) +{ + dout(10) << __func__ << " " << *in << " stid " << stid << dendl; + ceph_assert(in->is_auth()); + + set<mds_rank_t> mds_set; + if (stid > 0) { + mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE); + mds_set.erase(mds->get_nodeid()); + } else { + in->list_replicas(mds_set); + } + + if (!mds_set.empty()) { + bufferlist snap_blob; + in->encode_snap(snap_blob); + + for (auto p : mds_set) { + auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op); + m->snap_blob = snap_blob; + mds->send_message_mds(m, p); + } + } + + if (stid > 0) + notify_global_snaprealm_update(snap_op); +} + +void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + dout(10) << __func__ << " " << *m << " from mds." << from << dendl; + + if (mds->get_state() < MDSMap::STATE_RESOLVE && + mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) { + return; + } + + // null rejoin_done means open_snaprealms() has already been called + bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN || + (mds->is_rejoin() && !rejoin_done); + + if (m->get_tid() > 0) { + mds->snapclient->notify_commit(m->get_tid()); + if (notify_clients) + notify_global_snaprealm_update(m->get_snap_op()); + } + + CInode *in = get_inode(m->get_ino()); + if (in) { + ceph_assert(!in->is_auth()); + if (mds->get_state() > MDSMap::STATE_REJOIN || + (mds->is_rejoin() && !in->is_rejoining())) { + auto p = m->snap_blob.cbegin(); + in->decode_snap(p); + + if (!notify_clients) { + if (!rejoin_pending_snaprealms.count(in)) { + in->get(CInode::PIN_OPENINGSNAPPARENTS); + rejoin_pending_snaprealms.insert(in); + } + } + do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients); + } + } +} + +void MDCache::notify_global_snaprealm_update(int snap_op) +{ + if (snap_op != CEPH_SNAP_OP_DESTROY) + snap_op = CEPH_SNAP_OP_UPDATE; + set<Session*> sessions; + mds->sessionmap.get_client_session_set(sessions); + for (auto &session : sessions) { + if (!session->is_open() && !session->is_stale()) + continue; + auto update = make_message<MClientSnap>(snap_op); + update->head.split = global_snaprealm->inode->ino(); + update->bl = mds->server->get_snap_trace(session, global_snaprealm); + mds->send_message_client_counted(update, session); + } +} + +// ------------------------------------------------------------------------------- +// STRAYS + +struct C_MDC_RetryScanStray : public MDCacheContext { + dirfrag_t next; + C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { } + void finish(int r) override { + mdcache->scan_stray_dir(next); + } +}; + +void MDCache::scan_stray_dir(dirfrag_t next) +{ + dout(10) << "scan_stray_dir " << next << dendl; + + if (next.ino) + next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()]; + + for (int i = 0; i < NUM_STRAY; ++i) { + if (strays[i]->ino() < next.ino) + continue; + + std::vector<CDir*> ls; + strays[i]->get_dirfrags(ls); + + for (const auto& dir : ls) { + if (dir->get_frag() < next.frag) + continue; + + if (!dir->can_auth_pin()) { + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag())); + return; + } + + if (!dir->is_complete()) { + dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag())); + return; + } + + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->state_set(CDentry::STATE_STRAY); + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + if (in->get_inode()->nlink == 0) + in->state_set(CInode::STATE_ORPHAN); + maybe_eval_stray(in); + } + } + } + next.frag = frag_t(); + } +} + +void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin) +{ + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); + if (mds->logger) + mds->logger->inc(l_mds_openino_backtrace_fetch); +} + + + + + +// ======================================================================================== +// DISCOVER +/* + + - for all discovers (except base_inos, e.g. root, stray), waiters are attached + to the parent metadata object in the cache (pinning it). + + - all discovers are tracked by tid, so that we can ignore potentially dup replies. + +*/ + +void MDCache::_send_discover(discover_info_t& d) +{ + auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path, + d.want_base_dir, d.path_locked); + logger->inc(l_mdc_dir_send_discover); + dis->set_tid(d.tid); + mds->send_message_mds(dis, d.mds); +} + +void MDCache::discover_base_ino(inodeno_t want_ino, + MDSContext *onfinish, + mds_rank_t from) +{ + dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl; + if (waiting_for_base_ino[from].count(want_ino) == 0) { + discover_info_t& d = _create_discover(from); + d.ino = want_ino; + _send_discover(d); + } + waiting_for_base_ino[from][want_ino].push_back(onfinish); +} + + +void MDCache::discover_dir_frag(CInode *base, + frag_t approx_fg, + MDSContext *onfinish, + mds_rank_t from) +{ + if (from < 0) + from = base->authority().first; + + dirfrag_t df(base->ino(), approx_fg); + dout(7) << "discover_dir_frag " << df + << " from mds." << from << dendl; + + if (!base->is_waiting_for_dir(approx_fg) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.pin_base(base); + d.ino = base->ino(); + d.frag = approx_fg; + d.want_base_dir = true; + _send_discover(d); + } + + if (onfinish) + base->add_dir_waiter(approx_fg, onfinish); +} + +struct C_MDC_RetryDiscoverPath : public MDCacheContext { + CInode *base; + snapid_t snapid; + filepath path; + mds_rank_t from; + C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) : + MDCacheContext(c), base(b), snapid(s), path(p), from(f) {} + void finish(int r) override { + mdcache->discover_path(base, snapid, path, 0, from); + } +}; + +void MDCache::discover_path(CInode *base, + snapid_t snap, + filepath want_path, + MDSContext *onfinish, + bool path_locked, + mds_rank_t from) +{ + if (from < 0) + from = base->authority().first; + + dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from + << (path_locked ? " path_locked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(10) << " waiting for single auth on " << *base << dendl; + if (!onfinish) + onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from); + base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); + return; + } else if (from == mds->get_nodeid()) { + MDSContext::vec finished; + base->take_waiting(CInode::WAIT_DIR, finished); + mds->queue_waiters(finished); + return; + } + + frag_t fg = base->pick_dirfrag(want_path[0]); + if ((path_locked && want_path.depth() == 1) || + !base->is_waiting_for_dir(fg) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.ino = base->ino(); + d.pin_base(base); + d.frag = fg; + d.snap = snap; + d.want_path = want_path; + d.want_base_dir = true; + d.path_locked = path_locked; + _send_discover(d); + } + + // register + wait + if (onfinish) + base->add_dir_waiter(fg, onfinish); +} + +struct C_MDC_RetryDiscoverPath2 : public MDCacheContext { + CDir *base; + snapid_t snapid; + filepath path; + C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) : + MDCacheContext(c), base(b), snapid(s), path(p) {} + void finish(int r) override { + mdcache->discover_path(base, snapid, path, 0); + } +}; + +void MDCache::discover_path(CDir *base, + snapid_t snap, + filepath want_path, + MDSContext *onfinish, + bool path_locked) +{ + mds_rank_t from = base->authority().first; + + dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from + << (path_locked ? " path_locked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(7) << " waiting for single auth on " << *base << dendl; + if (!onfinish) + onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path); + base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); + return; + } + + if ((path_locked && want_path.depth() == 1) || + !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) { + discover_info_t& d = _create_discover(from); + d.ino = base->ino(); + d.pin_base(base->inode); + d.frag = base->get_frag(); + d.snap = snap; + d.want_path = want_path; + d.want_base_dir = false; + d.path_locked = path_locked; + _send_discover(d); + } + + // register + wait + if (onfinish) + base->add_dentry_waiter(want_path[0], snap, onfinish); +} + +void MDCache::kick_discovers(mds_rank_t who) +{ + for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin(); + p != discovers.end(); + ++p) { + if (p->second.mds != who) + continue; + _send_discover(p->second); + } +} + + +void MDCache::handle_discover(const cref_t<MDiscover> &dis) +{ + mds_rank_t whoami = mds->get_nodeid(); + mds_rank_t from = mds_rank_t(dis->get_source().num()); + + ceph_assert(from != whoami); + + if (mds->get_state() <= MDSMap::STATE_REJOIN) { + if (mds->get_state() < MDSMap::STATE_REJOIN && + mds->get_want_state() < CEPH_MDS_STATE_REJOIN) { + return; + } + + // proceed if requester is in the REJOIN stage, the request is from parallel_fetch(). + // delay processing request from survivor because we may not yet choose lock states. + if (!mds->mdsmap->is_rejoin(from)) { + dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl; + mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis)); + return; + } + } + + + CInode *cur = 0; + auto reply = make_message<MDiscoverReply>(*dis); + + snapid_t snapid = dis->get_snapid(); + + logger->inc(l_mdc_dir_handle_discover); + + // get started. + if (MDS_INO_IS_BASE(dis->get_base_ino()) && + !dis->wants_base_dir() && dis->get_want().depth() == 0) { + // wants root + dout(7) << "handle_discover from mds." << from + << " wants base + " << dis->get_want().get_path() + << " snap " << snapid + << dendl; + + cur = get_inode(dis->get_base_ino()); + ceph_assert(cur); + + // add root + reply->starts_with = MDiscoverReply::INODE; + encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features()); + dout(10) << "added base " << *cur << dendl; + } + else { + // there's a base inode + cur = get_inode(dis->get_base_ino(), snapid); + if (!cur && snapid != CEPH_NOSNAP) { + cur = get_inode(dis->get_base_ino()); + if (cur && !cur->is_multiversion()) + cur = NULL; // nope! + } + + if (!cur) { + dout(7) << "handle_discover mds." << from + << " don't have base ino " << dis->get_base_ino() << "." << snapid + << dendl; + if (!dis->wants_base_dir() && dis->get_want().depth() > 0) + reply->set_error_dentry(dis->get_dentry(0)); + reply->set_flag_error_dir(); + } else if (dis->wants_base_dir()) { + dout(7) << "handle_discover mds." << from + << " wants basedir+" << dis->get_want().get_path() + << " has " << *cur + << dendl; + } else { + dout(7) << "handle_discover mds." << from + << " wants " << dis->get_want().get_path() + << " has " << *cur + << dendl; + } + } + + ceph_assert(reply); + + // add content + // do some fidgeting to include a dir if they asked for the base dir, or just root. + for (unsigned i = 0; + cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); + i++) { + + // -- figure out the dir + + // is *cur even a dir at all? + if (!cur->is_dir()) { + dout(7) << *cur << " not a dir" << dendl; + reply->set_flag_error_dir(); + break; + } + + // pick frag + frag_t fg; + if (dis->get_want().depth()) { + // dentry specifies + fg = cur->pick_dirfrag(dis->get_dentry(i)); + } else { + // requester explicity specified the frag + ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino())); + fg = dis->get_base_dir_frag(); + if (!cur->dirfragtree.is_leaf(fg)) + fg = cur->dirfragtree[fg.value()]; + } + CDir *curdir = cur->get_dirfrag(fg); + + if ((!curdir && !cur->is_auth()) || + (curdir && !curdir->is_auth())) { + + /* before: + * ONLY set flag if empty!! + * otherwise requester will wake up waiter(s) _and_ continue with discover, + * resulting in duplicate discovers in flight, + * which can wreak havoc when discovering rename srcdn (which may move) + */ + + if (reply->is_empty()) { + // only hint if empty. + // someday this could be better, but right now the waiter logic isn't smart enough. + + // hint + if (curdir) { + dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; + reply->set_dir_auth_hint(curdir->authority().first); + } else { + dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " + << *cur << dendl; + reply->set_dir_auth_hint(cur->authority().first); + } + + // note error dentry, if any + // NOTE: important, as it allows requester to issue an equivalent discover + // to whomever we hint at. + if (dis->get_want().depth() > i) + reply->set_error_dentry(dis->get_dentry(i)); + } + + break; + } + + if (!curdir) { // open dir? + if (cur->is_frozen()) { + if (!reply->is_empty()) { + dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl; + break; + } + dout(7) << *cur << " is frozen, empty reply, waiting" << dendl; + cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } + curdir = cur->get_or_open_dirfrag(this, fg); + } else if (curdir->is_frozen_tree() || + (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) { + if (!reply->is_empty()) { + dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; + break; + } + if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) { + dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl; + reply->set_flag_error_dir(); + break; + } + dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; + curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } + + // add dir + if (curdir->get_version() == 0) { + // fetch newly opened dir + } else if (reply->is_empty() && !dis->wants_base_dir()) { + dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; + // make sure the base frag is correct, though, in there was a refragment since the + // original request was sent. + reply->set_base_dir_frag(curdir->get_frag()); + } else { + ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen. + if (!reply->trace.length()) + reply->starts_with = MDiscoverReply::DIR; + encode_replica_dir(curdir, from, reply->trace); + dout(7) << "handle_discover added dir " << *curdir << dendl; + } + + // lookup + CDentry *dn = 0; + std::string_view dname; + if (dis->get_want().depth() > 0) + dname = dis->get_dentry(i); + if (curdir->get_version() == 0) { + // fetch newly opened dir + ceph_assert(!curdir->has_bloom()); + } else if (dname.size() > 0) { + // lookup dentry + dn = curdir->lookup(dname, snapid); + } else + break; // done! + + // incomplete dir? + if (!dn) { + if (!curdir->is_complete() && + !(dname.size() > 0 && + snapid == CEPH_NOSNAP && + curdir->has_bloom() && + !curdir->is_in_bloom(dname))) { + // readdir + dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; + if (reply->is_empty()) { + // fetch and wait + curdir->fetch(dname, snapid, new C_MDS_RetryMessage(mds, dis), + dis->wants_base_dir() && curdir->get_version() == 0); + return; + } else { + // initiate fetch, but send what we have so far + curdir->fetch(dname, snapid, nullptr); + break; + } + } + + if (snapid != CEPH_NOSNAP && !reply->is_empty()) { + dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid + << " dne, non-empty reply, stopping" << dendl; + break; + } + + // send null dentry + dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " + << *curdir << dendl; + if (snapid == CEPH_NOSNAP) + dn = curdir->add_null_dentry(dis->get_dentry(i)); + else + dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid); + } + ceph_assert(dn); + + // don't add replica to purging dentry/inode + if (dn->state_test(CDentry::STATE_PURGING)) { + if (reply->is_empty()) + reply->set_flag_error_dn(dis->get_dentry(i)); + break; + } + + CDentry::linkage_t *dnl = dn->get_linkage(); + + // xlocked dentry? + // ...always block on non-tail items (they are unrelated) + // ...allow xlocked tail disocvery _only_ if explicitly requested + if (dn->lock.is_xlocked()) { + // is this the last (tail) item in the discover traversal? + if (dis->is_path_locked()) { + dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl; + } else if (reply->is_empty()) { + dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); + return; + } else { + dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl; + break; + } + } + + // frozen inode? + bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); + if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) { + if (tailitem && dis->is_path_locked()) { + dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl; + } else if (reply->is_empty()) { + dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl; + dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + return; + } else { + dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl; + break; + } + } + + // add dentry + if (!reply->trace.length()) + reply->starts_with = MDiscoverReply::DENTRY; + encode_replica_dentry(dn, from, reply->trace); + dout(7) << "handle_discover added dentry " << *dn << dendl; + + if (!dnl->is_primary()) break; // stop on null or remote link. + + // add inode + CInode *next = dnl->get_inode(); + ceph_assert(next->is_auth()); + + encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features()); + dout(7) << "handle_discover added inode " << *next << dendl; + + // descend, keep going. + cur = next; + continue; + } + + // how did we do? + ceph_assert(!reply->is_empty()); + dout(7) << "handle_discover sending result back to asker mds." << from << dendl; + mds->send_message(reply, dis->get_connection()); +} + +void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m) +{ + /* + if (mds->get_state() < MDSMap::STATE_ACTIVE) { + dout(0) << "discover_reply NOT ACTIVE YET" << dendl; + return; + } + */ + dout(7) << "discover_reply " << *m << dendl; + if (m->is_flag_error_dir()) + dout(7) << " flag error, dir" << dendl; + if (m->is_flag_error_dn()) + dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; + + MDSContext::vec finished, error; + mds_rank_t from = mds_rank_t(m->get_source().num()); + + // starting point + CInode *cur = get_inode(m->get_base_ino()); + auto p = m->trace.cbegin(); + + int next = m->starts_with; + + // decrement discover counters + if (m->get_tid()) { + map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid()); + if (p != discovers.end()) { + dout(10) << " found tid " << m->get_tid() << dendl; + discovers.erase(p); + } else { + dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl; + } + } + + // discover may start with an inode + if (!p.end() && next == MDiscoverReply::INODE) { + decode_replica_inode(cur, p, NULL, finished); + dout(7) << "discover_reply got base inode " << *cur << dendl; + ceph_assert(cur->is_base()); + + next = MDiscoverReply::DIR; + + // take waiters? + if (cur->is_base() && + waiting_for_base_ino[from].count(cur->ino())) { + finished.swap(waiting_for_base_ino[from][cur->ino()]); + waiting_for_base_ino[from].erase(cur->ino()); + } + } + ceph_assert(cur); + + // loop over discover results. + // indexes follow each ([[dir] dentry] inode) + // can start, end with any type. + while (!p.end()) { + // dir + frag_t fg; + CDir *curdir = nullptr; + if (next == MDiscoverReply::DIR) { + decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished); + if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) { + ceph_assert(m->get_wanted_base_dir()); + cur->take_dir_waiting(m->get_base_dir_frag(), finished); + } + } else { + // note: this can only happen our first way around this loop. + if (p.end() && m->is_flag_error_dn()) { + fg = cur->pick_dirfrag(m->get_error_dentry()); + curdir = cur->get_dirfrag(fg); + } else + curdir = cur->get_dirfrag(m->get_base_dir_frag()); + } + + if (p.end()) + break; + + // dentry + CDentry *dn = nullptr; + decode_replica_dentry(dn, p, curdir, finished); + + if (p.end()) + break; + + // inode + decode_replica_inode(cur, p, dn, finished); + + next = MDiscoverReply::DIR; + } + + // dir error? + // or dir_auth hint? + if (m->is_flag_error_dir() && !cur->is_dir()) { + // not a dir. + cur->take_waiting(CInode::WAIT_DIR, error); + } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) { + mds_rank_t who = m->get_dir_auth_hint(); + if (who == mds->get_nodeid()) who = -1; + if (who >= 0) + dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; + + + if (m->get_wanted_base_dir()) { + frag_t fg = m->get_base_dir_frag(); + CDir *dir = cur->get_dirfrag(fg); + + if (cur->is_waiting_for_dir(fg)) { + if (cur->is_auth()) + cur->take_waiting(CInode::WAIT_DIR, finished); + else if (dir || !cur->dirfragtree.is_leaf(fg)) + cur->take_dir_waiting(fg, finished); + else + discover_dir_frag(cur, fg, 0, who); + } else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + + // try again? + if (m->get_error_dentry().length()) { + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + // wanted a dentry + if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) { + if (dir->is_auth() || dir->lookup(m->get_error_dentry())) { + dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), + m->get_wanted_snapid(), finished); + } else { + filepath relpath(m->get_error_dentry(), 0); + discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked()); + } + } else + dout(7) << " doing nothing, have dir but nobody is waiting on dentry " + << m->get_error_dentry() << dendl; + } + } else if (m->is_flag_error_dn()) { + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + if (dir && !dir->is_auth()) { + dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), + m->get_wanted_snapid(), error); + } + } + + // waiters + finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT); // finish errors directly + mds->queue_waiters(finished); +} + + + +// ---------------------------- +// REPLICAS + + +void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + dirfrag_t df = dir->dirfrag(); + encode(df, bl); + __u32 nonce = dir->add_replica(to); + encode(nonce, bl); + dir->_encode_base(bl); + ENCODE_FINISH(bl); +} + +void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) +{ + ENCODE_START(2, 1, bl); + encode(dn->get_name(), bl); + encode(dn->last, bl); + + __u32 nonce = dn->add_replica(to); + encode(nonce, bl); + encode(dn->first, bl); + encode(dn->linkage.remote_ino, bl); + encode(dn->linkage.remote_d_type, bl); + dn->lock.encode_state_for_replica(bl); + bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE; + encode(need_recover, bl); + encode(dn->alternate_name, bl); + ENCODE_FINISH(bl); +} + +void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl, + uint64_t features) +{ + ceph_assert(in->is_auth()); + + ENCODE_START(2, 1, bl); + encode(in->ino(), bl); // bleh, minor assymetry here + encode(in->last, bl); + + __u32 nonce = in->add_replica(to); + encode(nonce, bl); + + in->_encode_base(bl, features); + in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE); + + __u32 state = in->state; + encode(state, bl); + + ENCODE_FINISH(bl); +} + +void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, + MDSContext::vec& finished) +{ + DECODE_START(1, p); + dirfrag_t df; + decode(df, p); + + ceph_assert(diri->ino() == df.ino); + + // add it (_replica_) + dir = diri->get_dirfrag(df.frag); + + if (dir) { + // had replica. update w/ new nonce. + __u32 nonce; + decode(nonce, p); + dir->set_replica_nonce(nonce); + dir->_decode_base(p); + dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl; + } else { + // force frag to leaf in the diri tree + if (!diri->dirfragtree.is_leaf(df.frag)) { + dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree " + << diri->dirfragtree << dendl; + diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag); + } + // add replica. + dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) ); + __u32 nonce; + decode(nonce, p); + dir->set_replica_nonce(nonce); + dir->_decode_base(p); + // is this a dir_auth delegation boundary? + if (from != diri->authority().first || + diri->is_ambiguous_auth() || + diri->is_base()) + adjust_subtree_auth(dir, from); + + dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl; + // get waiters + diri->take_dir_waiting(df.frag, finished); + } + DECODE_FINISH(p); +} + +void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished) +{ + DECODE_START(1, p); + string name; + snapid_t last; + decode(name, p); + decode(last, p); + + dn = dir->lookup(name, last); + + // have it? + bool is_new = false; + if (dn) { + is_new = false; + dout(7) << __func__ << " had " << *dn << dendl; + } else { + is_new = true; + dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last); + dout(7) << __func__ << " added " << *dn << dendl; + } + + __u32 nonce; + decode(nonce, p); + dn->set_replica_nonce(nonce); + decode(dn->first, p); + + inodeno_t rino; + unsigned char rdtype; + decode(rino, p); + decode(rdtype, p); + dn->lock.decode_state(p, is_new); + + bool need_recover; + decode(need_recover, p); + + mempool::mds_co::string alternate_name; + if (struct_v >= 2) { + decode(alternate_name, p); + } + + if (is_new) { + dn->set_alternate_name(std::move(alternate_name)); + if (rino) + dir->link_remote_inode(dn, rino, rdtype); + if (need_recover) + dn->lock.mark_need_recover(); + } else { + ceph_assert(dn->alternate_name == alternate_name); + } + + dir->take_dentry_waiting(name, dn->first, dn->last, finished); + DECODE_FINISH(p); +} + +void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished) +{ + DECODE_START(2, p); + inodeno_t ino; + snapid_t last; + __u32 nonce; + decode(ino, p); + decode(last, p); + decode(nonce, p); + in = get_inode(ino, last); + if (!in) { + in = new CInode(this, false, 2, last); + in->set_replica_nonce(nonce); + in->_decode_base(p); + in->_decode_locks_state_for_replica(p, true); + add_inode(in); + if (in->ino() == CEPH_INO_ROOT) + in->inode_auth.first = 0; + else if (in->is_mdsdir()) + in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET; + dout(10) << __func__ << " added " << *in << dendl; + if (dn) { + ceph_assert(dn->get_linkage()->is_null()); + dn->dir->link_primary_inode(dn, in); + } + } else { + in->set_replica_nonce(nonce); + in->_decode_base(p); + in->_decode_locks_state_for_replica(p, false); + dout(10) << __func__ << " had " << *in << dendl; + } + + if (dn) { + if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in) + dout(10) << __func__ << " different linkage in dentry " << *dn << dendl; + } + + if (struct_v >= 2) { + __u32 s; + decode(s, p); + s &= CInode::MASK_STATE_REPLICATED; + if (s & CInode::STATE_RANDEPHEMERALPIN) { + dout(10) << "replica inode is random ephemeral pinned" << dendl; + in->set_ephemeral_pin(false, true); + } + } + + DECODE_FINISH(p); +} + + +void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl) +{ + ceph_assert(straydn->get_num_auth_pins()); + ENCODE_START(2, 1, bl); + uint64_t features = mds->mdsmap->get_up_features(); + encode_replica_inode(get_myin(), who, bl, features); + encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl); + encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl); + encode_replica_inode(straydn->get_dir()->inode, who, bl, features); + encode_replica_dir(straydn->get_dir(), who, bl); + encode_replica_dentry(straydn, who, bl); + if (!straydn->get_projected_linkage()->is_null()) { + encode_replica_inode(straydn->get_projected_linkage()->get_inode(), who, bl, features); + } + ENCODE_FINISH(bl); +} + +void MDCache::decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from) +{ + MDSContext::vec finished; + auto p = bl.cbegin(); + + DECODE_START(2, p); + CInode *mdsin = nullptr; + decode_replica_inode(mdsin, p, NULL, finished); + CDir *mdsdir = nullptr; + decode_replica_dir(mdsdir, p, mdsin, from, finished); + CDentry *straydirdn = nullptr; + decode_replica_dentry(straydirdn, p, mdsdir, finished); + CInode *strayin = nullptr; + decode_replica_inode(strayin, p, straydirdn, finished); + CDir *straydir = nullptr; + decode_replica_dir(straydir, p, strayin, from, finished); + + decode_replica_dentry(straydn, p, straydir, finished); + if (struct_v >= 2 && in) { + decode_replica_inode(*in, p, straydn, finished); + } + if (!finished.empty()) + mds->queue_waiters(finished); + DECODE_FINISH(p); +} + + +int MDCache::send_dir_updates(CDir *dir, bool bcast) +{ + // this is an FYI, re: replication + + set<mds_rank_t> who; + if (bcast) { + set<mds_rank_t> mds_set; + mds->get_mds_map()->get_active_mds_set(mds_set); + + set<mds_rank_t> replica_set; + for (const auto &p : dir->get_replicas()) { + replica_set.insert(p.first); + } + + std::set_difference(mds_set.begin(), mds_set.end(), + replica_set.begin(), replica_set.end(), + std::inserter(who, who.end())); + } else { + for (const auto &p : dir->get_replicas()) { + who.insert(p.first); + } + } + + dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; + + filepath path; + dir->inode->make_path(path); + + std::set<int32_t> dir_rep_set; + for (const auto &r : dir->dir_rep_by) { + dir_rep_set.insert(r); + } + + mds_rank_t whoami = mds->get_nodeid(); + for (set<mds_rank_t>::iterator it = who.begin(); + it != who.end(); + ++it) { + if (*it == whoami) continue; + //if (*it == except) continue; + dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; + + logger->inc(l_mdc_dir_update); + mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it); + } + + return 0; +} + +void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m) +{ + dirfrag_t df = m->get_dirfrag(); + CDir *dir = get_dirfrag(df); + logger->inc(l_mdc_dir_update_receipt); + if (!dir) { + dout(5) << "dir_update on " << df << ", don't have it" << dendl; + + // discover it? + if (m->should_discover()) { + // only try once! + // this is key to avoid a fragtree update race, among other things. + m->inc_tried_discover(); + vector<CDentry*> trace; + CInode *in; + filepath path = m->get_path(); + dout(5) << "trying discover on dir_update for " << path << dendl; + logger->inc(l_mdc_dir_try_discover); + CF_MDS_RetryMessageFactory cf(mds, m); + MDRequestRef null_ref; + int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in); + if (r > 0) + return; + if (r == 0 && + in->ino() == df.ino && + in->get_approx_dirfrag(df.frag) == NULL) { + open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m)); + return; + } + } + + return; + } + + if (!m->has_tried_discover()) { + // Update if it already exists. Othwerwise it got updated by discover reply. + dout(5) << "dir_update on " << *dir << dendl; + dir->dir_rep = m->get_dir_rep(); + dir->dir_rep_by.clear(); + for (const auto &e : m->get_dir_rep_by()) { + dir->dir_rep_by.insert(e); + } + } +} + + + + + +// LINK + +void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + inodeno_t ino = dnl->get_remote_ino(); + encode(ino, bl); + __u8 d_type = dnl->get_remote_d_type(); + encode(d_type, bl); + ENCODE_FINISH(bl); +} + +void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + inodeno_t ino; + __u8 d_type; + decode(ino, p); + decode(d_type, p); + dout(10) << __func__ << " remote " << ino << " " << d_type << dendl; + dir->link_remote_inode(dn, ino, d_type); + DECODE_FINISH(p); +} + +void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr) +{ + dout(7) << __func__ << " " << *dn << dendl; + + CDir *subtree = get_subtree_root(dn->get_dir()); + for (const auto &p : dn->get_replicas()) { + // don't tell (rename) witnesses; they already know + if (mdr.get() && mdr->more()->witnessed.count(p.first)) + continue; + if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_gather.count(p.first))) + continue; + CDentry::linkage_t *dnl = dn->get_linkage(); + auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary()); + if (dnl->is_primary()) { + dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl; + encode_replica_inode(dnl->get_inode(), p.first, m->bl, + mds->mdsmap->get_up_features()); + } else if (dnl->is_remote()) { + encode_remote_dentry_link(dnl, m->bl); + } else + ceph_abort(); // aie, bad caller! + mds->send_message_mds(m, p.first); + } +} + +void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m) +{ + CDentry *dn = NULL; + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << __func__ << " on " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + + ceph_assert(!dn->is_auth()); + ceph_assert(dnl->is_null()); + } + } + + auto p = m->bl.cbegin(); + MDSContext::vec finished; + if (dn) { + if (m->get_is_primary()) { + // primary link. + CInode *in = nullptr; + decode_replica_inode(in, p, dn, finished); + } else { + // remote link, easy enough. + decode_remote_dentry_link(dir, dn, p); + } + } else { + ceph_abort(); + } + + if (!finished.empty()) + mds->queue_waiters(finished); + + return; +} + + +// UNLINK + +void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr) +{ + dout(10) << __func__ << " " << *dn << dendl; + // share unlink news with replicas + set<mds_rank_t> replicas; + dn->list_replicas(replicas); + bufferlist snapbl; + if (straydn) { + straydn->list_replicas(replicas); + CInode *strayin = straydn->get_linkage()->get_inode(); + strayin->encode_snap_blob(snapbl); + } + for (set<mds_rank_t>::iterator it = replicas.begin(); + it != replicas.end(); + ++it) { + // don't tell (rmdir) witnesses; they already know + if (mdr.get() && mdr->more()->witnessed.count(*it)) + continue; + + if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN && + rejoin_gather.count(*it))) + continue; + + auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name()); + if (straydn) { + encode_replica_stray(straydn, *it, unlink->straybl); + unlink->snapbl = snapbl; + } + mds->send_message_mds(unlink, *it); + } +} + +void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m) +{ + // straydn + CDentry *straydn = nullptr; + CInode *strayin = nullptr; + if (m->straybl.length()) + decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num())); + + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << __func__ << " on " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + + // open inode? + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + dn->dir->unlink_inode(dn); + ceph_assert(straydn); + straydn->dir->link_primary_inode(straydn, in); + + // in->first is lazily updated on replica; drag it forward so + // that we always keep it in sync with the dnq + ceph_assert(straydn->first >= in->first); + in->first = straydn->first; + + // update subtree map? + if (in->is_dir()) + adjust_subtree_after_rename(in, dir, false); + + if (m->snapbl.length()) { + bool hadrealm = (in->snaprealm ? true : false); + in->decode_snap_blob(m->snapbl); + ceph_assert(in->snaprealm); + if (!hadrealm) + do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false); + } + + // send caps to auth (if we're not already) + if (in->is_any_caps() && + !in->state_test(CInode::STATE_EXPORTINGCAPS)) + migrator->export_caps(in); + + straydn = NULL; + } else { + ceph_assert(!straydn); + ceph_assert(dnl->is_remote()); + dn->dir->unlink_inode(dn); + } + ceph_assert(dnl->is_null()); + } + } + + // race with trim_dentry() + if (straydn) { + ceph_assert(straydn->get_num_ref() == 0); + ceph_assert(straydn->get_linkage()->is_null()); + expiremap ex; + trim_dentry(straydn, ex); + send_expire_messages(ex); + } +} + + + + + + +// =================================================================== + + + +// =================================================================== +// FRAGMENT + + +/** + * adjust_dir_fragments -- adjust fragmentation for a directory + * + * @param diri directory inode + * @param basefrag base fragment + * @param bits bit adjustment. positive for split, negative for merge. + */ +void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + std::vector<CDir*>* resultfrags, + MDSContext::vec& waiters, + bool replay) +{ + dout(10) << "adjust_dir_fragments " << basefrag << " " << bits + << " on " << *diri << dendl; + + auto&& p = diri->get_dirfrags_under(basefrag); + + adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay); +} + +CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay) +{ + CDir *dir = diri->get_dirfrag(fg); + if (dir) + return dir; + + dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl; + + std::vector<CDir*> src, result; + MDSContext::vec waiters; + + // split a parent? + frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg); + while (1) { + CDir *pdir = diri->get_dirfrag(parent); + if (pdir) { + int split = fg.bits() - parent.bits(); + dout(10) << " splitting parent by " << split << " " << *pdir << dendl; + src.push_back(pdir); + adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay); + dir = diri->get_dirfrag(fg); + if (dir) { + dout(10) << "force_dir_fragment result " << *dir << dendl; + break; + } + } + if (parent == frag_t()) + break; + frag_t last = parent; + parent = parent.parent(); + dout(10) << " " << last << " parent is " << parent << dendl; + } + + if (!dir) { + // hoover up things under fg? + { + auto&& p = diri->get_dirfrags_under(fg); + src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second)); + } + if (src.empty()) { + dout(10) << "force_dir_fragment no frags under " << fg << dendl; + } else { + dout(10) << " will combine frags under " << fg << ": " << src << dendl; + adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay); + dir = result.front(); + dout(10) << "force_dir_fragment result " << *dir << dendl; + } + } + if (!replay) + mds->queue_waiters(waiters); + return dir; +} + +void MDCache::adjust_dir_fragments(CInode *diri, + const std::vector<CDir*>& srcfrags, + frag_t basefrag, int bits, + std::vector<CDir*>* resultfrags, + MDSContext::vec& waiters, + bool replay) +{ + dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits + << " srcfrags " << srcfrags + << " on " << *diri << dendl; + + // adjust fragtree + // yuck. we may have discovered the inode while it was being fragmented. + if (!diri->dirfragtree.is_leaf(basefrag)) + diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag); + + if (bits > 0) + diri->dirfragtree.split(basefrag, bits); + dout(10) << " new fragtree is " << diri->dirfragtree << dendl; + + if (srcfrags.empty()) + return; + + // split + CDir *parent_dir = diri->get_parent_dir(); + CDir *parent_subtree = 0; + if (parent_dir) + parent_subtree = get_subtree_root(parent_dir); + + ceph_assert(srcfrags.size() >= 1); + if (bits > 0) { + // SPLIT + ceph_assert(srcfrags.size() == 1); + CDir *dir = srcfrags.front(); + + dir->split(bits, resultfrags, waiters, replay); + + // did i change the subtree map? + if (dir->is_subtree_root()) { + // new frags are now separate subtrees + for (const auto& dir : *resultfrags) { + subtrees[dir].clear(); // new frag is now its own subtree + } + + // was i a bound? + if (parent_subtree) { + ceph_assert(subtrees[parent_subtree].count(dir)); + subtrees[parent_subtree].erase(dir); + for (const auto& dir : *resultfrags) { + ceph_assert(dir->is_subtree_root()); + subtrees[parent_subtree].insert(dir); + } + } + + // adjust my bounds. + set<CDir*> bounds; + bounds.swap(subtrees[dir]); + subtrees.erase(dir); + for (set<CDir*>::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *frag = get_subtree_root((*p)->get_parent_dir()); + subtrees[frag].insert(*p); + } + + show_subtrees(10); + } + + diri->close_dirfrag(dir->get_frag()); + + } else { + // MERGE + + // are my constituent bits subtrees? if so, i will be too. + // (it's all or none, actually.) + bool any_subtree = false, any_non_subtree = false; + for (const auto& dir : srcfrags) { + if (dir->is_subtree_root()) + any_subtree = true; + else + any_non_subtree = true; + } + ceph_assert(!any_subtree || !any_non_subtree); + + set<CDir*> new_bounds; + if (any_subtree) { + for (const auto& dir : srcfrags) { + // this simplifies the code that find subtrees underneath the dirfrag + if (!dir->is_subtree_root()) { + dir->state_set(CDir::STATE_AUXSUBTREE); + adjust_subtree_auth(dir, mds->get_nodeid()); + } + } + + for (const auto& dir : srcfrags) { + ceph_assert(dir->is_subtree_root()); + dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl; + map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir); + set<CDir*>::iterator r = q->second.begin(); + while (r != subtrees[dir].end()) { + new_bounds.insert(*r); + subtrees[dir].erase(r++); + } + subtrees.erase(q); + + // remove myself as my parent's bound + if (parent_subtree) + subtrees[parent_subtree].erase(dir); + } + } + + // merge + CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth()); + f->merge(srcfrags, waiters, replay); + + if (any_subtree) { + ceph_assert(f->is_subtree_root()); + subtrees[f].swap(new_bounds); + if (parent_subtree) + subtrees[parent_subtree].insert(f); + + show_subtrees(10); + } + + resultfrags->push_back(f); + } +} + + +class C_MDC_FragmentFrozen : public MDSInternalContext { + MDCache *mdcache; + MDRequestRef mdr; +public: + C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) : + MDSInternalContext(m->mds), mdcache(m), mdr(r) {} + void finish(int r) override { + mdcache->fragment_frozen(mdr, r); + } +}; + +bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs) +{ + if (is_readonly()) { + dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl; + return false; + } + if (mds->is_cluster_degraded()) { + dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl; + return false; + } + if (diri->get_parent_dir() && + diri->get_parent_dir()->get_inode()->is_stray()) { + dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl; + return false; + } + if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) { + dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl; + return false; + } + + for (const auto& dir : dirs) { + if (dir->scrub_is_in_progress()) { + dout(7) << "can_fragment: scrub in progress " << *dir << dendl; + return false; + } + + if (dir->state_test(CDir::STATE_FRAGMENTING)) { + dout(7) << "can_fragment: already fragmenting " << *dir << dendl; + return false; + } + if (!dir->is_auth()) { + dout(7) << "can_fragment: not auth on " << *dir << dendl; + return false; + } + if (dir->is_bad()) { + dout(7) << "can_fragment: bad dirfrag " << *dir << dendl; + return false; + } + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl; + return false; + } + } + + return true; +} + +void MDCache::split_dir(CDir *dir, int bits) +{ + dout(7) << __func__ << " " << *dir << " bits " << bits << dendl; + ceph_assert(dir->is_auth()); + CInode *diri = dir->inode; + + std::vector<CDir*> dirs; + dirs.push_back(dir); + + if (!can_fragment(diri, dirs)) { + dout(7) << __func__ << " cannot fragment right now, dropping" << dendl; + return; + } + + if (dir->frag.bits() + bits > 24) { + dout(7) << __func__ << " frag bits > 24, dropping" << dendl; + return; + } + + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + mdr->more()->fragment_base = dir->dirfrag(); + + ceph_assert(fragments.count(dir->dirfrag()) == 0); + fragment_info_t& info = fragments[dir->dirfrag()]; + info.mdr = mdr; + info.dirs.push_back(dir); + info.bits = bits; + info.last_cum_auth_pins_change = ceph_clock_now(); + + fragment_freeze_dirs(dirs); + // initial mark+complete pass + fragment_mark_and_complete(mdr); +} + +void MDCache::merge_dir(CInode *diri, frag_t frag) +{ + dout(7) << "merge_dir to " << frag << " on " << *diri << dendl; + + auto&& [all, dirs] = diri->get_dirfrags_under(frag); + if (!all) { + dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl; + return; + } + + if (diri->dirfragtree.is_leaf(frag)) { + dout(10) << " " << frag << " already a leaf for " << *diri << dendl; + return; + } + + if (!can_fragment(diri, dirs)) + return; + + CDir *first = dirs.front(); + int bits = first->get_frag().bits() - frag.bits(); + dout(10) << " we are merging by " << bits << " bits" << dendl; + + dirfrag_t basedirfrag(diri->ino(), frag); + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + mdr->more()->fragment_base = basedirfrag; + + ceph_assert(fragments.count(basedirfrag) == 0); + fragment_info_t& info = fragments[basedirfrag]; + info.mdr = mdr; + info.dirs = dirs; + info.bits = -bits; + info.last_cum_auth_pins_change = ceph_clock_now(); + + fragment_freeze_dirs(dirs); + // initial mark+complete pass + fragment_mark_and_complete(mdr); +} + +void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs) +{ + bool any_subtree = false, any_non_subtree = false; + for (const auto& dir : dirs) { + dir->auth_pin(dir); // until we mark and complete them + dir->state_set(CDir::STATE_FRAGMENTING); + dir->freeze_dir(); + ceph_assert(dir->is_freezing_dir()); + + if (dir->is_subtree_root()) + any_subtree = true; + else + any_non_subtree = true; + } + + if (any_subtree && any_non_subtree) { + // either all dirfrags are subtree roots or all are not. + for (const auto& dir : dirs) { + if (dir->is_subtree_root()) { + ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE)); + } else { + dir->state_set(CDir::STATE_AUXSUBTREE); + adjust_subtree_auth(dir, mds->get_nodeid()); + } + } + } +} + +class C_MDC_FragmentMarking : public MDCacheContext { + MDRequestRef mdr; +public: + C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} + void finish(int r) override { + mdcache->fragment_mark_and_complete(mdr); + } +}; + +void MDCache::fragment_mark_and_complete(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + fragment_info_t& info = it->second; + CInode *diri = info.dirs.front()->get_inode(); + dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl; + + MDSGatherBuilder gather(g_ceph_context); + + for (const auto& dir : info.dirs) { + bool ready = true; + if (!dir->is_complete()) { + dout(15) << " fetching incomplete " << *dir << dendl; + dir->fetch(gather.new_sub(), true); // ignore authpinnability + ready = false; + } else if (dir->get_frag() == frag_t()) { + // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback + // the operation. To avoid CDir::fetch() complaining about missing object, + // we commit new dirfrag first. + if (dir->state_test(CDir::STATE_CREATING)) { + dout(15) << " waiting until new dir gets journaled " << *dir << dendl; + dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub()); + ready = false; + } else if (dir->is_new()) { + dout(15) << " committing new " << *dir << dendl; + ceph_assert(dir->is_dirty()); + dir->commit(0, gather.new_sub(), true); + ready = false; + } + } + if (!ready) + continue; + + if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + dout(15) << " marking " << *dir << dendl; + for (auto &p : dir->items) { + CDentry *dn = p.second; + dn->get(CDentry::PIN_FRAGMENTING); + ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_set(CDentry::STATE_FRAGMENTING); + } + dir->state_set(CDir::STATE_DNPINNEDFRAG); + dir->auth_unpin(dir); + } else { + dout(15) << " already marked " << *dir << dendl; + } + } + if (gather.has_subs()) { + gather.set_finisher(new C_MDC_FragmentMarking(this, mdr)); + gather.activate(); + return; + } + + for (const auto& dir : info.dirs) { + if (!dir->is_frozen_dir()) { + ceph_assert(dir->is_freezing_dir()); + dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub()); + } + } + if (gather.has_subs()) { + gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr)); + gather.activate(); + // flush log so that request auth_pins are retired + mds->mdlog->flush(); + return; + } + + fragment_frozen(mdr, 0); +} + +void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs) +{ + dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl; + for (const auto& dir : dirs) { + dout(10) << " frag " << *dir << dendl; + + ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING)); + dir->state_clear(CDir::STATE_FRAGMENTING); + + if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + dir->state_clear(CDir::STATE_DNPINNEDFRAG); + + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_clear(CDentry::STATE_FRAGMENTING); + dn->put(CDentry::PIN_FRAGMENTING); + } + } else { + dir->auth_unpin(dir); + } + + dir->unfreeze_dir(); + } +} + +bool MDCache::fragment_are_all_frozen(CDir *dir) +{ + ceph_assert(dir->is_frozen_dir()); + map<dirfrag_t,fragment_info_t>::iterator p; + for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); + p != fragments.end() && p->first.ino == dir->ino(); + ++p) { + if (p->first.frag.contains(dir->get_frag())) + return p->second.all_frozen; + } + ceph_abort(); + return false; +} + +void MDCache::fragment_freeze_inc_num_waiters(CDir *dir) +{ + map<dirfrag_t,fragment_info_t>::iterator p; + for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); + p != fragments.end() && p->first.ino == dir->ino(); + ++p) { + if (p->first.frag.contains(dir->get_frag())) { + p->second.num_remote_waiters++; + return; + } + } + ceph_abort(); +} + +void MDCache::find_stale_fragment_freeze() +{ + dout(10) << "find_stale_fragment_freeze" << dendl; + // see comment in Migrator::find_stale_export_freeze() + utime_t now = ceph_clock_now(); + utime_t cutoff = now; + cutoff -= g_conf()->mds_freeze_tree_timeout; + + for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin(); + p != fragments.end(); ) { + dirfrag_t df = p->first; + fragment_info_t& info = p->second; + ++p; + if (info.all_frozen) + continue; + CDir *dir; + int total_auth_pins = 0; + for (const auto& d : info.dirs) { + dir = d; + if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + total_auth_pins = -1; + break; + } + if (dir->is_frozen_dir()) + continue; + total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins(); + } + if (total_auth_pins < 0) + continue; + if (info.last_cum_auth_pins != total_auth_pins) { + info.last_cum_auth_pins = total_auth_pins; + info.last_cum_auth_pins_change = now; + continue; + } + if (info.last_cum_auth_pins_change >= cutoff) + continue; + dir = info.dirs.front(); + if (info.num_remote_waiters > 0 || + (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { + dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl; + std::vector<CDir*> dirs; + info.dirs.swap(dirs); + fragments.erase(df); + fragment_unmark_unfreeze_dirs(dirs); + } + } +} + +class C_MDC_FragmentPrep : public MDCacheLogContext { + MDRequestRef mdr; +public: + C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_logged(mdr); + } +}; + +class C_MDC_FragmentStore : public MDCacheContext { + MDRequestRef mdr; +public: + C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_stored(mdr); + } +}; + +class C_MDC_FragmentCommit : public MDCacheLogContext { + dirfrag_t basedirfrag; + MDRequestRef mdr; +public: + C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) : + MDCacheLogContext(m), basedirfrag(df), mdr(r) {} + void finish(int r) override { + mdcache->_fragment_committed(basedirfrag, mdr); + } +}; + +class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext { + dirfrag_t basedirfrag; + int bits; + MDRequestRef mdr; +public: + C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b, + const MDRequestRef& r) : + MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {} + void finish(int r) override { + ceph_assert(r == 0 || r == -CEPHFS_ENOENT); + mdcache->_fragment_old_purged(basedirfrag, bits, mdr); + } + void print(ostream& out) const override { + out << "fragment_purge_old(" << basedirfrag << ")"; + } +}; + +void MDCache::fragment_frozen(MDRequestRef& mdr, int r) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + ceph_assert(r == 0); + fragment_info_t& info = it->second; + dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits + << " on " << info.dirs.front()->get_inode() << dendl; + + info.all_frozen = true; + dispatch_fragment_dir(mdr); +} + +void MDCache::dispatch_fragment_dir(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag); + if (it == fragments.end() || it->second.mdr != mdr) { + dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl; + request_finish(mdr); + return; + } + + fragment_info_t& info = it->second; + CInode *diri = info.dirs.front()->get_inode(); + + dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + + if (mdr->more()->peer_error) + mdr->aborted = true; + + if (!mdr->aborted) { + MutationImpl::LockOpVec lov; + lov.add_wrlock(&diri->dirfragtreelock); + // prevent a racing gather on any other scatterlocks too + lov.lock_scatter_gather(&diri->nestlock); + lov.lock_scatter_gather(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) { + if (!mdr->aborted) + return; + } + } + + if (mdr->aborted) { + dout(10) << " can't auth_pin " << *diri << ", requeuing dir " + << info.dirs.front()->dirfrag() << dendl; + if (info.bits > 0) + mds->balancer->queue_split(info.dirs.front(), false); + else + mds->balancer->queue_merge(info.dirs.front()); + fragment_unmark_unfreeze_dirs(info.dirs); + fragments.erase(it); + request_finish(mdr); + return; + } + + mdr->ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits); + mds->mdlog->start_entry(le); + + for (const auto& dir : info.dirs) { + dirfrag_rollback rollback; + rollback.fnode = dir->fnode; + le->add_orig_frag(dir->get_frag(), &rollback); + } + + // refragment + MDSContext::vec waiters; + adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits, + &info.resultfrags, waiters, false); + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + mds->queue_waiters(waiters); + + for (const auto& fg : le->orig_frags) + ceph_assert(!diri->dirfragtree.is_leaf(fg)); + + le->metablob.add_dir_context(info.resultfrags.front()); + for (const auto& dir : info.resultfrags) { + if (diri->is_auth()) { + le->metablob.add_fragmented_dir(dir, false, false); + } else { + dir->state_set(CDir::STATE_DIRTYDFT); + le->metablob.add_fragmented_dir(dir, false, true); + } + } + + // dft lock + if (diri->is_auth()) { + // journal dirfragtree + auto pi = diri->project_inode(mdr); + pi.inode->version = diri->pre_dirty(); + predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY); + journal_dirty_inode(mdr.get(), &le->metablob, diri); + } else { + mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); + mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); + mdr->add_updated_lock(&diri->dirfragtreelock); + } + + /* + // filelock + mds->locker->mark_updated_scatterlock(&diri->filelock); + mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mut->add_updated_lock(&diri->filelock); + + // dirlock + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + mut->add_updated_lock(&diri->nestlock); + */ + + add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls); + mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr), + mdr, __func__); + mds->mdlog->flush(); +} + +void MDCache::_fragment_logged(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + auto& info = fragments.at(basedirfrag); + CInode *diri = info.resultfrags.front()->get_inode(); + + dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + mdr->mark_event("prepare logged"); + + mdr->apply(); // mark scatterlock + + // store resulting frags + MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); + + for (const auto& dir : info.resultfrags) { + dout(10) << " storing result frag " << *dir << dendl; + + dir->mark_dirty(mdr->ls); + dir->mark_new(mdr->ls); + + // freeze and store them too + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + dir->commit(0, gather.new_sub(), true); // ignore authpinnability + } + + gather.activate(); +} + +void MDCache::_fragment_stored(MDRequestRef& mdr) +{ + dirfrag_t basedirfrag = mdr->more()->fragment_base; + fragment_info_t &info = fragments.at(basedirfrag); + CDir *first = info.resultfrags.front(); + CInode *diri = first->get_inode(); + + dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; + mdr->mark_event("new frags stored"); + + // tell peers + mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ? + diri->authority().first : CDIR_AUTH_UNKNOWN; + for (const auto &p : first->get_replicas()) { + if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || + (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && + rejoin_gather.count(p.first))) + continue; + + auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid); + if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root + diri_auth != p.first) { // not auth mds of diri + /* + * In the nornal case, mds does not trim dir inode whose child dirfrags + * are likely being fragmented (see trim_inode()). But when fragmenting + * subtree roots, following race can happen: + * + * - mds.a (auth mds of dirfrag) sends fragment_notify message to + * mds.c and drops wrlock on dirfragtreelock. + * - mds.b (auth mds of dir inode) changes dirfragtreelock state to + * SYNC and send lock message mds.c + * - mds.c receives the lock message and changes dirfragtreelock state + * to SYNC + * - mds.c trim dirfrag and dir inode from its cache + * - mds.c receives the fragment_notify message + * + * So we need to ensure replicas have received the notify, then unlock + * the dirfragtreelock. + */ + notify->mark_ack_wanted(); + info.notify_ack_waiting.insert(p.first); + } + + // freshly replicate new dirs to peers + for (const auto& dir : info.resultfrags) { + encode_replica_dir(dir, p.first, notify->basebl); + } + + mds->send_message_mds(notify, p.first); + } + + // journal commit + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits); + mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr)); + + + // unfreeze resulting frags + for (const auto& dir : info.resultfrags) { + dout(10) << " result frag " << *dir << dendl; + + for (auto &p : dir->items) { + CDentry *dn = p.second; + ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING)); + dn->state_clear(CDentry::STATE_FRAGMENTING); + dn->put(CDentry::PIN_FRAGMENTING); + } + + // unfreeze + dir->unfreeze_dir(); + } + + if (info.notify_ack_waiting.empty()) { + fragment_drop_locks(info); + } else { + mds->locker->drop_locks_for_fragment_unfreeze(mdr.get()); + } +} + +void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr) +{ + dout(10) << "fragment_committed " << basedirfrag << dendl; + if (mdr) + mdr->mark_event("commit logged"); + + ufragment &uf = uncommitted_fragments.at(basedirfrag); + + // remove old frags + C_GatherBuilder gather( + g_ceph_context, + new C_OnFinisher( + new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr), + mds->finisher)); + + SnapContext nullsnapc; + object_locator_t oloc(mds->get_metadata_pool()); + for (const auto& fg : uf.old_frags) { + object_t oid = CInode::get_object_name(basedirfrag.ino, fg, ""); + ObjectOperation op; + if (fg == frag_t()) { + // backtrace object + dout(10) << " truncate orphan dirfrag " << oid << dendl; + op.truncate(0); + op.omap_clear(); + } else { + dout(10) << " removing orphan dirfrag " << oid << dendl; + op.remove(); + } + mds->objecter->mutate(oid, oloc, op, nullsnapc, + ceph::real_clock::now(), + 0, gather.new_sub()); + } + + ceph_assert(gather.has_subs()); + gather.activate(); +} + +void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr) +{ + dout(10) << "fragment_old_purged " << basedirfrag << dendl; + if (mdr) + mdr->mark_event("old frags purged"); + + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits); + mds->mdlog->start_submit_entry(le); + + finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH); + + if (mds->logger) { + if (bits > 0) { + mds->logger->inc(l_mds_dir_split); + } else { + mds->logger->inc(l_mds_dir_merge); + } + } + + if (mdr) { + auto it = fragments.find(basedirfrag); + ceph_assert(it != fragments.end()); + it->second.finishing = true; + if (it->second.notify_ack_waiting.empty()) + fragment_maybe_finish(it); + else + mdr->mark_event("wating for notify acks"); + } +} + +void MDCache::fragment_drop_locks(fragment_info_t& info) +{ + mds->locker->drop_locks(info.mdr.get()); + request_finish(info.mdr); + //info.mdr.reset(); +} + +void MDCache::fragment_maybe_finish(const fragment_info_iterator& it) +{ + if (!it->second.finishing) + return; + + // unmark & auth_unpin + for (const auto &dir : it->second.resultfrags) { + dir->state_clear(CDir::STATE_FRAGMENTING); + dir->auth_unpin(this); + + // In case the resulting fragments are beyond the split size, + // we might need to split them again right away (they could + // have been taking inserts between unfreezing and getting + // here) + mds->balancer->maybe_fragment(dir, false); + } + + fragments.erase(it); +} + + +void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack) +{ + dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl; + mds_rank_t from = mds_rank_t(ack->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_ACTIVE) { + return; + } + + auto it = fragments.find(ack->get_base_dirfrag()); + if (it == fragments.end() || + it->second.get_tid() != ack->get_tid()) { + dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl; + return; + } + + if (it->second.notify_ack_waiting.erase(from) && + it->second.notify_ack_waiting.empty()) { + fragment_drop_locks(it->second); + fragment_maybe_finish(it); + } +} + +void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> ¬ify) +{ + dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; + mds_rank_t from = mds_rank_t(notify->get_source().num()); + + if (mds->get_state() < MDSMap::STATE_REJOIN) { + return; + } + + CInode *diri = get_inode(notify->get_ino()); + if (diri) { + frag_t base = notify->get_basefrag(); + int bits = notify->get_bits(); + +/* + if ((bits < 0 && diri->dirfragtree.is_leaf(base)) || + (bits > 0 && !diri->dirfragtree.is_leaf(base))) { + dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits + << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl; + return; + } +*/ + + // refragment + MDSContext::vec waiters; + std::vector<CDir*> resultfrags; + adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false); + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + + for (const auto& dir : resultfrags) { + diri->take_dir_waiting(dir->get_frag(), waiters); + } + + // add new replica dirs values + auto p = notify->basebl.cbegin(); + while (!p.end()) { + CDir *tmp_dir = nullptr; + decode_replica_dir(tmp_dir, p, diri, from, waiters); + } + + mds->queue_waiters(waiters); + } else { + ceph_abort(); + } + + if (notify->is_ack_wanted()) { + auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(), + notify->get_bits(), notify->get_tid()); + mds->send_message_mds(ack, from); + } +} + +void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags, + LogSegment *ls, bufferlist *rollback) +{ + dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl; + ceph_assert(!uncommitted_fragments.count(basedirfrag)); + ufragment& uf = uncommitted_fragments[basedirfrag]; + uf.old_frags = old_frags; + uf.bits = bits; + uf.ls = ls; + ls->uncommitted_fragments.insert(basedirfrag); + if (rollback) + uf.rollback.swap(*rollback); +} + +void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op) +{ + dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag + << " op " << EFragment::op_name(op) << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) { + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + mds->queue_waiters(uf.waiters); + uncommitted_fragments.erase(it); + } + } +} + +void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags) +{ + dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag + << " old_frags (" << old_frags << ")" << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (!uf.old_frags.empty()) { + uf.old_frags = std::move(old_frags); + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + uncommitted_fragments.erase(it); + } + } +} + +void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher) +{ + MDSGatherBuilder gather(g_ceph_context, finisher); + for (auto& p : uncommitted_fragments) { + p.second.waiters.push_back(gather.new_sub()); + } + gather.activate(); +} + +struct C_MDC_FragmentRollback : public MDCacheLogContext { + MutationRef mut; + C_MDC_FragmentRollback(MDCache *c, MutationRef& m) : + MDCacheLogContext(c), mut(m) {} + void finish(int r) override { + mut->apply(); + get_mds()->locker->drop_locks(mut.get()); + mut->cleanup(); + } +}; + +void MDCache::rollback_uncommitted_fragments() +{ + dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl; + for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin(); + p != uncommitted_fragments.end(); + ++p) { + ufragment &uf = p->second; + CInode *diri = get_inode(p->first.ino); + ceph_assert(diri); + + if (uf.committed) { + _fragment_committed(p->first, MDRequestRef()); + continue; + } + + dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; + + MutationRef mut(new MutationImpl()); + mut->ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits); + mds->mdlog->start_entry(le); + bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF); + + frag_vec_t old_frags; + diri->dirfragtree.get_leaves_under(p->first.frag, old_frags); + + std::vector<CDir*> resultfrags; + if (uf.old_frags.empty()) { + // created by old format EFragment + MDSContext::vec waiters; + adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true); + } else { + auto bp = uf.rollback.cbegin(); + for (const auto& fg : uf.old_frags) { + CDir *dir = force_dir_fragment(diri, fg); + resultfrags.push_back(dir); + + dirfrag_rollback rollback; + decode(rollback, bp); + + dir->fnode = rollback.fnode; + + dir->mark_dirty(mut->ls); + + if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) { + dout(10) << " dirty nestinfo on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + mut->add_updated_lock(&diri->nestlock); + } + if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) { + dout(10) << " dirty fragstat on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&diri->filelock); + mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mut->add_updated_lock(&diri->filelock); + } + + le->add_orig_frag(dir->get_frag()); + le->metablob.add_dir_context(dir); + if (diri_auth) { + le->metablob.add_fragmented_dir(dir, true, false); + } else { + dout(10) << " dirty dirfragtree on " << *dir << dendl; + dir->state_set(CDir::STATE_DIRTYDFT); + le->metablob.add_fragmented_dir(dir, true, true); + } + } + } + + if (diri_auth) { + auto pi = diri->project_inode(mut); + pi.inode->version = diri->pre_dirty(); + predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY); + le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true); + } else { + mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); + mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); + mut->add_updated_lock(&diri->dirfragtreelock); + } + + if (g_conf()->mds_debug_frag) + diri->verify_dirfrags(); + + for (const auto& leaf : old_frags) { + ceph_assert(!diri->dirfragtree.is_leaf(leaf)); + } + + mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut)); + + uf.old_frags.swap(old_frags); + _fragment_committed(p->first, MDRequestRef()); + } +} + +void MDCache::force_readonly() +{ + if (is_readonly()) + return; + + dout(1) << "force file system read-only" << dendl; + mds->clog->warn() << "force file system read-only"; + + set_readonly(); + + mds->server->force_clients_readonly(); + + // revoke write caps + int count = 0; + for (auto &p : inode_map) { + CInode *in = p.second; + if (in->is_head()) + mds->locker->eval(in, CEPH_CAP_LOCKS); + if (!(++count % mds->heartbeat_reset_grace())) + mds->heartbeat_reset(); + } + + mds->mdlog->flush(); +} + + +// ============================================================== +// debug crap + +void MDCache::show_subtrees(int dbl, bool force_print) +{ + if (g_conf()->mds_thrash_exports) + dbl += 15; + + //dout(10) << "show_subtrees" << dendl; + + if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl)) + return; // i won't print anything. + + if (subtrees.empty()) { + dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees" + << dendl; + return; + } + + if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD && + !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) { + dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not " + "printing subtrees" << dendl; + return; + } + + // root frags + std::vector<CDir*> basefrags; + for (set<CInode*>::iterator p = base_inodes.begin(); + p != base_inodes.end(); + ++p) + (*p)->get_dirfrags(basefrags); + //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; + dout(15) << "show_subtrees" << dendl; + + // queue stuff + list<pair<CDir*,int> > q; + string indent; + set<CDir*> seen; + + // calc max depth + for (const auto& dir : basefrags) { + q.emplace_back(dir, 0); + } + + set<CDir*> subtrees_seen; + + unsigned int depth = 0; + while (!q.empty()) { + CDir *dir = q.front().first; + unsigned int d = q.front().second; + q.pop_front(); + + if (subtrees.count(dir) == 0) continue; + + subtrees_seen.insert(dir); + + if (d > depth) depth = d; + + // sanity check + //dout(25) << "saw depth " << d << " " << *dir << dendl; + if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; + ceph_assert(seen.count(dir) == 0); + seen.insert(dir); + + // nested items? + if (!subtrees[dir].empty()) { + for (set<CDir*>::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) { + //dout(25) << " saw sub " << **p << dendl; + q.push_front(pair<CDir*,int>(*p, d+1)); + } + } + } + + if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD && + !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) { + dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing " + "subtrees" << dendl; + return; + } + + // print tree + for (const auto& dir : basefrags) { + q.emplace_back(dir, 0); + } + + while (!q.empty()) { + CDir *dir = q.front().first; + int d = q.front().second; + q.pop_front(); + + if (subtrees.count(dir) == 0) continue; + + // adjust indenter + while ((unsigned)d < indent.size()) + indent.resize(d); + + // pad + string pad = "______________________________________"; + pad.resize(depth*2+1-indent.size()); + if (!subtrees[dir].empty()) + pad[0] = '.'; // parent + + + string auth; + if (dir->is_auth()) + auth = "auth "; + else + auth = " rep "; + + char s[10]; + if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) + snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first)); + else + snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second)); + + // print + dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s + << " " << auth << *dir << dendl; + + if (dir->ino() == CEPH_INO_ROOT) + ceph_assert(dir->inode == root); + if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) + ceph_assert(dir->inode == myin); + if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid())) + ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode); + + // nested items? + if (!subtrees[dir].empty()) { + // more at my level? + if (!q.empty() && q.front().second == d) + indent += "| "; + else + indent += " "; + + for (set<CDir*>::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + q.push_front(pair<CDir*,int>(*p, d+2)); + } + } + + // verify there isn't stray crap in subtree map + int lost = 0; + for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (subtrees_seen.count(p->first)) continue; + dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; + lost++; + } + ceph_assert(lost == 0); +} + +void MDCache::show_cache() +{ + if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>()) + return; + dout(7) << "show_cache" << dendl; + + auto show_func = [this](CInode *in) { + // unlinked? + if (!in->parent) + dout(7) << " unlinked " << *in << dendl; + + // dirfrags? + auto&& dfs = in->get_dirfrags(); + for (const auto& dir : dfs) { + dout(7) << " dirfrag " << *dir << dendl; + + for (auto &p : dir->items) { + CDentry *dn = p.second; + dout(7) << " dentry " << *dn << dendl; + CDentry::linkage_t *dnl = dn->get_linkage(); + if (dnl->is_primary() && dnl->get_inode()) + dout(7) << " inode " << *dnl->get_inode() << dendl; + } + } + }; + + for (auto &p : inode_map) + show_func(p.second); + for (auto &p : snap_inode_map) + show_func(p.second); +} + +void MDCache::cache_status(Formatter *f) +{ + f->open_object_section("cache"); + + f->open_object_section("pool"); + mempool::get_pool(mempool::mds_co::id).dump(f); + f->close_section(); + + f->close_section(); +} + +void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f) +{ + ceph_assert(in); + if ((max_depth >= 0) && (cur_depth > max_depth)) { + return; + } + auto&& ls = in->get_dirfrags(); + for (const auto &subdir : ls) { + for (const auto &p : subdir->items) { + CDentry *dn = p.second; + CInode *in = dn->get_linkage()->get_inode(); + if (in) { + dump_tree(in, cur_depth + 1, max_depth, f); + } + } + } + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS); + f->close_section(); +} + +int MDCache::dump_cache(std::string_view file_name, double timeout) +{ + return dump_cache(file_name, NULL, timeout); +} + +int MDCache::dump_cache(Formatter *f, double timeout) +{ + return dump_cache(std::string_view(""), f, timeout); +} + +/** + * Dump the metadata cache, either to a Formatter, if + * provided, else to a plain text file. + */ +int MDCache::dump_cache(std::string_view fn, Formatter *f, double timeout) +{ + int r = 0; + + // dumping large caches may cause mds to hang or worse get killed. + // so, disallow the dump if the cache size exceeds the configured + // threshold, which is 1G for formatter and unlimited for file (note + // that this can be jacked up by the admin... and is nothing but foot + // shooting, but the option itself is for devs and hence dangerous to + // tune). TODO: remove this when fixed. + uint64_t threshold = f ? + g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") : + g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file"); + + if (threshold && cache_size() > threshold) { + if (f) { + CachedStackStringStream css; + *css << "cache usage exceeds dump threshold"; + f->open_object_section("result"); + f->dump_string("error", css->strv()); + f->close_section(); + } else { + derr << "cache usage exceeds dump threshold" << dendl; + r = -CEPHFS_EINVAL; + } + return r; + } + + r = 0; + int fd = -1; + + if (f) { + f->open_array_section("inodes"); + } else { + char path[PATH_MAX] = ""; + if (fn.length()) { + snprintf(path, sizeof path, "%s", fn.data()); + } else { + snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid())); + } + + dout(1) << "dump_cache to " << path << dendl; + + fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600); + if (fd < 0) { + derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl; + return errno; + } + } + + auto dump_func = [fd, f](CInode *in) { + int r; + if (f) { + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS); + f->close_section(); + return 1; + } + CachedStackStringStream css; + *css << *in << std::endl; + auto sv = css->strv(); + r = safe_write(fd, sv.data(), sv.size()); + if (r < 0) + return r; + auto&& dfs = in->get_dirfrags(); + for (auto &dir : dfs) { + CachedStackStringStream css2; + *css2 << " " << *dir << std::endl; + auto sv = css2->strv(); + r = safe_write(fd, sv.data(), sv.size()); + if (r < 0) + return r; + for (auto &p : dir->items) { + CDentry *dn = p.second; + CachedStackStringStream css3; + *css3 << " " << *dn << std::endl; + auto sv = css3->strv(); + r = safe_write(fd, sv.data(), sv.size()); + if (r < 0) + return r; + } + dir->check_rstats(); + } + return 1; + }; + + auto start = mono_clock::now(); + int64_t count = 0; + for (auto &p : inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; + if (!(++count % 1000) && + timeout > 0 && + std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) { + r = -ETIMEDOUT; + goto out; + } + } + for (auto &p : snap_inode_map) { + r = dump_func(p.second); + if (r < 0) + goto out; + if (!(++count % 1000) && + timeout > 0 && + std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) { + r = -ETIMEDOUT; + goto out; + } + + } + r = 0; + + out: + if (f) { + if (r == -ETIMEDOUT) + { + f->close_section(); + f->open_object_section("result"); + f->dump_string("error", "the operation timeout"); + } + f->close_section(); // inodes + } else { + if (r == -ETIMEDOUT) + { + CachedStackStringStream css; + *css << "error : the operation timeout" << std::endl; + auto sv = css->strv(); + r = safe_write(fd, sv.data(), sv.size()); + } + ::close(fd); + } + return r; +} + +void C_MDS_RetryRequest::finish(int r) +{ + mdr->retry++; + cache->dispatch_request(mdr); +} + +MDSContext *CF_MDS_RetryRequestFactory::build() +{ + if (drop_locks) { + mdcache->mds->locker->drop_locks(mdr.get(), nullptr); + mdr->drop_local_auth_pins(); + } + return new C_MDS_RetryRequest(mdcache, mdr); +} + +class C_MDS_EnqueueScrub : public Context +{ + std::string tag; + Formatter *formatter; + Context *on_finish; +public: + ScrubHeaderRef header; + C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) : + tag(tag), formatter(f), on_finish(fin), header(nullptr) {} + + void finish(int r) override { + formatter->open_object_section("results"); + formatter->dump_int("return_code", r); + if (r == 0) { + formatter->dump_string("scrub_tag", tag); + formatter->dump_string("mode", "asynchronous"); + } + formatter->close_section(); + + r = 0; + if (on_finish) + on_finish->complete(r); + } +}; + +void MDCache::enqueue_scrub( + std::string_view path, + std::string_view tag, + bool force, bool recursive, bool repair, + bool scrub_mdsdir, Formatter *f, Context *fin) +{ + dout(10) << __func__ << " " << path << dendl; + + filepath fp; + if (path.compare(0, 4, "~mds") == 0) { + mds_rank_t rank; + if (path == "~mdsdir") { + rank = mds->get_nodeid(); + } else { + std::string err; + rank = strict_strtoll(path.substr(4), 10, &err); + if (!err.empty()) + rank = MDS_RANK_NONE; + } + if (rank >= 0 && rank < MAX_MDS) + fp.set_path("", MDS_INO_MDSDIR(rank)); + } + if (fp.get_ino() == inodeno_t(0)) + fp.set_path(path); + + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB); + mdr->set_filepath(fp); + + bool is_internal = false; + std::string tag_str(tag); + if (tag_str.empty()) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + tag_str = uuid_gen.to_string(); + is_internal = true; + } + + C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin); + cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, + recursive, repair, scrub_mdsdir); + + mdr->internal_op_finish = cs; + enqueue_scrub_work(mdr); +} + +void MDCache::enqueue_scrub_work(MDRequestRef& mdr) +{ + CInode *in; + CF_MDS_RetryRequestFactory cf(this, mdr, true); + int r = path_traverse(mdr, cf, mdr->get_filepath(), + MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH, + nullptr, &in); + if (r > 0) + return; + if (r < 0) { + mds->server->respond_to_request(mdr, r); + return; + } + + // Cannot scrub same dentry twice at same time + if (in->scrub_is_in_progress()) { + mds->server->respond_to_request(mdr, -CEPHFS_EBUSY); + return; + } else { + in->scrub_info(); + } + + C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish); + ScrubHeaderRef& header = cs->header; + + r = mds->scrubstack->enqueue(in, header, !header->get_recursive()); + + mds->server->respond_to_request(mdr, r); +} + +struct C_MDC_RespondInternalRequest : public MDCacheLogContext { + MDRequestRef mdr; + C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) : + MDCacheLogContext(c), mdr(m) {} + void finish(int r) override { + mdr->apply(); + get_mds()->server->respond_to_request(mdr, r); + } +}; + +struct C_MDC_ScrubRepaired : public MDCacheContext { + ScrubHeaderRef header; +public: + C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h) + : MDCacheContext(m), header(h) { + header->inc_num_pending(); + } + void finish(int r) override { + header->dec_num_pending(); + } +}; + +void MDCache::repair_dirfrag_stats(CDir *dir) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS); + mdr->pin(dir); + mdr->internal_op_private = dir; + if (dir->scrub_is_in_progress()) + mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header()); + else + mdr->internal_op_finish = new C_MDSInternalNoop; + repair_dirfrag_stats_work(mdr); +} + +void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr) +{ + CDir *dir = static_cast<CDir*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *dir << dendl; + + if (!dir->is_auth()) { + mds->server->respond_to_request(mdr, -CEPHFS_ESTALE); + return; + } + + if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) { + dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr)); + + mds->locker->drop_locks(mdr.get()); + mdr->drop_local_auth_pins(); + if (mdr->is_any_remote_auth_pin()) + mds->locker->notify_freeze_waiter(dir); + return; + } + + mdr->auth_pin(dir); + + MutationImpl::LockOpVec lov; + CInode *diri = dir->inode; + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_wrlock(&diri->nestlock); + lov.add_wrlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + if (!dir->is_complete()) { + dir->fetch(new C_MDS_RetryRequest(this, mdr)); + return; + } + + frag_info_t frag_info; + nest_info_t nest_info; + for (auto it = dir->begin(); it != dir->end(); ++it) { + CDentry *dn = it->second; + if (dn->last != CEPH_NOSNAP) + continue; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) { + CInode *in = dnl->get_inode(); + nest_info.add(in->get_projected_inode()->accounted_rstat); + if (in->is_dir()) + frag_info.nsubdirs++; + else + frag_info.nfiles++; + } else if (dnl->is_remote()) + frag_info.nfiles++; + } + + auto pf = dir->get_projected_fnode(); + bool good_fragstat = frag_info.same_sums(pf->fragstat); + bool good_rstat = nest_info.same_sums(pf->rstat); + if (good_fragstat && good_rstat) { + dout(10) << __func__ << " no corruption found" << dendl; + mds->server->respond_to_request(mdr, 0); + return; + } + + auto _pf = dir->project_fnode(mdr); + _pf->version = dir->pre_dirty(); + pf = _pf; + + mdr->ls = mds->mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag"); + mds->mdlog->start_entry(le); + + if (!good_fragstat) { + if (pf->fragstat.mtime > frag_info.mtime) + frag_info.mtime = pf->fragstat.mtime; + if (pf->fragstat.change_attr > frag_info.change_attr) + frag_info.change_attr = pf->fragstat.change_attr; + _pf->fragstat = frag_info; + mds->locker->mark_updated_scatterlock(&diri->filelock); + mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mdr->add_updated_lock(&diri->filelock); + } + + if (!good_rstat) { + if (pf->rstat.rctime > nest_info.rctime) + nest_info.rctime = pf->rstat.rctime; + _pf->rstat = nest_info; + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + mdr->add_updated_lock(&diri->nestlock); + } + + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + + mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr)); +} + +void MDCache::repair_inode_stats(CInode *diri) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS); + mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state() + mdr->internal_op_private = diri; + if (diri->scrub_is_in_progress()) + mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header()); + else + mdr->internal_op_finish = new C_MDSInternalNoop; + repair_inode_stats_work(mdr); +} + +void MDCache::repair_inode_stats_work(MDRequestRef& mdr) +{ + CInode *diri = static_cast<CInode*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *diri << dendl; + + if (!diri->is_auth()) { + mds->server->respond_to_request(mdr, -CEPHFS_ESTALE); + return; + } + if (!diri->is_dir()) { + mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR); + return; + } + + MutationImpl::LockOpVec lov; + + if (mdr->ls) // already marked filelock/nestlock dirty ? + goto do_rdlocks; + + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_wrlock(&diri->nestlock); + lov.add_wrlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger + // the scatter-gather process, which will fix any fragstat/rstat errors. + { + frag_vec_t leaves; + diri->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + if (!dir) { + ceph_assert(mdr->is_auth_pinned(diri)); + dir = diri->get_or_open_dirfrag(this, leaf); + } + if (dir->get_version() == 0) { + ceph_assert(dir->is_auth()); + dir->fetch_keys({}, new C_MDS_RetryRequest(this, mdr)); + return; + } + } + } + + diri->state_set(CInode::STATE_REPAIRSTATS); + mdr->ls = mds->mdlog->get_current_segment(); + mds->locker->mark_updated_scatterlock(&diri->filelock); + mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); + mds->locker->mark_updated_scatterlock(&diri->nestlock); + mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); + + mds->locker->drop_locks(mdr.get()); + +do_rdlocks: + // force the scatter-gather process + lov.clear(); + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_rdlock(&diri->nestlock); + lov.add_rdlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + + diri->state_clear(CInode::STATE_REPAIRSTATS); + + frag_info_t dir_info; + nest_info_t nest_info; + nest_info.rsubdirs = 1; // it gets one to account for self + if (const sr_t *srnode = diri->get_projected_srnode(); srnode) + nest_info.rsnaps = srnode->snaps.size(); + + { + frag_vec_t leaves; + diri->dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + CDir *dir = diri->get_dirfrag(leaf); + ceph_assert(dir); + ceph_assert(dir->get_version() > 0); + dir_info.add(dir->get_fnode()->accounted_fragstat); + nest_info.add(dir->get_fnode()->accounted_rstat); + } + } + + if (!dir_info.same_sums(diri->get_inode()->dirstat) || + !nest_info.same_sums(diri->get_inode()->rstat)) { + dout(10) << __func__ << " failed to fix fragstat/rstat on " + << *diri << dendl; + } + + mds->server->respond_to_request(mdr, 0); +} + +void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin) +{ + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS); + mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state() + mdr->internal_op_private = diri; + mdr->internal_op_finish = fin; + return rdlock_dirfrags_stats_work(mdr); +} + +void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr) +{ + CInode *diri = static_cast<CInode*>(mdr->internal_op_private); + dout(10) << __func__ << " " << *diri << dendl; + if (!diri->is_auth()) { + mds->server->respond_to_request(mdr, -CEPHFS_ESTALE); + return; + } + if (!diri->is_dir()) { + mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR); + return; + } + + MutationImpl::LockOpVec lov; + lov.add_rdlock(&diri->dirfragtreelock); + lov.add_rdlock(&diri->nestlock); + lov.add_rdlock(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, lov)) + return; + dout(10) << __func__ << " start dirfrags : " << *diri << dendl; + + mds->server->respond_to_request(mdr, 0); + return; +} + +void MDCache::flush_dentry(std::string_view path, Context *fin) +{ + if (is_readonly()) { + dout(10) << __func__ << ": read-only FS" << dendl; + fin->complete(-CEPHFS_EROFS); + return; + } + dout(10) << "flush_dentry " << path << dendl; + MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH); + filepath fp(path); + mdr->set_filepath(fp); + mdr->internal_op_finish = fin; + flush_dentry_work(mdr); +} + +class C_FinishIOMDR : public MDSContext { +protected: + MDSRank *mds; + MDRequestRef mdr; + MDSRank *get_mds() override { return mds; } +public: + C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {} + void finish(int r) override { mds->server->respond_to_request(mdr, r); } +}; + +void MDCache::flush_dentry_work(MDRequestRef& mdr) +{ + MutationImpl::LockOpVec lov; + CInode *in = mds->server->rdlock_path_pin_ref(mdr, true); + if (!in) + return; + + ceph_assert(in->is_auth()); + in->flush(new C_FinishIOMDR(mds, mdr)); +} + + +/** + * Initialize performance counters with global perfcounter + * collection. + */ +void MDCache::register_perfcounters() +{ + PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last); + + pcb.add_u64_counter(l_mdc_dir_update, "dir_update", + "Directory replication directives"); + pcb.add_u64_counter(l_mdc_dir_update_receipt, "dir_update_receipt", + "Directory replication directives received"); + pcb.add_u64_counter(l_mdc_dir_try_discover, "dir_try_discover", + "Directory replication attempt to discover"); + pcb.add_u64_counter(l_mdc_dir_send_discover, "dir_send_discover", + "Directory replication discovery message sent"); + pcb.add_u64_counter(l_mdc_dir_handle_discover, "dir_handle_discover", + "Directory replication discovery message handled"); + + // Stray/purge statistics + pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry", + PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64(l_mdc_num_recovering_enqueued, + "num_recovering_enqueued", "Files waiting for recovery", "recy", + PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mdc_recovery_completed, + "recovery_completed", "File recoveries completed", "recd", + PerfCountersBuilder::PRIO_INTERESTING); + + // useful recovery queue statistics + pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", + "Files currently being recovered"); + pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", + "Files waiting for recovery with elevated priority"); + pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", + "File recoveries started"); + + // along with other stray dentries stats + pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", + "Stray dentries delayed"); + pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", + "Stray dentries enqueuing for purge"); + pcb.add_u64_counter(l_mdc_strays_created, "strays_created", + "Stray dentries created"); + pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued", + "Stray dentries enqueued for purge"); + pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", + "Stray dentries reintegrated"); + pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", + "Stray dentries migrated"); + + // low prio internal request stats + pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub", + "Internal Request type enqueue scrub"); + pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir", + "Internal Request type export dir"); + pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush", + "Internal Request type flush"); + pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir", + "Internal Request type fragmentdir"); + pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats", + "Internal Request type frag stats"); + pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats", + "Internal Request type inode stats"); + + logger.reset(pcb.create_perf_counters()); + g_ceph_context->get_perfcounters_collection()->add(logger.get()); + recovery_queue.set_logger(logger.get()); + stray_manager.set_logger(logger.get()); +} + +/** + * Call this when putting references to an inode/dentry or + * when attempting to trim it. + * + * If this inode is no longer linked by anyone, and this MDS + * rank holds the primary dentry, and that dentry is in a stray + * directory, then give up the dentry to the StrayManager, never + * to be seen again by MDCache. + * + * @param delay if true, then purgeable inodes are stashed til + * the next trim(), rather than being purged right + * away. + */ +void MDCache::maybe_eval_stray(CInode *in, bool delay) { + if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() || + mds->get_state() <= MDSMap::STATE_REJOIN) + return; + + CDentry *dn = in->get_projected_parent_dn(); + + if (dn->state_test(CDentry::STATE_PURGING)) { + /* We have already entered the purging process, no need + * to re-evaluate me ! */ + return; + } + + if (dn->get_dir()->get_inode()->is_stray()) { + if (delay) + stray_manager.queue_delayed(dn); + else + stray_manager.eval_stray(dn); + } +} + +void MDCache::clear_dirty_bits_for_stray(CInode* diri) { + dout(10) << __func__ << " " << *diri << dendl; + ceph_assert(diri->get_projected_parent_dir()->inode->is_stray()); + auto&& ls = diri->get_dirfrags(); + for (auto &p : ls) { + if (p->is_auth() && !(p->is_frozen() || p->is_freezing())) + p->try_remove_dentries_for_stray(); + } + if (!diri->snaprealm) { + if (diri->is_auth()) + diri->clear_dirty_rstat(); + diri->clear_scatter_dirty(); + } +} + +bool MDCache::dump_inode(Formatter *f, uint64_t number) { + CInode *in = get_inode(number); + if (!in) { + return false; + } + f->open_object_section("inode"); + in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH); + f->close_section(); + return true; +} + +void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) { + const mds_rank_t max_mds = mdsmap.get_max_mds(); + + // process export_pin_delayed_queue whenever a new MDSMap received + auto &q = export_pin_delayed_queue; + for (auto it = q.begin(); it != q.end(); ) { + auto *in = *it; + mds_rank_t export_pin = in->get_export_pin(false); + dout(10) << " delayed export_pin=" << export_pin << " on " << *in + << " max_mds=" << max_mds << dendl; + if (export_pin >= mdsmap.get_max_mds()) { + it++; + continue; + } + + in->state_clear(CInode::STATE_DELAYEDEXPORTPIN); + it = q.erase(it); + in->queue_export_pin(export_pin); + } + + if (mdsmap.get_max_mds() != oldmap.get_max_mds()) { + dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl; + /* copy to vector to avoid removals during iteration */ + std::vector<CInode*> migrate; + migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end()); + for (auto& in : migrate) { + in->maybe_export_pin(); + } + } + + if (max_mds <= 1) { + export_ephemeral_dist_frag_bits = 0; + } else { + double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor"); + want *= max_mds; + unsigned n = 0; + while ((1U << n) < (unsigned)want) + ++n; + export_ephemeral_dist_frag_bits = n; + } +} + +void MDCache::upkeep_main(void) +{ + std::unique_lock lock(upkeep_mutex); + while (!upkeep_trim_shutdown.load()) { + auto now = clock::now(); + auto since = now-upkeep_last_trim; + auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval")); + if (since >= trim_interval*.90) { + lock.unlock(); /* mds_lock -> upkeep_mutex */ + std::scoped_lock mds_lock(mds->mds_lock); + lock.lock(); + if (upkeep_trim_shutdown.load()) + return; + check_memory_usage(); + if (mds->is_cache_trimmable()) { + dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl; + bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping(); + if (active_with_clients) { + trim_client_leases(); + } + if (is_open()) { + trim(); + } + if (active_with_clients) { + auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS; + if (cache_toofull()) { + recall_flags = recall_flags|Server::RecallFlags::TRIM; + } + mds->server->recall_client_state(nullptr, recall_flags); + } + upkeep_last_trim = now = clock::now(); + } else { + dout(10) << "cache not ready for trimming" << dendl; + } + } else { + trim_interval -= since; + } + since = now-upkeep_last_release; + auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval")); + if (since >= release_interval*.90) { + /* XXX not necessary once MDCache uses PriorityCache */ + dout(10) << "releasing free memory" << dendl; + ceph_heap_release_free_memory(); + upkeep_last_release = clock::now(); + } else { + release_interval -= since; + } + auto interval = std::min(release_interval, trim_interval); + dout(20) << "upkeep thread waiting interval " << interval << dendl; + upkeep_cvar.wait_for(lock, interval); + } +} |