diff options
Diffstat (limited to 'src/mds/MDCache.h')
-rw-r--r-- | src/mds/MDCache.h | 1432 |
1 files changed, 1432 insertions, 0 deletions
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h new file mode 100644 index 000000000..d9f173038 --- /dev/null +++ b/src/mds/MDCache.h @@ -0,0 +1,1432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_MDCACHE_H +#define CEPH_MDCACHE_H + +#include <atomic> +#include <string_view> +#include <thread> + +#include "common/DecayCounter.h" +#include "include/common_fwd.h" +#include "include/types.h" +#include "include/filepath.h" +#include "include/elist.h" + +#include "messages/MCacheExpire.h" +#include "messages/MClientQuota.h" +#include "messages/MClientRequest.h" +#include "messages/MClientSnap.h" +#include "messages/MDentryLink.h" +#include "messages/MDentryUnlink.h" +#include "messages/MDirUpdate.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" +#include "messages/MGatherCaps.h" +#include "messages/MGenericMessage.h" +#include "messages/MInodeFileCaps.h" +#include "messages/MLock.h" +#include "messages/MMDSCacheRejoin.h" +#include "messages/MMDSFindIno.h" +#include "messages/MMDSFindInoReply.h" +#include "messages/MMDSFragmentNotify.h" +#include "messages/MMDSFragmentNotifyAck.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" +#include "messages/MMDSResolve.h" +#include "messages/MMDSResolveAck.h" +#include "messages/MMDSPeerRequest.h" +#include "messages/MMDSSnapUpdate.h" + +#include "osdc/Filer.h" +#include "CInode.h" +#include "CDentry.h" +#include "CDir.h" +#include "include/Context.h" +#include "events/EMetaBlob.h" +#include "RecoveryQueue.h" +#include "StrayManager.h" +#include "OpenFileTable.h" +#include "MDSContext.h" +#include "MDSMap.h" +#include "Mutation.h" + +class MDSRank; +class Session; +class Migrator; + +class Session; + +class ESubtreeMap; + +enum { + l_mdc_first = 3000, + + // dir updates for replication + l_mdc_dir_update, + l_mdc_dir_update_receipt, + l_mdc_dir_try_discover, + l_mdc_dir_send_discover, + l_mdc_dir_handle_discover, + + // How many inodes currently in stray dentries + l_mdc_num_strays, + // How many stray dentries are currently delayed for purge due to refs + l_mdc_num_strays_delayed, + // How many stray dentries are currently being enqueued for purge + l_mdc_num_strays_enqueuing, + + // How many dentries have ever been added to stray dir + l_mdc_strays_created, + // How many dentries have been passed on to PurgeQueue + l_mdc_strays_enqueued, + // How many strays have been reintegrated? + l_mdc_strays_reintegrated, + // How many strays have been migrated? + l_mdc_strays_migrated, + + // How many inode sizes currently being recovered + l_mdc_num_recovering_processing, + // How many inodes currently waiting to have size recovered + l_mdc_num_recovering_enqueued, + // How many inodes waiting with elevated priority for recovery + l_mdc_num_recovering_prioritized, + // How many inodes ever started size recovery + l_mdc_recovery_started, + // How many inodes ever completed size recovery + l_mdc_recovery_completed, + + l_mdss_ireq_enqueue_scrub, + l_mdss_ireq_exportdir, + l_mdss_ireq_flush, + l_mdss_ireq_fragmentdir, + l_mdss_ireq_fragstats, + l_mdss_ireq_inodestats, + + l_mdc_last, +}; + +// flags for path_traverse(); +static const int MDS_TRAVERSE_DISCOVER = (1 << 0); +static const int MDS_TRAVERSE_PATH_LOCKED = (1 << 1); +static const int MDS_TRAVERSE_WANT_DENTRY = (1 << 2); +static const int MDS_TRAVERSE_WANT_AUTH = (1 << 3); +static const int MDS_TRAVERSE_RDLOCK_SNAP = (1 << 4); +static const int MDS_TRAVERSE_RDLOCK_SNAP2 = (1 << 5); +static const int MDS_TRAVERSE_WANT_DIRLAYOUT = (1 << 6); +static const int MDS_TRAVERSE_RDLOCK_PATH = (1 << 7); +static const int MDS_TRAVERSE_XLOCK_DENTRY = (1 << 8); +static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK = (1 << 9); +static const int MDS_TRAVERSE_CHECK_LOCKCACHE = (1 << 10); +static const int MDS_TRAVERSE_WANT_INODE = (1 << 11); + + +// flags for predirty_journal_parents() +static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting +static const int PREDIRTY_DIR = 2; // update parent dir mtime/size +static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) + +class MDCache { + public: + typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap; + + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + // -- discover -- + struct discover_info_t { + discover_info_t() {} + ~discover_info_t() { + if (basei) + basei->put(MDSCacheObject::PIN_DISCOVERBASE); + } + void pin_base(CInode *b) { + basei = b; + basei->get(MDSCacheObject::PIN_DISCOVERBASE); + } + + ceph_tid_t tid = 0; + mds_rank_t mds = -1; + inodeno_t ino; + frag_t frag; + snapid_t snap = CEPH_NOSNAP; + filepath want_path; + CInode *basei = nullptr; + bool want_base_dir = false; + bool path_locked = false; + }; + + // [reconnect/rejoin caps] + struct reconnected_cap_info_t { + reconnected_cap_info_t() {} + inodeno_t realm_ino = 0; + snapid_t snap_follows = 0; + int dirty_caps = 0; + bool snapflush = 0; + }; + + // -- find_ino_peer -- + struct find_ino_peer_info_t { + find_ino_peer_info_t() {} + inodeno_t ino; + ceph_tid_t tid = 0; + MDSContext *fin = nullptr; + bool path_locked = false; + mds_rank_t hint = MDS_RANK_NONE; + mds_rank_t checking = MDS_RANK_NONE; + std::set<mds_rank_t> checked; + }; + + friend class C_MDC_RejoinOpenInoFinish; + friend class C_MDC_RejoinSessionsOpened; + + friend class Locker; + friend class Migrator; + friend class MDBalancer; + + // StrayManager needs to be able to remove_inode() from us + // when it is done purging + friend class StrayManager; + + explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_); + ~MDCache(); + + void insert_taken_inos(inodeno_t ino) { + replay_taken_inos.insert(ino); + } + void clear_taken_inos(inodeno_t ino) { + replay_taken_inos.erase(ino); + } + bool test_and_clear_taken_inos(inodeno_t ino) { + return replay_taken_inos.erase(ino) != 0; + } + bool is_taken_inos_empty(void) { + return replay_taken_inos.empty(); + } + + uint64_t cache_limit_memory(void) { + return cache_memory_limit; + } + double cache_toofull_ratio(void) const { + double memory_reserve = cache_memory_limit*(1.0-cache_reservation); + return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve); + } + bool cache_toofull(void) const { + return cache_toofull_ratio() > 0.0; + } + uint64_t cache_size(void) const { + return mempool::get_pool(mempool::mds_co::id).allocated_bytes(); + } + bool cache_overfull(void) const { + return cache_size() > cache_memory_limit*cache_health_threshold; + } + + void advance_stray(); + + unsigned get_ephemeral_dist_frag_bits() const { + return export_ephemeral_dist_frag_bits; + } + bool get_export_ephemeral_distributed_config(void) const { + return export_ephemeral_distributed_config; + } + + bool get_export_ephemeral_random_config(void) const { + return export_ephemeral_random_config; + } + + bool get_symlink_recovery(void) const { + return symlink_recovery; + } + + /** + * Call this when you know that a CDentry is ready to be passed + * on to StrayManager (i.e. this is a stray you've just created) + */ + void notify_stray(CDentry *dn) { + ceph_assert(dn->get_dir()->get_inode()->is_stray()); + if (dn->state_test(CDentry::STATE_PURGING)) + return; + + stray_manager.eval_stray(dn); + } + + mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0); + + void maybe_eval_stray(CInode *in, bool delay=false); + void clear_dirty_bits_for_stray(CInode* diri); + + bool is_readonly() { return readonly; } + void force_readonly(); + + static file_layout_t gen_default_file_layout(const MDSMap &mdsmap); + static file_layout_t gen_default_log_layout(const MDSMap &mdsmap); + + void register_perfcounters(); + + void touch_client_lease(ClientLease *r, int pool, utime_t ttl) { + client_leases[pool].push_back(&r->item_lease); + r->ttl = ttl; + } + + void notify_stray_removed() + { + stray_manager.notify_stray_removed(); + } + + void notify_stray_created() + { + stray_manager.notify_stray_created(); + } + + void eval_remote(CDentry *dn) + { + stray_manager.eval_remote(dn); + } + + void _send_discover(discover_info_t& dis); + discover_info_t& _create_discover(mds_rank_t mds) { + ceph_tid_t t = ++discover_last_tid; + discover_info_t& d = discovers[t]; + d.tid = t; + d.mds = mds; + return d; + } + + void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE); + void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish, + mds_rank_t from=MDS_RANK_NONE); + void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish, + bool path_locked=false, mds_rank_t from=MDS_RANK_NONE); + void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish, + bool path_locked=false); + void kick_discovers(mds_rank_t who); // after a failure. + + // adjust subtree auth specification + // dir->dir_auth + // imports/exports/nested_exports + // join/split subtrees as appropriate + bool is_subtrees() { return !subtrees.empty(); } + template<typename T> + void get_subtrees(T& c) { + if constexpr (std::is_same_v<T, std::vector<CDir*>>) + c.reserve(c.size() + subtrees.size()); + for (const auto& p : subtrees) { + c.push_back(p.first); + } + } + void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true); + void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) { + adjust_subtree_auth(root, mds_authority_t(a,b)); + } + void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_authority_t auth); + void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_rank_t a) { + adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); + } + void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, const mds_authority_t &auth); + void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, mds_rank_t a) { + adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); + } + void map_dirfrag_set(const std::list<dirfrag_t>& dfs, std::set<CDir*>& result); + void try_subtree_merge(CDir *root); + void try_subtree_merge_at(CDir *root, std::set<CInode*> *to_eval, bool adjust_pop=true); + void eval_subtree_root(CInode *diri); + CDir *get_subtree_root(CDir *dir); + CDir *get_projected_subtree_root(CDir *dir); + bool is_leaf_subtree(CDir *dir) { + ceph_assert(subtrees.count(dir)); + return subtrees[dir].empty(); + } + void remove_subtree(CDir *dir); + bool is_subtree(CDir *root) { + return subtrees.count(root); + } + void get_subtree_bounds(CDir *root, std::set<CDir*>& bounds); + void get_wouldbe_subtree_bounds(CDir *root, std::set<CDir*>& bounds); + void verify_subtree_bounds(CDir *root, const std::set<CDir*>& bounds); + void verify_subtree_bounds(CDir *root, const std::list<dirfrag_t>& bounds); + + void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir); + void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop); + + auto get_auth_subtrees() { + std::vector<CDir*> c; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_auth()) { + c.push_back(root); + } + } + return c; + } + + auto get_fullauth_subtrees() { + std::vector<CDir*> c; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_auth()) { + c.push_back(root); + } + } + return c; + } + auto num_subtrees_fullauth() const { + std::size_t n = 0; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_auth()) { + ++n; + } + } + return n; + } + + auto num_subtrees_fullnonauth() const { + std::size_t n = 0; + for (auto& p : subtrees) { + auto& root = p.first; + if (root->is_full_dir_nonauth()) { + ++n; + } + } + return n; + } + + auto num_subtrees() const { + return subtrees.size(); + } + + int get_num_client_requests(); + + MDRequestRef request_start(const cref_t<MClientRequest>& req); + MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m); + MDRequestRef request_start_internal(int op); + bool have_request(metareqid_t rid) { + return active_requests.count(rid); + } + MDRequestRef request_get(metareqid_t rid); + void request_pin_ref(MDRequestRef& r, CInode *ref, std::vector<CDentry*>& trace); + void request_finish(MDRequestRef& mdr); + void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0); + void dispatch_request(MDRequestRef& mdr); + void request_drop_foreign_locks(MDRequestRef& mdr); + void request_drop_non_rdlocks(MDRequestRef& r); + void request_drop_locks(MDRequestRef& r); + void request_cleanup(MDRequestRef& r); + + void request_kill(MDRequestRef& r); // called when session closes + + // journal/snap helpers + CInode *pick_inode_snap(CInode *in, snapid_t follows); + CInode *cow_inode(CInode *in, snapid_t last); + void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn, + snapid_t follows=CEPH_NOSNAP, + CInode **pcow_inode=0, CDentry::linkage_t *dnl=0); + void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); + + void project_rstat_inode_to_frag(const MutationRef& mut, + CInode *cur, CDir *parent, snapid_t first, + int linkunlink, SnapRealm *prealm); + void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last, + CDir *parent, int linkunlink, bool update_inode); + void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat, + snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head); + void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false); + void predirty_journal_parents(MutationRef mut, EMetaBlob *blob, + CInode *in, CDir *parent, + int flags, int linkunlink=0, + snapid_t follows=CEPH_NOSNAP); + + // peers + void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, std::set<mds_rank_t> &peers, bool safe=false) { + uncommitted_leaders[reqid].ls = ls; + uncommitted_leaders[reqid].peers = peers; + uncommitted_leaders[reqid].safe = safe; + } + void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) { + uncommitted_leaders[reqid].waiters.push_back(c); + } + bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) { + auto p = uncommitted_leaders.find(reqid); + return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0; + } + void log_leader_commit(metareqid_t reqid); + void logged_leader_update(metareqid_t reqid); + void _logged_leader_commit(metareqid_t reqid); + void committed_leader_peer(metareqid_t r, mds_rank_t from); + void finish_committed_leaders(); + + void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr); + void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) { + uncommitted_peers.at(reqid).waiters.push_back(c); + } + void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true); + MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader); + void _logged_peer_commit(mds_rank_t from, metareqid_t reqid); + + void set_recovery_set(std::set<mds_rank_t>& s); + void handle_mds_failure(mds_rank_t who); + void handle_mds_recovery(mds_rank_t who); + + void recalc_auth_bits(bool replay); + void remove_inode_recursive(CInode *in); + + bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { + auto p = ambiguous_peer_updates.find(leader); + return p != ambiguous_peer_updates.end() && p->second.count(reqid); + } + void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { + ambiguous_peer_updates[leader].insert(reqid); + } + void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { + auto p = ambiguous_peer_updates.find(leader); + auto q = p->second.find(reqid); + ceph_assert(q != p->second.end()); + p->second.erase(q); + if (p->second.empty()) + ambiguous_peer_updates.erase(p); + } + + void add_rollback(metareqid_t reqid, mds_rank_t leader) { + resolve_need_rollback[reqid] = leader; + } + void finish_rollback(metareqid_t reqid, MDRequestRef& mdr); + + // ambiguous imports + void add_ambiguous_import(dirfrag_t base, const std::vector<dirfrag_t>& bounds); + void add_ambiguous_import(CDir *base, const std::set<CDir*>& bounds); + bool have_ambiguous_import(dirfrag_t base) { + return my_ambiguous_imports.count(base); + } + void get_ambiguous_import_bounds(dirfrag_t base, std::vector<dirfrag_t>& bounds) { + ceph_assert(my_ambiguous_imports.count(base)); + bounds = my_ambiguous_imports[base]; + } + void cancel_ambiguous_import(CDir *); + void finish_ambiguous_import(dirfrag_t dirino); + void resolve_start(MDSContext *resolve_done_); + void send_resolves(); + void maybe_send_pending_resolves() { + if (resolves_pending) + send_subtree_resolves(); + } + + void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, + std::map<dirfrag_t,std::vector<dirfrag_t> >& subtrees); + ESubtreeMap *create_subtree_map(); + + void clean_open_file_lists(); + void dump_openfiles(Formatter *f); + bool dump_inode(Formatter *f, uint64_t number); + + void rejoin_start(MDSContext *rejoin_done_); + void rejoin_gather_finish(); + void rejoin_send_rejoins(); + void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, + int target=-1, bool drop_path=false) { + auto& ex = cap_exports[ino]; + ex.first = target; + auto &_icr = ex.second[client] = icr; + if (drop_path) + _icr.path.clear(); + } + void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, + mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) { + auto &_icr = cap_imports[ino][client][frommds] = icr; + if (drop_path) + _icr.path.clear(); + } + void rejoin_recovered_client(client_t client, const entity_inst_t& inst) { + rejoin_client_map.emplace(client, inst); + } + bool rejoin_has_cap_reconnect(inodeno_t ino) const { + return cap_imports.count(ino); + } + void add_replay_ino_alloc(inodeno_t ino) { + cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin + } + const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { + if (cap_imports.count(ino) && + cap_imports[ino].count(client) && + cap_imports[ino][client].count(MDS_RANK_NONE)) { + return &cap_imports[ino][client][MDS_RANK_NONE]; + } + return NULL; + } + void remove_replay_cap_reconnect(inodeno_t ino, client_t client) { + ceph_assert(cap_imports[ino].size() == 1); + ceph_assert(cap_imports[ino][client].size() == 1); + cap_imports.erase(ino); + } + void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) { + cap_reconnect_waiters[ino].push_back(c); + } + + void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) { + reconnected_cap_info_t &info = reconnected_caps[ino][client]; + info.realm_ino = inodeno_t(icr.capinfo.snaprealm); + info.snap_follows = icr.snap_follows; + } + void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) { + reconnected_cap_info_t &info = reconnected_caps[ino][client]; + info.dirty_caps |= dirty; + if (snapflush) + info.snapflush = snapflush; + } + void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { + reconnected_snaprealms[ino][client] = seq; + } + + void rejoin_open_ino_finish(inodeno_t ino, int ret); + void rejoin_prefetch_ino_finish(inodeno_t ino, int ret); + void rejoin_open_sessions_finish(std::map<client_t,std::pair<Session*,uint64_t> >& session_map); + bool process_imported_caps(); + void choose_lock_states_and_reconnect_caps(); + void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, + std::map<client_t,ref_t<MClientSnap>>& splits); + void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, std::map<client_t,ref_t<MClientSnap>>& splits); + void send_snaps(std::map<client_t,ref_t<MClientSnap>>& splits); + Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); + void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, + std::map<client_t,ref_t<MClientSnap>>& updates); + Capability* try_reconnect_cap(CInode *in, Session *session); + void export_remaining_imported_caps(); + + void do_cap_import(Session *session, CInode *in, Capability *cap, + uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, + int peer, int p_flags); + void do_delayed_cap_imports(); + void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client, + snapid_t snap_follows); + void open_snaprealms(); + + bool open_undef_inodes_dirfrags(); + void opened_undef_inode(CInode *in); + void opened_undef_dirfrag(CDir *dir) { + rejoin_undef_dirfrags.erase(dir); + } + + void reissue_all_caps(); + + void start_files_to_recover(); + void do_file_recover(); + void queue_file_recover(CInode *in); + void _queued_file_recover_cow(CInode *in, MutationRef& mut); + + void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); + + // debug + void log_stat(); + + // root inode + CInode *get_root() { return root; } + CInode *get_myin() { return myin; } + + size_t get_cache_size() { return lru.lru_get_size(); } + + // trimming + std::pair<bool, uint64_t> trim(uint64_t count=0); + + bool trim_non_auth_subtree(CDir *directory); + void standby_trim_segment(LogSegment *ls); + void try_trim_non_auth_subtree(CDir *dir); + bool can_trim_non_auth_dirfrag(CDir *dir) { + return my_ambiguous_imports.count((dir)->dirfrag()) == 0 && + uncommitted_peer_rename_olddir.count(dir->inode) == 0; + } + + /** + * For all unreferenced inodes, dirs, dentries below an inode, compose + * expiry messages. This is used when giving up all replicas of entities + * for an MDS peer in the 'stopping' state, such that the peer can + * empty its cache and finish shutting down. + * + * We have to make sure we're only expiring un-referenced items to + * avoid interfering with ongoing stray-movement (we can't distinguish + * between the "moving my strays" and "waiting for my cache to empty" + * phases within 'stopping') + * + * @return false if we completed cleanly, true if caller should stop + * expiring because we hit something with refs. + */ + bool expire_recursive(CInode *in, expiremap& expiremap); + + void trim_client_leases(); + void check_memory_usage(); + + void shutdown_start(); + void shutdown_check(); + bool shutdown_pass(); + bool shutdown(); // clear cache (ie at shutodwn) + bool shutdown_export_strays(); + void shutdown_export_stray_finish(inodeno_t ino) { + if (shutdown_exporting_strays.erase(ino)) + shutdown_export_strays(); + } + + // inode_map + bool have_inode(vinodeno_t vino) { + if (vino.snapid == CEPH_NOSNAP) + return inode_map.count(vino.ino) ? true : false; + else + return snap_inode_map.count(vino) ? true : false; + } + bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { + return have_inode(vinodeno_t(ino, snap)); + } + CInode* get_inode(vinodeno_t vino) { + if (vino.snapid == CEPH_NOSNAP) { + auto p = inode_map.find(vino.ino); + if (p != inode_map.end()) + return p->second; + } else { + auto p = snap_inode_map.find(vino); + if (p != snap_inode_map.end()) + return p->second; + } + return NULL; + } + CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { + return get_inode(vinodeno_t(ino, s)); + } + CInode* lookup_snap_inode(vinodeno_t vino) { + auto p = snap_inode_map.lower_bound(vino); + if (p != snap_inode_map.end() && + p->second->ino() == vino.ino && p->second->first <= vino.snapid) + return p->second; + return NULL; + } + + CDir* get_dirfrag(dirfrag_t df) { + CInode *in = get_inode(df.ino); + if (!in) + return NULL; + return in->get_dirfrag(df.frag); + } + CDir* get_dirfrag(inodeno_t ino, std::string_view dn) { + CInode *in = get_inode(ino); + if (!in) + return NULL; + frag_t fg = in->pick_dirfrag(dn); + return in->get_dirfrag(fg); + } + CDir* get_force_dirfrag(dirfrag_t df, bool replay) { + CInode *diri = get_inode(df.ino); + if (!diri) + return NULL; + CDir *dir = force_dir_fragment(diri, df.frag, replay); + if (!dir) + dir = diri->get_dirfrag(df.frag); + return dir; + } + + MDSCacheObject *get_object(const MDSCacheObjectInfo &info); + + void add_inode(CInode *in); + + void remove_inode(CInode *in); + + void touch_dentry(CDentry *dn) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { + bottom_lru.lru_midtouch(dn); + } else { + if (dn->is_auth()) + lru.lru_touch(dn); + else + lru.lru_midtouch(dn); + } + } + void touch_dentry_bottom(CDentry *dn) { + if (dn->state_test(CDentry::STATE_BOTTOMLRU)) + return; + lru.lru_bottouch(dn); + } + + // truncate + void truncate_inode(CInode *in, LogSegment *ls); + void _truncate_inode(CInode *in, LogSegment *ls); + void truncate_inode_finish(CInode *in, LogSegment *ls); + void truncate_inode_write_finish(CInode *in, LogSegment *ls, + uint32_t block_size); + void truncate_inode_logged(CInode *in, MutationRef& mut); + + void add_recovered_truncate(CInode *in, LogSegment *ls); + void remove_recovered_truncate(CInode *in, LogSegment *ls); + void start_recovered_truncates(); + + // purge unsafe inodes + void start_purge_inodes(); + void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls); + + CDir *get_auth_container(CDir *in); + CDir *get_export_container(CDir *dir); + void find_nested_exports(CDir *dir, std::set<CDir*>& s); + void find_nested_exports_under(CDir *import, CDir *dir, std::set<CDir*>& s); + + void init_layouts(); + void create_unlinked_system_inode(CInode *in, inodeno_t ino, + int mode) const; + CInode *create_system_inode(inodeno_t ino, int mode); + CInode *create_root_inode(); + + void create_empty_hierarchy(MDSGather *gather); + void create_mydir_hierarchy(MDSGather *gather); + + bool is_open() { return open; } + void wait_for_open(MDSContext *c) { + waiting_for_open.push_back(c); + } + + void open_root_inode(MDSContext *c); + void open_root(); + void open_mydir_inode(MDSContext *c); + void open_mydir_frag(MDSContext *c); + void populate_mydir(); + + void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin); + void _create_system_file_finish(MutationRef& mut, CDentry *dn, + version_t dpv, MDSContext *fin); + + void open_foreign_mdsdir(inodeno_t ino, MDSContext *c); + CDir *get_stray_dir(CInode *in); + + /** + * Find the given dentry (and whether it exists or not), its ancestors, + * and get them all into memory and usable on this MDS. This function + * makes a best-effort attempt to load everything; if it needs to + * go away and do something then it will put the request on a waitlist. + * It prefers the mdr, then the req, then the fin. (At least one of these + * must be non-null.) + * + * At least one of the params mdr, req, and fin must be non-null. + * + * @param mdr The MDRequest associated with the path. Can be null. + * @param cf A MDSContextFactory for waiter building. + * @param path The path to traverse to. + * + * @param flags Specifies different lookup behaviors. + * By default, path_traverse() forwards the request to the auth MDS if that + * is appropriate (ie, if it doesn't know the contents of a directory). + * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse() + * attempts to look up the path from a different MDS (and bring them into + * its cache as replicas). + * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked + * dentry is encountered. + * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if + * tail dentry does not exist. return 0 even tail dentry is null. + * MDS_TRAVERSE_WANT_INODE: Caller only wants target inode if it exists, or + * wants tail dentry if target inode does not exist and MDS_TRAVERSE_WANT_DENTRY + * is also set. + * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode + * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set). + * MDS_TRAVERSE_XLOCK_DENTRY: Caller wants to xlock tail dentry if MDS_TRAVERSE_WANT_INODE + * is not set or (MDS_TRAVERSE_WANT_INODE is set but target inode does not exist) + * + * @param pdnvec Data return parameter -- on success, contains a + * vector of dentries. On failure, is either empty or contains the + * full trace of traversable dentries. + * @param pin Data return parameter -- if successful, points to the inode + * associated with filepath. If unsuccessful, is null. + * + * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise. + * If it returns 1, the requester associated with this call has been placed + * on the appropriate waitlist, and it should unwind itself and back out. + * If it returns 2 the request has been forwarded, and again the requester + * should unwind itself and back out. + */ + int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, + const filepath& path, int flags, + std::vector<CDentry*> *pdnvec, CInode **pin=nullptr); + + int maybe_request_forward_to_auth(MDRequestRef& mdr, MDSContextFactory& cf, + MDSCacheObject *p); + + CInode *cache_traverse(const filepath& path); + + void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin); + CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false); + + bool parallel_fetch(std::map<inodeno_t,filepath>& pathmap, std::set<inodeno_t>& missing); + bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, + std::set<CDir*>& fetch_queue, std::set<inodeno_t>& missing, + C_GatherBuilder &gather_bld); + + void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, + bool want_xlocked=false); + void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin, + bool want_xlocked, int r); + + void make_trace(std::vector<CDentry*>& trace, CInode *in); + + void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin, + bool want_replica=true, bool want_xlocked=false, + std::vector<inode_backpointer_t> *ancestors_hint=nullptr, + mds_rank_t auth_hint=MDS_RANK_NONE); + void open_ino_batch_start(); + void open_ino_batch_submit(); + void kick_open_ino_peers(mds_rank_t who); + + void find_ino_peers(inodeno_t ino, MDSContext *c, + mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false); + void _do_find_ino_peer(find_ino_peer_info_t& fip); + void handle_find_ino(const cref_t<MMDSFindIno> &m); + void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m); + void kick_find_ino_peers(mds_rank_t who); + + SnapRealm *get_global_snaprealm() const { return global_snaprealm; } + void create_global_snaprealm(); + void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true); + void send_snap_update(CInode *in, version_t stid, int snap_op); + void handle_snap_update(const cref_t<MMDSSnapUpdate> &m); + void notify_global_snaprealm_update(int snap_op); + + // -- stray -- + void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); + uint64_t get_num_strays() const { return stray_manager.get_num_strays(); } + + // == messages == + void dispatch(const cref_t<Message> &m); + + void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl); + void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl); + void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl, + uint64_t features); + + void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished); + void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished); + void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished); + + void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); + void decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from); + + // -- namespace -- + void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl); + void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p); + void send_dentry_link(CDentry *dn, MDRequestRef& mdr); + void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); + + void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) { + uncommitted_fragments.at(dirfrag).waiters.push_back(c); + } + bool is_any_uncommitted_fragment() const { + return !uncommitted_fragments.empty(); + } + void wait_for_uncommitted_fragments(MDSContext* finisher); + void rollback_uncommitted_fragments(); + + void split_dir(CDir *dir, int byn); + void merge_dir(CInode *diri, frag_t fg); + + void find_stale_fragment_freeze(); + void fragment_freeze_inc_num_waiters(CDir *dir); + bool fragment_are_all_frozen(CDir *dir); + int get_num_fragmenting_dirs() { return fragments.size(); } + + // -- updates -- + //int send_inode_updates(CInode *in); + //void handle_inode_update(MInodeUpdate *m); + + int send_dir_updates(CDir *in, bool bcast=false); + void handle_dir_update(const cref_t<MDirUpdate> &m); + + // -- cache expiration -- + void handle_cache_expire(const cref_t<MCacheExpire> &m); + void process_delayed_expire(CDir *dir); + void discard_delayed_expire(CDir *dir); + + // -- mdsmap -- + void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap); + + int dump_cache() { return dump_cache({}, nullptr, 0); } + int dump_cache(std::string_view filename, double timeout); + int dump_cache(Formatter *f, double timeout); + void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f); + + void cache_status(Formatter *f); + + void dump_resolve_status(Formatter *f) const; + void dump_rejoin_status(Formatter *f) const; + + // == crap fns == + void show_cache(); + void show_subtrees(int dbl=10, bool force_print=false); + + CInode *hack_pick_random_inode() { + ceph_assert(!inode_map.empty()); + int n = rand() % inode_map.size(); + auto p = inode_map.begin(); + while (n--) ++p; + return p->second; + } + + void flush_dentry(std::string_view path, Context *fin); + /** + * Create and start an OP_ENQUEUE_SCRUB + */ + void enqueue_scrub(std::string_view path, std::string_view tag, + bool force, bool recursive, bool repair, + bool scrub_mdsdir, Formatter *f, Context *fin); + void repair_inode_stats(CInode *diri); + void repair_dirfrag_stats(CDir *dir); + void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin); + + // my leader + MDSRank *mds; + + // -- my cache -- + LRU lru; // dentry lru for expiring items from cache + LRU bottom_lru; // dentries that should be trimmed ASAP + + DecayRate decayrate; + + int num_shadow_inodes = 0; + + int num_inodes_with_caps = 0; + + unsigned max_dir_commit_size; + + file_layout_t default_file_layout; + file_layout_t default_log_layout; + + // -- client leases -- + static constexpr std::size_t client_lease_pools = 3; + std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0}; + + // -- client caps -- + uint64_t last_cap_id = 0; + + std::map<ceph_tid_t, discover_info_t> discovers; + ceph_tid_t discover_last_tid = 0; + + // waiters + std::map<int, std::map<inodeno_t, MDSContext::vec > > waiting_for_base_ino; + + std::map<inodeno_t,std::map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino + std::map<inodeno_t,std::map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq + + // realm inodes + std::set<CInode*> rejoin_pending_snaprealms; + // cap imports. delayed snap parent opens. + std::map<client_t,std::set<CInode*> > delayed_imported_caps; + + // subsystems + std::unique_ptr<Migrator> migrator; + + bool did_shutdown_log_cap = false; + + std::map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer; + ceph_tid_t find_ino_peer_last_tid = 0; + + // delayed cache expire + std::map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg + + /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ + std::set<CInode *> export_pin_queue; + std::set<CInode *> export_pin_delayed_queue; + std::set<CInode *> export_ephemeral_pins; + + OpenFileTable open_file_table; + + double export_ephemeral_random_max = 0.0; + + protected: + // track leader requests whose peers haven't acknowledged commit + struct uleader { + uleader() {} + std::set<mds_rank_t> peers; + LogSegment *ls = nullptr; + MDSContext::vec waiters; + bool safe = false; + bool committing = false; + bool recovering = false; + }; + + struct upeer { + upeer() {} + mds_rank_t leader; + LogSegment *ls = nullptr; + MDPeerUpdate *su = nullptr; + MDSContext::vec waiters; + }; + + struct open_ino_info_t { + open_ino_info_t() {} + std::vector<inode_backpointer_t> ancestors; + std::set<mds_rank_t> checked; + mds_rank_t checking = MDS_RANK_NONE; + mds_rank_t auth_hint = MDS_RANK_NONE; + bool check_peers = true; + bool fetch_backtrace = true; + bool discover = false; + bool want_replica = false; + bool want_xlocked = false; + version_t tid = 0; + int64_t pool = -1; + int last_err = 0; + MDSContext::vec waiters; + }; + + ceph_tid_t open_ino_last_tid = 0; + std::map<inodeno_t,open_ino_info_t> opening_inodes; + + bool open_ino_batch = false; + std::map<CDir*, std::pair<std::vector<std::string>, MDSContext::vec> > open_ino_batched_fetch; + + friend struct C_MDC_OpenInoTraverseDir; + friend struct C_MDC_OpenInoParentOpened; + friend struct C_MDC_RetryScanStray; + + friend class C_IO_MDC_OpenInoBacktraceFetched; + friend class C_MDC_Join; + friend class C_MDC_RespondInternalRequest; + + friend class EPeerUpdate; + friend class ECommitted; + + void set_readonly() { readonly = true; } + + void handle_resolve(const cref_t<MMDSResolve> &m); + void handle_resolve_ack(const cref_t<MMDSResolveAck> &m); + void process_delayed_resolve(); + void discard_delayed_resolve(mds_rank_t who); + void maybe_resolve_finish(); + void disambiguate_my_imports(); + void disambiguate_other_imports(); + void trim_unlinked_inodes(); + + void send_peer_resolves(); + void send_subtree_resolves(); + void maybe_finish_peer_resolve(); + + void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin); + void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m); + void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m); + CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); + CDir* rejoin_invent_dirfrag(dirfrag_t df); + void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m); + void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack, + std::set<vinodeno_t>& acked_inodes, + std::set<SimpleLock *>& gather_locks); + void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m); + void rejoin_send_acks(); + void rejoin_trim_undef_inodes(); + void maybe_send_pending_rejoins() { + if (rejoins_pending) + rejoin_send_rejoins(); + } + + void touch_inode(CInode *in) { + if (in->get_parent_dn()) + touch_dentry(in->get_projected_parent_dn()); + } + + void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin, + std::set<SimpleLock *>& gather_locks); + void dentry_remove_replica(CDentry *dn, mds_rank_t rep, std::set<SimpleLock *>& gather_locks); + + void rename_file(CDentry *srcdn, CDentry *destdn); + + void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); + void _open_ino_parent_opened(inodeno_t ino, int ret); + void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); + void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, bool parent, + CDir *dir, std::string_view dname); + int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, + const std::vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, mds_rank_t *hint); + void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); + void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0); + void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m); + + void scan_stray_dir(dirfrag_t next=dirfrag_t()); + // -- replicas -- + void handle_discover(const cref_t<MDiscover> &dis); + void handle_discover_reply(const cref_t<MDiscoverReply> &m); + void handle_dentry_link(const cref_t<MDentryLink> &m); + void handle_dentry_unlink(const cref_t<MDentryUnlink> &m); + + int dump_cache(std::string_view fn, Formatter *f, double timeout); + + void flush_dentry_work(MDRequestRef& mdr); + /** + * Resolve path to a dentry and pass it onto the ScrubStack. + * + * TODO: return enough information to the original mdr formatter + * and completion that they can subsequeuntly check the progress of + * this scrub (we won't block them on a whole scrub as it can take a very + * long time) + */ + void enqueue_scrub_work(MDRequestRef& mdr); + void repair_inode_stats_work(MDRequestRef& mdr); + void repair_dirfrag_stats_work(MDRequestRef& mdr); + void rdlock_dirfrags_stats_work(MDRequestRef& mdr); + + ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino + std::map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino + CInode *root = nullptr; // root inode + CInode *myin = nullptr; // .ceph/mds%d dir + + bool readonly = false; + + int stray_index = 0; + int stray_fragmenting_index = -1; + + std::set<CInode*> base_inodes; + + std::unique_ptr<PerfCounters> logger; + + Filer filer; + std::array<xlist<ClientLease*>, client_lease_pools> client_leases{}; + + /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */ + std::map<CDir*,std::set<CDir*> > subtrees; + std::map<CInode*,std::list<std::pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir + + // -- requests -- + ceph::unordered_map<metareqid_t, MDRequestRef> active_requests; + + // -- recovery -- + std::set<mds_rank_t> recovery_set; + + // [resolve] + // from EImportStart w/o EImportFinish during journal replay + std::map<dirfrag_t, std::vector<dirfrag_t> > my_ambiguous_imports; + // from MMDSResolves + std::map<mds_rank_t, std::map<dirfrag_t, std::vector<dirfrag_t> > > other_ambiguous_imports; + + std::map<CInode*, int> uncommitted_peer_rename_olddir; // peer: preserve the non-auth dir until seeing commit. + std::map<CInode*, int> uncommitted_peer_unlink; // peer: preserve the unlinked inode until seeing commit. + + std::map<metareqid_t, uleader> uncommitted_leaders; // leader: req -> peer set + std::map<metareqid_t, upeer> uncommitted_peers; // peer: preserve the peer req until seeing commit. + + std::set<metareqid_t> pending_leaders; + std::map<int, std::set<metareqid_t> > ambiguous_peer_updates; + + bool resolves_pending = false; + std::set<mds_rank_t> resolve_gather; // nodes i need resolves from + std::set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from + std::set<version_t> resolve_snapclient_commits; + std::map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal + std::map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve; + + // [rejoin] + bool rejoins_pending = false; + std::set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin + std::set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to + std::set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to + std::set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack + std::map<mds_rank_t,std::map<inodeno_t,std::map<client_t,Capability::Import> > > rejoin_imported_caps; + std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,Capability::Export> > > rejoin_peer_exports; + + std::map<client_t,entity_inst_t> rejoin_client_map; + std::map<client_t,client_metadata_t> rejoin_client_metadata_map; + std::map<client_t,std::pair<Session*,uint64_t> > rejoin_session_map; + + std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex + + std::map<inodeno_t,std::map<client_t,std::map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex + std::set<inodeno_t> cap_imports_missing; + std::map<inodeno_t, MDSContext::vec > cap_reconnect_waiters; + int cap_imports_num_opening = 0; + + std::set<CInode*> rejoin_undef_inodes; + std::set<CInode*> rejoin_potential_updated_scatterlocks; + std::set<CDir*> rejoin_undef_dirfrags; + std::map<mds_rank_t, std::set<CInode*> > rejoin_unlinked_inodes; + + std::vector<CInode*> rejoin_recover_q, rejoin_check_q; + std::list<SimpleLock*> rejoin_eval_locks; + MDSContext::vec rejoin_waiters; + + std::unique_ptr<MDSContext> rejoin_done; + std::unique_ptr<MDSContext> resolve_done; + + StrayManager stray_manager; + + private: + std::set<inodeno_t> replay_taken_inos; // the inos have been taken when replaying + + // -- fragmenting -- + struct ufragment { + ufragment() {} + int bits = 0; + bool committed = false; + LogSegment *ls = nullptr; + MDSContext::vec waiters; + frag_vec_t old_frags; + bufferlist rollback; + }; + + struct fragment_info_t { + fragment_info_t() {} + bool is_fragmenting() { return !resultfrags.empty(); } + uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; } + int bits; + std::vector<CDir*> dirs; + std::vector<CDir*> resultfrags; + MDRequestRef mdr; + std::set<mds_rank_t> notify_ack_waiting; + bool finishing = false; + + // for deadlock detection + bool all_frozen = false; + utime_t last_cum_auth_pins_change; + int last_cum_auth_pins = 0; + int num_remote_waiters = 0; // number of remote authpin waiters + }; + + typedef std::map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator; + + friend class EFragment; + friend class C_MDC_FragmentFrozen; + friend class C_MDC_FragmentMarking; + friend class C_MDC_FragmentPrep; + friend class C_MDC_FragmentStore; + friend class C_MDC_FragmentCommit; + friend class C_MDC_FragmentRollback; + friend class C_IO_MDC_FragmentPurgeOld; + + // -- subtrees -- + static const unsigned int SUBTREES_COUNT_THRESHOLD = 5; + static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5; + + CInode *get_stray() { + return strays[stray_index]; + } + + void identify_files_to_recover(); + + std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap); + bool trim_dentry(CDentry *dn, expiremap& expiremap); + void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap); + bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&); + void send_expire_messages(expiremap& expiremap); + void trim_non_auth(); // trim out trimmable non-auth items + + void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay); + void adjust_dir_fragments(CInode *diri, + const std::vector<CDir*>& srcfrags, + frag_t basefrag, int bits, + std::vector<CDir*>* resultfrags, + MDSContext::vec& waiters, + bool replay); + CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true); + void get_force_dirfrag_bound_set(const std::vector<dirfrag_t>& dfs, std::set<CDir*>& bounds); + + bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs); + void fragment_freeze_dirs(const std::vector<CDir*>& dirs); + void fragment_mark_and_complete(MDRequestRef& mdr); + void fragment_frozen(MDRequestRef& mdr, int r); + void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs); + void fragment_drop_locks(fragment_info_t &info); + void fragment_maybe_finish(const fragment_info_iterator& it); + void dispatch_fragment_dir(MDRequestRef& mdr); + void _fragment_logged(MDRequestRef& mdr); + void _fragment_stored(MDRequestRef& mdr); + void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr); + void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr); + + void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m); + void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m); + + void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag, + LogSegment *ls, bufferlist *rollback=NULL); + void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); + void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags); + + void upkeep_main(void); + + uint64_t cache_memory_limit; + double cache_reservation; + double cache_health_threshold; + std::array<CInode *, NUM_STRAY> strays{}; // my stray dir + + bool export_ephemeral_distributed_config; + bool export_ephemeral_random_config; + unsigned export_ephemeral_dist_frag_bits; + + // Stores the symlink target on the file object's head + bool symlink_recovery; + + // File size recovery + RecoveryQueue recovery_queue; + + // shutdown + std::set<inodeno_t> shutdown_exporting_strays; + std::pair<dirfrag_t, std::string> shutdown_export_next; + + bool opening_root = false, open = false; + MDSContext::vec waiting_for_open; + + // -- snaprealms -- + SnapRealm *global_snaprealm = nullptr; + + std::map<dirfrag_t, ufragment> uncommitted_fragments; + + std::map<dirfrag_t,fragment_info_t> fragments; + + DecayCounter trim_counter; + + std::thread upkeeper; + ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex"); + ceph::condition_variable upkeep_cvar; + time upkeep_last_trim = time::min(); + time upkeep_last_release = time::min(); + std::atomic<bool> upkeep_trim_shutdown{false}; +}; + +class C_MDS_RetryRequest : public MDSInternalContext { + MDCache *cache; + MDRequestRef mdr; + public: + C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) : + MDSInternalContext(c->mds), cache(c), mdr(r) {} + void finish(int r) override; +}; + +class CF_MDS_RetryRequestFactory : public MDSContextFactory { +public: + CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) : + mdcache(cache), mdr(mdr), drop_locks(dl) {} + MDSContext *build() override; +private: + MDCache *mdcache; + MDRequestRef mdr; + bool drop_locks; +}; + +/** + * Only for contexts called back from an I/O completion + * + * Note: duplication of members wrt MDCacheContext, because + * it'ls the lesser of two evils compared with introducing + * yet another piece of (multiple) inheritance. + */ +class MDCacheIOContext : public virtual MDSIOContextBase { +protected: + MDCache *mdcache; + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } +public: + explicit MDCacheIOContext(MDCache *mdc_, bool track=true) : + MDSIOContextBase(track), mdcache(mdc_) {} +}; + +#endif |