// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include "include/int_types.h" #include "common/errno.h" #include #include "CInode.h" #include "CDir.h" #include "CDentry.h" #include "MDSRank.h" #include "MDCache.h" #include "MDLog.h" #include "Locker.h" #include "Mutation.h" #include "events/EUpdate.h" #include "osdc/Objecter.h" #include "snap.h" #include "LogSegment.h" #include "common/Clock.h" #include "common/config.h" #include "global/global_context.h" #include "include/ceph_assert.h" #include "mds/MDSContinuation.h" #include "mds/InoTable.h" #include "cephfs_features.h" #include "osdc/Objecter.h" #define dout_context g_ceph_context #define dout_subsys ceph_subsys_mds #undef dout_prefix #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") " void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) { using ceph::encode; op.priority = priority; op.create(false); bufferlist parent_bl; encode(bt, parent_bl); op.setxattr("parent", parent_bl); // for the old pool there is no need to update the layout if (!update_layout) return; bufferlist layout_bl; encode(_layout, layout_bl, _features); op.setxattr("layout", layout_bl); } class CInodeIOContext : public MDSIOContextBase { protected: CInode *in; MDSRank *get_mds() override {return in->mdcache->mds;} public: explicit CInodeIOContext(CInode *in_) : in(in_) { ceph_assert(in != NULL); } }; sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1; LockType CInode::versionlock_type(CEPH_LOCK_IVERSION); LockType CInode::authlock_type(CEPH_LOCK_IAUTH); LockType CInode::linklock_type(CEPH_LOCK_ILINK); LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT); LockType CInode::filelock_type(CEPH_LOCK_IFILE); LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR); LockType CInode::snaplock_type(CEPH_LOCK_ISNAP); LockType CInode::nestlock_type(CEPH_LOCK_INEST); LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK); LockType CInode::policylock_type(CEPH_LOCK_IPOLICY); std::string_view CInode::pin_name(int p) const { switch (p) { case PIN_DIRFRAG: return "dirfrag"; case PIN_CAPS: return "caps"; case PIN_IMPORTING: return "importing"; case PIN_OPENINGDIR: return "openingdir"; case PIN_REMOTEPARENT: return "remoteparent"; case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; case PIN_SCATTERED: return "scattered"; case PIN_STICKYDIRS: return "stickydirs"; //case PIN_PURGING: return "purging"; case PIN_FREEZING: return "freezing"; case PIN_FROZEN: return "frozen"; case PIN_IMPORTINGCAPS: return "importingcaps"; case PIN_EXPORTINGCAPS: return "exportingcaps"; case PIN_PASTSNAPPARENT: return "pastsnapparent"; case PIN_OPENINGSNAPPARENTS: return "openingsnapparents"; case PIN_TRUNCATING: return "truncating"; case PIN_STRAY: return "stray"; case PIN_NEEDSNAPFLUSH: return "needsnapflush"; case PIN_DIRTYRSTAT: return "dirtyrstat"; case PIN_DIRTYPARENT: return "dirtyparent"; case PIN_DIRWAITER: return "dirwaiter"; default: return generic_pin_name(p); } } //int cinode_pins[CINODE_NUM_PINS]; // counts ostream& CInode::print_db_line_prefix(ostream& out) { return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "; } /* * write caps and lock ids */ struct cinode_lock_info_t cinode_lock_info[] = { { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR }, { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL }, { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL }, { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL }, }; int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]); ostream& operator<<(ostream& out, const CInode& in) { string path; in.make_path_string(path, true); out << "[inode " << in.ino(); out << " [" << (in.is_multiversion() ? "...":"") << in.first << "," << in.last << "]"; out << " " << path << (in.is_dir() ? "/":""); if (in.is_auth()) { out << " auth"; if (in.is_replicated()) out << in.get_replicas(); } else { mds_authority_t a = in.authority(); out << " rep@" << a.first; if (a.second != CDIR_AUTH_UNKNOWN) out << "," << a.second; out << "." << in.get_replica_nonce(); } if (in.is_symlink()) out << " symlink='" << in.symlink << "'"; if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree; out << " v" << in.get_version(); if (in.get_projected_version() > in.get_version()) out << " pv" << in.get_projected_version(); if (in.get_num_auth_pins()) { out << " ap=" << in.get_num_auth_pins(); #ifdef MDS_AUTHPIN_SET in.print_authpin_set(out); #endif } if (in.snaprealm) out << " snaprealm=" << in.snaprealm; if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER"; if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING"; if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT"; if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS"; if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN"; if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN"; if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; const auto& pi = in.get_projected_inode(); if (pi->is_truncating()) out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; if (in.is_dir()) { out << " " << in.get_inode()->dirstat; if (g_conf()->mds_debug_scatterstat && in.is_projected()) { out << "->" << pi->dirstat; } } else { out << " s=" << in.get_inode()->size; if (in.get_inode()->nlink != 1) out << " nl=" << in.get_inode()->nlink; } // rstat out << " " << in.get_inode()->rstat; if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat)) out << "/" << in.get_inode()->accounted_rstat; if (g_conf()->mds_debug_scatterstat && in.is_projected()) { out << "->" << pi->rstat; if (!(pi->rstat == pi->accounted_rstat)) out << "/" << pi->accounted_rstat; } if (in.is_any_old_inodes()) { out << " old_inodes=" << in.get_old_inodes()->size(); } if (!in.client_need_snapflush.empty()) out << " need_snapflush=" << in.client_need_snapflush; // locks if (!in.authlock.is_sync_and_unlocked()) out << " " << in.authlock; if (!in.linklock.is_sync_and_unlocked()) out << " " << in.linklock; if (in.get_inode()->is_dir()) { if (!in.dirfragtreelock.is_sync_and_unlocked()) out << " " << in.dirfragtreelock; if (!in.snaplock.is_sync_and_unlocked()) out << " " << in.snaplock; if (!in.nestlock.is_sync_and_unlocked()) out << " " << in.nestlock; if (!in.policylock.is_sync_and_unlocked()) out << " " << in.policylock; } else { if (!in.flocklock.is_sync_and_unlocked()) out << " " << in.flocklock; } if (!in.filelock.is_sync_and_unlocked()) out << " " << in.filelock; if (!in.xattrlock.is_sync_and_unlocked()) out << " " << in.xattrlock; if (!in.versionlock.is_sync_and_unlocked()) out << " " << in.versionlock; // hack: spit out crap on which clients have caps if (in.get_inode()->client_ranges.size()) out << " cr=" << in.get_inode()->client_ranges; if (!in.get_client_caps().empty()) { out << " caps={"; bool first = true; for (const auto &p : in.get_client_caps()) { if (!first) out << ","; out << p.first << "=" << ccap_string(p.second.pending()); if (p.second.issued() != p.second.pending()) out << "/" << ccap_string(p.second.issued()); out << "/" << ccap_string(p.second.wanted()) << "@" << p.second.get_last_seq(); first = false; } out << "}"; if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) { out << ",l=" << in.get_loner(); if (in.get_loner() != in.get_wanted_loner()) out << "(" << in.get_wanted_loner() << ")"; } } if (!in.get_mds_caps_wanted().empty()) { out << " mcw={"; bool first = true; for (const auto &p : in.get_mds_caps_wanted()) { if (!first) out << ','; out << p.first << '=' << ccap_string(p.second); first = false; } out << '}'; } if (in.get_num_ref()) { out << " |"; in.print_pin_set(out); } if (in.get_inode()->export_pin != MDS_RANK_NONE) { out << " export_pin=" << in.get_inode()->export_pin; } if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) { out << " distepin"; } if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) { out << " randepin"; } out << " " << ∈ out << "]"; return out; } CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) : mdcache(c), first(f), last(l), item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this), item_dirty_dirfrag_dir(this), item_dirty_dirfrag_nest(this), item_dirty_dirfrag_dirfragtree(this), pop(c->decayrate), versionlock(this, &versionlock_type), authlock(this, &authlock_type), linklock(this, &linklock_type), dirfragtreelock(this, &dirfragtreelock_type), filelock(this, &filelock_type), xattrlock(this, &xattrlock_type), snaplock(this, &snaplock_type), nestlock(this, &nestlock_type), flocklock(this, &flocklock_type), policylock(this, &policylock_type) { if (auth) state_set(STATE_AUTH); } void CInode::print(ostream& out) { out << *this; } void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) { dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; if (client_need_snapflush.empty()) { get(CInode::PIN_NEEDSNAPFLUSH); // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially // long periods waiting for clients to flush their snaps. auth_pin(this); // pin head get_inode()->.. } auto &clients = client_need_snapflush[snapid]; if (clients.empty()) snapin->auth_pin(this); // ...and pin snapped/old inode! clients.insert(client); } void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) { dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; auto it = client_need_snapflush.find(snapid); if (it == client_need_snapflush.end()) { dout(10) << " snapid not found" << dendl; return; } size_t n = it->second.erase(client); if (n == 0) { dout(10) << " client not found" << dendl; return; } if (it->second.empty()) { client_need_snapflush.erase(it); snapin->auth_unpin(this); if (client_need_snapflush.empty()) { put(CInode::PIN_NEEDSNAPFLUSH); auth_unpin(this); } } } pair CInode::split_need_snapflush(CInode *cowin, CInode *in) { dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl; bool cowin_need_flush = false; bool orig_need_flush = false; auto it = client_need_snapflush.lower_bound(cowin->first); while (it != client_need_snapflush.end() && it->first < in->first) { ceph_assert(!it->second.empty()); if (cowin->last >= it->first) { cowin->auth_pin(this); cowin_need_flush = true; ++it; } else { it = client_need_snapflush.erase(it); } in->auth_unpin(this); } if (it != client_need_snapflush.end() && it->first <= in->last) orig_need_flush = true; return make_pair(cowin_need_flush, orig_need_flush); } void CInode::mark_dirty_rstat() { if (!state_test(STATE_DIRTYRSTAT)) { dout(10) << __func__ << dendl; state_set(STATE_DIRTYRSTAT); get(PIN_DIRTYRSTAT); CDentry *pdn = get_projected_parent_dn(); if (pdn->is_auth()) { CDir *pdir = pdn->dir; pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item); mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock); } else { // under cross-MDS rename. // DIRTYRSTAT flag will get cleared when rename finishes ceph_assert(state_test(STATE_AMBIGUOUSAUTH)); } } } void CInode::clear_dirty_rstat() { if (state_test(STATE_DIRTYRSTAT)) { dout(10) << __func__ << dendl; state_clear(STATE_DIRTYRSTAT); put(PIN_DIRTYRSTAT); dirty_rstat_item.remove_myself(); } } CInode::projected_inode CInode::project_inode(const MutationRef& mut, bool xattr, bool snap) { if (mut && mut->is_projected(this)) { ceph_assert(!xattr && !snap); auto _inode = std::const_pointer_cast(projected_nodes.back().inode); return projected_inode(std::move(_inode), xattr_map_ptr()); } auto pi = allocate_inode(*get_projected_inode()); if (scrub_infop && scrub_infop->last_scrub_dirty) { pi->last_scrub_stamp = scrub_infop->last_scrub_stamp; pi->last_scrub_version = scrub_infop->last_scrub_version; scrub_infop->last_scrub_dirty = false; scrub_maybe_delete_info(); } const auto& ox = get_projected_xattrs(); xattr_map_ptr px; if (xattr) { px = allocate_xattr_map(); if (ox) *px = *ox; } sr_t* ps = projected_inode::UNDEF_SRNODE; if (snap) { ps = prepare_new_srnode(0); ++num_projected_srnodes; } projected_nodes.emplace_back(pi, xattr ? px : ox , ps); if (mut) mut->add_projected_node(this); dout(15) << __func__ << " " << pi->ino << dendl; return projected_inode(std::move(pi), std::move(px), ps); } void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut) { ceph_assert(!projected_nodes.empty()); auto front = std::move(projected_nodes.front()); dout(15) << __func__ << " v" << front.inode->version << dendl; projected_nodes.pop_front(); if (mut) mut->remove_projected_node(this); bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id; bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) || (get_inode()->export_ephemeral_distributed_pin != front.inode->export_ephemeral_distributed_pin); reset_inode(std::move(front.inode)); if (front.xattrs != get_xattrs()) reset_xattrs(std::move(front.xattrs)); if (front.snapnode != projected_inode::UNDEF_SRNODE) { --num_projected_srnodes; pop_projected_snaprealm(front.snapnode, false); } mark_dirty(ls); if (get_inode()->is_backtrace_updated()) mark_dirty_parent(ls, pool_updated); if (pin_updated) maybe_export_pin(true); } sr_t *CInode::prepare_new_srnode(snapid_t snapid) { const sr_t *cur_srnode = get_projected_srnode(); sr_t *new_srnode; if (cur_srnode) { new_srnode = new sr_t(*cur_srnode); } else { if (snapid == 0) snapid = mdcache->get_global_snaprealm()->get_newest_seq(); new_srnode = new sr_t(); new_srnode->seq = snapid; new_srnode->created = snapid; new_srnode->current_parent_since = get_oldest_snap(); } return new_srnode; } const sr_t *CInode::get_projected_srnode() const { if (num_projected_srnodes > 0) { for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) if (it->snapnode != projected_inode::UNDEF_SRNODE) return it->snapnode; } if (snaprealm) return &snaprealm->srnode; else return NULL; } void CInode::project_snaprealm(sr_t *new_srnode) { dout(10) << __func__ << " " << new_srnode << dendl; ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE); projected_nodes.back().snapnode = new_srnode; ++num_projected_srnodes; } void CInode::mark_snaprealm_global(sr_t *new_srnode) { ceph_assert(!is_dir()); // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since' new_srnode->last_destroyed = new_srnode->current_parent_since; new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; new_srnode->mark_parent_global(); } void CInode::clear_snaprealm_global(sr_t *new_srnode) { // restore 'current_parent_since' new_srnode->current_parent_since = new_srnode->last_destroyed; new_srnode->last_destroyed = 0; new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq(); new_srnode->clear_parent_global(); } bool CInode::is_projected_snaprealm_global() const { const sr_t *srnode = get_projected_srnode(); if (srnode && srnode->is_parent_global()) return true; return false; } void CInode::project_snaprealm_past_parent(SnapRealm *newparent) { sr_t *new_snap = project_snaprealm(); record_snaprealm_past_parent(new_snap, newparent); } /* if newparent != parent, add parent to past_parents if parent DNE, we need to find what the parent actually is and fill that in */ void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent) { ceph_assert(!new_snap->is_parent_global()); SnapRealm *oldparent; if (!snaprealm) { oldparent = find_snaprealm(); } else { oldparent = snaprealm->parent; } if (newparent != oldparent) { snapid_t oldparentseq = oldparent->get_newest_seq(); if (oldparentseq + 1 > new_snap->current_parent_since) { // copy old parent's snaps const set& snaps = oldparent->get_snaps(); auto p = snaps.lower_bound(new_snap->current_parent_since); if (p != snaps.end()) new_snap->past_parent_snaps.insert(p, snaps.end()); if (oldparentseq > new_snap->seq) new_snap->seq = oldparentseq; } new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; } } void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent, CDentry *dn, bool primary_dn) { ceph_assert(new_snap->is_parent_global()); if (!oldparent) oldparent = dn->get_dir()->inode->find_snaprealm(); auto& snaps = oldparent->get_snaps(); if (!primary_dn) { auto p = snaps.lower_bound(dn->first); if (p != snaps.end()) new_snap->past_parent_snaps.insert(p, snaps.end()); } else { // 'last_destroyed' is used as 'current_parent_since' auto p = snaps.lower_bound(new_snap->last_destroyed); if (p != snaps.end()) new_snap->past_parent_snaps.insert(p, snaps.end()); new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1; } } void CInode::early_pop_projected_snaprealm() { ceph_assert(!projected_nodes.empty()); if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) { pop_projected_snaprealm(projected_nodes.front().snapnode, true); projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE; --num_projected_srnodes; } } void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early) { if (next_snaprealm) { dout(10) << __func__ << (early ? " (early) " : " ") << next_snaprealm << " seq " << next_snaprealm->seq << dendl; if (!snaprealm) open_snaprealm(); auto old_flags = snaprealm->srnode.flags; snaprealm->srnode = *next_snaprealm; delete next_snaprealm; if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { snaprealm->adjust_parent(); } if (snaprealm->parent) dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl; } else { dout(10) << __func__ << (early ? " (early) null" : " null") << dendl; ceph_assert(snaprealm); snaprealm->merge_to(NULL); } } // ====== CInode ======= // dirfrags InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode(); __u32 InodeStoreBase::hash_dentry_name(std::string_view dn) { int which = inode->dir_layout.dl_dir_hash; if (!which) which = CEPH_STR_HASH_LINUX; ceph_assert(ceph_str_hash_valid(which)); return ceph_str_hash(which, dn.data(), dn.length()); } frag_t InodeStoreBase::pick_dirfrag(std::string_view dn) { if (dirfragtree.empty()) return frag_t(); // avoid the string hash if we can. __u32 h = hash_dentry_name(dn); return dirfragtree[h]; } std::pair> CInode::get_dirfrags_under(frag_t fg) { std::pair> result; auto& all = result.first; auto& dirs = result.second; all = false; if (auto it = dirfrags.find(fg); it != dirfrags.end()){ all = true; dirs.push_back(it->second); return result; } int total = 0; for(auto &[_fg, _dir] : dirfrags){ // frag_t.bits() can indicate the depth of the partition in the directory tree // e.g. // 01* : bit = 2, on the second floor // * // 0* 1* // 00* 01* 10* 11* -- > level 2, bit = 2 // so fragA.bits > fragB.bits means fragA is deeper than fragB if (fg.bits() >= _fg.bits()) { if (_fg.contains(fg)) { all = true; return result; } } else { if (fg.contains(_fg)) { dirs.push_back(_dir); // we can calculate how many sub slices a slice can be divided into // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*) // or 2^2 frags belonging to the second layer(00* 01* 10* 11*) // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level total += 1 << (24 - _fg.bits()); } } } // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache all = ((1<<(24-fg.bits())) == total); return result; } void CInode::verify_dirfrags() { bool bad = false; for (const auto &p : dirfrags) { if (!dirfragtree.is_leaf(p.first)) { dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree << ": " << *p.second << dendl; bad = true; } } ceph_assert(!bad); } void CInode::force_dirfrags() { bool bad = false; for (auto &p : dirfrags) { if (!dirfragtree.is_leaf(p.first)) { dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree << ": " << *p.second << dendl; bad = true; } } if (bad) { frag_vec_t leaves; dirfragtree.get_leaves(leaves); for (const auto& leaf : leaves) { mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true); } } verify_dirfrags(); } CDir *CInode::get_approx_dirfrag(frag_t fg) { CDir *dir = get_dirfrag(fg); if (dir) return dir; // find a child? auto&& p = get_dirfrags_under(fg); if (!p.second.empty()) return p.second.front(); // try parents? while (fg.bits() > 0) { fg = fg.parent(); dir = get_dirfrag(fg); if (dir) return dir; } return NULL; } CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) { ceph_assert(is_dir()); // have it? CDir *dir = get_dirfrag(fg); if (!dir) { // create it. ceph_assert(is_auth() || mdcache->mds->is_any_replay()); dir = new CDir(this, fg, mdcache, is_auth()); add_dirfrag(dir); } return dir; } CDir *CInode::add_dirfrag(CDir *dir) { auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir)); ceph_assert(em.second); if (stickydir_ref > 0) { dir->state_set(CDir::STATE_STICKY); dir->get(CDir::PIN_STICKY); } maybe_export_pin(); return dir; } void CInode::close_dirfrag(frag_t fg) { dout(14) << __func__ << " " << fg << dendl; ceph_assert(dirfrags.count(fg)); CDir *dir = dirfrags[fg]; dir->remove_null_dentries(); // clear dirty flag if (dir->is_dirty()) dir->mark_clean(); if (stickydir_ref > 0) { dir->state_clear(CDir::STATE_STICKY); dir->put(CDir::PIN_STICKY); } if (dir->is_subtree_root()) num_subtree_roots--; // dump any remaining dentries, for debugging purposes for (const auto &p : dir->items) dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl; ceph_assert(dir->get_num_ref() == 0); delete dir; dirfrags.erase(fg); } void CInode::close_dirfrags() { while (!dirfrags.empty()) close_dirfrag(dirfrags.begin()->first); } bool CInode::has_subtree_root_dirfrag(int auth) { if (num_subtree_roots > 0) { if (auth == -1) return true; for (const auto &p : dirfrags) { if (p.second->is_subtree_root() && p.second->dir_auth.first == auth) return true; } } return false; } bool CInode::has_subtree_or_exporting_dirfrag() { if (num_subtree_roots > 0 || num_exporting_dirs > 0) return true; return false; } void CInode::get_stickydirs() { if (stickydir_ref == 0) { get(PIN_STICKYDIRS); for (const auto &p : dirfrags) { p.second->state_set(CDir::STATE_STICKY); p.second->get(CDir::PIN_STICKY); } } stickydir_ref++; } void CInode::put_stickydirs() { ceph_assert(stickydir_ref > 0); stickydir_ref--; if (stickydir_ref == 0) { put(PIN_STICKYDIRS); for (const auto &p : dirfrags) { p.second->state_clear(CDir::STATE_STICKY); p.second->put(CDir::PIN_STICKY); } } } // pins void CInode::first_get() { // pin my dentry? if (parent) parent->get(CDentry::PIN_INODEPIN); } void CInode::last_put() { // unpin my dentry? if (parent) parent->put(CDentry::PIN_INODEPIN); } void CInode::_put() { if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent()) mdcache->maybe_eval_stray(this, true); } void CInode::add_remote_parent(CDentry *p) { if (remote_parents.empty()) get(PIN_REMOTEPARENT); remote_parents.insert(p); } void CInode::remove_remote_parent(CDentry *p) { remote_parents.erase(p); if (remote_parents.empty()) put(PIN_REMOTEPARENT); } CDir *CInode::get_parent_dir() { if (parent) return parent->dir; return NULL; } CDir *CInode::get_projected_parent_dir() { CDentry *p = get_projected_parent_dn(); if (p) return p->dir; return NULL; } CInode *CInode::get_parent_inode() { if (parent) return parent->dir->inode; return NULL; } bool CInode::is_ancestor_of(const CInode *other) const { while (other) { if (other == this) return true; const CDentry *pdn = other->get_oldest_parent_dn(); if (!pdn) { ceph_assert(other->is_base()); break; } other = pdn->get_dir()->get_inode(); } return false; } bool CInode::is_projected_ancestor_of(const CInode *other) const { while (other) { if (other == this) return true; const CDentry *pdn = other->get_projected_parent_dn(); if (!pdn) { ceph_assert(other->is_base()); break; } other = pdn->get_dir()->get_inode(); } return false; } /* * Because a non-directory inode may have multiple links, the use_parent * argument allows selecting which parent to use for path construction. This * argument is only meaningful for the final component (i.e. the first of the * nested calls) because directories cannot have multiple hard links. If * use_parent is NULL and projected is true, the primary parent's projected * inode is used all the way up the path chain. Otherwise the primary parent * stable inode is used. */ void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const { if (!use_parent) { use_parent = projected ? get_projected_parent_dn() : parent; } if (use_parent) { use_parent->make_path_string(s, projected); } else if (is_root()) { s = ""; } else if (is_mdsdir()) { char t[40]; uint64_t eino(ino()); eino -= MDS_INO_MDSDIR_OFFSET; snprintf(t, sizeof(t), "~mds%" PRId64, eino); s = t; } else { char n[40]; uint64_t eino(ino()); snprintf(n, sizeof(n), "#%" PRIx64, eino); s += n; } } void CInode::make_path(filepath& fp, bool projected) const { const CDentry *use_parent = projected ? get_projected_parent_dn() : parent; if (use_parent) { ceph_assert(!is_base()); use_parent->make_path(fp, projected); } else { fp = filepath(ino()); } } void CInode::name_stray_dentry(string& dname) { char s[20]; snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val); dname = s; } version_t CInode::pre_dirty() { version_t pv; CDentry* _cdentry = get_projected_parent_dn(); if (_cdentry) { pv = _cdentry->pre_dirty(get_projected_version()); dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl; } else { ceph_assert(is_base()); pv = get_projected_version() + 1; } // force update backtrace for old format inode (see mempool_inode::decode) if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) { auto pi = _get_projected_inode(); if (pi->backtrace_version == 0) pi->update_backtrace(pv); } return pv; } void CInode::_mark_dirty(LogSegment *ls) { if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); get(PIN_DIRTY); ceph_assert(ls); } // move myself to this segment's dirty list if (ls) ls->dirty_inodes.push_back(&item_dirty); } void CInode::mark_dirty(LogSegment *ls) { dout(10) << __func__ << " " << *this << dendl; /* NOTE: I may already be dirty, but this fn _still_ needs to be called so that the directory is (perhaps newly) dirtied, and so that parent_dir_version is updated below. */ // only auth can get dirty. "dirty" async data in replicas is relative to // filelock state, not the dirty flag. ceph_assert(is_auth()); // touch my private version _mark_dirty(ls); // mark dentry too if (parent) parent->mark_dirty(get_version(), ls); } void CInode::mark_clean() { dout(10) << __func__ << " " << *this << dendl; if (state_test(STATE_DIRTY)) { state_clear(STATE_DIRTY); put(PIN_DIRTY); // remove myself from ls dirty list item_dirty.remove_myself(); } } // -------------- // per-inode storage // (currently for root inode only) struct C_IO_Inode_Stored : public CInodeIOContext { version_t version; Context *fin; C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} void finish(int r) override { in->_stored(r, version, fin); } void print(ostream& out) const override { out << "inode_store(" << in->ino() << ")"; } }; object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix) { char n[60]; snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg); ceph_assert(strlen(n) + suffix.size() < sizeof n); strncat(n, suffix.data(), suffix.size()); return object_t(n); } void CInode::store(MDSContext *fin) { dout(10) << __func__ << " " << get_version() << dendl; ceph_assert(is_base()); if (snaprealm) purge_stale_snap_data(snaprealm->get_snaps()); // encode bufferlist bl; string magic = CEPH_FS_ONDISK_MAGIC; using ceph::encode; encode(magic, bl); encode_store(bl, mdcache->mds->mdsmap->get_up_features()); // write it. SnapContext snapc; ObjectOperation m; m.write_full(bl); object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode"); object_locator_t oloc(mdcache->mds->get_metadata_pool()); Context *newfin = new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin), mdcache->mds->finisher); mdcache->mds->objecter->mutate(oid, oloc, m, snapc, ceph::real_clock::now(), 0, newfin); } void CInode::_stored(int r, version_t v, Context *fin) { if (r < 0) { dout(1) << "store error " << r << " v " << v << " on " << *this << dendl; mdcache->mds->clog->error() << "failed to store inode " << ino() << " object: " << cpp_strerror(r); mdcache->mds->handle_write_error(r); fin->complete(r); return; } dout(10) << __func__ << " " << v << " on " << *this << dendl; if (v == get_projected_version()) mark_clean(); fin->complete(0); } void CInode::flush(MDSContext *fin) { dout(10) << __func__ << " " << *this << dendl; ceph_assert(is_auth() && can_auth_pin()); MDSGatherBuilder gather(g_ceph_context); if (is_dirty_parent()) { store_backtrace(gather.new_sub()); } if (is_dirty()) { if (is_base()) { store(gather.new_sub()); } else { parent->dir->commit(0, gather.new_sub()); } } if (gather.has_subs()) { gather.set_finisher(fin); gather.activate(); } else { fin->complete(0); } } struct C_IO_Inode_Fetched : public CInodeIOContext { bufferlist bl, bl2; Context *fin; C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {} void finish(int r) override { // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT in->_fetched(bl, bl2, fin); } void print(ostream& out) const override { out << "inode_fetch(" << in->ino() << ")"; } }; void CInode::fetch(MDSContext *fin) { dout(10) << __func__ << dendl; C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin); C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher)); object_t oid = CInode::get_object_name(ino(), frag_t(), ""); object_locator_t oloc(mdcache->mds->get_metadata_pool()); // Old on-disk format: inode stored in xattr of a dirfrag ObjectOperation rd; rd.getxattr("inode", &c->bl, NULL); mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub()); // Current on-disk format: inode stored in a .inode object object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode"); mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub()); gather.activate(); } void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) { dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl; bufferlist::const_iterator p; if (bl2.length()) { p = bl2.cbegin(); } else if (bl.length()) { p = bl.cbegin(); } else { derr << "No data while reading inode " << ino() << dendl; fin->complete(-CEPHFS_ENOENT); return; } using ceph::decode; // Attempt decode try { string magic; decode(magic, p); dout(10) << " magic is '" << magic << "' (expecting '" << CEPH_FS_ONDISK_MAGIC << "')" << dendl; if (magic != CEPH_FS_ONDISK_MAGIC) { dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC << "'" << dendl; fin->complete(-CEPHFS_EINVAL); } else { decode_store(p); dout(10) << "_fetched " << *this << dendl; fin->complete(0); } } catch (buffer::error &err) { derr << "Corrupt inode " << ino() << ": " << err.what() << dendl; fin->complete(-CEPHFS_EINVAL); return; } } void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) { bt.ino = ino(); bt.ancestors.clear(); bt.pool = pool; CInode *in = this; CDentry *pdn = get_parent_dn(); while (pdn) { CInode *diri = pdn->get_dir()->get_inode(); bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version)); in = diri; pdn = in->get_parent_dn(); } bt.old_pools.reserve(get_inode()->old_pools.size()); for (auto &p : get_inode()->old_pools) { // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) if (p != pool) bt.old_pools.push_back(p); } } struct C_IO_Inode_StoredBacktrace : public CInodeIOContext { version_t version; Context *fin; C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} void finish(int r) override { in->_stored_backtrace(r, version, fin); } void print(ostream& out) const override { out << "backtrace_store(" << in->ino() << ")"; } }; void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld, std::vector &ops_vec, inode_backtrace_t &bt) { dout(10) << __func__ << dendl; if (r < 0) { mdcache->mds->handle_write_error_with_lock(r); return; } SnapContext snapc; object_t oid = get_object_name(ino(), frag_t(), ""); for (auto &op : ops_vec) { ObjectOperation obj_op; object_locator_t oloc(op.get_pool()); op.update(obj_op, bt); mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc, ceph::real_clock::now(), 0, gather_bld.new_sub()); } } void CInode::_store_backtrace(std::vector &ops_vec, inode_backtrace_t &bt, int op_prio) { dout(10) << __func__ << " on " << *this << dendl; ceph_assert(is_dirty_parent()); if (op_prio < 0) op_prio = CEPH_MSG_PRIO_DEFAULT; auth_pin(this); const int64_t pool = get_backtrace_pool(); build_backtrace(pool, bt); ops_vec.emplace_back(op_prio, pool, get_inode()->layout, mdcache->mds->mdsmap->get_up_features()); if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) { dout(20) << __func__ << ": no dirtypool or no old pools" << dendl; return; } // In the case where DIRTYPOOL is set, we update all old pools backtraces // such that anyone reading them will see the new pool ID in // inode_backtrace_t::pool and go read everything else from there. for (const auto &p : get_inode()->old_pools) { if (p == pool) continue; dout(20) << __func__ << ": updating old pool " << p << dendl; ops_vec.emplace_back(op_prio, p); } } void CInode::store_backtrace(MDSContext *fin, int op_prio) { std::vector ops_vec; inode_backtrace_t bt; auto version = get_inode()->backtrace_version; _store_backtrace(ops_vec, bt, op_prio); C_GatherBuilder gather(g_ceph_context, new C_OnFinisher( new C_IO_Inode_StoredBacktrace(this, version, fin), mdcache->mds->finisher)); _commit_ops(0, gather, ops_vec, bt); ceph_assert(gather.has_subs()); gather.activate(); } void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio) { op.version = get_inode()->backtrace_version; op.in = this; _store_backtrace(op.ops_vec, op.bt, op_prio); } void CInode::_stored_backtrace(int r, version_t v, Context *fin) { if (r == -CEPHFS_ENOENT) { const int64_t pool = get_backtrace_pool(); bool exists = mdcache->mds->objecter->with_osdmap( [pool](const OSDMap &osd_map) { return osd_map.have_pg_pool(pool); }); // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it // out from under us), so the backtrace can never be written, so pretend // to succeed so that the user can proceed to e.g. delete the file. if (!exists) { dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted " "beneath us!" << dendl; r = 0; } } if (r < 0) { dout(1) << "store backtrace error " << r << " v " << v << dendl; mdcache->mds->clog->error() << "failed to store backtrace on ino " << ino() << " object" << ", pool " << get_backtrace_pool() << ", errno " << r; mdcache->mds->handle_write_error(r); if (fin) fin->complete(r); return; } dout(10) << __func__ << " v " << v << dendl; auth_unpin(this); if (v == get_inode()->backtrace_version) clear_dirty_parent(); if (fin) fin->complete(0); } void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace) { mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin); } void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool) { if (!state_test(STATE_DIRTYPARENT)) { dout(10) << __func__ << dendl; state_set(STATE_DIRTYPARENT); get(PIN_DIRTYPARENT); ceph_assert(ls); } if (dirty_pool) state_set(STATE_DIRTYPOOL); if (ls) ls->dirty_parent_inodes.push_back(&item_dirty_parent); } void CInode::clear_dirty_parent() { if (state_test(STATE_DIRTYPARENT)) { dout(10) << __func__ << dendl; state_clear(STATE_DIRTYPARENT); state_clear(STATE_DIRTYPOOL); put(PIN_DIRTYPARENT); item_dirty_parent.remove_myself(); } } void CInode::verify_diri_backtrace(bufferlist &bl, int err) { if (is_base() || is_dirty_parent() || !is_auth()) return; dout(10) << __func__ << dendl; if (err == 0) { inode_backtrace_t backtrace; using ceph::decode; decode(backtrace, bl); CDentry *pdn = get_parent_dn(); if (backtrace.ancestors.empty() || backtrace.ancestors[0].dname != pdn->get_name() || backtrace.ancestors[0].dirino != pdn->get_dir()->ino()) err = -CEPHFS_EINVAL; } if (err) { MDSRank *mds = mdcache->mds; mds->clog->error() << "bad backtrace on directory inode " << ino(); ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1)); mark_dirty_parent(mds->mdlog->get_current_segment(), false); mds->mdlog->flush(); } } // ------------------ // parent dir void InodeStoreBase::encode_xattrs(bufferlist &bl) const { using ceph::encode; if (xattrs) encode(*xattrs, bl); else encode((__u32)0, bl); } void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) { using ceph::decode; mempool_xattr_map tmp; decode_noshare(tmp, p); if (tmp.empty()) { reset_xattrs(xattr_map_ptr()); } else { reset_xattrs(allocate_xattr_map(std::move(tmp))); } } void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const { using ceph::encode; if (old_inodes) encode(*old_inodes, bl, features); else encode((__u32)0, bl); } void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) { using ceph::decode; mempool_old_inode_map tmp; decode(tmp, p); if (tmp.empty()) { reset_old_inodes(old_inode_map_ptr()); } else { reset_old_inodes(allocate_old_inode_map(std::move(tmp))); } } void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob) const { using ceph::encode; encode(*inode, bl, features); if (inode->is_symlink()) encode(symlink, bl); encode(dirfragtree, bl); encode_xattrs(bl); if (snap_blob) encode(*snap_blob, bl); else encode(bufferlist(), bl); encode_old_inodes(bl, features); encode(oldest_snap, bl); encode(damage_flags, bl); } void InodeStoreBase::encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob) const { ENCODE_START(6, 4, bl); encode_bare(bl, features, snap_blob); ENCODE_FINISH(bl); } void CInode::encode_store(bufferlist& bl, uint64_t features) { bufferlist snap_blob; encode_snap_blob(snap_blob); InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(), &snap_blob); } void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl, bufferlist& snap_blob, __u8 struct_v) { using ceph::decode; auto _inode = allocate_inode(); decode(*_inode, bl); if (_inode->is_symlink()) { std::string tmp; decode(tmp, bl); symlink = std::string_view(tmp); } decode(dirfragtree, bl); decode_xattrs(bl); decode(snap_blob, bl); decode_old_inodes(bl); if (struct_v == 2 && _inode->is_dir()) { bool default_layout_exists; decode(default_layout_exists, bl); if (default_layout_exists) { decode(struct_v, bl); // this was a default_file_layout decode(_inode->layout, bl); // but we only care about the layout portion } } if (struct_v >= 5) { // InodeStore is embedded in dentries without proper versioning, so // we consume up to the end of the buffer if (!bl.end()) { decode(oldest_snap, bl); } if (!bl.end()) { decode(damage_flags, bl); } } reset_inode(std::move(_inode)); } void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob) { DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); decode_bare(bl, snap_blob, struct_v); DECODE_FINISH(bl); } void CInode::decode_store(bufferlist::const_iterator& bl) { bufferlist snap_blob; InodeStoreBase::decode(bl, snap_blob); decode_snap_blob(snap_blob); } // ------------------ // locking SimpleLock* CInode::get_lock(int type) { switch (type) { case CEPH_LOCK_IVERSION: return &versionlock; case CEPH_LOCK_IFILE: return &filelock; case CEPH_LOCK_IAUTH: return &authlock; case CEPH_LOCK_ILINK: return &linklock; case CEPH_LOCK_IDFT: return &dirfragtreelock; case CEPH_LOCK_IXATTR: return &xattrlock; case CEPH_LOCK_ISNAP: return &snaplock; case CEPH_LOCK_INEST: return &nestlock; case CEPH_LOCK_IFLOCK: return &flocklock; case CEPH_LOCK_IPOLICY: return &policylock; } return 0; } void CInode::set_object_info(MDSCacheObjectInfo &info) { info.ino = ino(); info.snapid = last; } void CInode::encode_lock_iauth(bufferlist& bl) { ENCODE_START(1, 1, bl); encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode(get_inode()->mode, bl); encode(get_inode()->uid, bl); encode(get_inode()->gid, bl); ENCODE_FINISH(bl); } void CInode::decode_lock_iauth(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(1, p); decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode(_inode->mode, p); decode(_inode->uid, p); decode(_inode->gid, p); DECODE_FINISH(p); reset_inode(std::move(_inode)); } void CInode::encode_lock_ilink(bufferlist& bl) { ENCODE_START(1, 1, bl); encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode(get_inode()->nlink, bl); ENCODE_FINISH(bl); } void CInode::decode_lock_ilink(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(1, p); decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode(_inode->nlink, p); DECODE_FINISH(p); reset_inode(std::move(_inode)); } void CInode::encode_lock_idft(bufferlist& bl) { ENCODE_START(1, 1, bl); if (is_auth()) { encode(get_inode()->version, bl); } else { // treat flushing as dirty when rejoining cache bool dirty = dirfragtreelock.is_dirty_or_flushing(); encode(dirty, bl); } { // encode the raw tree encode(dirfragtree, bl); // also specify which frags are mine set myfrags; auto&& dfls = get_dirfrags(); for (const auto& dir : dfls) { if (dir->is_auth()) { frag_t fg = dir->get_frag(); myfrags.insert(fg); } } encode(myfrags, bl); } ENCODE_FINISH(bl); } void CInode::decode_lock_idft(bufferlist::const_iterator& p) { inode_ptr _inode; DECODE_START(1, p); if (is_auth()) { bool replica_dirty; decode(replica_dirty, p); if (replica_dirty) { dout(10) << __func__ << " setting dftlock dirty flag" << dendl; dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle } } else { _inode = allocate_inode(*get_inode()); decode(_inode->version, p); } { fragtree_t temp; decode(temp, p); set authfrags; decode(authfrags, p); if (is_auth()) { // auth. believe replica's auth frags only. for (auto fg : authfrags) { if (!dirfragtree.is_leaf(fg)) { dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl; dirfragtree.force_to_leaf(g_ceph_context, fg); dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle } } } else { // replica. take the new tree, BUT make sure any open // dirfrags remain leaves (they may have split _after_ this // dft was scattered, or we may still be be waiting on the // notify from the auth) dirfragtree.swap(temp); for (const auto &p : dirfrags) { if (!dirfragtree.is_leaf(p.first)) { dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl; dirfragtree.force_to_leaf(g_ceph_context, p.first); } if (p.second->is_auth()) p.second->state_clear(CDir::STATE_DIRTYDFT); } } if (g_conf()->mds_debug_frag) verify_dirfrags(); } DECODE_FINISH(p); if (_inode) reset_inode(std::move(_inode)); } void CInode::encode_lock_ifile(bufferlist& bl) { ENCODE_START(1, 1, bl); if (is_auth()) { encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode(get_inode()->mtime, bl); encode(get_inode()->atime, bl); encode(get_inode()->time_warp_seq, bl); if (!is_dir()) { encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); encode(get_inode()->size, bl); encode(get_inode()->truncate_seq, bl); encode(get_inode()->truncate_size, bl); encode(get_inode()->client_ranges, bl); encode(get_inode()->inline_data, bl); } } else { // treat flushing as dirty when rejoining cache bool dirty = filelock.is_dirty_or_flushing(); encode(dirty, bl); } dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl; encode(get_inode()->dirstat, bl); // only meaningful if i am auth. bufferlist tmp; __u32 n = 0; for (const auto &p : dirfrags) { frag_t fg = p.first; CDir *dir = p.second; if (is_auth() || dir->is_auth()) { const auto& pf = dir->get_projected_fnode(); dout(15) << fg << " " << *dir << dendl; dout(20) << fg << " fragstat " << pf->fragstat << dendl; dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; encode(fg, tmp); encode(dir->first, tmp); encode(pf->fragstat, tmp); encode(pf->accounted_fragstat, tmp); n++; } } encode(n, bl); bl.claim_append(tmp); ENCODE_FINISH(bl); } void CInode::decode_lock_ifile(bufferlist::const_iterator& p) { inode_ptr _inode; DECODE_START(1, p); if (!is_auth()) { _inode = allocate_inode(*get_inode()); decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode(_inode->mtime, p); decode(_inode->atime, p); decode(_inode->time_warp_seq, p); if (!is_dir()) { decode(_inode->layout, p); decode(_inode->size, p); decode(_inode->truncate_seq, p); decode(_inode->truncate_size, p); decode(_inode->client_ranges, p); decode(_inode->inline_data, p); } } else { bool replica_dirty; decode(replica_dirty, p); if (replica_dirty) { dout(10) << __func__ << " setting filelock dirty flag" << dendl; filelock.mark_dirty(); // ok bc we're auth and caller will handle } } frag_info_t dirstat; decode(dirstat, p); if (!is_auth()) { dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; _inode->dirstat = dirstat; // take inode summation if replica } __u32 n; decode(n, p); dout(10) << " ...got " << n << " fragstats on " << *this << dendl; while (n--) { frag_t fg; snapid_t fgfirst; frag_info_t fragstat; frag_info_t accounted_fragstat; decode(fg, p); decode(fgfirst, p); decode(fragstat, p); decode(accounted_fragstat, p); dout(10) << fg << " [" << fgfirst << ",head] " << dendl; dout(10) << fg << " fragstat " << fragstat << dendl; dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl; CDir *dir = get_dirfrag(fg); if (is_auth()) { ceph_assert(dir); // i am auth; i had better have this dir open dout(10) << fg << " first " << dir->first << " -> " << fgfirst << " on " << *dir << dendl; dir->first = fgfirst; auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); _fnode->fragstat = fragstat; _fnode->accounted_fragstat = accounted_fragstat; dir->reset_fnode(std::move(_fnode)); if (!(fragstat == accounted_fragstat)) { dout(10) << fg << " setting filelock updated flag" << dendl; filelock.mark_dirty(); // ok bc we're auth and caller will handle } } else { if (dir && dir->is_auth()) { dout(10) << fg << " first " << dir->first << " -> " << fgfirst << " on " << *dir << dendl; dir->first = fgfirst; const auto& pf = dir->get_projected_fnode(); finish_scatter_update(&filelock, dir, _inode->dirstat.version, pf->accounted_fragstat.version); } } } DECODE_FINISH(p); if (_inode) reset_inode(std::move(_inode)); } void CInode::encode_lock_inest(bufferlist& bl) { ENCODE_START(1, 1, bl); if (is_auth()) { encode(get_inode()->version, bl); } else { // treat flushing as dirty when rejoining cache bool dirty = nestlock.is_dirty_or_flushing(); encode(dirty, bl); } dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl; encode(get_inode()->rstat, bl); // only meaningful if i am auth. bufferlist tmp; __u32 n = 0; for (const auto &p : dirfrags) { frag_t fg = p.first; CDir *dir = p.second; if (is_auth() || dir->is_auth()) { const auto& pf = dir->get_projected_fnode(); dout(10) << __func__ << " " << fg << " dir " << *dir << dendl; dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl; dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl; dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; encode(fg, tmp); encode(dir->first, tmp); encode(pf->rstat, tmp); encode(pf->accounted_rstat, tmp); encode(dir->dirty_old_rstat, tmp); n++; } } encode(n, bl); bl.claim_append(tmp); ENCODE_FINISH(bl); } void CInode::decode_lock_inest(bufferlist::const_iterator& p) { inode_ptr _inode; DECODE_START(1, p); if (is_auth()) { bool replica_dirty; decode(replica_dirty, p); if (replica_dirty) { dout(10) << __func__ << " setting nestlock dirty flag" << dendl; nestlock.mark_dirty(); // ok bc we're auth and caller will handle } } else { _inode = allocate_inode(*get_inode()); decode(_inode->version, p); } nest_info_t rstat; decode(rstat, p); if (!is_auth()) { dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl; _inode->rstat = rstat; // take inode summation if replica } __u32 n; decode(n, p); while (n--) { frag_t fg; snapid_t fgfirst; nest_info_t rstat; nest_info_t accounted_rstat; decltype(CDir::dirty_old_rstat) dirty_old_rstat; decode(fg, p); decode(fgfirst, p); decode(rstat, p); decode(accounted_rstat, p); decode(dirty_old_rstat, p); dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl; dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl; dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl; dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl; CDir *dir = get_dirfrag(fg); if (is_auth()) { ceph_assert(dir); // i am auth; i had better have this dir open dout(10) << fg << " first " << dir->first << " -> " << fgfirst << " on " << *dir << dendl; dir->first = fgfirst; auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); _fnode->rstat = rstat; _fnode->accounted_rstat = accounted_rstat; dir->reset_fnode(std::move(_fnode)); dir->dirty_old_rstat.swap(dirty_old_rstat); if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { dout(10) << fg << " setting nestlock updated flag" << dendl; nestlock.mark_dirty(); // ok bc we're auth and caller will handle } } else { if (dir && dir->is_auth()) { dout(10) << fg << " first " << dir->first << " -> " << fgfirst << " on " << *dir << dendl; dir->first = fgfirst; const auto& pf = dir->get_projected_fnode(); finish_scatter_update(&nestlock, dir, _inode->rstat.version, pf->accounted_rstat.version); } } } DECODE_FINISH(p); if (_inode) reset_inode(std::move(_inode)); } void CInode::encode_lock_ixattr(bufferlist& bl) { ENCODE_START(2, 1, bl); encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode_xattrs(bl); encode(get_inode()->xattr_version, bl); ENCODE_FINISH(bl); } void CInode::decode_lock_ixattr(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(2, p); decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode_xattrs(p); if (struct_v >= 2) { decode(_inode->xattr_version, p); } DECODE_FINISH(p); reset_inode(std::move(_inode)); } void CInode::encode_lock_isnap(bufferlist& bl) { ENCODE_START(1, 1, bl); encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode_snap(bl); ENCODE_FINISH(bl); } void CInode::decode_lock_isnap(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(1, p); decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode_snap(p); DECODE_FINISH(p); reset_inode(std::move(_inode)); } void CInode::encode_lock_iflock(bufferlist& bl) { ENCODE_START(1, 1, bl); encode(get_inode()->version, bl); _encode_file_locks(bl); ENCODE_FINISH(bl); } void CInode::decode_lock_iflock(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(1, p); decode(_inode->version, p); _decode_file_locks(p); DECODE_FINISH(p); reset_inode(std::move(_inode)); } void CInode::encode_lock_ipolicy(bufferlist& bl) { ENCODE_START(2, 1, bl); if (is_dir()) { encode(get_inode()->version, bl); encode(get_inode()->ctime, bl); encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); encode(get_inode()->quota, bl); encode(get_inode()->export_pin, bl); encode(get_inode()->export_ephemeral_distributed_pin, bl); encode(get_inode()->export_ephemeral_random_pin, bl); } ENCODE_FINISH(bl); } void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p) { ceph_assert(!is_auth()); auto _inode = allocate_inode(*get_inode()); DECODE_START(1, p); if (is_dir()) { decode(_inode->version, p); utime_t tm; decode(tm, p); if (_inode->ctime < tm) _inode->ctime = tm; decode(_inode->layout, p); decode(_inode->quota, p); decode(_inode->export_pin, p); if (struct_v >= 2) { decode(_inode->export_ephemeral_distributed_pin, p); decode(_inode->export_ephemeral_random_pin, p); } } DECODE_FINISH(p); bool pin_updated = (get_inode()->export_pin != _inode->export_pin) || (get_inode()->export_ephemeral_distributed_pin != _inode->export_ephemeral_distributed_pin); reset_inode(std::move(_inode)); maybe_export_pin(pin_updated); } void CInode::encode_lock_state(int type, bufferlist& bl) { ENCODE_START(1, 1, bl); encode(first, bl); if (!is_base()) encode(parent->first, bl); switch (type) { case CEPH_LOCK_IAUTH: encode_lock_iauth(bl); break; case CEPH_LOCK_ILINK: encode_lock_ilink(bl); break; case CEPH_LOCK_IDFT: encode_lock_idft(bl); break; case CEPH_LOCK_IFILE: encode_lock_ifile(bl); break; case CEPH_LOCK_INEST: encode_lock_inest(bl); break; case CEPH_LOCK_IXATTR: encode_lock_ixattr(bl); break; case CEPH_LOCK_ISNAP: encode_lock_isnap(bl); break; case CEPH_LOCK_IFLOCK: encode_lock_iflock(bl); break; case CEPH_LOCK_IPOLICY: encode_lock_ipolicy(bl); break; default: ceph_abort(); } ENCODE_FINISH(bl); } /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ void CInode::decode_lock_state(int type, const bufferlist& bl) { auto p = bl.cbegin(); DECODE_START(1, p); utime_t tm; snapid_t newfirst; using ceph::decode; decode(newfirst, p); if (!is_auth() && newfirst != first) { dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl; first = newfirst; } if (!is_base()) { decode(newfirst, p); if (!parent->is_auth() && newfirst != parent->first) { dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl; parent->first = newfirst; } } switch (type) { case CEPH_LOCK_IAUTH: decode_lock_iauth(p); break; case CEPH_LOCK_ILINK: decode_lock_ilink(p); break; case CEPH_LOCK_IDFT: decode_lock_idft(p); break; case CEPH_LOCK_IFILE: decode_lock_ifile(p); break; case CEPH_LOCK_INEST: decode_lock_inest(p); break; case CEPH_LOCK_IXATTR: decode_lock_ixattr(p); break; case CEPH_LOCK_ISNAP: decode_lock_isnap(p); break; case CEPH_LOCK_IFLOCK: decode_lock_iflock(p); break; case CEPH_LOCK_IPOLICY: decode_lock_ipolicy(p); break; default: ceph_abort(); } DECODE_FINISH(p); } bool CInode::is_dirty_scattered() { return filelock.is_dirty_or_flushing() || nestlock.is_dirty_or_flushing() || dirfragtreelock.is_dirty_or_flushing(); } void CInode::clear_scatter_dirty() { filelock.remove_dirty(); nestlock.remove_dirty(); dirfragtreelock.remove_dirty(); } void CInode::clear_dirty_scattered(int type) { dout(10) << __func__ << " " << type << " on " << *this << dendl; ceph_assert(is_dir()); switch (type) { case CEPH_LOCK_IFILE: item_dirty_dirfrag_dir.remove_myself(); break; case CEPH_LOCK_INEST: item_dirty_dirfrag_nest.remove_myself(); break; case CEPH_LOCK_IDFT: item_dirty_dirfrag_dirfragtree.remove_myself(); break; default: ceph_abort(); } } /* * when we initially scatter a lock, we need to check if any of the dirfrags * have out of date accounted_rstat/fragstat. if so, mark the lock stale. */ /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ void CInode::start_scatter(ScatterLock *lock) { dout(10) << __func__ << " " << *lock << " on " << *this << dendl; ceph_assert(is_auth()); const auto& pi = get_projected_inode(); for (const auto &p : dirfrags) { frag_t fg = p.first; CDir *dir = p.second; const auto& pf = dir->get_projected_fnode(); dout(20) << fg << " " << *dir << dendl; if (!dir->is_auth()) continue; switch (lock->get_type()) { case CEPH_LOCK_IFILE: finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version); break; case CEPH_LOCK_INEST: finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version); break; case CEPH_LOCK_IDFT: dir->state_clear(CDir::STATE_DIRTYDFT); break; } } } class C_Inode_FragUpdate : public MDSLogContextBase { protected: CInode *in; CDir *dir; MutationRef mut; MDSRank *get_mds() override {return in->mdcache->mds;} void finish(int r) override { in->_finish_frag_update(dir, mut); } public: C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {} }; void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir, version_t inode_version, version_t dir_accounted_version) { frag_t fg = dir->get_frag(); ceph_assert(dir->is_auth()); if (dir->is_frozen()) { dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl; } else if (dir->get_version() == 0) { dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl; } else { if (dir_accounted_version != inode_version) { dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl; MDLog *mdlog = mdcache->mds->mdlog; MutationRef mut(new MutationImpl()); mut->ls = mdlog->get_current_segment(); auto pf = dir->project_fnode(mut); std::string_view ename; switch (lock->get_type()) { case CEPH_LOCK_IFILE: pf->fragstat.version = inode_version; pf->accounted_fragstat = pf->fragstat; ename = "lock ifile accounted scatter stat update"; break; case CEPH_LOCK_INEST: pf->rstat.version = inode_version; pf->accounted_rstat = pf->rstat; ename = "lock inest accounted scatter stat update"; if (!is_auth() && lock->get_state() == LOCK_MIX) { dout(10) << __func__ << " try to assimilate dirty rstat on " << *dir << dendl; dir->assimilate_dirty_rstat_inodes(mut); } break; default: ceph_abort(); } EUpdate *le = new EUpdate(mdlog, ename); mdlog->start_entry(le); le->metablob.add_dir_context(dir); le->metablob.add_dir(dir, true); ceph_assert(!dir->is_frozen()); mut->auth_pin(dir); if (lock->get_type() == CEPH_LOCK_INEST && !is_auth() && lock->get_state() == LOCK_MIX) { dout(10) << __func__ << " finish assimilating dirty rstat on " << *dir << dendl; dir->assimilate_dirty_rstat_inodes_finish(&le->metablob); if (!(pf->rstat == pf->accounted_rstat)) { if (!mut->is_wrlocked(&nestlock)) { mdcache->mds->locker->wrlock_force(&nestlock, mut); } mdcache->mds->locker->mark_updated_scatterlock(&nestlock); mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest); } } pf->version = dir->pre_dirty(); mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut)); } else { dout(10) << __func__ << " " << fg << " accounted " << *lock << " scatter stat unchanged at v" << dir_accounted_version << dendl; } } } void CInode::_finish_frag_update(CDir *dir, MutationRef& mut) { dout(10) << __func__ << " on " << *dir << dendl; mut->apply(); mdcache->mds->locker->drop_locks(mut.get()); mut->cleanup(); } /* * when we gather a lock, we need to assimilate dirfrag changes into the inode * state. it's possible we can't update the dirfrag accounted_rstat/fragstat * because the frag is auth and frozen, or that the replica couldn't for the same * reason. hopefully it will get updated the next time the lock cycles. * * we have two dimensions of behavior: * - we may be (auth and !frozen), and able to update, or not. * - the frag may be stale, or not. * * if the frag is non-stale, we want to assimilate the diff into the * inode, regardless of whether it's auth or updateable. * * if we update the frag, we want to set accounted_fragstat = frag, * both if we took the diff or it was stale and we are making it * un-stale. */ /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ void CInode::finish_scatter_gather_update(int type, MutationRef& mut) { LogChannelRef clog = mdcache->mds->clog; dout(10) << __func__ << " " << type << " on " << *this << dendl; ceph_assert(is_auth()); switch (type) { case CEPH_LOCK_IFILE: { fragtree_t tmpdft = dirfragtree; struct frag_info_t dirstat; bool dirstat_valid = true; // adjust summation ceph_assert(is_auth()); auto pi = _get_projected_inode(); bool touched_mtime = false, touched_chattr = false; dout(20) << " orig dirstat " << pi->dirstat << dendl; pi->dirstat.version++; for (const auto &p : dirfrags) { frag_t fg = p.first; CDir *dir = p.second; dout(20) << fg << " " << *dir << dendl; bool update; if (dir->get_version() != 0) { update = dir->is_auth() && !dir->is_frozen(); } else { update = false; dirstat_valid = false; } CDir::fnode_const_ptr pf; if (update) { mut->auth_pin(dir); pf = dir->project_fnode(mut); } else { pf = dir->get_projected_fnode(); } if (pf->accounted_fragstat.version == pi->dirstat.version - 1) { dout(20) << fg << " fragstat " << pf->fragstat << dendl; dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); } else { dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl; } if (pf->fragstat.nfiles < 0 || pf->fragstat.nsubdirs < 0) { clog->error() << "bad/negative dir size on " << dir->dirfrag() << " " << pf->fragstat; ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); auto _pf = const_cast(pf.get()); if (pf->fragstat.nfiles < 0) _pf->fragstat.nfiles = 0; if (pf->fragstat.nsubdirs < 0) _pf->fragstat.nsubdirs = 0; } if (update) { auto _pf = const_cast(pf.get()); _pf->accounted_fragstat = _pf->fragstat; _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version; _pf->version = dir->pre_dirty(); dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl; } tmpdft.force_to_leaf(g_ceph_context, fg); dirstat.add(pf->fragstat); } if (touched_mtime) pi->mtime = pi->ctime = pi->dirstat.mtime; if (touched_chattr) pi->change_attr++; dout(20) << " final dirstat " << pi->dirstat << dendl; if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) { frag_vec_t leaves; tmpdft.get_leaves_under(frag_t(), leaves); for (const auto& leaf : leaves) { if (!dirfrags.count(leaf)) { dirstat_valid = false; break; } } if (dirstat_valid) { if (state_test(CInode::STATE_REPAIRSTATS)) { dout(20) << " dirstat mismatch, fixing" << dendl; } else { clog->error() << "unmatched fragstat on " << ino() << ", inode has " << pi->dirstat << ", dirfrags have " << dirstat; ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter); } // trust the dirfrags for now version_t v = pi->dirstat.version; if (pi->dirstat.mtime > dirstat.mtime) dirstat.mtime = pi->dirstat.mtime; if (pi->dirstat.change_attr > dirstat.change_attr) dirstat.change_attr = pi->dirstat.change_attr; pi->dirstat = dirstat; pi->dirstat.version = v; } } if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) { std::string path; make_path_string(path); clog->error() << "Inconsistent statistics detected: fragstat on inode " << ino() << " (" << path << "), inode has " << pi->dirstat; ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); if (pi->dirstat.nfiles < 0) pi->dirstat.nfiles = 0; if (pi->dirstat.nsubdirs < 0) pi->dirstat.nsubdirs = 0; } } break; case CEPH_LOCK_INEST: { // adjust summation ceph_assert(is_auth()); fragtree_t tmpdft = dirfragtree; nest_info_t rstat; bool rstat_valid = true; rstat.rsubdirs = 1; if (const sr_t *srnode = get_projected_srnode(); srnode) rstat.rsnaps = srnode->snaps.size(); auto pi = _get_projected_inode(); dout(20) << " orig rstat " << pi->rstat << dendl; pi->rstat.version++; for (const auto &p : dirfrags) { frag_t fg = p.first; CDir *dir = p.second; dout(20) << fg << " " << *dir << dendl; bool update; if (dir->get_version() != 0) { update = dir->is_auth() && !dir->is_frozen(); } else { update = false; rstat_valid = false; } CDir::fnode_const_ptr pf; if (update) { mut->auth_pin(dir); pf = dir->project_fnode(mut); } else { pf = dir->get_projected_fnode(); } if (pf->accounted_rstat.version == pi->rstat.version-1) { // only pull this frag's dirty rstat inodes into the frag if // the frag is non-stale and updateable. if it's stale, // that info will just get thrown out! if (update) dir->assimilate_dirty_rstat_inodes(mut); dout(20) << fg << " rstat " << pf->rstat << dendl; dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl; dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, dir->first, CEPH_NOSNAP, this, true); for (auto &p : dir->dirty_old_rstat) { mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first, p.first, this, true); } if (update) // dir contents not valid if frozen or non-auth dir->check_rstats(); } else { dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl; } if (update) { auto _pf = const_cast(pf.get()); _pf->accounted_rstat = pf->rstat; _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version; _pf->version = dir->pre_dirty(); dir->dirty_old_rstat.clear(); dir->check_rstats(); dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl; } tmpdft.force_to_leaf(g_ceph_context, fg); rstat.add(pf->rstat); } dout(20) << " final rstat " << pi->rstat << dendl; if (rstat_valid && !rstat.same_sums(pi->rstat)) { frag_vec_t leaves; tmpdft.get_leaves_under(frag_t(), leaves); for (const auto& leaf : leaves) { if (!dirfrags.count(leaf)) { rstat_valid = false; break; } } if (rstat_valid) { if (state_test(CInode::STATE_REPAIRSTATS)) { dout(20) << " rstat mismatch, fixing" << dendl; } else { clog->error() << "inconsistent rstat on inode " << ino() << ", inode has " << pi->rstat << ", directory fragments have " << rstat; ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter); } // trust the dirfrag for now version_t v = pi->rstat.version; if (pi->rstat.rctime > rstat.rctime) rstat.rctime = pi->rstat.rctime; pi->rstat = rstat; pi->rstat.version = v; } } mdcache->broadcast_quota_to_client(this); } break; case CEPH_LOCK_IDFT: break; default: ceph_abort(); } } void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob) { dout(10) << __func__ << " " << type << " on " << *this << dendl; ceph_assert(is_auth()); for (const auto &p : dirfrags) { CDir *dir = p.second; if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) continue; if (type == CEPH_LOCK_IDFT) continue; // nothing to do. if (type == CEPH_LOCK_INEST) dir->assimilate_dirty_rstat_inodes_finish(metablob); dout(10) << " journaling updated frag accounted_ on " << *dir << dendl; ceph_assert(dir->is_projected()); metablob->add_dir(dir, true); } } // waiting bool CInode::is_frozen() const { if (is_frozen_inode()) return true; if (parent && parent->dir->is_frozen()) return true; return false; } bool CInode::is_frozen_dir() const { if (parent && parent->dir->is_frozen_dir()) return true; return false; } bool CInode::is_freezing() const { if (is_freezing_inode()) return true; if (parent && parent->dir->is_freezing()) return true; return false; } void CInode::add_dir_waiter(frag_t fg, MDSContext *c) { if (waiting_on_dir.empty()) get(PIN_DIRWAITER); waiting_on_dir[fg].push_back(c); dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl; } void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls) { if (waiting_on_dir.empty()) return; auto it = waiting_on_dir.find(fg); if (it != waiting_on_dir.end()) { dout(10) << __func__ << " frag " << fg << " on " << *this << dendl; auto& waiting = it->second; ls.insert(ls.end(), waiting.begin(), waiting.end()); waiting_on_dir.erase(it); if (waiting_on_dir.empty()) put(PIN_DIRWAITER); } } void CInode::add_waiter(uint64_t tag, MDSContext *c) { dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) << " !frozen " << !is_frozen_inode() << " !freezing " << !is_freezing_inode() << dendl; // wait on the directory? // make sure its not the inode that is explicitly ambiguous|freezing|frozen if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { dout(15) << "passing waiter up tree" << dendl; parent->dir->add_waiter(tag, c); return; } dout(15) << "taking waiter here" << dendl; MDSCacheObject::add_waiter(tag, c); } void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls) { if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { // take all dentry waiters while (!waiting_on_dir.empty()) { auto it = waiting_on_dir.begin(); dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl; auto& waiting = it->second; ls.insert(ls.end(), waiting.begin(), waiting.end()); waiting_on_dir.erase(it); } put(PIN_DIRWAITER); } // waiting MDSCacheObject::take_waiting(mask, ls); } void CInode::maybe_finish_freeze_inode() { CDir *dir = get_parent_dir(); if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed) return; dout(10) << "maybe_finish_freeze_inode - frozen" << dendl; ceph_assert(auth_pins == auth_pin_freeze_allowance); get(PIN_FROZEN); put(PIN_FREEZING); state_clear(STATE_FREEZING); state_set(STATE_FROZEN); item_freezing_inode.remove_myself(); dir->num_frozen_inodes++; finish_waiting(WAIT_FROZEN); } bool CInode::freeze_inode(int auth_pin_allowance) { CDir *dir = get_parent_dir(); ceph_assert(dir); ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins ceph_assert(auth_pins >= auth_pin_allowance); if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) { dout(10) << "freeze_inode - frozen" << dendl; if (!state_test(STATE_FROZEN)) { get(PIN_FROZEN); state_set(STATE_FROZEN); dir->num_frozen_inodes++; } return true; } dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; auth_pin_freeze_allowance = auth_pin_allowance; dir->freezing_inodes.push_back(&item_freezing_inode); get(PIN_FREEZING); state_set(STATE_FREEZING); if (!dir->lock_caches_with_auth_pins.empty()) mdcache->mds->locker->invalidate_lock_caches(dir); const static int lock_types[] = { CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT, CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0 }; for (int i = 0; lock_types[i]; ++i) { auto lock = get_lock(lock_types[i]); if (lock->is_cached()) mdcache->mds->locker->invalidate_lock_caches(lock); } // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed // and finish freezing the inode return state_test(STATE_FROZEN); } void CInode::unfreeze_inode(MDSContext::vec& finished) { dout(10) << __func__ << dendl; if (state_test(STATE_FREEZING)) { state_clear(STATE_FREEZING); put(PIN_FREEZING); item_freezing_inode.remove_myself(); } else if (state_test(STATE_FROZEN)) { state_clear(STATE_FROZEN); put(PIN_FROZEN); get_parent_dir()->num_frozen_inodes--; } else ceph_abort(); take_waiting(WAIT_UNFREEZE, finished); } void CInode::unfreeze_inode() { MDSContext::vec finished; unfreeze_inode(finished); mdcache->mds->queue_waiters(finished); } void CInode::freeze_auth_pin() { ceph_assert(state_test(CInode::STATE_FROZEN)); state_set(CInode::STATE_FROZENAUTHPIN); get_parent_dir()->num_frozen_inodes++; } void CInode::unfreeze_auth_pin() { ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN)); state_clear(CInode::STATE_FROZENAUTHPIN); get_parent_dir()->num_frozen_inodes--; if (!state_test(STATE_FREEZING|STATE_FROZEN)) { MDSContext::vec finished; take_waiting(WAIT_UNFREEZE, finished); mdcache->mds->queue_waiters(finished); } } void CInode::clear_ambiguous_auth(MDSContext::vec& finished) { ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); state_clear(CInode::STATE_AMBIGUOUSAUTH); take_waiting(CInode::WAIT_SINGLEAUTH, finished); } void CInode::clear_ambiguous_auth() { MDSContext::vec finished; clear_ambiguous_auth(finished); mdcache->mds->queue_waiters(finished); } // auth_pins bool CInode::can_auth_pin(int *err_ret) const { int err; if (!is_auth()) { err = ERR_NOT_AUTH; } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) { err = ERR_EXPORTING_INODE; } else { if (parent) return parent->can_auth_pin(err_ret); err = 0; } if (err && err_ret) *err_ret = err; return !err; } void CInode::auth_pin(void *by) { if (auth_pins == 0) get(PIN_AUTHPIN); auth_pins++; #ifdef MDS_AUTHPIN_SET auth_pin_set.insert(by); #endif dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl; if (parent) parent->adjust_nested_auth_pins(1, this); } void CInode::auth_unpin(void *by) { auth_pins--; #ifdef MDS_AUTHPIN_SET { auto it = auth_pin_set.find(by); ceph_assert(it != auth_pin_set.end()); auth_pin_set.erase(it); } #endif if (auth_pins == 0) put(PIN_AUTHPIN); dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl; ceph_assert(auth_pins >= 0); if (parent) parent->adjust_nested_auth_pins(-1, by); if (is_freezing_inode()) maybe_finish_freeze_inode(); } // authority mds_authority_t CInode::authority() const { if (inode_auth.first >= 0) return inode_auth; if (parent) return parent->dir->authority(); // new items that are not yet linked in (in the committed plane) belong // to their first parent. if (!projected_parent.empty()) return projected_parent.front()->dir->authority(); return CDIR_AUTH_UNDEF; } // SNAP snapid_t CInode::get_oldest_snap() { snapid_t t = first; if (is_any_old_inodes()) t = get_old_inodes()->begin()->second.first; return std::min(t, oldest_snap); } const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head) { ceph_assert(follows >= first); const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode(); const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs(); auto _old_inodes = allocate_old_inode_map(); if (old_inodes) *_old_inodes = *old_inodes; mempool_old_inode &old = (*_old_inodes)[follows]; old.first = first; old.inode = *pi; if (px) { dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl; old.xattrs = *px; } if (first < oldest_snap) oldest_snap = first; old.inode.trim_client_ranges(follows); if (g_conf()->mds_snap_rstat && !(old.inode.rstat == old.inode.accounted_rstat)) dirty_old_rstats.insert(follows); first = follows+1; dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" ) << " to [" << old.first << "," << follows << "] on " << *this << dendl; reset_old_inodes(std::move(_old_inodes)); return old; } void CInode::pre_cow_old_inode() { snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); if (first <= follows) cow_old_inode(follows, true); } bool CInode::has_snap_data(snapid_t snapid) { bool found = snapid >= first && snapid <= last; if (!found && is_any_old_inodes()) { auto p = old_inodes->lower_bound(snapid); if (p != old_inodes->end()) { if (p->second.first > snapid) { if (p != old_inodes->begin()) --p; } if (p->second.first <= snapid && snapid <= p->first) { found = true; } } } return found; } void CInode::purge_stale_snap_data(const set& snaps) { dout(10) << __func__ << " " << snaps << dendl; if (!get_old_inodes()) return; std::vector to_remove; for (auto p : *get_old_inodes()) { const snapid_t &id = p.first; const auto &s = snaps.lower_bound(p.second.first); if (s == snaps.end() || *s > id) { dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl; to_remove.push_back(id); } } if (to_remove.size() == get_old_inodes()->size()) { reset_old_inodes(old_inode_map_ptr()); } else if (!to_remove.empty()) { auto _old_inodes = allocate_old_inode_map(*get_old_inodes()); for (auto id : to_remove) _old_inodes->erase(id); reset_old_inodes(std::move(_old_inodes)); } } /* * pick/create an old_inode */ snapid_t CInode::pick_old_inode(snapid_t snap) const { if (is_any_old_inodes()) { auto it = old_inodes->lower_bound(snap); // p is first key >= to snap if (it != old_inodes->end() && it->second.first <= snap) { dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl; return it->first; } } dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl; return 0; } void CInode::open_snaprealm(bool nosplit) { if (!snaprealm) { SnapRealm *parent = find_snaprealm(); snaprealm = new SnapRealm(mdcache, this); if (parent) { dout(10) << __func__ << " " << snaprealm << " parent is " << parent << dendl; dout(30) << " siblings are " << parent->open_children << dendl; snaprealm->parent = parent; if (!nosplit) parent->split_at(snaprealm); parent->open_children.insert(snaprealm); } } } void CInode::close_snaprealm(bool nojoin) { if (snaprealm) { dout(15) << __func__ << " " << *snaprealm << dendl; if (snaprealm->parent) { snaprealm->parent->open_children.erase(snaprealm); //if (!nojoin) //snaprealm->parent->join(snaprealm); } delete snaprealm; snaprealm = 0; } } SnapRealm *CInode::find_snaprealm() const { const CInode *cur = this; while (!cur->snaprealm) { const CDentry *pdn = cur->get_oldest_parent_dn(); if (!pdn) break; cur = pdn->get_dir()->get_inode(); } return cur->snaprealm; } void CInode::encode_snap_blob(bufferlist &snapbl) { if (snaprealm) { using ceph::encode; encode(snaprealm->srnode, snapbl); dout(20) << __func__ << " " << *snaprealm << dendl; } } void CInode::decode_snap_blob(const bufferlist& snapbl) { using ceph::decode; if (snapbl.length()) { open_snaprealm(); auto old_flags = snaprealm->srnode.flags; auto p = snapbl.cbegin(); decode(snaprealm->srnode, p); if (!is_base()) { if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { snaprealm->adjust_parent(); } } dout(20) << __func__ << " " << *snaprealm << dendl; } else if (snaprealm && !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675 ceph_assert(mdcache->mds->is_any_replay()); snaprealm->merge_to(NULL); } } void CInode::encode_snap(bufferlist& bl) { ENCODE_START(1, 1, bl); bufferlist snapbl; encode_snap_blob(snapbl); encode(snapbl, bl); encode(oldest_snap, bl); ENCODE_FINISH(bl); } void CInode::decode_snap(bufferlist::const_iterator& p) { DECODE_START(1, p); bufferlist snapbl; decode(snapbl, p); decode(oldest_snap, p); decode_snap_blob(snapbl); DECODE_FINISH(p); } // ============================================= client_t CInode::calc_ideal_loner() { if (mdcache->is_readonly()) return -1; if (!get_mds_caps_wanted().empty()) return -1; int n = 0; client_t loner = -1; for (const auto &p : client_caps) { if (!p.second.is_stale() && (is_dir() ? !has_subtree_or_exporting_dirfrag() : (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) { if (n) return -1; n++; loner = p.first; } } return loner; } bool CInode::choose_ideal_loner() { want_loner_cap = calc_ideal_loner(); int changed = false; if (loner_cap >= 0 && loner_cap != want_loner_cap) { if (!try_drop_loner()) return false; changed = true; } if (want_loner_cap >= 0) { if (loner_cap < 0) { set_loner_cap(want_loner_cap); changed = true; } else ceph_assert(loner_cap == want_loner_cap); } return changed; } bool CInode::try_set_loner() { ceph_assert(want_loner_cap >= 0); if (loner_cap >= 0 && loner_cap != want_loner_cap) return false; set_loner_cap(want_loner_cap); return true; } void CInode::set_loner_cap(client_t l) { loner_cap = l; authlock.set_excl_client(loner_cap); filelock.set_excl_client(loner_cap); linklock.set_excl_client(loner_cap); xattrlock.set_excl_client(loner_cap); } bool CInode::try_drop_loner() { if (loner_cap < 0) return true; int other_allowed = get_caps_allowed_by_type(CAP_ANY); Capability *cap = get_client_cap(loner_cap); if (!cap || (cap->issued() & ~other_allowed) == 0) { set_loner_cap(-1); return true; } return false; } // choose new lock state during recovery, based on issued caps void CInode::choose_lock_state(SimpleLock *lock, int allissued) { int shift = lock->get_cap_shift(); int issued = (allissued >> shift) & lock->get_cap_mask(); if (is_auth()) { if (lock->is_xlocked()) { // do nothing here } else if (lock->get_state() != LOCK_MIX) { if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER)) lock->set_state(LOCK_EXCL); else if (issued & CEPH_CAP_GWR) { if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED)) lock->set_state(LOCK_EXCL); else lock->set_state(LOCK_MIX); } else if (lock->is_dirty()) { if (is_replicated()) lock->set_state(LOCK_MIX); else lock->set_state(LOCK_LOCK); } else lock->set_state(LOCK_SYNC); } } else { // our states have already been chosen during rejoin. if (lock->is_xlocked()) ceph_assert(lock->get_state() == LOCK_LOCK); } } void CInode::choose_lock_states(int dirty_caps) { int issued = get_caps_issued() | dirty_caps; if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR))) choose_ideal_loner(); choose_lock_state(&filelock, issued); choose_lock_state(&nestlock, issued); choose_lock_state(&dirfragtreelock, issued); choose_lock_state(&authlock, issued); choose_lock_state(&xattrlock, issued); choose_lock_state(&linklock, issued); } int CInode::count_nonstale_caps() { int n = 0; for (const auto &p : client_caps) { if (!p.second.is_stale()) n++; } return n; } bool CInode::multiple_nonstale_caps() { int n = 0; for (const auto &p : client_caps) { if (!p.second.is_stale()) { if (n) return true; n++; } } return false; } void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map& m) { bool old_empty = mds_caps_wanted.empty(); mds_caps_wanted.swap(m); if (old_empty != (bool)mds_caps_wanted.empty()) { if (old_empty) adjust_num_caps_notable(1); else adjust_num_caps_notable(-1); } } void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) { bool old_empty = mds_caps_wanted.empty(); if (wanted) { mds_caps_wanted[mds] = wanted; if (old_empty) adjust_num_caps_notable(1); } else if (!old_empty) { mds_caps_wanted.erase(mds); if (mds_caps_wanted.empty()) adjust_num_caps_notable(-1); } } Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm, bool new_inode) { ceph_assert(last == CEPH_NOSNAP); if (client_caps.empty()) { get(PIN_CAPS); if (conrealm) containing_realm = conrealm; else containing_realm = find_snaprealm(); containing_realm->inodes_with_caps.push_back(&item_caps); dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; mdcache->num_inodes_with_caps++; if (parent) parent->dir->adjust_num_inodes_with_caps(1); } uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id; auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(this, session, cap_id)); ceph_assert(ret.second == true); Capability *cap = &ret.first->second; cap->client_follows = first-1; containing_realm->add_cap(client, cap); return cap; } void CInode::remove_client_cap(client_t client) { auto it = client_caps.find(client); ceph_assert(it != client_caps.end()); Capability *cap = &it->second; cap->item_session_caps.remove_myself(); cap->item_revoking_caps.remove_myself(); cap->item_client_revoking_caps.remove_myself(); containing_realm->remove_cap(client, cap); if (client == loner_cap) loner_cap = -1; if (cap->is_wanted_notable()) adjust_num_caps_notable(-1); client_caps.erase(it); if (client_caps.empty()) { dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl; put(PIN_CAPS); item_caps.remove_myself(); containing_realm = NULL; mdcache->num_inodes_with_caps--; if (parent) parent->dir->adjust_num_inodes_with_caps(-1); } //clean up advisory locks bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false; bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; if (fcntl_removed || flock_removed) { MDSContext::vec waiters; take_waiting(CInode::WAIT_FLOCK, waiters); mdcache->mds->queue_waiters(waiters); } } void CInode::move_to_realm(SnapRealm *realm) { dout(10) << __func__ << " joining realm " << *realm << ", leaving realm " << *containing_realm << dendl; for (auto& p : client_caps) { containing_realm->remove_cap(p.first, &p.second); realm->add_cap(p.first, &p.second); } item_caps.remove_myself(); realm->inodes_with_caps.push_back(&item_caps); containing_realm = realm; } Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session) { Capability *cap = get_client_cap(client); if (cap) { // FIXME? cap->merge(icr.capinfo.wanted, icr.capinfo.issued); } else { cap = add_client_cap(client, session); cap->set_cap_id(icr.capinfo.cap_id); cap->set_wanted(icr.capinfo.wanted); cap->issue_norevoke(icr.capinfo.issued); cap->reset_seq(); } cap->set_last_issue_stamp(ceph_clock_now()); return cap; } void CInode::clear_client_caps_after_export() { while (!client_caps.empty()) remove_client_cap(client_caps.begin()->first); loner_cap = -1; want_loner_cap = -1; if (!get_mds_caps_wanted().empty()) { mempool::mds_co::compact_map empty; set_mds_caps_wanted(empty); } } void CInode::export_client_caps(map& cl) { for (const auto &p : client_caps) { cl[p.first] = p.second.make_export(); } } // caps allowed int CInode::get_caps_liked() const { if (is_dir()) return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER else return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; } int CInode::get_caps_allowed_ever() const { int allowed; if (is_dir()) allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; else allowed = CEPH_CAP_ANY; return allowed & (CEPH_CAP_PIN | (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) | (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) | (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) | (linklock.gcaps_allowed_ever() << linklock.get_cap_shift())); } int CInode::get_caps_allowed_by_type(int type) const { return CEPH_CAP_PIN | (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); } int CInode::get_caps_careful() const { return (filelock.gcaps_careful() << filelock.get_cap_shift()) | (authlock.gcaps_careful() << authlock.get_cap_shift()) | (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | (linklock.gcaps_careful() << linklock.get_cap_shift()); } int CInode::get_xlocker_mask(client_t client) const { return (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); } int CInode::get_caps_allowed_for_client(Session *session, Capability *cap, const mempool_inode *file_i) const { client_t client = session->get_client(); int allowed; if (client == get_loner()) { // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked allowed = get_caps_allowed_by_type(CAP_LONER) | (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client)); } else { allowed = get_caps_allowed_by_type(CAP_ANY); } if (is_dir()) { allowed &= ~CEPH_CAP_ANY_DIR_OPS; if (cap && (allowed & CEPH_CAP_FILE_EXCL)) allowed |= cap->get_lock_cache_allowed(); } else { if (file_i->inline_data.version == CEPH_INLINE_NONE && file_i->layout.pool_ns.empty()) { // noop } else if (cap) { if ((file_i->inline_data.version != CEPH_INLINE_NONE && cap->is_noinline()) || (!file_i->layout.pool_ns.empty() && cap->is_nopoolns())) allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); } else { auto& conn = session->get_connection(); if ((file_i->inline_data.version != CEPH_INLINE_NONE && !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) || (!file_i->layout.pool_ns.empty() && !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))) allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); } } return allowed; } // caps issued, wanted int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker, int shift, int mask) { int c = 0; int loner = 0, other = 0, xlocker = 0; if (!is_auth()) { loner_cap = -1; } for (const auto &p : client_caps) { int i = p.second.issued(); c |= i; if (p.first == loner_cap) loner |= i; else other |= i; xlocker |= get_xlocker_mask(p.first) & i; } if (ploner) *ploner = (loner >> shift) & mask; if (pother) *pother = (other >> shift) & mask; if (pxlocker) *pxlocker = (xlocker >> shift) & mask; return (c >> shift) & mask; } bool CInode::is_any_caps_wanted() const { for (const auto &p : client_caps) { if (p.second.wanted()) return true; } return false; } int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const { int w = 0; int loner = 0, other = 0; for (const auto &p : client_caps) { if (!p.second.is_stale()) { int t = p.second.wanted(); w |= t; if (p.first == loner_cap) loner |= t; else other |= t; } //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; } if (is_auth()) for (const auto &p : mds_caps_wanted) { w |= p.second; other |= p.second; //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; } if (ploner) *ploner = (loner >> shift) & mask; if (pother) *pother = (other >> shift) & mask; return (w >> shift) & mask; } bool CInode::issued_caps_need_gather(SimpleLock *lock) { int loner_issued, other_issued, xlocker_issued; get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, lock->get_cap_shift(), lock->get_cap_mask()); if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) || (other_issued & ~lock->gcaps_allowed(CAP_ANY)) || (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER))) return true; return false; } void CInode::adjust_num_caps_notable(int d) { if (!is_clientwriteable()) { if (!num_caps_notable && d > 0) mdcache->open_file_table.add_inode(this); else if (num_caps_notable > 0 && num_caps_notable == -d) mdcache->open_file_table.remove_inode(this); } num_caps_notable +=d; ceph_assert(num_caps_notable >= 0); } void CInode::mark_clientwriteable() { if (last != CEPH_NOSNAP) return; if (!state_test(STATE_CLIENTWRITEABLE)) { if (num_caps_notable == 0) mdcache->open_file_table.add_inode(this); state_set(STATE_CLIENTWRITEABLE); } } void CInode::clear_clientwriteable() { if (state_test(STATE_CLIENTWRITEABLE)) { if (num_caps_notable == 0) mdcache->open_file_table.remove_inode(this); state_clear(STATE_CLIENTWRITEABLE); } } // ============================================= int CInode::encode_inodestat(bufferlist& bl, Session *session, SnapRealm *dir_realm, snapid_t snapid, unsigned max_bytes, int getattr_caps) { client_t client = session->get_client(); ceph_assert(snapid); bool valid = true; // pick a version! const mempool_inode *oi = get_inode().get(); const mempool_inode *pi = get_projected_inode().get(); const mempool_xattr_map *pxattrs = nullptr; if (snapid != CEPH_NOSNAP) { // for now at least, old_inodes is only defined/valid on the auth if (!is_auth()) valid = false; if (is_any_old_inodes()) { auto it = old_inodes->lower_bound(snapid); if (it != old_inodes->end()) { if (it->second.first > snapid) { if (it != old_inodes->begin()) --it; } if (it->second.first <= snapid && snapid <= it->first) { dout(15) << __func__ << " snapid " << snapid << " to old_inode [" << it->second.first << "," << it->first << "]" << " " << it->second.inode.rstat << dendl; pi = oi = &it->second.inode; pxattrs = &it->second.xattrs; } else { // snapshoted remote dentry can result this dout(0) << __func__ << " old_inode for snapid " << snapid << " not found" << dendl; } } } else if (snapid < first || snapid > last) { // snapshoted remote dentry can result this dout(0) << __func__ << " [" << first << "," << last << "]" << " not match snapid " << snapid << dendl; } } utime_t snap_btime; std::map snap_metadata; SnapRealm *realm = find_snaprealm(); if (snapid != CEPH_NOSNAP && realm) { // add snapshot timestamp vxattr map infomap; realm->get_snap_info(infomap, snapid, // min snapid); // max if (!infomap.empty()) { ceph_assert(infomap.size() == 1); const SnapInfo *si = infomap.begin()->second; snap_btime = si->stamp; snap_metadata = si->metadata; } } bool no_caps = !valid || session->is_stale() || (dir_realm && realm != dir_realm) || is_frozen() || state_test(CInode::STATE_EXPORTINGCAPS); if (no_caps) dout(20) << __func__ << " no caps" << (!valid?", !valid":"") << (session->is_stale()?", session stale ":"") << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"") << (is_frozen()?", frozen inode":"") << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") << dendl; // "fake" a version that is old (stable) version, +1 if projected. version_t version = (oi->version * 2) + is_projected(); Capability *cap = get_client_cap(client); bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client; //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL)); bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client; bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client; bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client; bool plocal = versionlock.get_last_wrlock_client() == client; bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client; const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi; dout(20) << " pfile " << pfile << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr << " plocal " << plocal << " mtime " << any_i->mtime << " ctime " << any_i->ctime << " change_attr " << any_i->change_attr << " valid=" << valid << dendl; // file const mempool_inode *file_i = pfile ? pi:oi; file_layout_t layout; if (is_dir()) { layout = (ppolicy ? pi : oi)->layout; } else { layout = file_i->layout; } // max_size is min of projected, actual uint64_t max_size = std::min(oi->get_client_range(client), pi->get_client_range(client)); // inline data version_t inline_version = 0; bufferlist inline_data; if (file_i->inline_data.version == CEPH_INLINE_NONE) { inline_version = CEPH_INLINE_NONE; } else if ((!cap && !no_caps) || (cap && cap->client_inline_version < file_i->inline_data.version) || (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data inline_version = file_i->inline_data.version; if (file_i->inline_data.length() > 0) file_i->inline_data.get_data(inline_data); } // nest (do same as file... :/) if (cap) { cap->last_rbytes = file_i->rstat.rbytes; cap->last_rsize = file_i->rstat.rsize(); } // auth const mempool_inode *auth_i = pauth ? pi:oi; // link const mempool_inode *link_i = plink ? pi:oi; // xattr const mempool_inode *xattr_i = pxattr ? pi:oi; using ceph::encode; // xattr version_t xattr_version; if ((!cap && !no_caps) || (cap && cap->client_xattr_version < xattr_i->xattr_version) || (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs if (!pxattrs) pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get(); xattr_version = xattr_i->xattr_version; } else { xattr_version = 0; } // do we have room? if (max_bytes) { unsigned bytes = 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) + sizeof(struct ceph_file_layout) + sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree sizeof(__u32) + symlink.length() + // symlink sizeof(struct ceph_dir_layout); // dir_layout if (xattr_version) { bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries if (pxattrs) { for (const auto &p : *pxattrs) bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length(); } } else { bytes += sizeof(__u32); // xattr buffer len } bytes += sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data 1 + 1 + 8 + 8 + 4 + // quota 4 + layout.pool_ns.size() + // pool ns sizeof(struct ceph_timespec) + 8; // btime + change_attr if (bytes > max_bytes) return -CEPHFS_ENOSPC; } // encode caps struct ceph_mds_reply_cap ecap; if (snapid != CEPH_NOSNAP) { /* * snapped inodes (files or dirs) only get read-only caps. always * issue everything possible, since it is read only. * * if a snapped inode has caps, limit issued caps based on the * lock state. * * if it is a live inode, limit issued caps based on the lock * state. * * do NOT adjust cap issued state, because the client always * tracks caps per-snap and the mds does either per-interval or * multiversion. */ ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE; if (last == CEPH_NOSNAP || is_any_caps()) ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i); ecap.seq = 0; ecap.mseq = 0; ecap.realm = 0; } else { if (!no_caps && !cap) { // add a new cap cap = add_client_cap(client, session, realm); if (is_auth()) choose_ideal_loner(); } int issue = 0; if (!no_caps && cap) { int likes = get_caps_liked(); int allowed = get_caps_allowed_for_client(session, cap, file_i); issue = (cap->wanted() | likes) & allowed; cap->issue_norevoke(issue, true); issue = cap->pending(); dout(10) << "encode_inodestat issuing " << ccap_string(issue) << " seq " << cap->get_last_seq() << dendl; } else if (cap && cap->is_new() && !dir_realm) { // alway issue new caps to client, otherwise the caps get lost ceph_assert(cap->is_stale()); ceph_assert(!cap->pending()); issue = CEPH_CAP_PIN; cap->issue_norevoke(issue, true); dout(10) << "encode_inodestat issuing " << ccap_string(issue) << " seq " << cap->get_last_seq() << "(stale&new caps)" << dendl; } if (issue) { cap->set_last_issue(); cap->set_last_issue_stamp(ceph_clock_now()); ecap.caps = issue; ecap.wanted = cap->wanted(); ecap.cap_id = cap->get_cap_id(); ecap.seq = cap->get_last_seq(); ecap.mseq = cap->get_mseq(); ecap.realm = realm->inode->ino(); } else { ecap.cap_id = 0; ecap.caps = 0; ecap.seq = 0; ecap.mseq = 0; ecap.realm = 0; ecap.wanted = 0; } } ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0; dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps) << " seq " << ecap.seq << " mseq " << ecap.mseq << " xattrv " << xattr_version << dendl; if (inline_data.length() && cap) { if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) { dout(10) << "including inline version " << inline_version << dendl; cap->client_inline_version = inline_version; } else { dout(10) << "dropping inline version " << inline_version << dendl; inline_version = 0; inline_data.clear(); } } // include those xattrs? if (xattr_version && cap) { if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) { dout(10) << "including xattrs version " << xattr_version << dendl; cap->client_xattr_version = xattr_version; } else { dout(10) << "dropping xattrs version " << xattr_version << dendl; xattr_version = 0; } } // The end result of encode_xattrs() is equivalent to: // { // bufferlist xbl; // if (xattr_version) { // if (pxattrs) // encode(*pxattrs, bl); // else // encode((__u32)0, bl); // } // encode(xbl, bl); // } // // But encoding xattrs into the 'xbl' requires a memory allocation. // The 'bl' should have enough pre-allocated memory in most cases. // Encoding xattrs directly into it can avoid the extra allocation. auto encode_xattrs = [xattr_version, pxattrs, &bl]() { using ceph::encode; if (xattr_version) { ceph_le32 xbl_len; auto filler = bl.append_hole(sizeof(xbl_len)); const auto starting_bl_len = bl.length(); if (pxattrs) encode(*pxattrs, bl); else encode((__u32)0, bl); xbl_len = bl.length() - starting_bl_len; filler.copy_in(sizeof(xbl_len), (char *)&xbl_len); } else { encode((__u32)0, bl); } }; /* * note: encoding matches MClientReply::InodeStat */ if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { ENCODE_START(6, 1, bl); encode(oi->ino, bl); encode(snapid, bl); encode(oi->rdev, bl); encode(version, bl); encode(xattr_version, bl); encode(ecap, bl); { ceph_file_layout legacy_layout; layout.to_legacy(&legacy_layout); encode(legacy_layout, bl); } encode(any_i->ctime, bl); encode(file_i->mtime, bl); encode(file_i->atime, bl); encode(file_i->time_warp_seq, bl); encode(file_i->size, bl); encode(max_size, bl); encode(file_i->truncate_size, bl); encode(file_i->truncate_seq, bl); encode(auth_i->mode, bl); encode((uint32_t)auth_i->uid, bl); encode((uint32_t)auth_i->gid, bl); encode(link_i->nlink, bl); encode(file_i->dirstat.nfiles, bl); encode(file_i->dirstat.nsubdirs, bl); encode(file_i->rstat.rbytes, bl); encode(file_i->rstat.rfiles, bl); encode(file_i->rstat.rsubdirs, bl); encode(file_i->rstat.rctime, bl); dirfragtree.encode(bl); encode(symlink, bl); encode(file_i->dir_layout, bl); encode_xattrs(); encode(inline_version, bl); encode(inline_data, bl); const mempool_inode *policy_i = ppolicy ? pi : oi; encode(policy_i->quota, bl); encode(layout.pool_ns, bl); encode(any_i->btime, bl); encode(any_i->change_attr, bl); encode(file_i->export_pin, bl); encode(snap_btime, bl); encode(file_i->rstat.rsnaps, bl); encode(snap_metadata, bl); encode(file_i->fscrypt, bl); ENCODE_FINISH(bl); } else { ceph_assert(session->get_connection()); encode(oi->ino, bl); encode(snapid, bl); encode(oi->rdev, bl); encode(version, bl); encode(xattr_version, bl); encode(ecap, bl); { ceph_file_layout legacy_layout; layout.to_legacy(&legacy_layout); encode(legacy_layout, bl); } encode(any_i->ctime, bl); encode(file_i->mtime, bl); encode(file_i->atime, bl); encode(file_i->time_warp_seq, bl); encode(file_i->size, bl); encode(max_size, bl); encode(file_i->truncate_size, bl); encode(file_i->truncate_seq, bl); encode(auth_i->mode, bl); encode((uint32_t)auth_i->uid, bl); encode((uint32_t)auth_i->gid, bl); encode(link_i->nlink, bl); encode(file_i->dirstat.nfiles, bl); encode(file_i->dirstat.nsubdirs, bl); encode(file_i->rstat.rbytes, bl); encode(file_i->rstat.rfiles, bl); encode(file_i->rstat.rsubdirs, bl); encode(file_i->rstat.rctime, bl); dirfragtree.encode(bl); encode(symlink, bl); auto& conn = session->get_connection(); if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { encode(file_i->dir_layout, bl); } encode_xattrs(); if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { encode(inline_version, bl); encode(inline_data, bl); } if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) { const mempool_inode *policy_i = ppolicy ? pi : oi; encode(policy_i->quota, bl); } if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { encode(layout.pool_ns, bl); } if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) { encode(any_i->btime, bl); encode(any_i->change_attr, bl); } } return valid; } void CInode::encode_cap_message(const ref_t &m, Capability *cap) { ceph_assert(cap); client_t client = cap->get_client(); bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL); bool pauth = authlock.is_xlocked_by_client(client); bool plink = linklock.is_xlocked_by_client(client); bool pxattr = xattrlock.is_xlocked_by_client(client); const mempool_inode *oi = get_inode().get(); const mempool_inode *pi = get_projected_inode().get(); const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi; dout(20) << __func__ << " pfile " << pfile << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr << " mtime " << i->mtime << " ctime " << i->ctime << " change_attr " << i->change_attr << dendl; i = pfile ? pi:oi; m->set_layout(i->layout); m->size = i->size; m->truncate_seq = i->truncate_seq; m->truncate_size = i->truncate_size; m->mtime = i->mtime; m->atime = i->atime; m->ctime = i->ctime; m->btime = i->btime; m->change_attr = i->change_attr; m->time_warp_seq = i->time_warp_seq; m->nfiles = i->dirstat.nfiles; m->nsubdirs = i->dirstat.nsubdirs; if (cap->client_inline_version < i->inline_data.version) { m->inline_version = cap->client_inline_version = i->inline_data.version; if (i->inline_data.length() > 0) i->inline_data.get_data(m->inline_data); } else { m->inline_version = 0; } // max_size is min of projected, actual. uint64_t oldms = oi->get_client_range(client); uint64_t newms = pi->get_client_range(client); m->max_size = std::min(oldms, newms); i = pauth ? pi:oi; m->head.mode = i->mode; m->head.uid = i->uid; m->head.gid = i->gid; i = plink ? pi:oi; m->head.nlink = i->nlink; using ceph::encode; i = pxattr ? pi:oi; const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs(); if ((cap->pending() & CEPH_CAP_XATTR_SHARED) && i->xattr_version > cap->client_xattr_version) { dout(10) << " including xattrs v " << i->xattr_version << dendl; if (ix) encode(*ix, m->xattrbl); else encode((__u32)0, m->xattrbl); m->head.xattr_version = i->xattr_version; cap->client_xattr_version = i->xattr_version; } } void CInode::_encode_base(bufferlist& bl, uint64_t features) { ENCODE_START(1, 1, bl); encode(first, bl); encode(*get_inode(), bl, features); encode(symlink, bl); encode(dirfragtree, bl); encode_xattrs(bl); encode_old_inodes(bl, features); encode(damage_flags, bl); encode_snap(bl); ENCODE_FINISH(bl); } void CInode::_decode_base(bufferlist::const_iterator& p) { DECODE_START(1, p); decode(first, p); { auto _inode = allocate_inode(); decode(*_inode, p); reset_inode(std::move(_inode)); } { std::string tmp; decode(tmp, p); symlink = std::string_view(tmp); } decode(dirfragtree, p); decode_xattrs(p); decode_old_inodes(p); decode(damage_flags, p); decode_snap(p); DECODE_FINISH(p); } void CInode::_encode_locks_full(bufferlist& bl) { using ceph::encode; encode(authlock, bl); encode(linklock, bl); encode(dirfragtreelock, bl); encode(filelock, bl); encode(xattrlock, bl); encode(snaplock, bl); encode(nestlock, bl); encode(flocklock, bl); encode(policylock, bl); encode(loner_cap, bl); } void CInode::_decode_locks_full(bufferlist::const_iterator& p) { using ceph::decode; decode(authlock, p); decode(linklock, p); decode(dirfragtreelock, p); decode(filelock, p); decode(xattrlock, p); decode(snaplock, p); decode(nestlock, p); decode(flocklock, p); decode(policylock, p); decode(loner_cap, p); set_loner_cap(loner_cap); want_loner_cap = loner_cap; // for now, we'll eval() shortly. } void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) { ENCODE_START(1, 1, bl); authlock.encode_state_for_replica(bl); linklock.encode_state_for_replica(bl); dirfragtreelock.encode_state_for_replica(bl); filelock.encode_state_for_replica(bl); nestlock.encode_state_for_replica(bl); xattrlock.encode_state_for_replica(bl); snaplock.encode_state_for_replica(bl); flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); encode(need_recover, bl); ENCODE_FINISH(bl); } void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) { authlock.encode_state_for_replica(bl); linklock.encode_state_for_replica(bl); dirfragtreelock.encode_state_for_rejoin(bl, rep); filelock.encode_state_for_rejoin(bl, rep); nestlock.encode_state_for_rejoin(bl, rep); xattrlock.encode_state_for_replica(bl); snaplock.encode_state_for_replica(bl); flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); } void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new) { DECODE_START(1, p); authlock.decode_state(p, is_new); linklock.decode_state(p, is_new); dirfragtreelock.decode_state(p, is_new); filelock.decode_state(p, is_new); nestlock.decode_state(p, is_new); xattrlock.decode_state(p, is_new); snaplock.decode_state(p, is_new); flocklock.decode_state(p, is_new); policylock.decode_state(p, is_new); bool need_recover; decode(need_recover, p); if (need_recover && is_new) { // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock // and change the object when replaying unsafe requests. authlock.mark_need_recover(); linklock.mark_need_recover(); dirfragtreelock.mark_need_recover(); filelock.mark_need_recover(); nestlock.mark_need_recover(); xattrlock.mark_need_recover(); snaplock.mark_need_recover(); flocklock.mark_need_recover(); policylock.mark_need_recover(); } DECODE_FINISH(p); } void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, list& eval_locks, bool survivor) { authlock.decode_state_rejoin(p, waiters, survivor); linklock.decode_state_rejoin(p, waiters, survivor); dirfragtreelock.decode_state_rejoin(p, waiters, survivor); filelock.decode_state_rejoin(p, waiters, survivor); nestlock.decode_state_rejoin(p, waiters, survivor); xattrlock.decode_state_rejoin(p, waiters, survivor); snaplock.decode_state_rejoin(p, waiters, survivor); flocklock.decode_state_rejoin(p, waiters, survivor); policylock.decode_state_rejoin(p, waiters, survivor); if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) eval_locks.push_back(&dirfragtreelock); if (!filelock.is_stable() && !filelock.is_wrlocked()) eval_locks.push_back(&filelock); if (!nestlock.is_stable() && !nestlock.is_wrlocked()) eval_locks.push_back(&nestlock); } // IMPORT/EXPORT void CInode::encode_export(bufferlist& bl) { ENCODE_START(5, 4, bl); _encode_base(bl, mdcache->mds->mdsmap->get_up_features()); encode(state, bl); encode(pop, bl); encode(get_replicas(), bl); // include scatterlock info for any bounding CDirs bufferlist bounding; if (get_inode()->is_dir()) for (const auto &p : dirfrags) { CDir *dir = p.second; if (dir->state_test(CDir::STATE_EXPORTBOUND)) { encode(p.first, bounding); encode(dir->get_fnode()->fragstat, bounding); encode(dir->get_fnode()->accounted_fragstat, bounding); encode(dir->get_fnode()->rstat, bounding); encode(dir->get_fnode()->accounted_rstat, bounding); dout(10) << " encoded fragstat/rstat info for " << *dir << dendl; } } encode(bounding, bl); _encode_locks_full(bl); _encode_file_locks(bl); ENCODE_FINISH(bl); get(PIN_TEMPEXPORTING); } void CInode::finish_export() { state &= MASK_STATE_EXPORT_KEPT; pop.zero(); // just in case! //dirlock.clear_updated(); loner_cap = -1; put(PIN_TEMPEXPORTING); } void CInode::decode_import(bufferlist::const_iterator& p, LogSegment *ls) { DECODE_START(5, p); _decode_base(p); { unsigned s; decode(s, p); s &= MASK_STATE_EXPORTED; set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN), (s & STATE_RANDEPHEMERALPIN)); state_set(STATE_AUTH | s); } if (is_dirty()) { get(PIN_DIRTY); _mark_dirty(ls); } if (is_dirty_parent()) { get(PIN_DIRTYPARENT); mark_dirty_parent(ls); } decode(pop, p); decode(get_replicas(), p); if (is_replicated()) get(PIN_REPLICATED); replica_nonce = 0; // decode fragstat info on bounding cdirs bufferlist bounding; decode(bounding, p); auto q = bounding.cbegin(); while (!q.end()) { frag_t fg; decode(fg, q); CDir *dir = get_dirfrag(fg); ceph_assert(dir); // we should have all bounds open // Only take the remote's fragstat/rstat if we are non-auth for // this dirfrag AND the lock is NOT in a scattered (MIX) state. // We know lock is stable, and MIX is the only state in which // the inode auth (who sent us this data) may not have the best // info. // HMM: Are there cases where dir->is_auth() is an insufficient // check because the dirfrag is under migration? That implies // it is frozen (and in a SYNC or LOCK state). FIXME. auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); if (dir->is_auth() || filelock.get_state() == LOCK_MIX) { dout(10) << " skipped fragstat info for " << *dir << dendl; frag_info_t f; decode(f, q); decode(f, q); } else { decode(_fnode->fragstat, q); decode(_fnode->accounted_fragstat, q); dout(10) << " took fragstat info for " << *dir << dendl; } if (dir->is_auth() || nestlock.get_state() == LOCK_MIX) { dout(10) << " skipped rstat info for " << *dir << dendl; nest_info_t n; decode(n, q); decode(n, q); } else { decode(_fnode->rstat, q); decode(_fnode->accounted_rstat, q); dout(10) << " took rstat info for " << *dir << dendl; } dir->reset_fnode(std::move(_fnode)); } _decode_locks_full(p); _decode_file_locks(p); DECODE_FINISH(p); } void InodeStoreBase::dump(Formatter *f) const { inode->dump(f); f->dump_string("symlink", symlink); f->open_array_section("xattrs"); if (xattrs) { for (const auto& [key, val] : *xattrs) { f->open_object_section("xattr"); f->dump_string("key", key); std::string v(val.c_str(), val.length()); f->dump_string("val", v); f->close_section(); } } f->close_section(); f->open_object_section("dirfragtree"); dirfragtree.dump(f); f->close_section(); // dirfragtree f->open_array_section("old_inodes"); if (old_inodes) { for (const auto &p : *old_inodes) { f->open_object_section("old_inode"); // The key is the last snapid, the first is in the mempool_old_inode f->dump_int("last", p.first); p.second.dump(f); f->close_section(); // old_inode } } f->close_section(); // old_inodes f->dump_unsigned("oldest_snap", oldest_snap); f->dump_unsigned("damage_flags", damage_flags); } template <> void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){ t = mempool::mds_co::string(std::string_view(obj->get_data())); } void InodeStoreBase::decode_json(JSONObj *obj) { { auto _inode = allocate_inode(); _inode->decode_json(obj); reset_inode(std::move(_inode)); } JSONDecoder::decode_json("symlink", symlink, obj, true); // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now // // { mempool_xattr_map tmp; JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true); if (tmp.empty()) reset_xattrs(xattr_map_ptr()); else reset_xattrs(allocate_xattr_map(std::move(tmp))); } // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true); JSONDecoder::decode_json("damage_flags", damage_flags, obj, true); //sr_t srnode; //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now //snap_blob = srnode; } void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){ string k; JSONDecoder::decode_json("key", k, obj, true); string v; JSONDecoder::decode_json("val", v, obj, true); c[k.c_str()] = buffer::copy(v.c_str(), v.size()); } void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){ snapid_t s; JSONDecoder::decode_json("last", s.val, obj, true); InodeStoreBase::mempool_old_inode i; // i.decode_json(obj); // cann't decode now, simon c[s] = i; } void InodeStore::generate_test_instances(std::list &ls) { InodeStore *populated = new InodeStore; populated->get_inode()->ino = 0xdeadbeef; populated->symlink = "rhubarb"; ls.push_back(populated); } void InodeStoreBare::generate_test_instances(std::list &ls) { InodeStoreBare *populated = new InodeStoreBare; populated->get_inode()->ino = 0xdeadbeef; populated->symlink = "rhubarb"; ls.push_back(populated); } void CInode::validate_disk_state(CInode::validated_data *results, MDSContext *fin) { class ValidationContinuation : public MDSContinuation { public: MDSContext *fin; CInode *in; CInode::validated_data *results; bufferlist bl; CInode *shadow_in; enum { START = 0, BACKTRACE, INODE, DIRFRAGS, SNAPREALM, }; ValidationContinuation(CInode *i, CInode::validated_data *data_r, MDSContext *fin_) : MDSContinuation(i->mdcache->mds->server), fin(fin_), in(i), results(data_r), shadow_in(NULL) { set_callback(START, static_cast(&ValidationContinuation::_start)); set_callback(BACKTRACE, static_cast(&ValidationContinuation::_backtrace)); set_callback(INODE, static_cast(&ValidationContinuation::_inode_disk)); set_callback(DIRFRAGS, static_cast(&ValidationContinuation::_dirfrags)); } ~ValidationContinuation() override { if (shadow_in) { delete shadow_in; in->mdcache->num_shadow_inodes--; } } /** * Fetch backtrace and set tag if tag is non-empty */ void fetch_backtrace_and_tag(CInode *in, std::string_view tag, bool is_internal, Context *fin, int *bt_r, bufferlist *bt) { const int64_t pool = in->get_backtrace_pool(); object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); ObjectOperation fetch; fetch.getxattr("parent", bt, bt_r); in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP, NULL, 0, fin); if (in->mdcache->mds->logger) { in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch); in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch); } using ceph::encode; if (!is_internal) { ObjectOperation scrub_tag; bufferlist tag_bl; encode(tag, tag_bl); scrub_tag.setxattr("scrub_tag", tag_bl); SnapContext snapc; in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc, ceph::real_clock::now(), 0, NULL); if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_set_tag); } } bool _start(int rval) { ceph_assert(in->can_auth_pin()); in->auth_pin(this); if (in->is_dirty()) { MDCache *mdcache = in->mdcache; // For the benefit of dout auto ino = [this]() { return in->ino(); }; // For the benefit of dout dout(20) << "validating a dirty CInode; results will be inconclusive" << dendl; } C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE), in->mdcache->mds->finisher); std::string_view tag = in->scrub_infop->header->get_tag(); bool is_internal = in->scrub_infop->header->is_internal_tag(); // Rather than using the usual CInode::fetch_backtrace, // use a special variant that optionally writes a tag in the same // operation. fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl); return false; } bool _backtrace(int rval) { // set up basic result reporting and make sure we got the data results->performed_validation = true; // at least, some of it! results->backtrace.checked = true; const int64_t pool = in->get_backtrace_pool(); inode_backtrace_t& memory_backtrace = results->backtrace.memory_value; in->build_backtrace(pool, memory_backtrace); bool equivalent, divergent; int memory_newer; MDCache *mdcache = in->mdcache; // For the benefit of dout auto ino = [this]() { return in->ino(); }; // For the benefit of dout // Ignore rval because it's the result of a FAILOK operation // from fetch_backtrace_and_tag: the real result is in // backtrace.ondisk_read_retval dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl; if (results->backtrace.ondisk_read_retval != 0) { results->backtrace.error_str << "failed to read off disk; see retval"; // we probably have a new unwritten file! // so skip the backtrace scrub for this entry and say that all's well if (in->is_dirty_parent()) { dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl; results->backtrace.passed = true; } goto next; } // extract the backtrace, and compare it to a newly-constructed one try { auto p = bl.cbegin(); using ceph::decode; decode(results->backtrace.ondisk_value, p); dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl; } catch (buffer::error&) { if (results->backtrace.ondisk_read_retval == 0 && rval != 0) { // Cases where something has clearly gone wrong with the overall // fetch op, though we didn't get a nonzero rc from the getxattr // operation. e.g. object missing. results->backtrace.ondisk_read_retval = rval; } results->backtrace.error_str << "failed to decode on-disk backtrace (" << bl.length() << " bytes)!"; // we probably have a new unwritten file! // so skip the backtrace scrub for this entry and say that all's well if (in->is_dirty_parent()) { dout(20) << "decode failed; forcing backtrace as passed since " "inode is dirty parent" << dendl; results->backtrace.passed = true; } goto next; } memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value, &equivalent, &divergent); if (divergent || memory_newer < 0) { // we're divergent, or on-disk version is newer results->backtrace.error_str << "On-disk backtrace is divergent or newer"; /* if the backtraces are divergent and the link count is 0, then * most likely its a stray entry that's being purged and things are * well and there's no reason for alarm */ if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) { results->backtrace.passed = true; dout(20) << "divergent backtraces are acceptable when dn " "is being purged or has been renamed or moved to a " "different directory " << *in << dendl; } } else { results->backtrace.passed = true; } next: if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) { std::string path; in->make_path_string(path); in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino() << "(" << path << "), rewriting it"; in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(), false); // Flag that we repaired this BT so that it won't go into damagetable results->backtrace.repaired = true; if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired); } // If the inode's number was free in the InoTable, fix that // (#15619) { InoTable *inotable = mdcache->mds->inotable; dout(10) << "scrub: inotable ino = " << in->ino() << dendl; dout(10) << "scrub: inotable free says " << inotable->is_marked_free(in->ino()) << dendl; if (inotable->is_marked_free(in->ino())) { LogChannelRef clog = in->mdcache->mds->clog; clog->error() << "scrub: inode wrongly marked free: " << in->ino(); if (in->scrub_infop->header->get_repair()) { bool repaired = inotable->repair(in->ino()); if (repaired) { clog->error() << "inode table repaired for inode: " << in->ino(); inotable->save(); if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired); } else { clog->error() << "Cannot repair inotable while other operations" " are in progress"; } } } } if (in->is_dir()) { if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes); return validate_directory_data(); } else { if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes); // TODO: validate on-disk inode for normal files return true; } } bool validate_directory_data() { ceph_assert(in->is_dir()); if (in->is_base()) { if (!shadow_in) { shadow_in = new CInode(in->mdcache); in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode); in->mdcache->num_shadow_inodes++; } shadow_in->fetch(get_internal_callback(INODE)); if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes); return false; } else { // TODO: validate on-disk inode for non-base directories if (in->mdcache->mds->logger) in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats); results->inode.passed = true; return check_dirfrag_rstats(); } } bool _inode_disk(int rval) { const auto& si = shadow_in->get_inode(); const auto& i = in->get_inode(); results->inode.checked = true; results->inode.ondisk_read_retval = rval; results->inode.ondisk_value = *si; results->inode.memory_value = *i; if (si->version > i->version) { // uh, what? results->inode.error_str << "On-disk inode is newer than in-memory one; "; goto next; } else { bool divergent = false; int r = i->compare(*si, &divergent); results->inode.passed = !divergent && r >= 0; if (!results->inode.passed) { results->inode.error_str << "On-disk inode is divergent or newer than in-memory one; "; goto next; } } next: return check_dirfrag_rstats(); } bool check_dirfrag_rstats() { if (in->has_subtree_root_dirfrag()) { in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS)); return false; } else { return immediate(DIRFRAGS, 0); } } bool _dirfrags(int rval) { // basic reporting setup results->raw_stats.checked = true; results->raw_stats.ondisk_read_retval = rval; results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat; results->raw_stats.memory_value.rstat = in->get_inode()->rstat; frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat; nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat; if (rval != 0) { results->raw_stats.error_str << "Failed to read dirfrags off disk"; goto next; } // check each dirfrag... for (const auto &p : in->dirfrags) { CDir *dir = p.second; ceph_assert(dir->get_version() > 0); nest_info.add(dir->get_fnode()->accounted_rstat); dir_info.add(dir->get_fnode()->accounted_fragstat); } nest_info.rsubdirs++; // it gets one to account for self if (const sr_t *srnode = in->get_projected_srnode(); srnode) nest_info.rsnaps += srnode->snaps.size(); // ...and that their sum matches our inode settings if (!dir_info.same_sums(in->get_inode()->dirstat) || !nest_info.same_sums(in->get_inode()->rstat)) { if (in->scrub_infop->header->get_repair()) { results->raw_stats.error_str << "freshly-calculated rstats don't match existing ones (will be fixed)"; in->mdcache->repair_inode_stats(in); results->raw_stats.repaired = true; } else { results->raw_stats.error_str << "freshly-calculated rstats don't match existing ones"; } if (in->is_dirty()) { MDCache *mdcache = in->mdcache; // for dout() auto ino = [this]() { return in->ino(); }; // for dout() dout(20) << "raw stats most likely wont match since inode is dirty; " "please rerun scrub when system is stable; " "assuming passed for now;" << dendl; results->raw_stats.passed = true; } goto next; } results->raw_stats.passed = true; { MDCache *mdcache = in->mdcache; // for dout() auto ino = [this]() { return in->ino(); }; // for dout() dout(20) << "raw stats check passed on " << *in << dendl; } next: return true; } void _done() override { if ((!results->raw_stats.checked || results->raw_stats.passed) && (!results->backtrace.checked || results->backtrace.passed) && (!results->inode.checked || results->inode.passed)) results->passed_validation = true; // Flag that we did some repair work so that our repair operation // can be flushed at end of scrub if (results->backtrace.repaired || results->inode.repaired || results->raw_stats.repaired) in->scrub_infop->header->set_repaired(); if (fin) fin->complete(get_rval()); in->auth_unpin(this); } }; dout(10) << "scrub starting validate_disk_state on " << *this << dendl; ValidationContinuation *vc = new ValidationContinuation(this, results, fin); vc->begin(); } void CInode::validated_data::dump(Formatter *f) const { f->open_object_section("results"); { f->dump_bool("performed_validation", performed_validation); f->dump_bool("passed_validation", passed_validation); f->open_object_section("backtrace"); { f->dump_bool("checked", backtrace.checked); f->dump_bool("passed", backtrace.passed); f->dump_int("read_ret_val", backtrace.ondisk_read_retval); f->dump_stream("ondisk_value") << backtrace.ondisk_value; f->dump_stream("memoryvalue") << backtrace.memory_value; f->dump_string("error_str", backtrace.error_str.str()); } f->close_section(); // backtrace f->open_object_section("raw_stats"); { f->dump_bool("checked", raw_stats.checked); f->dump_bool("passed", raw_stats.passed); f->dump_int("read_ret_val", raw_stats.ondisk_read_retval); f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat; f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat; f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat; f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat; f->dump_string("error_str", raw_stats.error_str.str()); } f->close_section(); // raw_stats // dump failure return code int rc = 0; if (backtrace.checked && backtrace.ondisk_read_retval) rc = backtrace.ondisk_read_retval; if (inode.checked && inode.ondisk_read_retval) rc = inode.ondisk_read_retval; if (raw_stats.checked && raw_stats.ondisk_read_retval) rc = raw_stats.ondisk_read_retval; f->dump_int("return_code", rc); } f->close_section(); // results } bool CInode::validated_data::all_damage_repaired() const { bool unrepaired = (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired) || (backtrace.checked && !backtrace.passed && !backtrace.repaired) || (inode.checked && !inode.passed && !inode.repaired); return !unrepaired; } void CInode::dump(Formatter *f, int flags) const { if (flags & DUMP_PATH) { std::string path; make_path_string(path, true); if (path.empty()) path = "/"; f->dump_string("path", path); } if (flags & DUMP_INODE_STORE_BASE) InodeStoreBase::dump(f); if (flags & DUMP_MDS_CACHE_OBJECT) MDSCacheObject::dump(f); if (flags & DUMP_LOCKS) { f->open_object_section("versionlock"); versionlock.dump(f); f->close_section(); f->open_object_section("authlock"); authlock.dump(f); f->close_section(); f->open_object_section("linklock"); linklock.dump(f); f->close_section(); f->open_object_section("dirfragtreelock"); dirfragtreelock.dump(f); f->close_section(); f->open_object_section("filelock"); filelock.dump(f); f->close_section(); f->open_object_section("xattrlock"); xattrlock.dump(f); f->close_section(); f->open_object_section("snaplock"); snaplock.dump(f); f->close_section(); f->open_object_section("nestlock"); nestlock.dump(f); f->close_section(); f->open_object_section("flocklock"); flocklock.dump(f); f->close_section(); f->open_object_section("policylock"); policylock.dump(f); f->close_section(); } if (flags & DUMP_STATE) { f->open_array_section("states"); MDSCacheObject::dump_states(f); if (state_test(STATE_EXPORTING)) f->dump_string("state", "exporting"); if (state_test(STATE_OPENINGDIR)) f->dump_string("state", "openingdir"); if (state_test(STATE_FREEZING)) f->dump_string("state", "freezing"); if (state_test(STATE_FROZEN)) f->dump_string("state", "frozen"); if (state_test(STATE_AMBIGUOUSAUTH)) f->dump_string("state", "ambiguousauth"); if (state_test(STATE_EXPORTINGCAPS)) f->dump_string("state", "exportingcaps"); if (state_test(STATE_NEEDSRECOVER)) f->dump_string("state", "needsrecover"); if (state_test(STATE_PURGING)) f->dump_string("state", "purging"); if (state_test(STATE_DIRTYPARENT)) f->dump_string("state", "dirtyparent"); if (state_test(STATE_DIRTYRSTAT)) f->dump_string("state", "dirtyrstat"); if (state_test(STATE_STRAYPINNED)) f->dump_string("state", "straypinned"); if (state_test(STATE_FROZENAUTHPIN)) f->dump_string("state", "frozenauthpin"); if (state_test(STATE_DIRTYPOOL)) f->dump_string("state", "dirtypool"); if (state_test(STATE_ORPHAN)) f->dump_string("state", "orphan"); if (state_test(STATE_MISSINGOBJS)) f->dump_string("state", "missingobjs"); f->close_section(); } if (flags & DUMP_CAPS) { f->open_array_section("client_caps"); for (const auto &p : client_caps) { auto &client = p.first; auto cap = &p.second; f->open_object_section("client_cap"); f->dump_int("client_id", client.v); f->dump_string("pending", ccap_string(cap->pending())); f->dump_string("issued", ccap_string(cap->issued())); f->dump_string("wanted", ccap_string(cap->wanted())); f->dump_int("last_sent", cap->get_last_seq()); f->close_section(); } f->close_section(); f->dump_int("loner", loner_cap.v); f->dump_int("want_loner", want_loner_cap.v); f->open_array_section("mds_caps_wanted"); for (const auto &p : mds_caps_wanted) { f->open_object_section("mds_cap_wanted"); f->dump_int("rank", p.first); f->dump_string("cap", ccap_string(p.second)); f->close_section(); } f->close_section(); } if (flags & DUMP_DIRFRAGS) { f->open_array_section("dirfrags"); auto&& dfs = get_dirfrags(); for(const auto &dir: dfs) { f->open_object_section("dir"); dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS); dir->check_rstats(); f->close_section(); } f->close_section(); } } /****** Scrub Stuff *****/ void CInode::scrub_info_create() const { dout(25) << __func__ << dendl; ceph_assert(!scrub_infop); // break out of const-land to set up implicit initial state CInode *me = const_cast(this); const auto& pi = me->get_projected_inode(); std::unique_ptr si(new scrub_info_t()); si->last_scrub_stamp = pi->last_scrub_stamp; si->last_scrub_version = pi->last_scrub_version; me->scrub_infop.swap(si); } void CInode::scrub_maybe_delete_info() { if (scrub_infop && !scrub_infop->scrub_in_progress && !scrub_infop->last_scrub_dirty) { scrub_infop.reset(); } } void CInode::scrub_initialize(ScrubHeaderRef& header) { dout(20) << __func__ << " with scrub_version " << get_version() << dendl; scrub_info(); scrub_infop->scrub_in_progress = true; scrub_infop->queued_frags.clear(); scrub_infop->header = header; header->inc_num_pending(); // right now we don't handle remote inodes } void CInode::scrub_aborted() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); scrub_infop->scrub_in_progress = false; scrub_infop->header->dec_num_pending(); scrub_maybe_delete_info(); } void CInode::scrub_finished() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); scrub_infop->last_scrub_version = get_version(); scrub_infop->last_scrub_stamp = ceph_clock_now(); scrub_infop->last_scrub_dirty = true; scrub_infop->scrub_in_progress = false; scrub_infop->header->dec_num_pending(); } int64_t CInode::get_backtrace_pool() const { if (is_dir()) { return mdcache->mds->get_metadata_pool(); } else { // Files are required to have an explicit layout that specifies // a pool ceph_assert(get_inode()->layout.pool_id != -1); return get_inode()->layout.pool_id; } } void CInode::queue_export_pin(mds_rank_t export_pin) { if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) return; mds_rank_t target; if (export_pin >= 0) target = export_pin; else if (export_pin == MDS_RANK_EPHEMERAL_RAND) target = mdcache->hash_into_rank_bucket(ino()); else target = MDS_RANK_NONE; unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits(); bool queue = false; for (auto& p : dirfrags) { CDir *dir = p.second; if (!dir->is_auth()) continue; if (export_pin == MDS_RANK_EPHEMERAL_DIST) { if (dir->get_frag().bits() < min_frag_bits) { // needs split queue = true; break; } target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag()); } if (target != MDS_RANK_NONE) { if (dir->is_subtree_root()) { // set auxsubtree bit or export it if (!dir->state_test(CDir::STATE_AUXSUBTREE) || target != dir->get_dir_auth().first) queue = true; } else { // create aux subtree or export it queue = true; } } else { // clear aux subtrees ? queue = dir->state_test(CDir::STATE_AUXSUBTREE); } if (queue) break; } if (queue) { state_set(CInode::STATE_QUEUEDEXPORTPIN); mdcache->export_pin_queue.insert(this); } } void CInode::maybe_export_pin(bool update) { if (!g_conf()->mds_bal_export_pin) return; if (!is_dir() || !is_normal()) return; dout(15) << __func__ << " update=" << update << " " << *this << dendl; mds_rank_t export_pin = get_export_pin(false); if (export_pin == MDS_RANK_NONE && !update) return; check_pin_policy(export_pin); queue_export_pin(export_pin); } void CInode::set_ephemeral_pin(bool dist, bool rand) { unsigned state = 0; if (dist) state |= STATE_DISTEPHEMERALPIN; if (rand) state |= STATE_RANDEPHEMERALPIN; if (!state) return; if (state_test(state) != state) { dout(10) << "set ephemeral (" << (dist ? "dist" : "") << (rand ? " rand" : "") << ") pin on " << *this << dendl; if (!is_ephemerally_pinned()) { auto p = mdcache->export_ephemeral_pins.insert(this); ceph_assert(p.second); } state_set(state); } } void CInode::clear_ephemeral_pin(bool dist, bool rand) { unsigned state = 0; if (dist) state |= STATE_DISTEPHEMERALPIN; if (rand) state |= STATE_RANDEPHEMERALPIN; if (state_test(state)) { dout(10) << "clear ephemeral (" << (dist ? "dist" : "") << (rand ? " rand" : "") << ") pin on " << *this << dendl; state_clear(state); if (!is_ephemerally_pinned()) { auto count = mdcache->export_ephemeral_pins.erase(this); ceph_assert(count == 1); } } } void CInode::maybe_ephemeral_rand(double threshold) { if (!mdcache->get_export_ephemeral_random_config()) { dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl; clear_ephemeral_pin(false, true); return; } else if (!is_dir() || !is_normal()) { dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl; clear_ephemeral_pin(false, true); return; } else if (get_inode()->nlink == 0) { dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl; clear_ephemeral_pin(false, true); return; } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) { dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl; queue_export_pin(MDS_RANK_EPHEMERAL_RAND); return; } /* not precomputed? */ if (threshold < 0.0) { threshold = get_ephemeral_rand(); } if (threshold <= 0.0) { return; } double n = ceph::util::generate_random_number(0.0, 1.0); dout(15) << __func__ << " rand " << n << " export_ephemeral_random_pin = probability; } void CInode::setxattr_ephemeral_dist(bool val) { ceph_assert(is_dir()); _get_projected_inode()->export_ephemeral_distributed_pin = val; } void CInode::set_export_pin(mds_rank_t rank) { ceph_assert(is_dir()); _get_projected_inode()->export_pin = rank; maybe_export_pin(true); } mds_rank_t CInode::get_export_pin(bool inherit) const { if (!g_conf()->mds_bal_export_pin) return MDS_RANK_NONE; /* An inode that is export pinned may not necessarily be a subtree root, we * need to traverse the parents. A base or system inode cannot be pinned. * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not * have a parent yet. */ mds_rank_t r_target = MDS_RANK_NONE; const CInode *in = this; const CDir *dir = nullptr; while (true) { if (in->is_system()) break; const CDentry *pdn = in->get_parent_dn(); if (!pdn) break; if (in->get_inode()->nlink == 0) { // ignore export pin for unlinked directory break; } if (in->get_inode()->export_pin >= 0) { return in->get_inode()->export_pin; } else if (in->get_inode()->export_ephemeral_distributed_pin && mdcache->get_export_ephemeral_distributed_config()) { if (in != this) return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag()); return MDS_RANK_EPHEMERAL_DIST; } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) { return r_target; } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() && mdcache->get_export_ephemeral_random_config()) { /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */ if (!inherit) return MDS_RANK_EPHEMERAL_RAND; if (in == this) r_target = MDS_RANK_EPHEMERAL_RAND; else r_target = mdcache->hash_into_rank_bucket(in->ino()); } if (!inherit) break; dir = pdn->get_dir(); in = dir->inode; } return MDS_RANK_NONE; } void CInode::check_pin_policy(mds_rank_t export_pin) { if (export_pin == MDS_RANK_EPHEMERAL_DIST) { set_ephemeral_pin(true, false); clear_ephemeral_pin(false, true); } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) { set_ephemeral_pin(false, true); clear_ephemeral_pin(true, false); } else if (is_ephemerally_pinned()) { // export_pin >= 0 || export_pin == MDS_RANK_NONE clear_ephemeral_pin(true, true); if (export_pin != get_inode()->export_pin) // inherited export_pin queue_export_pin(MDS_RANK_NONE); } } double CInode::get_ephemeral_rand() const { /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not * have a parent yet. */ const CInode *in = this; double max = mdcache->export_ephemeral_random_max; while (true) { if (in->is_system()) break; const CDentry *pdn = in->get_parent_dn(); if (!pdn) break; // ignore export pin for unlinked directory if (in->get_inode()->nlink == 0) break; if (in->get_inode()->export_ephemeral_random_pin > 0.0) return std::min(in->get_inode()->export_ephemeral_random_pin, max); /* An export_pin overrides only if no closer parent (incl. this one) has a * random pin set. */ if (in->get_inode()->export_pin >= 0 || in->get_inode()->export_ephemeral_distributed_pin) return 0.0; in = pdn->get_dir()->inode; } return 0.0; } void CInode::get_nested_dirfrags(std::vector& v) const { for (const auto &p : dirfrags) { const auto& dir = p.second; if (!dir->is_subtree_root()) v.push_back(dir); } } void CInode::get_subtree_dirfrags(std::vector& v) const { for (const auto &p : dirfrags) { const auto& dir = p.second; if (dir->is_subtree_root()) v.push_back(dir); } } MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);