diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/mds/CInode.cc | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/mds/CInode.cc')
-rw-r--r-- | src/mds/CInode.cc | 5526 |
1 files changed, 5526 insertions, 0 deletions
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc new file mode 100644 index 000000000..23cb087c8 --- /dev/null +++ b/src/mds/CInode.cc @@ -0,0 +1,5526 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "common/errno.h" + +#include <string> + +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" + +#include "MDSRank.h" +#include "MDCache.h" +#include "MDLog.h" +#include "Locker.h" +#include "Mutation.h" + +#include "events/EUpdate.h" + +#include "osdc/Objecter.h" + +#include "snap.h" + +#include "LogSegment.h" + +#include "common/Clock.h" + +#include "common/config.h" +#include "global/global_context.h" +#include "include/ceph_assert.h" + +#include "mds/MDSContinuation.h" +#include "mds/InoTable.h" +#include "cephfs_features.h" +#include "osdc/Objecter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") " + +using namespace std; + +void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) { + using ceph::encode; + + op.priority = priority; + op.create(false); + + bufferlist parent_bl; + encode(bt, parent_bl); + op.setxattr("parent", parent_bl); + + // for the old pool there is no need to update the layout and symlink + if (!update_layout_symlink) + return; + + bufferlist layout_bl; + encode(_layout, layout_bl, _features); + op.setxattr("layout", layout_bl); + + if (!_symlink.empty()) { + bufferlist symlink_bl; + encode(_symlink, symlink_bl); + op.setxattr("symlink", symlink_bl); + } +} + +class CInodeIOContext : public MDSIOContextBase +{ +protected: + CInode *in; + MDSRank *get_mds() override {return in->mdcache->mds;} +public: + explicit CInodeIOContext(CInode *in_) : in(in_) { + ceph_assert(in != NULL); + } +}; + +sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1; + +LockType CInode::versionlock_type(CEPH_LOCK_IVERSION); +LockType CInode::authlock_type(CEPH_LOCK_IAUTH); +LockType CInode::linklock_type(CEPH_LOCK_ILINK); +LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT); +LockType CInode::filelock_type(CEPH_LOCK_IFILE); +LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR); +LockType CInode::snaplock_type(CEPH_LOCK_ISNAP); +LockType CInode::nestlock_type(CEPH_LOCK_INEST); +LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK); +LockType CInode::policylock_type(CEPH_LOCK_IPOLICY); + +std::string_view CInode::pin_name(int p) const +{ + switch (p) { + case PIN_DIRFRAG: return "dirfrag"; + case PIN_CAPS: return "caps"; + case PIN_IMPORTING: return "importing"; + case PIN_OPENINGDIR: return "openingdir"; + case PIN_REMOTEPARENT: return "remoteparent"; + case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; + case PIN_SCATTERED: return "scattered"; + case PIN_STICKYDIRS: return "stickydirs"; + //case PIN_PURGING: return "purging"; + case PIN_FREEZING: return "freezing"; + case PIN_FROZEN: return "frozen"; + case PIN_IMPORTINGCAPS: return "importingcaps"; + case PIN_EXPORTINGCAPS: return "exportingcaps"; + case PIN_PASTSNAPPARENT: return "pastsnapparent"; + case PIN_OPENINGSNAPPARENTS: return "openingsnapparents"; + case PIN_TRUNCATING: return "truncating"; + case PIN_STRAY: return "stray"; + case PIN_NEEDSNAPFLUSH: return "needsnapflush"; + case PIN_DIRTYRSTAT: return "dirtyrstat"; + case PIN_DIRTYPARENT: return "dirtyparent"; + case PIN_DIRWAITER: return "dirwaiter"; + default: return generic_pin_name(p); + } +} + +//int cinode_pins[CINODE_NUM_PINS]; // counts +ostream& CInode::print_db_line_prefix(ostream& out) const +{ + return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "; +} + +/* + * write caps and lock ids + */ +struct cinode_lock_info_t cinode_lock_info[] = { + { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR }, + { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL }, + { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL }, + { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL }, +}; +int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]); + +ostream& operator<<(ostream& out, const CInode& in) +{ + string path; + in.make_path_string(path, true); + + out << "[inode " << in.ino(); + out << " [" + << (in.is_multiversion() ? "...":"") + << in.first << "," << in.last << "]"; + out << " " << path << (in.is_dir() ? "/":""); + + if (in.is_auth()) { + out << " auth"; + if (in.is_replicated()) + out << in.get_replicas(); + } else { + mds_authority_t a = in.authority(); + out << " rep@" << a.first; + if (a.second != CDIR_AUTH_UNKNOWN) + out << "," << a.second; + out << "." << in.get_replica_nonce(); + } + + if (in.is_symlink()) + out << " symlink='" << in.symlink << "'"; + if (in.is_dir() && !in.dirfragtree.empty()) + out << " " << in.dirfragtree; + + out << " v" << in.get_version(); + if (in.get_projected_version() > in.get_version()) + out << " pv" << in.get_projected_version(); + + if (in.get_num_auth_pins()) { + out << " ap=" << in.get_num_auth_pins(); +#ifdef MDS_AUTHPIN_SET + in.print_authpin_set(out); +#endif + } + + if (in.snaprealm) + out << " snaprealm=" << in.snaprealm; + + if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; + if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER"; + if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING"; + if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT"; + if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS"; + if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN"; + if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN"; + if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; + if (in.is_frozen_inode()) out << " FROZEN"; + if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; + + const auto& pi = in.get_projected_inode(); + if (pi->is_truncating()) + out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; + + if (in.is_dir()) { + out << " " << in.get_inode()->dirstat; + if (g_conf()->mds_debug_scatterstat && in.is_projected()) { + out << "->" << pi->dirstat; + } + } else { + out << " s=" << in.get_inode()->size; + if (in.get_inode()->nlink != 1) + out << " nl=" << in.get_inode()->nlink; + } + + // rstat + out << " " << in.get_inode()->rstat; + if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat)) + out << "/" << in.get_inode()->accounted_rstat; + if (g_conf()->mds_debug_scatterstat && in.is_projected()) { + out << "->" << pi->rstat; + if (!(pi->rstat == pi->accounted_rstat)) + out << "/" << pi->accounted_rstat; + } + + if (in.is_any_old_inodes()) { + out << " old_inodes=" << in.get_old_inodes()->size(); + } + + if (!in.client_need_snapflush.empty()) + out << " need_snapflush=" << in.client_need_snapflush; + + // locks + if (!in.authlock.is_sync_and_unlocked()) + out << " " << in.authlock; + if (!in.linklock.is_sync_and_unlocked()) + out << " " << in.linklock; + if (in.get_inode()->is_dir()) { + if (!in.dirfragtreelock.is_sync_and_unlocked()) + out << " " << in.dirfragtreelock; + if (!in.snaplock.is_sync_and_unlocked()) + out << " " << in.snaplock; + if (!in.nestlock.is_sync_and_unlocked()) + out << " " << in.nestlock; + if (!in.policylock.is_sync_and_unlocked()) + out << " " << in.policylock; + } else { + if (!in.flocklock.is_sync_and_unlocked()) + out << " " << in.flocklock; + } + if (!in.filelock.is_sync_and_unlocked()) + out << " " << in.filelock; + if (!in.xattrlock.is_sync_and_unlocked()) + out << " " << in.xattrlock; + if (!in.versionlock.is_sync_and_unlocked()) + out << " " << in.versionlock; + + // hack: spit out crap on which clients have caps + if (in.get_inode()->client_ranges.size()) + out << " cr=" << in.get_inode()->client_ranges; + + if (!in.get_client_caps().empty()) { + out << " caps={"; + bool first = true; + for (const auto &p : in.get_client_caps()) { + if (!first) out << ","; + out << p.first << "=" + << ccap_string(p.second.pending()); + if (p.second.issued() != p.second.pending()) + out << "/" << ccap_string(p.second.issued()); + out << "/" << ccap_string(p.second.wanted()) + << "@" << p.second.get_last_seq(); + first = false; + } + out << "}"; + if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) { + out << ",l=" << in.get_loner(); + if (in.get_loner() != in.get_wanted_loner()) + out << "(" << in.get_wanted_loner() << ")"; + } + } + if (!in.get_mds_caps_wanted().empty()) { + out << " mcw={"; + bool first = true; + for (const auto &p : in.get_mds_caps_wanted()) { + if (!first) + out << ','; + out << p.first << '=' << ccap_string(p.second); + first = false; + } + out << '}'; + } + + if (in.get_num_ref()) { + out << " |"; + in.print_pin_set(out); + } + + if (in.get_inode()->export_pin != MDS_RANK_NONE) { + out << " export_pin=" << in.get_inode()->export_pin; + } + if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) { + out << " distepin"; + } + if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) { + out << " randepin"; + } + + out << " " << ∈ + out << "]"; + return out; +} + +CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) : + mdcache(c), first(f), last(l), + item_dirty(this), + item_caps(this), + item_open_file(this), + item_dirty_parent(this), + item_dirty_dirfrag_dir(this), + item_dirty_dirfrag_nest(this), + item_dirty_dirfrag_dirfragtree(this), + pop(c->decayrate), + versionlock(this, &versionlock_type), + authlock(this, &authlock_type), + linklock(this, &linklock_type), + dirfragtreelock(this, &dirfragtreelock_type), + filelock(this, &filelock_type), + xattrlock(this, &xattrlock_type), + snaplock(this, &snaplock_type), + nestlock(this, &nestlock_type), + flocklock(this, &flocklock_type), + policylock(this, &policylock_type) +{ + if (auth) + state_set(STATE_AUTH); +} + +void CInode::print(ostream& out) const +{ + out << *this; +} + +void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) +{ + dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; + + if (client_need_snapflush.empty()) { + get(CInode::PIN_NEEDSNAPFLUSH); + + // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially + // long periods waiting for clients to flush their snaps. + auth_pin(this); // pin head get_inode()->.. + } + + auto &clients = client_need_snapflush[snapid]; + if (clients.empty()) + snapin->auth_pin(this); // ...and pin snapped/old inode! + + clients.insert(client); +} + +void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) +{ + dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; + auto it = client_need_snapflush.find(snapid); + if (it == client_need_snapflush.end()) { + dout(10) << " snapid not found" << dendl; + return; + } + size_t n = it->second.erase(client); + if (n == 0) { + dout(10) << " client not found" << dendl; + return; + } + if (it->second.empty()) { + client_need_snapflush.erase(it); + snapin->auth_unpin(this); + + if (client_need_snapflush.empty()) { + put(CInode::PIN_NEEDSNAPFLUSH); + auth_unpin(this); + } + } +} + +pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in) +{ + dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl; + bool cowin_need_flush = false; + bool orig_need_flush = false; + auto it = client_need_snapflush.lower_bound(cowin->first); + while (it != client_need_snapflush.end() && it->first < in->first) { + ceph_assert(!it->second.empty()); + if (cowin->last >= it->first) { + cowin->auth_pin(this); + cowin_need_flush = true; + ++it; + } else { + it = client_need_snapflush.erase(it); + } + in->auth_unpin(this); + } + + if (it != client_need_snapflush.end() && it->first <= in->last) + orig_need_flush = true; + + return make_pair(cowin_need_flush, orig_need_flush); +} + +void CInode::mark_dirty_rstat() +{ + if (!state_test(STATE_DIRTYRSTAT)) { + dout(10) << __func__ << dendl; + state_set(STATE_DIRTYRSTAT); + get(PIN_DIRTYRSTAT); + CDentry *pdn = get_projected_parent_dn(); + if (pdn->is_auth()) { + CDir *pdir = pdn->dir; + pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item); + mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock); + } else { + // under cross-MDS rename. + // DIRTYRSTAT flag will get cleared when rename finishes + ceph_assert(state_test(STATE_AMBIGUOUSAUTH)); + } + } +} +void CInode::clear_dirty_rstat() +{ + if (state_test(STATE_DIRTYRSTAT)) { + dout(10) << __func__ << dendl; + state_clear(STATE_DIRTYRSTAT); + put(PIN_DIRTYRSTAT); + dirty_rstat_item.remove_myself(); + } +} + +CInode::projected_inode CInode::project_inode(const MutationRef& mut, + bool xattr, bool snap) +{ + if (mut && mut->is_projected(this)) { + ceph_assert(!xattr && !snap); + auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode); + return projected_inode(std::move(_inode), xattr_map_ptr()); + } + + auto pi = allocate_inode(*get_projected_inode()); + + if (scrub_infop && scrub_infop->last_scrub_dirty) { + pi->last_scrub_stamp = scrub_infop->last_scrub_stamp; + pi->last_scrub_version = scrub_infop->last_scrub_version; + scrub_infop->last_scrub_dirty = false; + scrub_maybe_delete_info(); + } + + const auto& ox = get_projected_xattrs(); + xattr_map_ptr px; + if (xattr) { + px = allocate_xattr_map(); + if (ox) + *px = *ox; + } + + sr_t* ps = projected_inode::UNDEF_SRNODE; + if (snap) { + ps = prepare_new_srnode(0); + ++num_projected_srnodes; + } + + projected_nodes.emplace_back(pi, xattr ? px : ox , ps); + if (mut) + mut->add_projected_node(this); + dout(15) << __func__ << " " << pi->ino << dendl; + return projected_inode(std::move(pi), std::move(px), ps); +} + +void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut) +{ + ceph_assert(!projected_nodes.empty()); + auto front = std::move(projected_nodes.front()); + dout(15) << __func__ << " v" << front.inode->version << dendl; + + projected_nodes.pop_front(); + if (mut) + mut->remove_projected_node(this); + + bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id; + bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) || + (get_inode()->export_ephemeral_distributed_pin != + front.inode->export_ephemeral_distributed_pin); + + reset_inode(std::move(front.inode)); + if (front.xattrs != get_xattrs()) + reset_xattrs(std::move(front.xattrs)); + + if (front.snapnode != projected_inode::UNDEF_SRNODE) { + --num_projected_srnodes; + pop_projected_snaprealm(front.snapnode, false); + } + + mark_dirty(ls); + if (get_inode()->is_backtrace_updated()) + mark_dirty_parent(ls, pool_updated); + + if (pin_updated) + maybe_export_pin(true); +} + +sr_t *CInode::prepare_new_srnode(snapid_t snapid) +{ + const sr_t *cur_srnode = get_projected_srnode(); + sr_t *new_srnode; + + if (cur_srnode) { + new_srnode = new sr_t(*cur_srnode); + } else { + if (snapid == 0) + snapid = mdcache->get_global_snaprealm()->get_newest_seq(); + new_srnode = new sr_t(); + new_srnode->seq = snapid; + new_srnode->created = snapid; + new_srnode->current_parent_since = get_oldest_snap(); + SnapRealm *sr = find_snaprealm(); + dout(20) << __func__ << ": inheriting change_attr from " << *sr + << dendl; + new_srnode->change_attr = sr->srnode.change_attr; + } + return new_srnode; +} + +const sr_t *CInode::get_projected_srnode() const { + if (num_projected_srnodes > 0) { + for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) + if (it->snapnode != projected_inode::UNDEF_SRNODE) + return it->snapnode; + } + if (snaprealm) + return &snaprealm->srnode; + else + return NULL; +} + +void CInode::project_snaprealm(sr_t *new_srnode) +{ + dout(10) << __func__ << " " << new_srnode << dendl; + ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE); + projected_nodes.back().snapnode = new_srnode; + ++num_projected_srnodes; +} + +void CInode::mark_snaprealm_global(sr_t *new_srnode) +{ + ceph_assert(!is_dir()); + // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since' + new_srnode->last_destroyed = new_srnode->current_parent_since; + new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + new_srnode->mark_parent_global(); +} + +void CInode::clear_snaprealm_global(sr_t *new_srnode) +{ + // restore 'current_parent_since' + new_srnode->current_parent_since = new_srnode->last_destroyed; + new_srnode->last_destroyed = 0; + new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq(); + new_srnode->clear_parent_global(); +} + +bool CInode::is_projected_snaprealm_global() const +{ + const sr_t *srnode = get_projected_srnode(); + if (srnode && srnode->is_parent_global()) + return true; + return false; +} + +void CInode::project_snaprealm_past_parent(SnapRealm *newparent) +{ + sr_t *new_snap = project_snaprealm(); + record_snaprealm_past_parent(new_snap, newparent); +} + + +/* if newparent != parent, add parent to past_parents + if parent DNE, we need to find what the parent actually is and fill that in */ +void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent) +{ + ceph_assert(!new_snap->is_parent_global()); + SnapRealm *oldparent; + if (!snaprealm) { + oldparent = find_snaprealm(); + } else { + oldparent = snaprealm->parent; + } + + if (newparent != oldparent) { + snapid_t oldparentseq = oldparent->get_newest_seq(); + if (oldparentseq + 1 > new_snap->current_parent_since) { + // copy old parent's snaps + const set<snapid_t>& snaps = oldparent->get_snaps(); + auto p = snaps.lower_bound(new_snap->current_parent_since); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + if (oldparentseq > new_snap->seq) + new_snap->seq = oldparentseq; + } + new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + } +} + +void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent, + CDentry *dn, bool primary_dn) +{ + ceph_assert(new_snap->is_parent_global()); + + if (!oldparent) + oldparent = dn->get_dir()->inode->find_snaprealm(); + auto& snaps = oldparent->get_snaps(); + + if (!primary_dn) { + auto p = snaps.lower_bound(dn->first); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + } else { + // 'last_destroyed' is used as 'current_parent_since' + auto p = snaps.lower_bound(new_snap->last_destroyed); + if (p != snaps.end()) + new_snap->past_parent_snaps.insert(p, snaps.end()); + new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1; + } +} + +void CInode::early_pop_projected_snaprealm() +{ + ceph_assert(!projected_nodes.empty()); + if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) { + pop_projected_snaprealm(projected_nodes.front().snapnode, true); + projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE; + --num_projected_srnodes; + } +} + +void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early) +{ + if (next_snaprealm) { + dout(10) << __func__ << (early ? " (early) " : " ") + << next_snaprealm << " seq " << next_snaprealm->seq << dendl; + if (!snaprealm) + open_snaprealm(); + + auto old_flags = snaprealm->srnode.flags; + snaprealm->srnode = *next_snaprealm; + delete next_snaprealm; + + if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { + snaprealm->adjust_parent(); + } + + if (snaprealm->parent) + dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl; + } else { + dout(10) << __func__ << (early ? " (early) null" : " null") << dendl; + ceph_assert(snaprealm); + snaprealm->merge_to(NULL); + } +} + + +// ====== CInode ======= + +// dirfrags + +InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode(); + +__u32 InodeStoreBase::hash_dentry_name(std::string_view dn) +{ + int which = inode->dir_layout.dl_dir_hash; + if (!which) + which = CEPH_STR_HASH_LINUX; + ceph_assert(ceph_str_hash_valid(which)); + return ceph_str_hash(which, dn.data(), dn.length()); +} + +frag_t InodeStoreBase::pick_dirfrag(std::string_view dn) +{ + if (dirfragtree.empty()) + return frag_t(); // avoid the string hash if we can. + + __u32 h = hash_dentry_name(dn); + return dirfragtree[h]; +} + +std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg) +{ + std::pair<bool, std::vector<CDir*>> result; + auto& all = result.first; + auto& dirs = result.second; + all = false; + + if (auto it = dirfrags.find(fg); it != dirfrags.end()){ + all = true; + dirs.push_back(it->second); + return result; + } + + int total = 0; + for(auto &[_fg, _dir] : dirfrags){ + // frag_t.bits() can indicate the depth of the partition in the directory tree + // e.g. + // 01* : bit = 2, on the second floor + // * + // 0* 1* + // 00* 01* 10* 11* -- > level 2, bit = 2 + // so fragA.bits > fragB.bits means fragA is deeper than fragB + + if (fg.bits() >= _fg.bits()) { + if (_fg.contains(fg)) { + all = true; + return result; + } + } else { + if (fg.contains(_fg)) { + dirs.push_back(_dir); + // we can calculate how many sub slices a slice can be divided into + // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*) + // or 2^2 frags belonging to the second layer(00* 01* 10* 11*) + // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level + total += 1 << (24 - _fg.bits()); + } + } + } + + // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache + all = ((1<<(24-fg.bits())) == total); + return result; +} + +void CInode::verify_dirfrags() +{ + bool bad = false; + for (const auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree + << ": " << *p.second << dendl; + bad = true; + } + } + ceph_assert(!bad); +} + +void CInode::force_dirfrags() +{ + bool bad = false; + for (auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree + << ": " << *p.second << dendl; + bad = true; + } + } + + if (bad) { + frag_vec_t leaves; + dirfragtree.get_leaves(leaves); + for (const auto& leaf : leaves) { + mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true); + } + } + + verify_dirfrags(); +} + +CDir *CInode::get_approx_dirfrag(frag_t fg) +{ + CDir *dir = get_dirfrag(fg); + if (dir) return dir; + + // find a child? + auto&& p = get_dirfrags_under(fg); + if (!p.second.empty()) + return p.second.front(); + + // try parents? + while (fg.bits() > 0) { + fg = fg.parent(); + dir = get_dirfrag(fg); + if (dir) return dir; + } + return NULL; +} + +CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) +{ + ceph_assert(is_dir()); + + // have it? + CDir *dir = get_dirfrag(fg); + if (!dir) { + // create it. + ceph_assert(is_auth() || mdcache->mds->is_any_replay()); + dir = new CDir(this, fg, mdcache, is_auth()); + add_dirfrag(dir); + } + return dir; +} + +CDir *CInode::add_dirfrag(CDir *dir) +{ + auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir)); + ceph_assert(em.second); + + if (stickydir_ref > 0) { + dir->state_set(CDir::STATE_STICKY); + dir->get(CDir::PIN_STICKY); + } + + maybe_export_pin(); + + return dir; +} + +void CInode::close_dirfrag(frag_t fg) +{ + dout(14) << __func__ << " " << fg << dendl; + ceph_assert(dirfrags.count(fg)); + + CDir *dir = dirfrags[fg]; + dir->remove_null_dentries(); + + // clear dirty flag + if (dir->is_dirty()) + dir->mark_clean(); + + if (stickydir_ref > 0) { + dir->state_clear(CDir::STATE_STICKY); + dir->put(CDir::PIN_STICKY); + } + + if (dir->is_subtree_root()) + num_subtree_roots--; + + // dump any remaining dentries, for debugging purposes + for (const auto &p : dir->items) + dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl; + + ceph_assert(dir->get_num_ref() == 0); + delete dir; + dirfrags.erase(fg); +} + +void CInode::close_dirfrags() +{ + while (!dirfrags.empty()) + close_dirfrag(dirfrags.begin()->first); +} + +bool CInode::has_subtree_root_dirfrag(int auth) +{ + if (num_subtree_roots > 0) { + if (auth == -1) + return true; + for (const auto &p : dirfrags) { + if (p.second->is_subtree_root() && + p.second->dir_auth.first == auth) + return true; + } + } + return false; +} + +bool CInode::has_subtree_or_exporting_dirfrag() +{ + if (num_subtree_roots > 0 || num_exporting_dirs > 0) + return true; + return false; +} + +void CInode::get_stickydirs() +{ + if (stickydir_ref == 0) { + get(PIN_STICKYDIRS); + for (const auto &p : dirfrags) { + p.second->state_set(CDir::STATE_STICKY); + p.second->get(CDir::PIN_STICKY); + } + } + stickydir_ref++; +} + +void CInode::put_stickydirs() +{ + ceph_assert(stickydir_ref > 0); + stickydir_ref--; + if (stickydir_ref == 0) { + put(PIN_STICKYDIRS); + for (const auto &p : dirfrags) { + p.second->state_clear(CDir::STATE_STICKY); + p.second->put(CDir::PIN_STICKY); + } + } +} + + + + + +// pins + +void CInode::first_get() +{ + // pin my dentry? + if (parent) + parent->get(CDentry::PIN_INODEPIN); +} + +void CInode::last_put() +{ + // unpin my dentry? + if (parent) + parent->put(CDentry::PIN_INODEPIN); +} + +void CInode::_put() +{ + if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent()) + mdcache->maybe_eval_stray(this, true); +} + +void CInode::add_remote_parent(CDentry *p) +{ + if (remote_parents.empty()) + get(PIN_REMOTEPARENT); + remote_parents.insert(p); +} +void CInode::remove_remote_parent(CDentry *p) +{ + remote_parents.erase(p); + if (remote_parents.empty()) + put(PIN_REMOTEPARENT); +} + + + + +CDir *CInode::get_parent_dir() +{ + if (parent) + return parent->dir; + return NULL; +} +CDir *CInode::get_projected_parent_dir() +{ + CDentry *p = get_projected_parent_dn(); + if (p) + return p->dir; + return NULL; +} +CInode *CInode::get_parent_inode() +{ + if (parent) + return parent->dir->inode; + return NULL; +} + +bool CInode::is_ancestor_of(const CInode *other) const +{ + while (other) { + if (other == this) + return true; + const CDentry *pdn = other->get_oldest_parent_dn(); + if (!pdn) { + ceph_assert(other->is_base()); + break; + } + other = pdn->get_dir()->get_inode(); + } + return false; +} + +bool CInode::is_projected_ancestor_of(const CInode *other) const +{ + while (other) { + if (other == this) + return true; + const CDentry *pdn = other->get_projected_parent_dn(); + if (!pdn) { + ceph_assert(other->is_base()); + break; + } + other = pdn->get_dir()->get_inode(); + } + return false; +} + +/* + * Because a non-directory inode may have multiple links, the use_parent + * argument allows selecting which parent to use for path construction. This + * argument is only meaningful for the final component (i.e. the first of the + * nested calls) because directories cannot have multiple hard links. If + * use_parent is NULL and projected is true, the primary parent's projected + * inode is used all the way up the path chain. Otherwise the primary parent + * stable inode is used. + */ +void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const +{ + if (!use_parent) { + use_parent = projected ? get_projected_parent_dn() : parent; + } + + if (use_parent) { + use_parent->make_path_string(s, projected); + } else if (is_root()) { + s = ""; + } else if (is_mdsdir()) { + char t[40]; + uint64_t eino(ino()); + eino -= MDS_INO_MDSDIR_OFFSET; + snprintf(t, sizeof(t), "~mds%" PRId64, eino); + s = t; + } else { + char n[40]; + uint64_t eino(ino()); + snprintf(n, sizeof(n), "#%" PRIx64, eino); + s += n; + } +} + +void CInode::make_path(filepath& fp, bool projected) const +{ + const CDentry *use_parent = projected ? get_projected_parent_dn() : parent; + if (use_parent) { + ceph_assert(!is_base()); + use_parent->make_path(fp, projected); + } else { + fp = filepath(ino()); + } +} + +void CInode::name_stray_dentry(string& dname) +{ + char s[20]; + snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val); + dname = s; +} + +version_t CInode::pre_dirty() +{ + version_t pv; + CDentry* _cdentry = get_projected_parent_dn(); + if (_cdentry) { + pv = _cdentry->pre_dirty(get_projected_version()); + dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl; + } else { + ceph_assert(is_base()); + pv = get_projected_version() + 1; + } + // force update backtrace for old format inode (see mempool_inode::decode) + if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) { + auto pi = _get_projected_inode(); + if (pi->backtrace_version == 0) + pi->update_backtrace(pv); + } + return pv; +} + +void CInode::_mark_dirty(LogSegment *ls) +{ + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + ceph_assert(ls); + } + + // move myself to this segment's dirty list + if (ls) + ls->dirty_inodes.push_back(&item_dirty); +} + +void CInode::mark_dirty(LogSegment *ls) { + + dout(10) << __func__ << " " << *this << dendl; + + /* + NOTE: I may already be dirty, but this fn _still_ needs to be called so that + the directory is (perhaps newly) dirtied, and so that parent_dir_version is + updated below. + */ + + // only auth can get dirty. "dirty" async data in replicas is relative to + // filelock state, not the dirty flag. + ceph_assert(is_auth()); + + // touch my private version + _mark_dirty(ls); + + // mark dentry too + if (parent) + parent->mark_dirty(get_version(), ls); +} + + +void CInode::mark_clean() +{ + dout(10) << __func__ << " " << *this << dendl; + if (state_test(STATE_DIRTY)) { + state_clear(STATE_DIRTY); + put(PIN_DIRTY); + + // remove myself from ls dirty list + item_dirty.remove_myself(); + } +} + + +// -------------- +// per-inode storage +// (currently for root inode only) + +struct C_IO_Inode_Stored : public CInodeIOContext { + version_t version; + Context *fin; + C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} + void finish(int r) override { + in->_stored(r, version, fin); + } + void print(ostream& out) const override { + out << "inode_store(" << in->ino() << ")"; + } +}; + +object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix) +{ + char n[60]; + snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg); + ceph_assert(strlen(n) + suffix.size() < sizeof n); + strncat(n, suffix.data(), suffix.size()); + return object_t(n); +} + +void CInode::store(MDSContext *fin) +{ + dout(10) << __func__ << " " << get_version() << dendl; + ceph_assert(is_base()); + + if (snaprealm) + purge_stale_snap_data(snaprealm->get_snaps()); + + // encode + bufferlist bl; + string magic = CEPH_FS_ONDISK_MAGIC; + using ceph::encode; + encode(magic, bl); + encode_store(bl, mdcache->mds->mdsmap->get_up_features()); + + // write it. + SnapContext snapc; + ObjectOperation m; + m.write_full(bl); + + object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode"); + object_locator_t oloc(mdcache->mds->get_metadata_pool()); + + Context *newfin = + new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin), + mdcache->mds->finisher); + mdcache->mds->objecter->mutate(oid, oloc, m, snapc, + ceph::real_clock::now(), 0, + newfin); +} + +void CInode::_stored(int r, version_t v, Context *fin) +{ + if (r < 0) { + dout(1) << "store error " << r << " v " << v << " on " << *this << dendl; + mdcache->mds->clog->error() << "failed to store inode " << ino() + << " object: " << cpp_strerror(r); + mdcache->mds->handle_write_error(r); + fin->complete(r); + return; + } + + dout(10) << __func__ << " " << v << " on " << *this << dendl; + if (v == get_projected_version()) + mark_clean(); + + fin->complete(0); +} + +void CInode::flush(MDSContext *fin) +{ + dout(10) << __func__ << " " << *this << dendl; + ceph_assert(is_auth() && can_auth_pin()); + + MDSGatherBuilder gather(g_ceph_context); + + if (is_dirty_parent()) { + store_backtrace(gather.new_sub()); + } + if (is_dirty()) { + if (is_base()) { + store(gather.new_sub()); + } else { + parent->dir->commit(0, gather.new_sub()); + } + } + + if (gather.has_subs()) { + gather.set_finisher(fin); + gather.activate(); + } else { + fin->complete(0); + } +} + +struct C_IO_Inode_Fetched : public CInodeIOContext { + bufferlist bl, bl2; + Context *fin; + C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {} + void finish(int r) override { + // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT + in->_fetched(bl, bl2, fin); + } + void print(ostream& out) const override { + out << "inode_fetch(" << in->ino() << ")"; + } +}; + +void CInode::fetch(MDSContext *fin) +{ + dout(10) << __func__ << dendl; + + C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin); + C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher)); + + object_t oid = CInode::get_object_name(ino(), frag_t(), ""); + object_locator_t oloc(mdcache->mds->get_metadata_pool()); + + // Old on-disk format: inode stored in xattr of a dirfrag + ObjectOperation rd; + rd.getxattr("inode", &c->bl, NULL); + mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub()); + + // Current on-disk format: inode stored in a .inode object + object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode"); + mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub()); + + gather.activate(); +} + +void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) +{ + dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl; + bufferlist::const_iterator p; + if (bl2.length()) { + p = bl2.cbegin(); + } else if (bl.length()) { + p = bl.cbegin(); + } else { + derr << "No data while reading inode " << ino() << dendl; + fin->complete(-CEPHFS_ENOENT); + return; + } + + using ceph::decode; + // Attempt decode + try { + string magic; + decode(magic, p); + dout(10) << " magic is '" << magic << "' (expecting '" + << CEPH_FS_ONDISK_MAGIC << "')" << dendl; + if (magic != CEPH_FS_ONDISK_MAGIC) { + dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC + << "'" << dendl; + fin->complete(-CEPHFS_EINVAL); + } else { + decode_store(p); + dout(10) << "_fetched " << *this << dendl; + fin->complete(0); + } + } catch (buffer::error &err) { + derr << "Corrupt inode " << ino() << ": " << err.what() << dendl; + fin->complete(-CEPHFS_EINVAL); + return; + } +} + +void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) +{ + bt.ino = ino(); + bt.ancestors.clear(); + bt.pool = pool; + + CInode *in = this; + CDentry *pdn = get_parent_dn(); + while (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version)); + in = diri; + pdn = in->get_parent_dn(); + } + bt.old_pools.reserve(get_inode()->old_pools.size()); + for (auto &p : get_inode()->old_pools) { + // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) + if (p != pool) + bt.old_pools.push_back(p); + } +} + +struct C_IO_Inode_StoredBacktrace : public CInodeIOContext { + version_t version; + Context *fin; + C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} + void finish(int r) override { + in->_stored_backtrace(r, version, fin); + } + void print(ostream& out) const override { + out << "backtrace_store(" << in->ino() << ")"; + } +}; + + +void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld, + std::vector<CInodeCommitOperation> &ops_vec, + inode_backtrace_t &bt) +{ + dout(10) << __func__ << dendl; + + if (r < 0) { + mdcache->mds->handle_write_error_with_lock(r); + return; + } + + SnapContext snapc; + object_t oid = get_object_name(ino(), frag_t(), ""); + + for (auto &op : ops_vec) { + ObjectOperation obj_op; + object_locator_t oloc(op.get_pool()); + op.update(obj_op, bt); + mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc, + ceph::real_clock::now(), + 0, gather_bld.new_sub()); + } +} + +void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec, + inode_backtrace_t &bt, int op_prio) +{ + dout(10) << __func__ << " on " << *this << dendl; + ceph_assert(is_dirty_parent()); + + if (op_prio < 0) + op_prio = CEPH_MSG_PRIO_DEFAULT; + + auth_pin(this); + + const int64_t pool = get_backtrace_pool(); + build_backtrace(pool, bt); + + std::string_view slink = ""; + if (is_symlink() && mdcache->get_symlink_recovery()) { + slink = symlink; + } + + ops_vec.emplace_back(op_prio, pool, get_inode()->layout, + mdcache->mds->mdsmap->get_up_features(), slink); + + if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) { + dout(20) << __func__ << ": no dirtypool or no old pools" << dendl; + return; + } + + // In the case where DIRTYPOOL is set, we update all old pools backtraces + // such that anyone reading them will see the new pool ID in + // inode_backtrace_t::pool and go read everything else from there. + for (const auto &p : get_inode()->old_pools) { + if (p == pool) + continue; + + dout(20) << __func__ << ": updating old pool " << p << dendl; + + ops_vec.emplace_back(op_prio, p); + } +} + +void CInode::store_backtrace(MDSContext *fin, int op_prio) +{ + std::vector<CInodeCommitOperation> ops_vec; + inode_backtrace_t bt; + auto version = get_inode()->backtrace_version; + + _store_backtrace(ops_vec, bt, op_prio); + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher( + new C_IO_Inode_StoredBacktrace(this, version, fin), + mdcache->mds->finisher)); + _commit_ops(0, gather, ops_vec, bt); + ceph_assert(gather.has_subs()); + gather.activate(); +} + +void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio) +{ + op.version = get_inode()->backtrace_version; + op.in = this; + + _store_backtrace(op.ops_vec, op.bt, op_prio); +} + +void CInode::_stored_backtrace(int r, version_t v, Context *fin) +{ + if (r == -CEPHFS_ENOENT) { + const int64_t pool = get_backtrace_pool(); + bool exists = mdcache->mds->objecter->with_osdmap( + [pool](const OSDMap &osd_map) { + return osd_map.have_pg_pool(pool); + }); + + // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it + // out from under us), so the backtrace can never be written, so pretend + // to succeed so that the user can proceed to e.g. delete the file. + if (!exists) { + dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted " + "beneath us!" << dendl; + r = 0; + } + } + + if (r < 0) { + dout(1) << "store backtrace error " << r << " v " << v << dendl; + mdcache->mds->clog->error() << "failed to store backtrace on ino " + << ino() << " object" + << ", pool " << get_backtrace_pool() + << ", errno " << r; + mdcache->mds->handle_write_error(r); + if (fin) + fin->complete(r); + return; + } + + dout(10) << __func__ << " v " << v << dendl; + + auth_unpin(this); + if (v == get_inode()->backtrace_version) + clear_dirty_parent(); + if (fin) + fin->complete(0); +} + +void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace) +{ + mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin); +} + +void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool) +{ + if (!state_test(STATE_DIRTYPARENT)) { + dout(10) << __func__ << dendl; + state_set(STATE_DIRTYPARENT); + get(PIN_DIRTYPARENT); + ceph_assert(ls); + } + if (dirty_pool) + state_set(STATE_DIRTYPOOL); + if (ls) + ls->dirty_parent_inodes.push_back(&item_dirty_parent); +} + +void CInode::clear_dirty_parent() +{ + if (state_test(STATE_DIRTYPARENT)) { + dout(10) << __func__ << dendl; + state_clear(STATE_DIRTYPARENT); + state_clear(STATE_DIRTYPOOL); + put(PIN_DIRTYPARENT); + item_dirty_parent.remove_myself(); + } +} + +void CInode::verify_diri_backtrace(bufferlist &bl, int err) +{ + if (is_base() || is_dirty_parent() || !is_auth()) + return; + + dout(10) << __func__ << dendl; + + if (err == 0) { + inode_backtrace_t backtrace; + using ceph::decode; + decode(backtrace, bl); + CDentry *pdn = get_parent_dn(); + if (backtrace.ancestors.empty() || + backtrace.ancestors[0].dname != pdn->get_name() || + backtrace.ancestors[0].dirino != pdn->get_dir()->ino()) + err = -CEPHFS_EINVAL; + } + + if (err) { + MDSRank *mds = mdcache->mds; + mds->clog->error() << "bad backtrace on directory inode " << ino(); + ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1)); + + mark_dirty_parent(mds->mdlog->get_current_segment(), false); + mds->mdlog->flush(); + } +} + +// ------------------ +// parent dir + + +void InodeStoreBase::encode_xattrs(bufferlist &bl) const { + using ceph::encode; + if (xattrs) + encode(*xattrs, bl); + else + encode((__u32)0, bl); +} + +void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) { + using ceph::decode; + mempool_xattr_map tmp; + decode_noshare(tmp, p); + if (tmp.empty()) { + reset_xattrs(xattr_map_ptr()); + } else { + reset_xattrs(allocate_xattr_map(std::move(tmp))); + } +} + +void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const { + using ceph::encode; + if (old_inodes) + encode(*old_inodes, bl, features); + else + encode((__u32)0, bl); +} + +void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) { + using ceph::decode; + mempool_old_inode_map tmp; + decode(tmp, p); + if (tmp.empty()) { + reset_old_inodes(old_inode_map_ptr()); + } else { + reset_old_inodes(allocate_old_inode_map(std::move(tmp))); + } +} + +void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features, + const bufferlist *snap_blob) const +{ + using ceph::encode; + encode(*inode, bl, features); + if (inode->is_symlink()) + encode(symlink, bl); + encode(dirfragtree, bl); + encode_xattrs(bl); + + if (snap_blob) + encode(*snap_blob, bl); + else + encode(bufferlist(), bl); + encode_old_inodes(bl, features); + encode(oldest_snap, bl); + encode(damage_flags, bl); +} + +void InodeStoreBase::encode(bufferlist &bl, uint64_t features, + const bufferlist *snap_blob) const +{ + ENCODE_START(6, 4, bl); + encode_bare(bl, features, snap_blob); + ENCODE_FINISH(bl); +} + +void CInode::encode_store(bufferlist& bl, uint64_t features) +{ + bufferlist snap_blob; + encode_snap_blob(snap_blob); + InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(), + &snap_blob); +} + +void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl, + bufferlist& snap_blob, __u8 struct_v) +{ + using ceph::decode; + + auto _inode = allocate_inode(); + decode(*_inode, bl); + + if (_inode->is_symlink()) { + std::string tmp; + decode(tmp, bl); + symlink = std::string_view(tmp); + } + decode(dirfragtree, bl); + decode_xattrs(bl); + decode(snap_blob, bl); + + decode_old_inodes(bl); + if (struct_v == 2 && _inode->is_dir()) { + bool default_layout_exists; + decode(default_layout_exists, bl); + if (default_layout_exists) { + decode(struct_v, bl); // this was a default_file_layout + decode(_inode->layout, bl); // but we only care about the layout portion + } + } + + if (struct_v >= 5) { + // InodeStore is embedded in dentries without proper versioning, so + // we consume up to the end of the buffer + if (!bl.end()) { + decode(oldest_snap, bl); + } + + if (!bl.end()) { + decode(damage_flags, bl); + } + } + + reset_inode(std::move(_inode)); +} + + +void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); + decode_bare(bl, snap_blob, struct_v); + DECODE_FINISH(bl); +} + +void CInode::decode_store(bufferlist::const_iterator& bl) +{ + bufferlist snap_blob; + InodeStoreBase::decode(bl, snap_blob); + decode_snap_blob(snap_blob); +} + +// ------------------ +// locking + +SimpleLock* CInode::get_lock(int type) +{ + switch (type) { + case CEPH_LOCK_IVERSION: return &versionlock; + case CEPH_LOCK_IFILE: return &filelock; + case CEPH_LOCK_IAUTH: return &authlock; + case CEPH_LOCK_ILINK: return &linklock; + case CEPH_LOCK_IDFT: return &dirfragtreelock; + case CEPH_LOCK_IXATTR: return &xattrlock; + case CEPH_LOCK_ISNAP: return &snaplock; + case CEPH_LOCK_INEST: return &nestlock; + case CEPH_LOCK_IFLOCK: return &flocklock; + case CEPH_LOCK_IPOLICY: return &policylock; + } + return 0; +} + +void CInode::set_object_info(MDSCacheObjectInfo &info) +{ + info.ino = ino(); + info.snapid = last; +} + +void CInode::encode_lock_iauth(bufferlist& bl) +{ + ENCODE_START(2, 1, bl); + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode(get_inode()->mode, bl); + encode(get_inode()->uid, bl); + encode(get_inode()->gid, bl); + encode(get_inode()->fscrypt_auth, bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_iauth(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(2, p); + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) _inode->ctime = tm; + decode(_inode->mode, p); + decode(_inode->uid, p); + decode(_inode->gid, p); + if (struct_v >= 2) + decode(_inode->fscrypt_auth, p); + DECODE_FINISH(p); + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_ilink(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode(get_inode()->nlink, bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_ilink(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(1, p); + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) _inode->ctime = tm; + decode(_inode->nlink, p); + DECODE_FINISH(p); + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_idft(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + if (is_auth()) { + encode(get_inode()->version, bl); + } else { + // treat flushing as dirty when rejoining cache + bool dirty = dirfragtreelock.is_dirty_or_flushing(); + encode(dirty, bl); + } + { + // encode the raw tree + encode(dirfragtree, bl); + + // also specify which frags are mine + set<frag_t> myfrags; + auto&& dfls = get_dirfrags(); + for (const auto& dir : dfls) { + if (dir->is_auth()) { + frag_t fg = dir->get_frag(); + myfrags.insert(fg); + } + } + encode(myfrags, bl); + } + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_idft(bufferlist::const_iterator& p) +{ + inode_ptr _inode; + + DECODE_START(1, p); + if (is_auth()) { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting dftlock dirty flag" << dendl; + dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + _inode = allocate_inode(*get_inode()); + decode(_inode->version, p); + } + { + fragtree_t temp; + decode(temp, p); + set<frag_t> authfrags; + decode(authfrags, p); + if (is_auth()) { + // auth. believe replica's auth frags only. + for (auto fg : authfrags) { + if (!dirfragtree.is_leaf(fg)) { + dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl; + dirfragtree.force_to_leaf(g_ceph_context, fg); + dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } + } else { + // replica. take the new tree, BUT make sure any open + // dirfrags remain leaves (they may have split _after_ this + // dft was scattered, or we may still be be waiting on the + // notify from the auth) + dirfragtree.swap(temp); + for (const auto &p : dirfrags) { + if (!dirfragtree.is_leaf(p.first)) { + dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl; + dirfragtree.force_to_leaf(g_ceph_context, p.first); + } + if (p.second->is_auth()) + p.second->state_clear(CDir::STATE_DIRTYDFT); + } + } + if (g_conf()->mds_debug_frag) + verify_dirfrags(); + } + DECODE_FINISH(p); + + if (_inode) + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_ifile(bufferlist& bl) +{ + ENCODE_START(2, 1, bl); + if (is_auth()) { + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode(get_inode()->mtime, bl); + encode(get_inode()->atime, bl); + encode(get_inode()->time_warp_seq, bl); + if (!is_dir()) { + encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); + encode(get_inode()->size, bl); + encode(get_inode()->truncate_seq, bl); + encode(get_inode()->truncate_size, bl); + encode(get_inode()->client_ranges, bl); + encode(get_inode()->inline_data, bl); + } + } else { + // treat flushing as dirty when rejoining cache + bool dirty = filelock.is_dirty_or_flushing(); + encode(dirty, bl); + } + dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl; + encode(get_inode()->dirstat, bl); // only meaningful if i am auth. + bufferlist tmp; + __u32 n = 0; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + if (is_auth() || dir->is_auth()) { + const auto& pf = dir->get_projected_fnode(); + dout(15) << fg << " " << *dir << dendl; + dout(20) << fg << " fragstat " << pf->fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; + encode(fg, tmp); + encode(dir->first, tmp); + encode(pf->fragstat, tmp); + encode(pf->accounted_fragstat, tmp); + n++; + } + } + encode(n, bl); + bl.claim_append(tmp); + if (is_auth()) + encode(get_inode()->fscrypt_file, bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_ifile(bufferlist::const_iterator& p) +{ + inode_ptr _inode; + + DECODE_START(2, p); + if (!is_auth()) { + _inode = allocate_inode(*get_inode()); + + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) _inode->ctime = tm; + decode(_inode->mtime, p); + decode(_inode->atime, p); + decode(_inode->time_warp_seq, p); + if (!is_dir()) { + decode(_inode->layout, p); + decode(_inode->size, p); + decode(_inode->truncate_seq, p); + decode(_inode->truncate_size, p); + decode(_inode->client_ranges, p); + decode(_inode->inline_data, p); + } + } else { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting filelock dirty flag" << dendl; + filelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } + + frag_info_t dirstat; + decode(dirstat, p); + if (!is_auth()) { + dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; + _inode->dirstat = dirstat; // take inode summation if replica + } + __u32 n; + decode(n, p); + dout(10) << " ...got " << n << " fragstats on " << *this << dendl; + while (n--) { + frag_t fg; + snapid_t fgfirst; + frag_info_t fragstat; + frag_info_t accounted_fragstat; + decode(fg, p); + decode(fgfirst, p); + decode(fragstat, p); + decode(accounted_fragstat, p); + dout(10) << fg << " [" << fgfirst << ",head] " << dendl; + dout(10) << fg << " fragstat " << fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl; + + CDir *dir = get_dirfrag(fg); + if (is_auth()) { + ceph_assert(dir); // i am auth; i had better have this dir open + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); + _fnode->fragstat = fragstat; + _fnode->accounted_fragstat = accounted_fragstat; + dir->reset_fnode(std::move(_fnode)); + if (!(fragstat == accounted_fragstat)) { + dout(10) << fg << " setting filelock updated flag" << dendl; + filelock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + if (dir && dir->is_auth()) { + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + const auto& pf = dir->get_projected_fnode(); + finish_scatter_update(&filelock, dir, + _inode->dirstat.version, pf->accounted_fragstat.version); + } + } + } + if (!is_auth() && struct_v >= 2) + decode(_inode->fscrypt_file, p); + DECODE_FINISH(p); + + if (_inode) + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_inest(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + if (is_auth()) { + encode(get_inode()->version, bl); + } else { + // treat flushing as dirty when rejoining cache + bool dirty = nestlock.is_dirty_or_flushing(); + encode(dirty, bl); + } + dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl; + encode(get_inode()->rstat, bl); // only meaningful if i am auth. + bufferlist tmp; + __u32 n = 0; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + if (is_auth() || dir->is_auth()) { + const auto& pf = dir->get_projected_fnode(); + dout(10) << __func__ << " " << fg << " dir " << *dir << dendl; + dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl; + dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl; + dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; + encode(fg, tmp); + encode(dir->first, tmp); + encode(pf->rstat, tmp); + encode(pf->accounted_rstat, tmp); + encode(dir->dirty_old_rstat, tmp); + n++; + } + } + encode(n, bl); + bl.claim_append(tmp); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_inest(bufferlist::const_iterator& p) +{ + inode_ptr _inode; + + DECODE_START(1, p); + if (is_auth()) { + bool replica_dirty; + decode(replica_dirty, p); + if (replica_dirty) { + dout(10) << __func__ << " setting nestlock dirty flag" << dendl; + nestlock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + _inode = allocate_inode(*get_inode()); + decode(_inode->version, p); + } + nest_info_t rstat; + decode(rstat, p); + if (!is_auth()) { + dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl; + _inode->rstat = rstat; // take inode summation if replica + } + __u32 n; + decode(n, p); + while (n--) { + frag_t fg; + snapid_t fgfirst; + nest_info_t rstat; + nest_info_t accounted_rstat; + decltype(CDir::dirty_old_rstat) dirty_old_rstat; + decode(fg, p); + decode(fgfirst, p); + decode(rstat, p); + decode(accounted_rstat, p); + decode(dirty_old_rstat, p); + dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl; + dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl; + dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl; + dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl; + CDir *dir = get_dirfrag(fg); + if (is_auth()) { + ceph_assert(dir); // i am auth; i had better have this dir open + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); + _fnode->rstat = rstat; + _fnode->accounted_rstat = accounted_rstat; + dir->reset_fnode(std::move(_fnode)); + dir->dirty_old_rstat.swap(dirty_old_rstat); + if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { + dout(10) << fg << " setting nestlock updated flag" << dendl; + nestlock.mark_dirty(); // ok bc we're auth and caller will handle + } + } else { + if (dir && dir->is_auth()) { + dout(10) << fg << " first " << dir->first << " -> " << fgfirst + << " on " << *dir << dendl; + dir->first = fgfirst; + const auto& pf = dir->get_projected_fnode(); + finish_scatter_update(&nestlock, dir, + _inode->rstat.version, pf->accounted_rstat.version); + } + } + } + DECODE_FINISH(p); + + if (_inode) + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_ixattr(bufferlist& bl) +{ + ENCODE_START(2, 1, bl); + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode_xattrs(bl); + encode(get_inode()->xattr_version, bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_ixattr(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(2, p); + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) + _inode->ctime = tm; + decode_xattrs(p); + if (struct_v >= 2) { + decode(_inode->xattr_version, p); + } + DECODE_FINISH(p); + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_isnap(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode_snap(bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_isnap(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(1, p); + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) _inode->ctime = tm; + decode_snap(p); + DECODE_FINISH(p); + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_iflock(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + encode(get_inode()->version, bl); + _encode_file_locks(bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_iflock(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(1, p); + decode(_inode->version, p); + _decode_file_locks(p); + DECODE_FINISH(p); + reset_inode(std::move(_inode)); +} + +void CInode::encode_lock_ipolicy(bufferlist& bl) +{ + ENCODE_START(2, 1, bl); + if (is_dir()) { + encode(get_inode()->version, bl); + encode(get_inode()->ctime, bl); + encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); + encode(get_inode()->quota, bl); + encode(get_inode()->export_pin, bl); + encode(get_inode()->export_ephemeral_distributed_pin, bl); + encode(get_inode()->export_ephemeral_random_pin, bl); + } + ENCODE_FINISH(bl); +} + +void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p) +{ + ceph_assert(!is_auth()); + auto _inode = allocate_inode(*get_inode()); + DECODE_START(1, p); + if (is_dir()) { + decode(_inode->version, p); + utime_t tm; + decode(tm, p); + if (_inode->ctime < tm) + _inode->ctime = tm; + decode(_inode->layout, p); + decode(_inode->quota, p); + decode(_inode->export_pin, p); + if (struct_v >= 2) { + decode(_inode->export_ephemeral_distributed_pin, p); + decode(_inode->export_ephemeral_random_pin, p); + } + } + DECODE_FINISH(p); + + bool pin_updated = (get_inode()->export_pin != _inode->export_pin) || + (get_inode()->export_ephemeral_distributed_pin != + _inode->export_ephemeral_distributed_pin); + reset_inode(std::move(_inode)); + maybe_export_pin(pin_updated); +} + +void CInode::encode_lock_state(int type, bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + encode(first, bl); + if (!is_base()) + encode(parent->first, bl); + + switch (type) { + case CEPH_LOCK_IAUTH: + encode_lock_iauth(bl); + break; + + case CEPH_LOCK_ILINK: + encode_lock_ilink(bl); + break; + + case CEPH_LOCK_IDFT: + encode_lock_idft(bl); + break; + + case CEPH_LOCK_IFILE: + encode_lock_ifile(bl); + break; + + case CEPH_LOCK_INEST: + encode_lock_inest(bl); + break; + + case CEPH_LOCK_IXATTR: + encode_lock_ixattr(bl); + break; + + case CEPH_LOCK_ISNAP: + encode_lock_isnap(bl); + break; + + case CEPH_LOCK_IFLOCK: + encode_lock_iflock(bl); + break; + + case CEPH_LOCK_IPOLICY: + encode_lock_ipolicy(bl); + break; + + default: + ceph_abort(); + } + ENCODE_FINISH(bl); +} + +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ + +void CInode::decode_lock_state(int type, const bufferlist& bl) +{ + auto p = bl.cbegin(); + + DECODE_START(1, p); + + snapid_t newfirst; + using ceph::decode; + decode(newfirst, p); + if (!is_auth() && newfirst != first) { + dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl; + first = newfirst; + } + if (!is_base()) { + decode(newfirst, p); + if (!parent->is_auth() && newfirst != parent->first) { + dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl; + parent->first = newfirst; + } + } + + switch (type) { + case CEPH_LOCK_IAUTH: + decode_lock_iauth(p); + break; + + case CEPH_LOCK_ILINK: + decode_lock_ilink(p); + break; + + case CEPH_LOCK_IDFT: + decode_lock_idft(p); + break; + + case CEPH_LOCK_IFILE: + decode_lock_ifile(p); + break; + + case CEPH_LOCK_INEST: + decode_lock_inest(p); + break; + + case CEPH_LOCK_IXATTR: + decode_lock_ixattr(p); + break; + + case CEPH_LOCK_ISNAP: + decode_lock_isnap(p); + break; + + case CEPH_LOCK_IFLOCK: + decode_lock_iflock(p); + break; + + case CEPH_LOCK_IPOLICY: + decode_lock_ipolicy(p); + break; + + default: + ceph_abort(); + } + DECODE_FINISH(p); +} + + +bool CInode::is_dirty_scattered() +{ + return + filelock.is_dirty_or_flushing() || + nestlock.is_dirty_or_flushing() || + dirfragtreelock.is_dirty_or_flushing(); +} + +void CInode::clear_scatter_dirty() +{ + filelock.remove_dirty(); + nestlock.remove_dirty(); + dirfragtreelock.remove_dirty(); +} + +void CInode::clear_dirty_scattered(int type) +{ + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_dir()); + switch (type) { + case CEPH_LOCK_IFILE: + item_dirty_dirfrag_dir.remove_myself(); + break; + + case CEPH_LOCK_INEST: + item_dirty_dirfrag_nest.remove_myself(); + break; + + case CEPH_LOCK_IDFT: + item_dirty_dirfrag_dirfragtree.remove_myself(); + break; + + default: + ceph_abort(); + } +} + + +/* + * when we initially scatter a lock, we need to check if any of the dirfrags + * have out of date accounted_rstat/fragstat. if so, mark the lock stale. + */ +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ +void CInode::start_scatter(ScatterLock *lock) +{ + dout(10) << __func__ << " " << *lock << " on " << *this << dendl; + ceph_assert(is_auth()); + const auto& pi = get_projected_inode(); + + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + const auto& pf = dir->get_projected_fnode(); + dout(20) << fg << " " << *dir << dendl; + + if (!dir->is_auth()) + continue; + + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version); + break; + + case CEPH_LOCK_INEST: + finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version); + break; + + case CEPH_LOCK_IDFT: + dir->state_clear(CDir::STATE_DIRTYDFT); + break; + } + } +} + + +class C_Inode_FragUpdate : public MDSLogContextBase { +protected: + CInode *in; + CDir *dir; + MutationRef mut; + MDSRank *get_mds() override {return in->mdcache->mds;} + void finish(int r) override { + in->_finish_frag_update(dir, mut); + } + +public: + C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {} +}; + +void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir, + version_t inode_version, version_t dir_accounted_version) +{ + frag_t fg = dir->get_frag(); + ceph_assert(dir->is_auth()); + + if (dir->is_frozen()) { + dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl; + } else if (dir->get_version() == 0) { + dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl; + } else { + if (dir_accounted_version != inode_version) { + dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl; + + MDLog *mdlog = mdcache->mds->mdlog; + MutationRef mut(new MutationImpl()); + mut->ls = mdlog->get_current_segment(); + + auto pf = dir->project_fnode(mut); + + std::string_view ename; + switch (lock->get_type()) { + case CEPH_LOCK_IFILE: + pf->fragstat.version = inode_version; + pf->accounted_fragstat = pf->fragstat; + ename = "lock ifile accounted scatter stat update"; + break; + case CEPH_LOCK_INEST: + pf->rstat.version = inode_version; + pf->accounted_rstat = pf->rstat; + ename = "lock inest accounted scatter stat update"; + + if (!is_auth() && lock->get_state() == LOCK_MIX) { + dout(10) << __func__ << " try to assimilate dirty rstat on " + << *dir << dendl; + dir->assimilate_dirty_rstat_inodes(mut); + } + + break; + default: + ceph_abort(); + } + + EUpdate *le = new EUpdate(mdlog, ename); + mdlog->start_entry(le); + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + + ceph_assert(!dir->is_frozen()); + mut->auth_pin(dir); + + if (lock->get_type() == CEPH_LOCK_INEST && + !is_auth() && lock->get_state() == LOCK_MIX) { + dout(10) << __func__ << " finish assimilating dirty rstat on " + << *dir << dendl; + dir->assimilate_dirty_rstat_inodes_finish(&le->metablob); + + if (!(pf->rstat == pf->accounted_rstat)) { + if (!mut->is_wrlocked(&nestlock)) { + mdcache->mds->locker->wrlock_force(&nestlock, mut); + } + + mdcache->mds->locker->mark_updated_scatterlock(&nestlock); + mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest); + } + } + + pf->version = dir->pre_dirty(); + + mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut)); + } else { + dout(10) << __func__ << " " << fg << " accounted " << *lock + << " scatter stat unchanged at v" << dir_accounted_version << dendl; + } + } +} + +void CInode::_finish_frag_update(CDir *dir, MutationRef& mut) +{ + dout(10) << __func__ << " on " << *dir << dendl; + mut->apply(); + mdcache->mds->locker->drop_locks(mut.get()); + mut->cleanup(); +} + + +/* + * when we gather a lock, we need to assimilate dirfrag changes into the inode + * state. it's possible we can't update the dirfrag accounted_rstat/fragstat + * because the frag is auth and frozen, or that the replica couldn't for the same + * reason. hopefully it will get updated the next time the lock cycles. + * + * we have two dimensions of behavior: + * - we may be (auth and !frozen), and able to update, or not. + * - the frag may be stale, or not. + * + * if the frag is non-stale, we want to assimilate the diff into the + * inode, regardless of whether it's auth or updateable. + * + * if we update the frag, we want to set accounted_fragstat = frag, + * both if we took the diff or it was stale and we are making it + * un-stale. + */ +/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ +void CInode::finish_scatter_gather_update(int type, MutationRef& mut) +{ + LogChannelRef clog = mdcache->mds->clog; + + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_auth()); + + switch (type) { + case CEPH_LOCK_IFILE: + { + fragtree_t tmpdft = dirfragtree; + struct frag_info_t dirstat; + bool dirstat_valid = true; + + // adjust summation + ceph_assert(is_auth()); + auto pi = _get_projected_inode(); + + bool touched_mtime = false, touched_chattr = false; + dout(20) << " orig dirstat " << pi->dirstat << dendl; + pi->dirstat.version++; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + dout(20) << fg << " " << *dir << dendl; + + bool update; + if (dir->get_version() != 0) { + update = dir->is_auth() && !dir->is_frozen(); + } else { + update = false; + dirstat_valid = false; + } + + CDir::fnode_const_ptr pf; + if (update) { + mut->auth_pin(dir); + pf = dir->project_fnode(mut); + } else { + pf = dir->get_projected_fnode(); + } + + if (pf->accounted_fragstat.version == pi->dirstat.version - 1) { + dout(20) << fg << " fragstat " << pf->fragstat << dendl; + dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; + pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); + } else { + dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl; + } + + if (pf->fragstat.nfiles < 0 || + pf->fragstat.nsubdirs < 0) { + clog->error() << "bad/negative dir size on " + << dir->dirfrag() << " " << pf->fragstat; + ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); + + auto _pf = const_cast<fnode_t*>(pf.get()); + if (pf->fragstat.nfiles < 0) + _pf->fragstat.nfiles = 0; + if (pf->fragstat.nsubdirs < 0) + _pf->fragstat.nsubdirs = 0; + } + + if (update) { + auto _pf = const_cast<fnode_t*>(pf.get()); + _pf->accounted_fragstat = _pf->fragstat; + _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version; + _pf->version = dir->pre_dirty(); + dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl; + } + + tmpdft.force_to_leaf(g_ceph_context, fg); + dirstat.add(pf->fragstat); + } + if (touched_mtime) + pi->mtime = pi->ctime = pi->dirstat.mtime; + if (touched_chattr) + pi->change_attr++; + + dout(20) << " final dirstat " << pi->dirstat << dendl; + + if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) { + frag_vec_t leaves; + tmpdft.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + if (!dirfrags.count(leaf)) { + dirstat_valid = false; + break; + } + } + if (dirstat_valid) { + if (state_test(CInode::STATE_REPAIRSTATS)) { + dout(20) << " dirstat mismatch, fixing" << dendl; + } else { + clog->error() << "unmatched fragstat on " << ino() << ", inode has " + << pi->dirstat << ", dirfrags have " << dirstat; + ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter); + } + // trust the dirfrags for now + version_t v = pi->dirstat.version; + if (pi->dirstat.mtime > dirstat.mtime) + dirstat.mtime = pi->dirstat.mtime; + if (pi->dirstat.change_attr > dirstat.change_attr) + dirstat.change_attr = pi->dirstat.change_attr; + pi->dirstat = dirstat; + pi->dirstat.version = v; + } + } + + if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) { + std::string path; + make_path_string(path); + clog->error() << "Inconsistent statistics detected: fragstat on inode " + << ino() << " (" << path << "), inode has " << pi->dirstat; + ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); + + if (pi->dirstat.nfiles < 0) + pi->dirstat.nfiles = 0; + if (pi->dirstat.nsubdirs < 0) + pi->dirstat.nsubdirs = 0; + } + } + break; + + case CEPH_LOCK_INEST: + { + // adjust summation + ceph_assert(is_auth()); + + fragtree_t tmpdft = dirfragtree; + nest_info_t rstat; + bool rstat_valid = true; + + rstat.rsubdirs = 1; + if (const sr_t *srnode = get_projected_srnode(); srnode) + rstat.rsnaps = srnode->snaps.size(); + + auto pi = _get_projected_inode(); + dout(20) << " orig rstat " << pi->rstat << dendl; + pi->rstat.version++; + for (const auto &p : dirfrags) { + frag_t fg = p.first; + CDir *dir = p.second; + dout(20) << fg << " " << *dir << dendl; + + bool update; + if (dir->get_version() != 0) { + update = dir->is_auth() && !dir->is_frozen(); + } else { + update = false; + rstat_valid = false; + } + + CDir::fnode_const_ptr pf; + if (update) { + mut->auth_pin(dir); + pf = dir->project_fnode(mut); + } else { + pf = dir->get_projected_fnode(); + } + + if (pf->accounted_rstat.version == pi->rstat.version-1) { + // only pull this frag's dirty rstat inodes into the frag if + // the frag is non-stale and updateable. if it's stale, + // that info will just get thrown out! + if (update) + dir->assimilate_dirty_rstat_inodes(mut); + + dout(20) << fg << " rstat " << pf->rstat << dendl; + dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl; + dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; + mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, + dir->first, CEPH_NOSNAP, this, true); + for (auto &p : dir->dirty_old_rstat) { + mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, + p.second.first, p.first, this, true); + } + if (update) // dir contents not valid if frozen or non-auth + dir->check_rstats(); + } else { + dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl; + } + if (update) { + auto _pf = const_cast<fnode_t*>(pf.get()); + _pf->accounted_rstat = pf->rstat; + _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version; + _pf->version = dir->pre_dirty(); + dir->dirty_old_rstat.clear(); + dir->check_rstats(); + dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl; + } + + tmpdft.force_to_leaf(g_ceph_context, fg); + rstat.add(pf->rstat); + } + dout(20) << " final rstat " << pi->rstat << dendl; + + if (rstat_valid && !rstat.same_sums(pi->rstat)) { + frag_vec_t leaves; + tmpdft.get_leaves_under(frag_t(), leaves); + for (const auto& leaf : leaves) { + if (!dirfrags.count(leaf)) { + rstat_valid = false; + break; + } + } + if (rstat_valid) { + if (state_test(CInode::STATE_REPAIRSTATS)) { + dout(20) << " rstat mismatch, fixing" << dendl; + } else { + clog->error() << "inconsistent rstat on inode " << ino() + << ", inode has " << pi->rstat + << ", directory fragments have " << rstat; + ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter); + } + // trust the dirfrag for now + version_t v = pi->rstat.version; + if (pi->rstat.rctime > rstat.rctime) + rstat.rctime = pi->rstat.rctime; + pi->rstat = rstat; + pi->rstat.version = v; + } + } + + mdcache->broadcast_quota_to_client(this); + } + break; + + case CEPH_LOCK_IDFT: + break; + + default: + ceph_abort(); + } +} + +void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob) +{ + dout(10) << __func__ << " " << type << " on " << *this << dendl; + ceph_assert(is_auth()); + + for (const auto &p : dirfrags) { + CDir *dir = p.second; + if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) + continue; + + if (type == CEPH_LOCK_IDFT) + continue; // nothing to do. + + if (type == CEPH_LOCK_INEST) + dir->assimilate_dirty_rstat_inodes_finish(metablob); + + dout(10) << " journaling updated frag accounted_ on " << *dir << dendl; + ceph_assert(dir->is_projected()); + metablob->add_dir(dir, true); + } +} + +// waiting + +bool CInode::is_frozen() const +{ + if (is_frozen_inode()) return true; + if (parent && parent->dir->is_frozen()) return true; + return false; +} + +bool CInode::is_frozen_dir() const +{ + if (parent && parent->dir->is_frozen_dir()) return true; + return false; +} + +bool CInode::is_freezing() const +{ + if (is_freezing_inode()) return true; + if (parent && parent->dir->is_freezing()) return true; + return false; +} + +void CInode::add_dir_waiter(frag_t fg, MDSContext *c) +{ + if (waiting_on_dir.empty()) + get(PIN_DIRWAITER); + waiting_on_dir[fg].push_back(c); + dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl; +} + +void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls) +{ + if (waiting_on_dir.empty()) + return; + + auto it = waiting_on_dir.find(fg); + if (it != waiting_on_dir.end()) { + dout(10) << __func__ << " frag " << fg << " on " << *this << dendl; + auto& waiting = it->second; + ls.insert(ls.end(), waiting.begin(), waiting.end()); + waiting_on_dir.erase(it); + + if (waiting_on_dir.empty()) + put(PIN_DIRWAITER); + } +} + +void CInode::add_waiter(uint64_t tag, MDSContext *c) +{ + dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c + << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) + << " !frozen " << !is_frozen_inode() + << " !freezing " << !is_freezing_inode() + << dendl; + // wait on the directory? + // make sure its not the inode that is explicitly ambiguous|freezing|frozen + if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || + ((tag & WAIT_UNFREEZE) && + !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { + dout(15) << "passing waiter up tree" << dendl; + parent->dir->add_waiter(tag, c); + return; + } + dout(15) << "taking waiter here" << dendl; + MDSCacheObject::add_waiter(tag, c); +} + +void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls) +{ + if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { + // take all dentry waiters + while (!waiting_on_dir.empty()) { + auto it = waiting_on_dir.begin(); + dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl; + auto& waiting = it->second; + ls.insert(ls.end(), waiting.begin(), waiting.end()); + waiting_on_dir.erase(it); + } + put(PIN_DIRWAITER); + } + + // waiting + MDSCacheObject::take_waiting(mask, ls); +} + +void CInode::maybe_finish_freeze_inode() +{ + CDir *dir = get_parent_dir(); + if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed) + return; + + dout(10) << "maybe_finish_freeze_inode - frozen" << dendl; + ceph_assert(auth_pins == auth_pin_freeze_allowance); + get(PIN_FROZEN); + put(PIN_FREEZING); + state_clear(STATE_FREEZING); + state_set(STATE_FROZEN); + + item_freezing_inode.remove_myself(); + dir->num_frozen_inodes++; + + finish_waiting(WAIT_FROZEN); +} + +bool CInode::freeze_inode(int auth_pin_allowance) +{ + CDir *dir = get_parent_dir(); + ceph_assert(dir); + + ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins + ceph_assert(auth_pins >= auth_pin_allowance); + if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) { + dout(10) << "freeze_inode - frozen" << dendl; + if (!state_test(STATE_FROZEN)) { + get(PIN_FROZEN); + state_set(STATE_FROZEN); + dir->num_frozen_inodes++; + } + return true; + } + + dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; + auth_pin_freeze_allowance = auth_pin_allowance; + dir->freezing_inodes.push_back(&item_freezing_inode); + + get(PIN_FREEZING); + state_set(STATE_FREEZING); + + if (!dir->lock_caches_with_auth_pins.empty()) + mdcache->mds->locker->invalidate_lock_caches(dir); + + const static int lock_types[] = { + CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT, + CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0 + }; + for (int i = 0; lock_types[i]; ++i) { + auto lock = get_lock(lock_types[i]); + if (lock->is_cached()) + mdcache->mds->locker->invalidate_lock_caches(lock); + } + // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed + // and finish freezing the inode + return state_test(STATE_FROZEN); +} + +void CInode::unfreeze_inode(MDSContext::vec& finished) +{ + dout(10) << __func__ << dendl; + if (state_test(STATE_FREEZING)) { + state_clear(STATE_FREEZING); + put(PIN_FREEZING); + item_freezing_inode.remove_myself(); + } else if (state_test(STATE_FROZEN)) { + state_clear(STATE_FROZEN); + put(PIN_FROZEN); + get_parent_dir()->num_frozen_inodes--; + } else + ceph_abort(); + take_waiting(WAIT_UNFREEZE, finished); +} + +void CInode::unfreeze_inode() +{ + MDSContext::vec finished; + unfreeze_inode(finished); + mdcache->mds->queue_waiters(finished); +} + +void CInode::freeze_auth_pin() +{ + ceph_assert(state_test(CInode::STATE_FROZEN)); + state_set(CInode::STATE_FROZENAUTHPIN); + get_parent_dir()->num_frozen_inodes++; +} + +void CInode::unfreeze_auth_pin() +{ + ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN)); + state_clear(CInode::STATE_FROZENAUTHPIN); + get_parent_dir()->num_frozen_inodes--; + if (!state_test(STATE_FREEZING|STATE_FROZEN)) { + MDSContext::vec finished; + take_waiting(WAIT_UNFREEZE, finished); + mdcache->mds->queue_waiters(finished); + } +} + +void CInode::clear_ambiguous_auth(MDSContext::vec& finished) +{ + ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); + state_clear(CInode::STATE_AMBIGUOUSAUTH); + take_waiting(CInode::WAIT_SINGLEAUTH, finished); +} + +void CInode::clear_ambiguous_auth() +{ + MDSContext::vec finished; + clear_ambiguous_auth(finished); + mdcache->mds->queue_waiters(finished); +} + +// auth_pins +bool CInode::can_auth_pin(int *err_ret) const { + int err; + if (!is_auth()) { + err = ERR_NOT_AUTH; + } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) { + err = ERR_EXPORTING_INODE; + } else { + if (parent) + return parent->can_auth_pin(err_ret); + err = 0; + } + if (err && err_ret) + *err_ret = err; + return !err; +} + +void CInode::auth_pin(void *by) +{ + if (auth_pins == 0) + get(PIN_AUTHPIN); + auth_pins++; + +#ifdef MDS_AUTHPIN_SET + auth_pin_set.insert(by); +#endif + + dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl; + + if (parent) + parent->adjust_nested_auth_pins(1, this); +} + +void CInode::auth_unpin(void *by) +{ + auth_pins--; + +#ifdef MDS_AUTHPIN_SET + { + auto it = auth_pin_set.find(by); + ceph_assert(it != auth_pin_set.end()); + auth_pin_set.erase(it); + } +#endif + + if (auth_pins == 0) + put(PIN_AUTHPIN); + + dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl; + + ceph_assert(auth_pins >= 0); + + if (parent) + parent->adjust_nested_auth_pins(-1, by); + + if (is_freezing_inode()) + maybe_finish_freeze_inode(); +} + +// authority + +mds_authority_t CInode::authority() const +{ + if (inode_auth.first >= 0) + return inode_auth; + + if (parent) + return parent->dir->authority(); + + // new items that are not yet linked in (in the committed plane) belong + // to their first parent. + if (!projected_parent.empty()) + return projected_parent.front()->dir->authority(); + + return CDIR_AUTH_UNDEF; +} + + +// SNAP + +snapid_t CInode::get_oldest_snap() +{ + snapid_t t = first; + if (is_any_old_inodes()) + t = get_old_inodes()->begin()->second.first; + return std::min(t, oldest_snap); +} + +const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head) +{ + ceph_assert(follows >= first); + + const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode(); + const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs(); + + auto _old_inodes = allocate_old_inode_map(); + if (old_inodes) + *_old_inodes = *old_inodes; + + mempool_old_inode &old = (*_old_inodes)[follows]; + old.first = first; + old.inode = *pi; + if (px) { + dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl; + old.xattrs = *px; + } + + if (first < oldest_snap) + oldest_snap = first; + + old.inode.trim_client_ranges(follows); + + if (g_conf()->mds_snap_rstat && + !(old.inode.rstat == old.inode.accounted_rstat)) + dirty_old_rstats.insert(follows); + + first = follows+1; + + dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" ) + << " to [" << old.first << "," << follows << "] on " + << *this << dendl; + + reset_old_inodes(std::move(_old_inodes)); + return old; +} + +void CInode::pre_cow_old_inode() +{ + snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); + dout(20) << __func__ << " follows " << follows << " on " << *this << dendl; + if (first <= follows) + cow_old_inode(follows, true); +} + +bool CInode::has_snap_data(snapid_t snapid) +{ + bool found = snapid >= first && snapid <= last; + if (!found && is_any_old_inodes()) { + auto p = old_inodes->lower_bound(snapid); + if (p != old_inodes->end()) { + if (p->second.first > snapid) { + if (p != old_inodes->begin()) + --p; + } + if (p->second.first <= snapid && snapid <= p->first) { + found = true; + } + } + } + return found; +} + +void CInode::purge_stale_snap_data(const set<snapid_t>& snaps) +{ + dout(10) << __func__ << " " << snaps << dendl; + + if (!get_old_inodes()) + return; + + std::vector<snapid_t> to_remove; + for (auto p : *get_old_inodes()) { + const snapid_t &id = p.first; + const auto &s = snaps.lower_bound(p.second.first); + if (s == snaps.end() || *s > id) { + dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl; + to_remove.push_back(id); + } + } + + if (to_remove.size() == get_old_inodes()->size()) { + reset_old_inodes(old_inode_map_ptr()); + } else if (!to_remove.empty()) { + auto _old_inodes = allocate_old_inode_map(*get_old_inodes()); + for (auto id : to_remove) + _old_inodes->erase(id); + reset_old_inodes(std::move(_old_inodes)); + } +} + +/* + * pick/create an old_inode + */ +snapid_t CInode::pick_old_inode(snapid_t snap) const +{ + if (is_any_old_inodes()) { + auto it = old_inodes->lower_bound(snap); // p is first key >= to snap + if (it != old_inodes->end() && it->second.first <= snap) { + dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl; + return it->first; + } + } + dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl; + return 0; +} + +void CInode::open_snaprealm(bool nosplit) +{ + if (!snaprealm) { + SnapRealm *parent = find_snaprealm(); + snaprealm = new SnapRealm(mdcache, this); + if (parent) { + dout(10) << __func__ << " " << snaprealm + << " parent is " << parent + << dendl; + dout(30) << " siblings are " << parent->open_children << dendl; + snaprealm->parent = parent; + if (!nosplit) + parent->split_at(snaprealm); + parent->open_children.insert(snaprealm); + } + } +} +void CInode::close_snaprealm(bool nojoin) +{ + if (snaprealm) { + dout(15) << __func__ << " " << *snaprealm << dendl; + if (snaprealm->parent) { + snaprealm->parent->open_children.erase(snaprealm); + //if (!nojoin) + //snaprealm->parent->join(snaprealm); + } + delete snaprealm; + snaprealm = 0; + } +} + +SnapRealm *CInode::find_snaprealm() const +{ + const CInode *cur = this; + while (!cur->snaprealm) { + const CDentry *pdn = cur->get_oldest_parent_dn(); + if (!pdn) + break; + cur = pdn->get_dir()->get_inode(); + } + return cur->snaprealm; +} + +void CInode::encode_snap_blob(bufferlist &snapbl) +{ + if (snaprealm) { + using ceph::encode; + encode(snaprealm->srnode, snapbl); + dout(20) << __func__ << " " << *snaprealm << dendl; + } +} +void CInode::decode_snap_blob(const bufferlist& snapbl) +{ + using ceph::decode; + if (snapbl.length()) { + open_snaprealm(); + auto old_flags = snaprealm->srnode.flags; + auto p = snapbl.cbegin(); + decode(snaprealm->srnode, p); + if (!is_base()) { + if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { + snaprealm->adjust_parent(); + } + } + dout(20) << __func__ << " " << *snaprealm << dendl; + } else if (snaprealm && + !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675 + ceph_assert(mdcache->mds->is_any_replay()); + snaprealm->merge_to(NULL); + } +} + +void CInode::encode_snap(bufferlist& bl) +{ + ENCODE_START(1, 1, bl); + bufferlist snapbl; + encode_snap_blob(snapbl); + encode(snapbl, bl); + encode(oldest_snap, bl); + ENCODE_FINISH(bl); +} + +void CInode::decode_snap(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + bufferlist snapbl; + decode(snapbl, p); + decode(oldest_snap, p); + decode_snap_blob(snapbl); + DECODE_FINISH(p); +} + +// ============================================= + +client_t CInode::calc_ideal_loner() +{ + if (mdcache->is_readonly()) + return -1; + if (!get_mds_caps_wanted().empty()) + return -1; + + int n = 0; + client_t loner = -1; + for (const auto &p : client_caps) { + if (!p.second.is_stale() && + (is_dir() ? + !has_subtree_or_exporting_dirfrag() : + (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) { + if (n) + return -1; + n++; + loner = p.first; + } + } + return loner; +} + +bool CInode::choose_ideal_loner() +{ + want_loner_cap = calc_ideal_loner(); + int changed = false; + if (loner_cap >= 0 && loner_cap != want_loner_cap) { + if (!try_drop_loner()) + return false; + changed = true; + } + + if (want_loner_cap >= 0) { + if (loner_cap < 0) { + set_loner_cap(want_loner_cap); + changed = true; + } else + ceph_assert(loner_cap == want_loner_cap); + } + return changed; +} + +bool CInode::try_set_loner() +{ + ceph_assert(want_loner_cap >= 0); + if (loner_cap >= 0 && loner_cap != want_loner_cap) + return false; + set_loner_cap(want_loner_cap); + return true; +} + +void CInode::set_loner_cap(client_t l) +{ + loner_cap = l; + authlock.set_excl_client(loner_cap); + filelock.set_excl_client(loner_cap); + linklock.set_excl_client(loner_cap); + xattrlock.set_excl_client(loner_cap); +} + +bool CInode::try_drop_loner() +{ + if (loner_cap < 0) + return true; + + int other_allowed = get_caps_allowed_by_type(CAP_ANY); + Capability *cap = get_client_cap(loner_cap); + if (!cap || + (cap->issued() & ~other_allowed) == 0) { + set_loner_cap(-1); + return true; + } + return false; +} + + +// choose new lock state during recovery, based on issued caps +void CInode::choose_lock_state(SimpleLock *lock, int allissued) +{ + int shift = lock->get_cap_shift(); + int issued = (allissued >> shift) & lock->get_cap_mask(); + if (is_auth()) { + if (lock->is_xlocked()) { + // do nothing here + } else if (lock->get_state() != LOCK_MIX) { + if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER)) + lock->set_state(LOCK_EXCL); + else if (issued & CEPH_CAP_GWR) { + if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED)) + lock->set_state(LOCK_EXCL); + else + lock->set_state(LOCK_MIX); + } else if (lock->is_dirty()) { + if (is_replicated()) + lock->set_state(LOCK_MIX); + else + lock->set_state(LOCK_LOCK); + } else + lock->set_state(LOCK_SYNC); + } + } else { + // our states have already been chosen during rejoin. + if (lock->is_xlocked()) + ceph_assert(lock->get_state() == LOCK_LOCK); + } +} + +void CInode::choose_lock_states(int dirty_caps) +{ + int issued = get_caps_issued() | dirty_caps; + if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR))) + choose_ideal_loner(); + choose_lock_state(&filelock, issued); + choose_lock_state(&nestlock, issued); + choose_lock_state(&dirfragtreelock, issued); + choose_lock_state(&authlock, issued); + choose_lock_state(&xattrlock, issued); + choose_lock_state(&linklock, issued); +} + +int CInode::count_nonstale_caps() +{ + int n = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) + n++; + } + return n; +} + +bool CInode::multiple_nonstale_caps() +{ + int n = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) { + if (n) + return true; + n++; + } + } + return false; +} + +void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m) +{ + bool old_empty = mds_caps_wanted.empty(); + mds_caps_wanted.swap(m); + if (old_empty != (bool)mds_caps_wanted.empty()) { + if (old_empty) + adjust_num_caps_notable(1); + else + adjust_num_caps_notable(-1); + } +} + +void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) +{ + bool old_empty = mds_caps_wanted.empty(); + if (wanted) { + mds_caps_wanted[mds] = wanted; + if (old_empty) + adjust_num_caps_notable(1); + } else if (!old_empty) { + mds_caps_wanted.erase(mds); + if (mds_caps_wanted.empty()) + adjust_num_caps_notable(-1); + } +} + +Capability *CInode::add_client_cap(client_t client, Session *session, + SnapRealm *conrealm, bool new_inode) +{ + ceph_assert(last == CEPH_NOSNAP); + if (client_caps.empty()) { + get(PIN_CAPS); + if (conrealm) + containing_realm = conrealm; + else + containing_realm = find_snaprealm(); + containing_realm->inodes_with_caps.push_back(&item_caps); + dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; + + mdcache->num_inodes_with_caps++; + if (parent) + parent->dir->adjust_num_inodes_with_caps(1); + } + + uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id; + auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client), + std::forward_as_tuple(this, session, cap_id)); + ceph_assert(ret.second == true); + Capability *cap = &ret.first->second; + + cap->client_follows = first-1; + containing_realm->add_cap(client, cap); + + return cap; +} + +void CInode::remove_client_cap(client_t client) +{ + auto it = client_caps.find(client); + ceph_assert(it != client_caps.end()); + Capability *cap = &it->second; + + cap->item_session_caps.remove_myself(); + cap->item_revoking_caps.remove_myself(); + cap->item_client_revoking_caps.remove_myself(); + containing_realm->remove_cap(client, cap); + + if (client == loner_cap) + loner_cap = -1; + + if (cap->is_wanted_notable()) + adjust_num_caps_notable(-1); + + client_caps.erase(it); + if (client_caps.empty()) { + dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl; + put(PIN_CAPS); + item_caps.remove_myself(); + containing_realm = NULL; + mdcache->num_inodes_with_caps--; + if (parent) + parent->dir->adjust_num_inodes_with_caps(-1); + } + + //clean up advisory locks + bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false; + bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; + if (fcntl_removed || flock_removed) { + MDSContext::vec waiters; + take_waiting(CInode::WAIT_FLOCK, waiters); + mdcache->mds->queue_waiters(waiters); + } +} + +void CInode::move_to_realm(SnapRealm *realm) +{ + dout(10) << __func__ << " joining realm " << *realm + << ", leaving realm " << *containing_realm << dendl; + for (auto& p : client_caps) { + containing_realm->remove_cap(p.first, &p.second); + realm->add_cap(p.first, &p.second); + } + item_caps.remove_myself(); + realm->inodes_with_caps.push_back(&item_caps); + containing_realm = realm; +} + +Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session) +{ + Capability *cap = get_client_cap(client); + if (cap) { + // FIXME? + cap->merge(icr.capinfo.wanted, icr.capinfo.issued); + } else { + cap = add_client_cap(client, session); + cap->set_cap_id(icr.capinfo.cap_id); + cap->set_wanted(icr.capinfo.wanted); + cap->issue_norevoke(icr.capinfo.issued); + cap->reset_seq(); + } + cap->set_last_issue_stamp(ceph_clock_now()); + return cap; +} + +void CInode::clear_client_caps_after_export() +{ + while (!client_caps.empty()) + remove_client_cap(client_caps.begin()->first); + loner_cap = -1; + want_loner_cap = -1; + if (!get_mds_caps_wanted().empty()) { + mempool::mds_co::compact_map<int32_t,int32_t> empty; + set_mds_caps_wanted(empty); + } +} + +void CInode::export_client_caps(map<client_t,Capability::Export>& cl) +{ + for (const auto &p : client_caps) { + cl[p.first] = p.second.make_export(); + } +} + + // caps allowed +int CInode::get_caps_liked() const +{ + if (is_dir()) + return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER + else + return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; +} + +int CInode::get_caps_allowed_ever() const +{ + int allowed; + if (is_dir()) + allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; + else + allowed = CEPH_CAP_ANY; + return allowed & + (CEPH_CAP_PIN | + (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) | + (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) | + (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) | + (linklock.gcaps_allowed_ever() << linklock.get_cap_shift())); +} + +int CInode::get_caps_allowed_by_type(int type) const +{ + return + CEPH_CAP_PIN | + (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | + (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | + (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | + (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); +} + +int CInode::get_caps_careful() const +{ + return + (filelock.gcaps_careful() << filelock.get_cap_shift()) | + (authlock.gcaps_careful() << authlock.get_cap_shift()) | + (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | + (linklock.gcaps_careful() << linklock.get_cap_shift()); +} + +int CInode::get_xlocker_mask(client_t client) const +{ + return + (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | + (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | + (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | + (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); +} + +int CInode::get_caps_allowed_for_client(Session *session, Capability *cap, + const mempool_inode *file_i) const +{ + client_t client = session->get_client(); + int allowed; + if (client == get_loner()) { + // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked + allowed = + get_caps_allowed_by_type(CAP_LONER) | + (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client)); + } else { + allowed = get_caps_allowed_by_type(CAP_ANY); + } + + if (is_dir()) { + allowed &= ~CEPH_CAP_ANY_DIR_OPS; + if (cap && (allowed & CEPH_CAP_FILE_EXCL)) + allowed |= cap->get_lock_cache_allowed(); + } else { + if (file_i->inline_data.version == CEPH_INLINE_NONE && + file_i->layout.pool_ns.empty()) { + // noop + } else if (cap) { + if ((file_i->inline_data.version != CEPH_INLINE_NONE && + cap->is_noinline()) || + (!file_i->layout.pool_ns.empty() && + cap->is_nopoolns())) + allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); + } else { + auto& conn = session->get_connection(); + if ((file_i->inline_data.version != CEPH_INLINE_NONE && + !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) || + (!file_i->layout.pool_ns.empty() && + !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))) + allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); + } + } + return allowed; +} + +// caps issued, wanted +int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker, + int shift, int mask) +{ + int c = 0; + int loner = 0, other = 0, xlocker = 0; + if (!is_auth()) { + loner_cap = -1; + } + + for (const auto &p : client_caps) { + int i = p.second.issued(); + c |= i; + if (p.first == loner_cap) + loner |= i; + else + other |= i; + xlocker |= get_xlocker_mask(p.first) & i; + } + if (ploner) *ploner = (loner >> shift) & mask; + if (pother) *pother = (other >> shift) & mask; + if (pxlocker) *pxlocker = (xlocker >> shift) & mask; + return (c >> shift) & mask; +} + +bool CInode::is_any_caps_wanted() const +{ + for (const auto &p : client_caps) { + if (p.second.wanted()) + return true; + } + return false; +} + +int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const +{ + int w = 0; + int loner = 0, other = 0; + for (const auto &p : client_caps) { + if (!p.second.is_stale()) { + int t = p.second.wanted(); + w |= t; + if (p.first == loner_cap) + loner |= t; + else + other |= t; + } + //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; + } + if (is_auth()) + for (const auto &p : mds_caps_wanted) { + w |= p.second; + other |= p.second; + //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; + } + if (ploner) *ploner = (loner >> shift) & mask; + if (pother) *pother = (other >> shift) & mask; + return (w >> shift) & mask; +} + +bool CInode::issued_caps_need_gather(SimpleLock *lock) +{ + int loner_issued, other_issued, xlocker_issued; + get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, + lock->get_cap_shift(), lock->get_cap_mask()); + if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) || + (other_issued & ~lock->gcaps_allowed(CAP_ANY)) || + (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER))) + return true; + return false; +} + +void CInode::adjust_num_caps_notable(int d) +{ + if (!is_clientwriteable()) { + if (!num_caps_notable && d > 0) + mdcache->open_file_table.add_inode(this); + else if (num_caps_notable > 0 && num_caps_notable == -d) + mdcache->open_file_table.remove_inode(this); + } + + num_caps_notable +=d; + ceph_assert(num_caps_notable >= 0); +} + +void CInode::mark_clientwriteable() +{ + if (last != CEPH_NOSNAP) + return; + if (!state_test(STATE_CLIENTWRITEABLE)) { + if (num_caps_notable == 0) + mdcache->open_file_table.add_inode(this); + state_set(STATE_CLIENTWRITEABLE); + } +} + +void CInode::clear_clientwriteable() +{ + if (state_test(STATE_CLIENTWRITEABLE)) { + if (num_caps_notable == 0) + mdcache->open_file_table.remove_inode(this); + state_clear(STATE_CLIENTWRITEABLE); + } +} + +// ============================================= + +int CInode::encode_inodestat(bufferlist& bl, Session *session, + SnapRealm *dir_realm, + snapid_t snapid, + unsigned max_bytes, + int getattr_caps) +{ + client_t client = session->get_client(); + ceph_assert(snapid); + + bool valid = true; + + // pick a version! + const mempool_inode *oi = get_inode().get(); + const mempool_inode *pi = get_projected_inode().get(); + + const mempool_xattr_map *pxattrs = nullptr; + + if (snapid != CEPH_NOSNAP) { + + // for now at least, old_inodes is only defined/valid on the auth + if (!is_auth()) + valid = false; + + if (is_any_old_inodes()) { + auto it = old_inodes->lower_bound(snapid); + if (it != old_inodes->end()) { + if (it->second.first > snapid) { + if (it != old_inodes->begin()) + --it; + } + if (it->second.first <= snapid && snapid <= it->first) { + dout(15) << __func__ << " snapid " << snapid + << " to old_inode [" << it->second.first << "," << it->first << "]" + << " " << it->second.inode.rstat + << dendl; + pi = oi = &it->second.inode; + pxattrs = &it->second.xattrs; + } else { + // snapshoted remote dentry can result this + dout(0) << __func__ << " old_inode for snapid " << snapid + << " not found" << dendl; + } + } + } else if (snapid < first || snapid > last) { + // snapshoted remote dentry can result this + dout(0) << __func__ << " [" << first << "," << last << "]" + << " not match snapid " << snapid << dendl; + } + } + + utime_t snap_btime; + std::map<std::string, std::string> snap_metadata; + SnapRealm *realm = find_snaprealm(); + if (snapid != CEPH_NOSNAP && realm) { + // add snapshot timestamp vxattr + map<snapid_t,const SnapInfo*> infomap; + realm->get_snap_info(infomap, + snapid, // min + snapid); // max + if (!infomap.empty()) { + ceph_assert(infomap.size() == 1); + const SnapInfo *si = infomap.begin()->second; + snap_btime = si->stamp; + snap_metadata = si->metadata; + } + } + + + bool no_caps = !valid || + session->is_stale() || + (dir_realm && realm != dir_realm) || + is_frozen() || + state_test(CInode::STATE_EXPORTINGCAPS); + if (no_caps) + dout(20) << __func__ << " no caps" + << (!valid?", !valid":"") + << (session->is_stale()?", session stale ":"") + << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"") + << (is_frozen()?", frozen inode":"") + << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") + << dendl; + + + // "fake" a version that is odd (stable) version, +1 if projected. + version_t version = (oi->version * 2) + is_projected(); + + Capability *cap = get_client_cap(client); + bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client; + //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL)); + bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client; + bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client; + bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client; + + bool plocal = versionlock.get_last_wrlock_client() == client; + bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client; + + const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi; + + dout(20) << " pfile " << pfile << " pauth " << pauth + << " plink " << plink << " pxattr " << pxattr + << " plocal " << plocal + << " mtime " << any_i->mtime + << " ctime " << any_i->ctime + << " change_attr " << any_i->change_attr + << " valid=" << valid << dendl; + + // file + const mempool_inode *file_i = pfile ? pi:oi; + file_layout_t layout; + if (is_dir()) { + layout = (ppolicy ? pi : oi)->layout; + } else { + layout = file_i->layout; + } + + // max_size is min of projected, actual + uint64_t max_size = + std::min(oi->get_client_range(client), + pi->get_client_range(client)); + + // inline data + version_t inline_version = 0; + bufferlist inline_data; + if (file_i->inline_data.version == CEPH_INLINE_NONE) { + inline_version = CEPH_INLINE_NONE; + } else if ((!cap && !no_caps) || + (cap && cap->client_inline_version < file_i->inline_data.version) || + (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data + inline_version = file_i->inline_data.version; + if (file_i->inline_data.length() > 0) + file_i->inline_data.get_data(inline_data); + } + + // nest (do same as file... :/) + if (cap) { + cap->last_rbytes = file_i->rstat.rbytes; + cap->last_rsize = file_i->rstat.rsize(); + } + + // auth + const mempool_inode *auth_i = pauth ? pi:oi; + + // link + const mempool_inode *link_i = plink ? pi:oi; + + // xattr + const mempool_inode *xattr_i = pxattr ? pi:oi; + + using ceph::encode; + // xattr + version_t xattr_version; + if ((!cap && !no_caps) || + (cap && cap->client_xattr_version < xattr_i->xattr_version) || + (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs + if (!pxattrs) + pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get(); + xattr_version = xattr_i->xattr_version; + } else { + xattr_version = 0; + } + + // do we have room? + if (max_bytes) { + unsigned bytes = + 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) + + sizeof(struct ceph_file_layout) + + sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq + 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink + 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime + sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree + sizeof(__u32) + symlink.length() + // symlink + sizeof(struct ceph_dir_layout); // dir_layout + + if (xattr_version) { + bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries + if (pxattrs) { + for (const auto &p : *pxattrs) + bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length(); + } + } else { + bytes += sizeof(__u32); // xattr buffer len + } + bytes += + sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data + 1 + 1 + 8 + 8 + 4 + // quota + 4 + layout.pool_ns.size() + // pool ns + sizeof(struct ceph_timespec) + 8; // btime + change_attr + + if (bytes > max_bytes) + return -CEPHFS_ENOSPC; + } + + + // encode caps + struct ceph_mds_reply_cap ecap; + if (snapid != CEPH_NOSNAP) { + /* + * snapped inodes (files or dirs) only get read-only caps. always + * issue everything possible, since it is read only. + * + * if a snapped inode has caps, limit issued caps based on the + * lock state. + * + * if it is a live inode, limit issued caps based on the lock + * state. + * + * do NOT adjust cap issued state, because the client always + * tracks caps per-snap and the mds does either per-interval or + * multiversion. + */ + ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE; + if (last == CEPH_NOSNAP || is_any_caps()) + ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i); + ecap.seq = 0; + ecap.mseq = 0; + ecap.realm = 0; + } else { + if (!no_caps && !cap) { + // add a new cap + cap = add_client_cap(client, session, realm); + if (is_auth()) + choose_ideal_loner(); + } + + int issue = 0; + if (!no_caps && cap) { + int likes = get_caps_liked(); + int allowed = get_caps_allowed_for_client(session, cap, file_i); + issue = (cap->wanted() | likes) & allowed; + cap->issue_norevoke(issue, true); + issue = cap->pending(); + dout(10) << "encode_inodestat issuing " << ccap_string(issue) + << " seq " << cap->get_last_seq() << dendl; + } else if (cap && cap->is_new() && !dir_realm) { + // alway issue new caps to client, otherwise the caps get lost + ceph_assert(cap->is_stale()); + ceph_assert(!cap->pending()); + issue = CEPH_CAP_PIN; + cap->issue_norevoke(issue, true); + dout(10) << "encode_inodestat issuing " << ccap_string(issue) + << " seq " << cap->get_last_seq() + << "(stale&new caps)" << dendl; + } + + if (issue) { + cap->set_last_issue(); + cap->set_last_issue_stamp(ceph_clock_now()); + ecap.caps = issue; + ecap.wanted = cap->wanted(); + ecap.cap_id = cap->get_cap_id(); + ecap.seq = cap->get_last_seq(); + ecap.mseq = cap->get_mseq(); + ecap.realm = realm->inode->ino(); + } else { + ecap.cap_id = 0; + ecap.caps = 0; + ecap.seq = 0; + ecap.mseq = 0; + ecap.realm = 0; + ecap.wanted = 0; + } + } + ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0; + dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps) + << " seq " << ecap.seq << " mseq " << ecap.mseq + << " xattrv " << xattr_version << dendl; + + if (inline_data.length() && cap) { + if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) { + dout(10) << "including inline version " << inline_version << dendl; + cap->client_inline_version = inline_version; + } else { + dout(10) << "dropping inline version " << inline_version << dendl; + inline_version = 0; + inline_data.clear(); + } + } + + // include those xattrs? + if (xattr_version && cap) { + if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) { + dout(10) << "including xattrs version " << xattr_version << dendl; + cap->client_xattr_version = xattr_version; + } else { + dout(10) << "dropping xattrs version " << xattr_version << dendl; + xattr_version = 0; + } + } + + // The end result of encode_xattrs() is equivalent to: + // { + // bufferlist xbl; + // if (xattr_version) { + // if (pxattrs) + // encode(*pxattrs, bl); + // else + // encode((__u32)0, bl); + // } + // encode(xbl, bl); + // } + // + // But encoding xattrs into the 'xbl' requires a memory allocation. + // The 'bl' should have enough pre-allocated memory in most cases. + // Encoding xattrs directly into it can avoid the extra allocation. + auto encode_xattrs = [xattr_version, pxattrs, &bl]() { + using ceph::encode; + if (xattr_version) { + ceph_le32 xbl_len; + auto filler = bl.append_hole(sizeof(xbl_len)); + const auto starting_bl_len = bl.length(); + if (pxattrs) + encode(*pxattrs, bl); + else + encode((__u32)0, bl); + xbl_len = bl.length() - starting_bl_len; + filler.copy_in(sizeof(xbl_len), (char *)&xbl_len); + } else { + encode((__u32)0, bl); + } + }; + + /* + * note: encoding matches MClientReply::InodeStat + */ + if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { + ENCODE_START(7, 1, bl); + encode(oi->ino, bl); + encode(snapid, bl); + encode(oi->rdev, bl); + encode(version, bl); + encode(xattr_version, bl); + encode(ecap, bl); + { + ceph_file_layout legacy_layout; + layout.to_legacy(&legacy_layout); + encode(legacy_layout, bl); + } + encode(any_i->ctime, bl); + encode(file_i->mtime, bl); + encode(file_i->atime, bl); + encode(file_i->time_warp_seq, bl); + encode(file_i->size, bl); + encode(max_size, bl); + encode(file_i->truncate_size, bl); + encode(file_i->truncate_seq, bl); + encode(auth_i->mode, bl); + encode((uint32_t)auth_i->uid, bl); + encode((uint32_t)auth_i->gid, bl); + encode(link_i->nlink, bl); + encode(file_i->dirstat.nfiles, bl); + encode(file_i->dirstat.nsubdirs, bl); + encode(file_i->rstat.rbytes, bl); + encode(file_i->rstat.rfiles, bl); + encode(file_i->rstat.rsubdirs, bl); + encode(file_i->rstat.rctime, bl); + dirfragtree.encode(bl); + encode(symlink, bl); + encode(file_i->dir_layout, bl); + encode_xattrs(); + encode(inline_version, bl); + encode(inline_data, bl); + const mempool_inode *policy_i = ppolicy ? pi : oi; + encode(policy_i->quota, bl); + encode(layout.pool_ns, bl); + encode(any_i->btime, bl); + encode(any_i->change_attr, bl); + encode(file_i->export_pin, bl); + encode(snap_btime, bl); + encode(file_i->rstat.rsnaps, bl); + encode(snap_metadata, bl); + encode(!file_i->fscrypt_auth.empty(), bl); + encode(file_i->fscrypt_auth, bl); + encode(file_i->fscrypt_file, bl); + ENCODE_FINISH(bl); + } + else { + ceph_assert(session->get_connection()); + + encode(oi->ino, bl); + encode(snapid, bl); + encode(oi->rdev, bl); + encode(version, bl); + encode(xattr_version, bl); + encode(ecap, bl); + { + ceph_file_layout legacy_layout; + layout.to_legacy(&legacy_layout); + encode(legacy_layout, bl); + } + encode(any_i->ctime, bl); + encode(file_i->mtime, bl); + encode(file_i->atime, bl); + encode(file_i->time_warp_seq, bl); + encode(file_i->size, bl); + encode(max_size, bl); + encode(file_i->truncate_size, bl); + encode(file_i->truncate_seq, bl); + encode(auth_i->mode, bl); + encode((uint32_t)auth_i->uid, bl); + encode((uint32_t)auth_i->gid, bl); + encode(link_i->nlink, bl); + encode(file_i->dirstat.nfiles, bl); + encode(file_i->dirstat.nsubdirs, bl); + encode(file_i->rstat.rbytes, bl); + encode(file_i->rstat.rfiles, bl); + encode(file_i->rstat.rsubdirs, bl); + encode(file_i->rstat.rctime, bl); + dirfragtree.encode(bl); + encode(symlink, bl); + auto& conn = session->get_connection(); + if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { + encode(file_i->dir_layout, bl); + } + encode_xattrs(); + if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { + encode(inline_version, bl); + encode(inline_data, bl); + } + if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) { + const mempool_inode *policy_i = ppolicy ? pi : oi; + encode(policy_i->quota, bl); + } + if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { + encode(layout.pool_ns, bl); + } + if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) { + encode(any_i->btime, bl); + encode(any_i->change_attr, bl); + } + } + + return valid; +} + +void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap) +{ + ceph_assert(cap); + + client_t client = cap->get_client(); + + bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL); + bool pauth = authlock.is_xlocked_by_client(client); + bool plink = linklock.is_xlocked_by_client(client); + bool pxattr = xattrlock.is_xlocked_by_client(client); + + const mempool_inode *oi = get_inode().get(); + const mempool_inode *pi = get_projected_inode().get(); + const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi; + + dout(20) << __func__ << " pfile " << pfile + << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr + << " mtime " << i->mtime << " ctime " << i->ctime << " change_attr " << i->change_attr << dendl; + + i = pfile ? pi:oi; + m->set_layout(i->layout); + m->size = i->size; + m->truncate_seq = i->truncate_seq; + m->truncate_size = i->truncate_size; + m->fscrypt_file = i->fscrypt_file; + m->fscrypt_auth = i->fscrypt_auth; + m->mtime = i->mtime; + m->atime = i->atime; + m->ctime = i->ctime; + m->btime = i->btime; + m->change_attr = i->change_attr; + m->time_warp_seq = i->time_warp_seq; + m->nfiles = i->dirstat.nfiles; + m->nsubdirs = i->dirstat.nsubdirs; + + if (cap->client_inline_version < i->inline_data.version) { + m->inline_version = cap->client_inline_version = i->inline_data.version; + if (i->inline_data.length() > 0) + i->inline_data.get_data(m->inline_data); + } else { + m->inline_version = 0; + } + + // max_size is min of projected, actual. + uint64_t oldms = oi->get_client_range(client); + uint64_t newms = pi->get_client_range(client); + m->max_size = std::min(oldms, newms); + + i = pauth ? pi:oi; + m->head.mode = i->mode; + m->head.uid = i->uid; + m->head.gid = i->gid; + + i = plink ? pi:oi; + m->head.nlink = i->nlink; + + using ceph::encode; + i = pxattr ? pi:oi; + const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs(); + if ((cap->pending() & CEPH_CAP_XATTR_SHARED) && + i->xattr_version > cap->client_xattr_version) { + dout(10) << " including xattrs v " << i->xattr_version << dendl; + if (ix) + encode(*ix, m->xattrbl); + else + encode((__u32)0, m->xattrbl); + m->head.xattr_version = i->xattr_version; + cap->client_xattr_version = i->xattr_version; + } +} + + + +void CInode::_encode_base(bufferlist& bl, uint64_t features) +{ + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(*get_inode(), bl, features); + encode(symlink, bl); + encode(dirfragtree, bl); + encode_xattrs(bl); + encode_old_inodes(bl, features); + encode(damage_flags, bl); + encode_snap(bl); + ENCODE_FINISH(bl); +} +void CInode::_decode_base(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(first, p); + { + auto _inode = allocate_inode(); + decode(*_inode, p); + reset_inode(std::move(_inode)); + } + { + std::string tmp; + decode(tmp, p); + symlink = std::string_view(tmp); + } + decode(dirfragtree, p); + decode_xattrs(p); + decode_old_inodes(p); + decode(damage_flags, p); + decode_snap(p); + DECODE_FINISH(p); +} + +void CInode::_encode_locks_full(bufferlist& bl) +{ + using ceph::encode; + encode(authlock, bl); + encode(linklock, bl); + encode(dirfragtreelock, bl); + encode(filelock, bl); + encode(xattrlock, bl); + encode(snaplock, bl); + encode(nestlock, bl); + encode(flocklock, bl); + encode(policylock, bl); + + encode(loner_cap, bl); +} +void CInode::_decode_locks_full(bufferlist::const_iterator& p) +{ + using ceph::decode; + decode(authlock, p); + decode(linklock, p); + decode(dirfragtreelock, p); + decode(filelock, p); + decode(xattrlock, p); + decode(snaplock, p); + decode(nestlock, p); + decode(flocklock, p); + decode(policylock, p); + + decode(loner_cap, p); + set_loner_cap(loner_cap); + want_loner_cap = loner_cap; // for now, we'll eval() shortly. +} + +void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) +{ + ENCODE_START(1, 1, bl); + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_replica(bl); + filelock.encode_state_for_replica(bl); + nestlock.encode_state_for_replica(bl); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); + encode(need_recover, bl); + ENCODE_FINISH(bl); +} + +void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) +{ + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_rejoin(bl, rep); + filelock.encode_state_for_rejoin(bl, rep); + nestlock.encode_state_for_rejoin(bl, rep); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); +} + +void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new) +{ + DECODE_START(1, p); + authlock.decode_state(p, is_new); + linklock.decode_state(p, is_new); + dirfragtreelock.decode_state(p, is_new); + filelock.decode_state(p, is_new); + nestlock.decode_state(p, is_new); + xattrlock.decode_state(p, is_new); + snaplock.decode_state(p, is_new); + flocklock.decode_state(p, is_new); + policylock.decode_state(p, is_new); + + bool need_recover; + decode(need_recover, p); + if (need_recover && is_new) { + // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock + // and change the object when replaying unsafe requests. + authlock.mark_need_recover(); + linklock.mark_need_recover(); + dirfragtreelock.mark_need_recover(); + filelock.mark_need_recover(); + nestlock.mark_need_recover(); + xattrlock.mark_need_recover(); + snaplock.mark_need_recover(); + flocklock.mark_need_recover(); + policylock.mark_need_recover(); + } + DECODE_FINISH(p); +} +void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, + list<SimpleLock*>& eval_locks, bool survivor) +{ + authlock.decode_state_rejoin(p, waiters, survivor); + linklock.decode_state_rejoin(p, waiters, survivor); + dirfragtreelock.decode_state_rejoin(p, waiters, survivor); + filelock.decode_state_rejoin(p, waiters, survivor); + nestlock.decode_state_rejoin(p, waiters, survivor); + xattrlock.decode_state_rejoin(p, waiters, survivor); + snaplock.decode_state_rejoin(p, waiters, survivor); + flocklock.decode_state_rejoin(p, waiters, survivor); + policylock.decode_state_rejoin(p, waiters, survivor); + + if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) + eval_locks.push_back(&dirfragtreelock); + if (!filelock.is_stable() && !filelock.is_wrlocked()) + eval_locks.push_back(&filelock); + if (!nestlock.is_stable() && !nestlock.is_wrlocked()) + eval_locks.push_back(&nestlock); +} + + +// IMPORT/EXPORT + +void CInode::encode_export(bufferlist& bl) +{ + ENCODE_START(5, 4, bl); + _encode_base(bl, mdcache->mds->mdsmap->get_up_features()); + + encode(state, bl); + + encode(pop, bl); + + encode(get_replicas(), bl); + + // include scatterlock info for any bounding CDirs + bufferlist bounding; + if (get_inode()->is_dir()) + for (const auto &p : dirfrags) { + CDir *dir = p.second; + if (dir->state_test(CDir::STATE_EXPORTBOUND)) { + encode(p.first, bounding); + encode(dir->get_fnode()->fragstat, bounding); + encode(dir->get_fnode()->accounted_fragstat, bounding); + encode(dir->get_fnode()->rstat, bounding); + encode(dir->get_fnode()->accounted_rstat, bounding); + dout(10) << " encoded fragstat/rstat info for " << *dir << dendl; + } + } + encode(bounding, bl); + + _encode_locks_full(bl); + + _encode_file_locks(bl); + + ENCODE_FINISH(bl); + + get(PIN_TEMPEXPORTING); +} + +void CInode::finish_export() +{ + state &= MASK_STATE_EXPORT_KEPT; + + pop.zero(); + + // just in case! + //dirlock.clear_updated(); + + loner_cap = -1; + + put(PIN_TEMPEXPORTING); +} + +void CInode::decode_import(bufferlist::const_iterator& p, + LogSegment *ls) +{ + DECODE_START(5, p); + + _decode_base(p); + + { + unsigned s; + decode(s, p); + s &= MASK_STATE_EXPORTED; + + set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN), + (s & STATE_RANDEPHEMERALPIN)); + state_set(STATE_AUTH | s); + } + + if (is_dirty()) { + get(PIN_DIRTY); + _mark_dirty(ls); + } + if (is_dirty_parent()) { + get(PIN_DIRTYPARENT); + mark_dirty_parent(ls); + } + + decode(pop, p); + + decode(get_replicas(), p); + if (is_replicated()) + get(PIN_REPLICATED); + replica_nonce = 0; + + // decode fragstat info on bounding cdirs + bufferlist bounding; + decode(bounding, p); + auto q = bounding.cbegin(); + while (!q.end()) { + frag_t fg; + decode(fg, q); + CDir *dir = get_dirfrag(fg); + ceph_assert(dir); // we should have all bounds open + + // Only take the remote's fragstat/rstat if we are non-auth for + // this dirfrag AND the lock is NOT in a scattered (MIX) state. + // We know lock is stable, and MIX is the only state in which + // the inode auth (who sent us this data) may not have the best + // info. + + // HMM: Are there cases where dir->is_auth() is an insufficient + // check because the dirfrag is under migration? That implies + // it is frozen (and in a SYNC or LOCK state). FIXME. + + auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); + if (dir->is_auth() || + filelock.get_state() == LOCK_MIX) { + dout(10) << " skipped fragstat info for " << *dir << dendl; + frag_info_t f; + decode(f, q); + decode(f, q); + } else { + decode(_fnode->fragstat, q); + decode(_fnode->accounted_fragstat, q); + dout(10) << " took fragstat info for " << *dir << dendl; + } + if (dir->is_auth() || + nestlock.get_state() == LOCK_MIX) { + dout(10) << " skipped rstat info for " << *dir << dendl; + nest_info_t n; + decode(n, q); + decode(n, q); + } else { + decode(_fnode->rstat, q); + decode(_fnode->accounted_rstat, q); + dout(10) << " took rstat info for " << *dir << dendl; + } + dir->reset_fnode(std::move(_fnode)); + } + + _decode_locks_full(p); + + _decode_file_locks(p); + + DECODE_FINISH(p); +} + + +void InodeStoreBase::dump(Formatter *f) const +{ + inode->dump(f); + f->dump_string("symlink", symlink); + + f->open_array_section("xattrs"); + if (xattrs) { + for (const auto& [key, val] : *xattrs) { + f->open_object_section("xattr"); + f->dump_string("key", key); + std::string v(val.c_str(), val.length()); + f->dump_string("val", v); + f->close_section(); + } + } + f->close_section(); + f->open_object_section("dirfragtree"); + dirfragtree.dump(f); + f->close_section(); // dirfragtree + + f->open_array_section("old_inodes"); + if (old_inodes) { + for (const auto &p : *old_inodes) { + f->open_object_section("old_inode"); + // The key is the last snapid, the first is in the mempool_old_inode + f->dump_int("last", p.first); + p.second.dump(f); + f->close_section(); // old_inode + } + } + f->close_section(); // old_inodes + + f->dump_unsigned("oldest_snap", oldest_snap); + f->dump_unsigned("damage_flags", damage_flags); +} + +template <> +void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){ + + t = mempool::mds_co::string(std::string_view(obj->get_data())); +} + +void InodeStoreBase::decode_json(JSONObj *obj) +{ + { + auto _inode = allocate_inode(); + _inode->decode_json(obj); + reset_inode(std::move(_inode)); + } + + JSONDecoder::decode_json("symlink", symlink, obj, true); + // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now + // + // + { + mempool_xattr_map tmp; + JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true); + if (tmp.empty()) + reset_xattrs(xattr_map_ptr()); + else + reset_xattrs(allocate_xattr_map(std::move(tmp))); + } + // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now + JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true); + JSONDecoder::decode_json("damage_flags", damage_flags, obj, true); + //sr_t srnode; + //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now + //snap_blob = srnode; +} + +void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){ + + string k; + JSONDecoder::decode_json("key", k, obj, true); + string v; + JSONDecoder::decode_json("val", v, obj, true); + c[k.c_str()] = buffer::copy(v.c_str(), v.size()); +} + +void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){ + + snapid_t s; + JSONDecoder::decode_json("last", s.val, obj, true); + InodeStoreBase::mempool_old_inode i; + // i.decode_json(obj); // cann't decode now, simon + c[s] = i; +} + +void InodeStore::generate_test_instances(std::list<InodeStore*> &ls) +{ + InodeStore *populated = new InodeStore; + populated->get_inode()->ino = 0xdeadbeef; + populated->symlink = "rhubarb"; + ls.push_back(populated); +} + +void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls) +{ + InodeStoreBare *populated = new InodeStoreBare; + populated->get_inode()->ino = 0xdeadbeef; + populated->symlink = "rhubarb"; + ls.push_back(populated); +} + +void CInode::validate_disk_state(CInode::validated_data *results, + MDSContext *fin) +{ + class ValidationContinuation : public MDSContinuation { + public: + MDSContext *fin; + CInode *in; + CInode::validated_data *results; + bufferlist bl; + CInode *shadow_in; + + enum { + START = 0, + BACKTRACE, + INODE, + DIRFRAGS, + SNAPREALM, + }; + + ValidationContinuation(CInode *i, + CInode::validated_data *data_r, + MDSContext *fin_) : + MDSContinuation(i->mdcache->mds->server), + fin(fin_), + in(i), + results(data_r), + shadow_in(NULL) { + set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start)); + set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace)); + set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk)); + set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags)); + } + + ~ValidationContinuation() override { + if (shadow_in) { + delete shadow_in; + in->mdcache->num_shadow_inodes--; + } + } + + /** + * Fetch backtrace and set tag if tag is non-empty + */ + void fetch_backtrace_and_tag(CInode *in, + std::string_view tag, bool is_internal, + Context *fin, int *bt_r, bufferlist *bt) + { + const int64_t pool = in->get_backtrace_pool(); + object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); + + ObjectOperation fetch; + fetch.getxattr("parent", bt, bt_r); + in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP, + NULL, 0, fin); + if (in->mdcache->mds->logger) { + in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch); + in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch); + } + + using ceph::encode; + if (!is_internal) { + ObjectOperation scrub_tag; + bufferlist tag_bl; + encode(tag, tag_bl); + scrub_tag.setxattr("scrub_tag", tag_bl); + SnapContext snapc; + in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc, + ceph::real_clock::now(), + 0, NULL); + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_set_tag); + } + } + + bool _start(int rval) { + ceph_assert(in->can_auth_pin()); + in->auth_pin(this); + + if (in->is_dirty()) { + MDCache *mdcache = in->mdcache; // For the benefit of dout + auto ino = [this]() { return in->ino(); }; // For the benefit of dout + dout(20) << "validating a dirty CInode; results will be inconclusive" + << dendl; + } + + C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE), + in->mdcache->mds->finisher); + + std::string_view tag = in->scrub_infop->header->get_tag(); + bool is_internal = in->scrub_infop->header->is_internal_tag(); + // Rather than using the usual CInode::fetch_backtrace, + // use a special variant that optionally writes a tag in the same + // operation. + fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl); + return false; + } + + bool _backtrace(int rval) { + // set up basic result reporting and make sure we got the data + results->performed_validation = true; // at least, some of it! + results->backtrace.checked = true; + + const int64_t pool = in->get_backtrace_pool(); + inode_backtrace_t& memory_backtrace = results->backtrace.memory_value; + in->build_backtrace(pool, memory_backtrace); + bool equivalent, divergent; + int memory_newer; + + MDCache *mdcache = in->mdcache; // For the benefit of dout + auto ino = [this]() { return in->ino(); }; // For the benefit of dout + + // Ignore rval because it's the result of a FAILOK operation + // from fetch_backtrace_and_tag: the real result is in + // backtrace.ondisk_read_retval + dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl; + if (results->backtrace.ondisk_read_retval != 0) { + results->backtrace.error_str << "failed to read off disk; see retval"; + // we probably have a new unwritten file! + // so skip the backtrace scrub for this entry and say that all's well + if (in->is_mdsdir()){ + dout(20) << "forcing backtrace as passed since mdsdir actually doesn't have backtrace" << dendl; + results->backtrace.passed = true; + } + if (in->is_dirty_parent()) { + dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl; + results->backtrace.passed = true; + } + goto next; + } + + // extract the backtrace, and compare it to a newly-constructed one + try { + auto p = bl.cbegin(); + using ceph::decode; + decode(results->backtrace.ondisk_value, p); + dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl; + } catch (buffer::error&) { + if (results->backtrace.ondisk_read_retval == 0 && rval != 0) { + // Cases where something has clearly gone wrong with the overall + // fetch op, though we didn't get a nonzero rc from the getxattr + // operation. e.g. object missing. + results->backtrace.ondisk_read_retval = rval; + } + results->backtrace.error_str << "failed to decode on-disk backtrace (" + << bl.length() << " bytes)!"; + // we probably have a new unwritten file! + // so skip the backtrace scrub for this entry and say that all's well + if (in->is_dirty_parent()) { + dout(20) << "decode failed; forcing backtrace as passed since " + "inode is dirty parent" << dendl; + results->backtrace.passed = true; + } + + goto next; + } + + memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value, + &equivalent, &divergent); + + if (divergent || memory_newer < 0) { + // we're divergent, or on-disk version is newer + results->backtrace.error_str << "On-disk backtrace is divergent or newer"; + /* if the backtraces are divergent and the link count is 0, then + * most likely its a stray entry that's being purged and things are + * well and there's no reason for alarm + */ + if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) { + results->backtrace.passed = true; + dout(20) << "divergent backtraces are acceptable when dn " + "is being purged or has been renamed or moved to a " + "different directory " << *in << dendl; + } + } else { + results->backtrace.passed = true; + } +next: + + if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) { + std::string path; + in->make_path_string(path); + in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino() + << "(" << path << "), rewriting it"; + in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(), + false); + // Flag that we repaired this BT so that it won't go into damagetable + results->backtrace.repaired = true; + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired); + } + + // If the inode's number was free in the InoTable, fix that + // (#15619) + { + InoTable *inotable = mdcache->mds->inotable; + + dout(10) << "scrub: inotable ino = " << in->ino() << dendl; + dout(10) << "scrub: inotable free says " + << inotable->is_marked_free(in->ino()) << dendl; + + if (inotable->is_marked_free(in->ino())) { + LogChannelRef clog = in->mdcache->mds->clog; + clog->error() << "scrub: inode wrongly marked free: " << in->ino(); + + if (in->scrub_infop->header->get_repair()) { + bool repaired = inotable->repair(in->ino()); + if (repaired) { + clog->error() << "inode table repaired for inode: " << in->ino(); + + inotable->save(); + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired); + } else { + clog->error() << "Cannot repair inotable while other operations" + " are in progress"; + } + } + } + } + + + if (in->is_dir()) { + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes); + return validate_directory_data(); + } else { + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes); + // TODO: validate on-disk inode for normal files + return true; + } + } + + bool validate_directory_data() { + ceph_assert(in->is_dir()); + + if (in->is_base()) { + if (!shadow_in) { + shadow_in = new CInode(in->mdcache); + in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode); + in->mdcache->num_shadow_inodes++; + } + shadow_in->fetch(get_internal_callback(INODE)); + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes); + return false; + } else { + // TODO: validate on-disk inode for non-base directories + if (in->mdcache->mds->logger) + in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats); + results->inode.passed = true; + return check_dirfrag_rstats(); + } + } + + bool _inode_disk(int rval) { + const auto& si = shadow_in->get_inode(); + const auto& i = in->get_inode(); + + results->inode.checked = true; + results->inode.ondisk_read_retval = rval; + results->inode.ondisk_value = *si; + results->inode.memory_value = *i; + + if (si->version > i->version) { + // uh, what? + results->inode.error_str << "On-disk inode is newer than in-memory one; "; + goto next; + } else { + bool divergent = false; + int r = i->compare(*si, &divergent); + results->inode.passed = !divergent && r >= 0; + if (!results->inode.passed) { + results->inode.error_str << + "On-disk inode is divergent or newer than in-memory one; "; + goto next; + } + } +next: + return check_dirfrag_rstats(); + } + + bool check_dirfrag_rstats() { + if (in->has_subtree_root_dirfrag()) { + in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS)); + return false; + } else { + return immediate(DIRFRAGS, 0); + } + } + + bool _dirfrags(int rval) { + // basic reporting setup + results->raw_stats.checked = true; + results->raw_stats.ondisk_read_retval = rval; + + results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat; + results->raw_stats.memory_value.rstat = in->get_inode()->rstat; + frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat; + nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat; + + if (rval != 0) { + results->raw_stats.error_str << "Failed to read dirfrags off disk"; + goto next; + } + + // check each dirfrag... + for (const auto &p : in->dirfrags) { + CDir *dir = p.second; + ceph_assert(dir->get_version() > 0); + nest_info.add(dir->get_fnode()->accounted_rstat); + dir_info.add(dir->get_fnode()->accounted_fragstat); + } + nest_info.rsubdirs++; // it gets one to account for self + if (const sr_t *srnode = in->get_projected_srnode(); srnode) + nest_info.rsnaps += srnode->snaps.size(); + + // ...and that their sum matches our inode settings + if (!dir_info.same_sums(in->get_inode()->dirstat) || + !nest_info.same_sums(in->get_inode()->rstat)) { + if (in->scrub_infop->header->get_repair()) { + results->raw_stats.error_str + << "freshly-calculated rstats don't match existing ones (will be fixed)"; + in->mdcache->repair_inode_stats(in); + results->raw_stats.repaired = true; + } else { + results->raw_stats.error_str + << "freshly-calculated rstats don't match existing ones"; + } + if (in->is_dirty()) { + MDCache *mdcache = in->mdcache; // for dout() + auto ino = [this]() { return in->ino(); }; // for dout() + dout(20) << "raw stats most likely wont match since inode is dirty; " + "please rerun scrub when system is stable; " + "assuming passed for now;" << dendl; + results->raw_stats.passed = true; + } + goto next; + } + + results->raw_stats.passed = true; + { + MDCache *mdcache = in->mdcache; // for dout() + auto ino = [this]() { return in->ino(); }; // for dout() + dout(20) << "raw stats check passed on " << *in << dendl; + } + +next: + return true; + } + + void _done() override { + if ((!results->raw_stats.checked || results->raw_stats.passed) && + (!results->backtrace.checked || results->backtrace.passed) && + (!results->inode.checked || results->inode.passed)) + results->passed_validation = true; + + // Flag that we did some repair work so that our repair operation + // can be flushed at end of scrub + if (results->backtrace.repaired || + results->inode.repaired || + results->raw_stats.repaired) + in->scrub_infop->header->set_repaired(); + if (fin) + fin->complete(get_rval()); + + in->auth_unpin(this); + } + }; + + + dout(10) << "scrub starting validate_disk_state on " << *this << dendl; + ValidationContinuation *vc = new ValidationContinuation(this, + results, + fin); + vc->begin(); +} + +void CInode::validated_data::dump(Formatter *f) const +{ + f->open_object_section("results"); + { + f->dump_bool("performed_validation", performed_validation); + f->dump_bool("passed_validation", passed_validation); + f->open_object_section("backtrace"); + { + f->dump_bool("checked", backtrace.checked); + f->dump_bool("passed", backtrace.passed); + f->dump_int("read_ret_val", backtrace.ondisk_read_retval); + f->dump_stream("ondisk_value") << backtrace.ondisk_value; + f->dump_stream("memoryvalue") << backtrace.memory_value; + f->dump_string("error_str", backtrace.error_str.str()); + } + f->close_section(); // backtrace + f->open_object_section("raw_stats"); + { + f->dump_bool("checked", raw_stats.checked); + f->dump_bool("passed", raw_stats.passed); + f->dump_int("read_ret_val", raw_stats.ondisk_read_retval); + f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat; + f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat; + f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat; + f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat; + f->dump_string("error_str", raw_stats.error_str.str()); + } + f->close_section(); // raw_stats + // dump failure return code + int rc = 0; + if (backtrace.checked && backtrace.ondisk_read_retval) + rc = backtrace.ondisk_read_retval; + if (inode.checked && inode.ondisk_read_retval) + rc = inode.ondisk_read_retval; + if (raw_stats.checked && raw_stats.ondisk_read_retval) + rc = raw_stats.ondisk_read_retval; + f->dump_int("return_code", rc); + } + f->close_section(); // results +} + +bool CInode::validated_data::all_damage_repaired() const +{ + bool unrepaired = + (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired) + || + (backtrace.checked && !backtrace.passed && !backtrace.repaired) + || + (inode.checked && !inode.passed && !inode.repaired); + + return !unrepaired; +} + +void CInode::dump(Formatter *f, int flags) const +{ + if (flags & DUMP_PATH) { + std::string path; + make_path_string(path, true); + if (path.empty()) + path = "/"; + f->dump_string("path", path); + } + + if (flags & DUMP_INODE_STORE_BASE) + InodeStoreBase::dump(f); + + if (flags & DUMP_MDS_CACHE_OBJECT) + MDSCacheObject::dump(f); + + if (flags & DUMP_LOCKS) { + f->open_object_section("versionlock"); + versionlock.dump(f); + f->close_section(); + + f->open_object_section("authlock"); + authlock.dump(f); + f->close_section(); + + f->open_object_section("linklock"); + linklock.dump(f); + f->close_section(); + + f->open_object_section("dirfragtreelock"); + dirfragtreelock.dump(f); + f->close_section(); + + f->open_object_section("filelock"); + filelock.dump(f); + f->close_section(); + + f->open_object_section("xattrlock"); + xattrlock.dump(f); + f->close_section(); + + f->open_object_section("snaplock"); + snaplock.dump(f); + f->close_section(); + + f->open_object_section("nestlock"); + nestlock.dump(f); + f->close_section(); + + f->open_object_section("flocklock"); + flocklock.dump(f); + f->close_section(); + + f->open_object_section("policylock"); + policylock.dump(f); + f->close_section(); + } + + if (flags & DUMP_STATE) { + f->open_array_section("states"); + MDSCacheObject::dump_states(f); + if (state_test(STATE_EXPORTING)) + f->dump_string("state", "exporting"); + if (state_test(STATE_OPENINGDIR)) + f->dump_string("state", "openingdir"); + if (state_test(STATE_FREEZING)) + f->dump_string("state", "freezing"); + if (state_test(STATE_FROZEN)) + f->dump_string("state", "frozen"); + if (state_test(STATE_AMBIGUOUSAUTH)) + f->dump_string("state", "ambiguousauth"); + if (state_test(STATE_EXPORTINGCAPS)) + f->dump_string("state", "exportingcaps"); + if (state_test(STATE_NEEDSRECOVER)) + f->dump_string("state", "needsrecover"); + if (state_test(STATE_PURGING)) + f->dump_string("state", "purging"); + if (state_test(STATE_DIRTYPARENT)) + f->dump_string("state", "dirtyparent"); + if (state_test(STATE_DIRTYRSTAT)) + f->dump_string("state", "dirtyrstat"); + if (state_test(STATE_STRAYPINNED)) + f->dump_string("state", "straypinned"); + if (state_test(STATE_FROZENAUTHPIN)) + f->dump_string("state", "frozenauthpin"); + if (state_test(STATE_DIRTYPOOL)) + f->dump_string("state", "dirtypool"); + if (state_test(STATE_ORPHAN)) + f->dump_string("state", "orphan"); + if (state_test(STATE_MISSINGOBJS)) + f->dump_string("state", "missingobjs"); + f->close_section(); + } + + if (flags & DUMP_CAPS) { + f->open_array_section("client_caps"); + for (const auto &p : client_caps) { + auto &client = p.first; + auto cap = &p.second; + f->open_object_section("client_cap"); + f->dump_int("client_id", client.v); + f->dump_string("pending", ccap_string(cap->pending())); + f->dump_string("issued", ccap_string(cap->issued())); + f->dump_string("wanted", ccap_string(cap->wanted())); + f->dump_int("last_sent", cap->get_last_seq()); + f->close_section(); + } + f->close_section(); + + f->dump_int("loner", loner_cap.v); + f->dump_int("want_loner", want_loner_cap.v); + + f->open_array_section("mds_caps_wanted"); + for (const auto &p : mds_caps_wanted) { + f->open_object_section("mds_cap_wanted"); + f->dump_int("rank", p.first); + f->dump_string("cap", ccap_string(p.second)); + f->close_section(); + } + f->close_section(); + } + + if (flags & DUMP_DIRFRAGS) { + f->open_array_section("dirfrags"); + auto&& dfs = get_dirfrags(); + for(const auto &dir: dfs) { + f->open_object_section("dir"); + dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS); + dir->check_rstats(); + f->close_section(); + } + f->close_section(); + } +} + +/****** Scrub Stuff *****/ +void CInode::scrub_info_create() const +{ + dout(25) << __func__ << dendl; + ceph_assert(!scrub_infop); + + // break out of const-land to set up implicit initial state + CInode *me = const_cast<CInode*>(this); + const auto& pi = me->get_projected_inode(); + + std::unique_ptr<scrub_info_t> si(new scrub_info_t()); + si->last_scrub_stamp = pi->last_scrub_stamp; + si->last_scrub_version = pi->last_scrub_version; + + me->scrub_infop.swap(si); +} + +void CInode::scrub_maybe_delete_info() +{ + if (scrub_infop && + !scrub_infop->scrub_in_progress && + !scrub_infop->last_scrub_dirty) { + scrub_infop.reset(); + } +} + +void CInode::scrub_initialize(ScrubHeaderRef& header) +{ + dout(20) << __func__ << " with scrub_version " << get_version() << dendl; + + scrub_info(); + scrub_infop->scrub_in_progress = true; + scrub_infop->queued_frags.clear(); + scrub_infop->header = header; + header->inc_num_pending(); + // right now we don't handle remote inodes +} + +void CInode::scrub_aborted() { + dout(20) << __func__ << dendl; + ceph_assert(scrub_is_in_progress()); + + scrub_infop->scrub_in_progress = false; + scrub_infop->header->dec_num_pending(); + scrub_maybe_delete_info(); +} + +void CInode::scrub_finished() { + dout(20) << __func__ << dendl; + ceph_assert(scrub_is_in_progress()); + + scrub_infop->last_scrub_version = get_version(); + scrub_infop->last_scrub_stamp = ceph_clock_now(); + scrub_infop->last_scrub_dirty = true; + scrub_infop->scrub_in_progress = false; + scrub_infop->header->dec_num_pending(); +} + +int64_t CInode::get_backtrace_pool() const +{ + if (is_dir()) { + return mdcache->mds->get_metadata_pool(); + } else { + // Files are required to have an explicit layout that specifies + // a pool + ceph_assert(get_inode()->layout.pool_id != -1); + return get_inode()->layout.pool_id; + } +} + +void CInode::queue_export_pin(mds_rank_t export_pin) +{ + if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) + return; + + mds_rank_t target; + if (export_pin >= 0) + target = export_pin; + else if (export_pin == MDS_RANK_EPHEMERAL_RAND) + target = mdcache->hash_into_rank_bucket(ino()); + else + target = MDS_RANK_NONE; + + unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits(); + bool queue = false; + for (auto& p : dirfrags) { + CDir *dir = p.second; + if (!dir->is_auth()) + continue; + + if (export_pin == MDS_RANK_EPHEMERAL_DIST) { + if (dir->get_frag().bits() < min_frag_bits) { + // needs split + queue = true; + break; + } + target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag()); + } + + if (target != MDS_RANK_NONE) { + if (dir->is_subtree_root()) { + // set auxsubtree bit or export it + if (!dir->state_test(CDir::STATE_AUXSUBTREE) || + target != dir->get_dir_auth().first) + queue = true; + } else { + // create aux subtree or export it + queue = true; + } + } else { + // clear aux subtrees ? + queue = dir->state_test(CDir::STATE_AUXSUBTREE); + } + + if (queue) + break; + } + if (queue) { + state_set(CInode::STATE_QUEUEDEXPORTPIN); + mdcache->export_pin_queue.insert(this); + } +} + +void CInode::maybe_export_pin(bool update) +{ + if (!g_conf()->mds_bal_export_pin) + return; + if (!is_dir() || !is_normal()) + return; + + dout(15) << __func__ << " update=" << update << " " << *this << dendl; + + mds_rank_t export_pin = get_export_pin(false); + if (export_pin == MDS_RANK_NONE && !update) + return; + + check_pin_policy(export_pin); + queue_export_pin(export_pin); +} + +void CInode::set_ephemeral_pin(bool dist, bool rand) +{ + unsigned state = 0; + if (dist) + state |= STATE_DISTEPHEMERALPIN; + if (rand) + state |= STATE_RANDEPHEMERALPIN; + if (!state) + return; + + if (state_test(state) != state) { + dout(10) << "set ephemeral (" << (dist ? "dist" : "") + << (rand ? " rand" : "") << ") pin on " << *this << dendl; + if (!is_ephemerally_pinned()) { + auto p = mdcache->export_ephemeral_pins.insert(this); + ceph_assert(p.second); + } + state_set(state); + } +} + +void CInode::clear_ephemeral_pin(bool dist, bool rand) +{ + unsigned state = 0; + if (dist) + state |= STATE_DISTEPHEMERALPIN; + if (rand) + state |= STATE_RANDEPHEMERALPIN; + + if (state_test(state)) { + dout(10) << "clear ephemeral (" << (dist ? "dist" : "") + << (rand ? " rand" : "") << ") pin on " << *this << dendl; + state_clear(state); + if (!is_ephemerally_pinned()) { + auto count = mdcache->export_ephemeral_pins.erase(this); + ceph_assert(count == 1); + } + } +} + +void CInode::maybe_ephemeral_rand(double threshold) +{ + if (!mdcache->get_export_ephemeral_random_config()) { + dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl; + clear_ephemeral_pin(false, true); + return; + } else if (!is_dir() || !is_normal()) { + dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl; + clear_ephemeral_pin(false, true); + return; + } else if (get_inode()->nlink == 0) { + dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl; + clear_ephemeral_pin(false, true); + return; + } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) { + dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl; + queue_export_pin(MDS_RANK_EPHEMERAL_RAND); + return; + } + + /* not precomputed? */ + if (threshold < 0.0) { + threshold = get_ephemeral_rand(); + } + if (threshold <= 0.0) { + return; + } + double n = ceph::util::generate_random_number(0.0, 1.0); + + dout(15) << __func__ << " rand " << n << " <?= " << threshold + << " " << *this << dendl; + + if (n <= threshold) { + dout(10) << __func__ << " randomly export pinning " << *this << dendl; + set_ephemeral_pin(false, true); + queue_export_pin(MDS_RANK_EPHEMERAL_RAND); + } +} + +void CInode::setxattr_ephemeral_rand(double probability) +{ + ceph_assert(is_dir()); + _get_projected_inode()->export_ephemeral_random_pin = probability; +} + +void CInode::setxattr_ephemeral_dist(bool val) +{ + ceph_assert(is_dir()); + _get_projected_inode()->export_ephemeral_distributed_pin = val; +} + +void CInode::set_export_pin(mds_rank_t rank) +{ + ceph_assert(is_dir()); + _get_projected_inode()->export_pin = rank; + maybe_export_pin(true); +} + +mds_rank_t CInode::get_export_pin(bool inherit) const +{ + if (!g_conf()->mds_bal_export_pin) + return MDS_RANK_NONE; + + /* An inode that is export pinned may not necessarily be a subtree root, we + * need to traverse the parents. A base or system inode cannot be pinned. + * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not + * have a parent yet. + */ + mds_rank_t r_target = MDS_RANK_NONE; + const CInode *in = this; + const CDir *dir = nullptr; + while (true) { + if (in->is_system()) + break; + const CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + if (in->get_inode()->nlink == 0) { + // ignore export pin for unlinked directory + break; + } + + if (in->get_inode()->export_pin >= 0) { + return in->get_inode()->export_pin; + } else if (in->get_inode()->export_ephemeral_distributed_pin && + mdcache->get_export_ephemeral_distributed_config()) { + if (in != this) + return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag()); + return MDS_RANK_EPHEMERAL_DIST; + } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) { + return r_target; + } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() && + mdcache->get_export_ephemeral_random_config()) { + /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */ + if (!inherit) + return MDS_RANK_EPHEMERAL_RAND; + if (in == this) + r_target = MDS_RANK_EPHEMERAL_RAND; + else + r_target = mdcache->hash_into_rank_bucket(in->ino()); + } + + if (!inherit) + break; + dir = pdn->get_dir(); + in = dir->inode; + } + return MDS_RANK_NONE; +} + +void CInode::check_pin_policy(mds_rank_t export_pin) +{ + if (export_pin == MDS_RANK_EPHEMERAL_DIST) { + set_ephemeral_pin(true, false); + clear_ephemeral_pin(false, true); + } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) { + set_ephemeral_pin(false, true); + clear_ephemeral_pin(true, false); + } else if (is_ephemerally_pinned()) { + // export_pin >= 0 || export_pin == MDS_RANK_NONE + clear_ephemeral_pin(true, true); + if (export_pin != get_inode()->export_pin) // inherited export_pin + queue_export_pin(MDS_RANK_NONE); + } +} + +double CInode::get_ephemeral_rand() const +{ + /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not + * have a parent yet. + */ + const CInode *in = this; + double max = mdcache->export_ephemeral_random_max; + while (true) { + if (in->is_system()) + break; + const CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + // ignore export pin for unlinked directory + if (in->get_inode()->nlink == 0) + break; + + if (in->get_inode()->export_ephemeral_random_pin > 0.0) + return std::min(in->get_inode()->export_ephemeral_random_pin, max); + + /* An export_pin overrides only if no closer parent (incl. this one) has a + * random pin set. + */ + if (in->get_inode()->export_pin >= 0 || + in->get_inode()->export_ephemeral_distributed_pin) + return 0.0; + + in = pdn->get_dir()->inode; + } + return 0.0; +} + +void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const +{ + for (const auto &p : dirfrags) { + const auto& dir = p.second; + if (!dir->is_subtree_root()) + v.push_back(dir); + } +} + +void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const +{ + for (const auto &p : dirfrags) { + const auto& dir = p.second; + if (dir->is_subtree_root()) + v.push_back(dir); + } +} + +MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co); |