diff options
Diffstat (limited to 'src/mds/mdstypes.h')
-rw-r--r-- | src/mds/mdstypes.h | 1938 |
1 files changed, 1938 insertions, 0 deletions
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h new file mode 100644 index 000000000..bfb279108 --- /dev/null +++ b/src/mds/mdstypes.h @@ -0,0 +1,1938 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_MDSTYPES_H +#define CEPH_MDSTYPES_H + +#include "include/int_types.h" + +#include <ostream> +#include <set> +#include <map> +#include <string_view> + +#include "common/config.h" +#include "common/Clock.h" +#include "common/DecayCounter.h" +#include "common/StackStringStream.h" +#include "common/entity_name.h" + +#include "include/compat.h" +#include "include/Context.h" +#include "include/frag.h" +#include "include/xlist.h" +#include "include/interval_set.h" +#include "include/compact_set.h" +#include "include/fs_types.h" +#include "include/ceph_fs.h" + +#include "inode_backtrace.h" + +#include <boost/spirit/include/qi.hpp> +#include <boost/pool/pool.hpp> +#include "include/ceph_assert.h" +#include <boost/serialization/strong_typedef.hpp> +#include "common/ceph_json.h" + +#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011" + +#define MDS_PORT_CACHE 0x200 +#define MDS_PORT_LOCKER 0x300 +#define MDS_PORT_MIGRATOR 0x400 + +#define MAX_MDS 0x100 +#define NUM_STRAY 10 + +// Inode numbers 1,2 and 4 please see CEPH_INO_* in include/ceph_fs.h + +#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS) +#define MDS_INO_STRAY_OFFSET (6*MAX_MDS) + +// Locations for journal data +#define MDS_INO_LOG_OFFSET (2*MAX_MDS) +#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS) +#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS) +#define MDS_INO_PURGE_QUEUE (5*MAX_MDS) + +#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY)) + +#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i)))) +#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x)) + +#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY))) +#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS)) +#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET)) +#define MDS_INO_IS_BASE(i) ((i) == CEPH_INO_ROOT || (i) == CEPH_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i)) +#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY)) +#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY) + +#define MDS_IS_PRIVATE_INO(i) ((i) < MDS_INO_SYSTEM_BASE && (i) >= MDS_INO_MDSDIR_OFFSET) + +typedef int32_t mds_rank_t; +constexpr mds_rank_t MDS_RANK_NONE = -1; +constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST = -2; +constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND = -3; + +BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) +extern const mds_gid_t MDS_GID_NONE; + +typedef int32_t fs_cluster_id_t; +constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1; + +// The namespace ID of the anonymous default filesystem from legacy systems +constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0; + +class mds_role_t { +public: + mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_) + : fscid(fscid_), rank(rank_) + {} + mds_role_t() {} + + bool operator<(mds_role_t const &rhs) const { + if (fscid < rhs.fscid) { + return true; + } else if (fscid == rhs.fscid) { + return rank < rhs.rank; + } else { + return false; + } + } + + bool is_none() const { + return (rank == MDS_RANK_NONE); + } + + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + mds_rank_t rank = MDS_RANK_NONE; +}; +inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) { + return out << role.fscid << ":" << role.rank; +} + +// CAPS +inline std::string gcap_string(int cap) +{ + std::string s; + if (cap & CEPH_CAP_GSHARED) s += "s"; + if (cap & CEPH_CAP_GEXCL) s += "x"; + if (cap & CEPH_CAP_GCACHE) s += "c"; + if (cap & CEPH_CAP_GRD) s += "r"; + if (cap & CEPH_CAP_GWR) s += "w"; + if (cap & CEPH_CAP_GBUFFER) s += "b"; + if (cap & CEPH_CAP_GWREXTEND) s += "a"; + if (cap & CEPH_CAP_GLAZYIO) s += "l"; + return s; +} +inline std::string ccap_string(int cap) +{ + std::string s; + if (cap & CEPH_CAP_PIN) s += "p"; + + int a = (cap >> CEPH_CAP_SAUTH) & 3; + if (a) s += 'A' + gcap_string(a); + + a = (cap >> CEPH_CAP_SLINK) & 3; + if (a) s += 'L' + gcap_string(a); + + a = (cap >> CEPH_CAP_SXATTR) & 3; + if (a) s += 'X' + gcap_string(a); + + a = cap >> CEPH_CAP_SFILE; + if (a) s += 'F' + gcap_string(a); + + if (s.length() == 0) + s = "-"; + return s; +} + +struct scatter_info_t { + version_t version = 0; +}; + +struct frag_info_t : public scatter_info_t { + int64_t size() const { return nfiles + nsubdirs; } + + void zero() { + *this = frag_info_t(); + } + + // *this += cur - acc; + void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) { + if (cur.mtime > mtime) { + mtime = cur.mtime; + if (touched_mtime) + *touched_mtime = true; + } + if (cur.change_attr > change_attr) { + change_attr = cur.change_attr; + if (touched_chattr) + *touched_chattr = true; + } + nfiles += cur.nfiles - acc.nfiles; + nsubdirs += cur.nsubdirs - acc.nsubdirs; + } + + void add(const frag_info_t& other) { + if (other.mtime > mtime) + mtime = other.mtime; + if (other.change_attr > change_attr) + change_attr = other.change_attr; + nfiles += other.nfiles; + nsubdirs += other.nsubdirs; + } + + bool same_sums(const frag_info_t &o) const { + return mtime <= o.mtime && + nfiles == o.nfiles && + nsubdirs == o.nsubdirs; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<frag_info_t*>& ls); + + // this frag + utime_t mtime; + uint64_t change_attr = 0; + int64_t nfiles = 0; // files + int64_t nsubdirs = 0; // subdirs +}; +WRITE_CLASS_ENCODER(frag_info_t) + +inline bool operator==(const frag_info_t &l, const frag_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const frag_info_t &l, const frag_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const frag_info_t &f); + + +struct nest_info_t : public scatter_info_t { + int64_t rsize() const { return rfiles + rsubdirs; } + + void zero() { + *this = nest_info_t(); + } + + void sub(const nest_info_t &other) { + add(other, -1); + } + void add(const nest_info_t &other, int fac=1) { + if (other.rctime > rctime) + rctime = other.rctime; + rbytes += fac*other.rbytes; + rfiles += fac*other.rfiles; + rsubdirs += fac*other.rsubdirs; + rsnaps += fac*other.rsnaps; + } + + // *this += cur - acc; + void add_delta(const nest_info_t &cur, const nest_info_t &acc) { + if (cur.rctime > rctime) + rctime = cur.rctime; + rbytes += cur.rbytes - acc.rbytes; + rfiles += cur.rfiles - acc.rfiles; + rsubdirs += cur.rsubdirs - acc.rsubdirs; + rsnaps += cur.rsnaps - acc.rsnaps; + } + + bool same_sums(const nest_info_t &o) const { + return rctime <= o.rctime && + rbytes == o.rbytes && + rfiles == o.rfiles && + rsubdirs == o.rsubdirs && + rsnaps == o.rsnaps; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<nest_info_t*>& ls); + + // this frag + children + utime_t rctime; + int64_t rbytes = 0; + int64_t rfiles = 0; + int64_t rsubdirs = 0; + int64_t rsnaps = 0; +}; +WRITE_CLASS_ENCODER(nest_info_t) + +inline bool operator==(const nest_info_t &l, const nest_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const nest_info_t &l, const nest_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const nest_info_t &n); + +struct vinodeno_t { + vinodeno_t() {} + vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {} + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(ino, bl); + encode(snapid, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(ino, p); + decode(snapid, p); + } + + inodeno_t ino; + snapid_t snapid; +}; +WRITE_CLASS_ENCODER(vinodeno_t) + +inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) { + return l.ino == r.ino && l.snapid == r.snapid; +} +inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) { + return !(l == r); +} +inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) { + return + l.ino < r.ino || + (l.ino == r.ino && l.snapid < r.snapid); +} + +struct quota_info_t +{ + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(max_bytes, bl); + encode(max_files, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p); + decode(max_bytes, p); + decode(max_files, p); + DECODE_FINISH(p); + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<quota_info_t *>& ls); + + bool is_valid() const { + return max_bytes >=0 && max_files >=0; + } + bool is_enable() const { + return max_bytes || max_files; + } + void decode_json(JSONObj *obj); + + int64_t max_bytes = 0; + int64_t max_files = 0; +}; +WRITE_CLASS_ENCODER(quota_info_t) + +inline bool operator==(const quota_info_t &l, const quota_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} + +std::ostream& operator<<(std::ostream &out, const quota_info_t &n); + +namespace std { + template<> struct hash<vinodeno_t> { + size_t operator()(const vinodeno_t &vino) const { + hash<inodeno_t> H; + hash<uint64_t> I; + return H(vino.ino) ^ I(vino.snapid); + } + }; +} + +inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) { + out << vino.ino; + if (vino.snapid == CEPH_NOSNAP) + out << ".head"; + else if (vino.snapid) + out << '.' << vino.snapid; + return out; +} + +struct client_writeable_range_t { + struct byte_range_t { + uint64_t first = 0, last = 0; // interval client can write to + byte_range_t() {} + void decode_json(JSONObj *obj); + }; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<client_writeable_range_t*>& ls); + + byte_range_t range; + snapid_t follows = 0; // aka "data+metadata flushed thru" +}; + +inline void decode(client_writeable_range_t::byte_range_t& range, ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(range.first, bl); + decode(range.last, bl); +} + +WRITE_CLASS_ENCODER(client_writeable_range_t) + +std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r); + +inline bool operator==(const client_writeable_range_t& l, + const client_writeable_range_t& r) { + return l.range.first == r.range.first && l.range.last == r.range.last && + l.follows == r.follows; +} + +struct inline_data_t { +public: + inline_data_t() {} + inline_data_t(const inline_data_t& o) : version(o.version) { + if (o.blp) + set_data(*o.blp); + } + inline_data_t& operator=(const inline_data_t& o) { + version = o.version; + if (o.blp) + set_data(*o.blp); + else + free_data(); + return *this; + } + + void free_data() { + blp.reset(); + } + void get_data(ceph::buffer::list& ret) const { + if (blp) + ret = *blp; + else + ret.clear(); + } + void set_data(const ceph::buffer::list& bl) { + if (!blp) + blp.reset(new ceph::buffer::list); + *blp = bl; + } + size_t length() const { return blp ? blp->length() : 0; } + + bool operator==(const inline_data_t& o) const { + return length() == o.length() && + (length() == 0 || + (*const_cast<ceph::buffer::list*>(blp.get()) == *const_cast<ceph::buffer::list*>(o.blp.get()))); + } + bool operator!=(const inline_data_t& o) const { + return !(*this == o); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + + version_t version = 1; + +private: + std::unique_ptr<ceph::buffer::list> blp; +}; +WRITE_CLASS_ENCODER(inline_data_t) + +enum { + DAMAGE_STATS, // statistics (dirstat, size, etc) + DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat) + DAMAGE_FRAGTREE // fragtree -- repair by searching +}; +typedef uint32_t damage_flags_t; + +template<template<typename> class Allocator = std::allocator> +struct inode_t { + /** + * *************** + * Do not forget to add any new fields to the compare() function. + * *************** + */ + using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>; + + inode_t() + { + clear_layout(); + } + + // file type + bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; } + bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } + bool is_file() const { return (mode & S_IFMT) == S_IFREG; } + + bool is_truncating() const { return (truncate_pending > 0); } + void truncate(uint64_t old_size, uint64_t new_size) { + ceph_assert(new_size < old_size); + if (old_size > max_size_ever) + max_size_ever = old_size; + truncate_from = old_size; + size = new_size; + rstat.rbytes = new_size; + truncate_size = size; + truncate_seq++; + truncate_pending++; + } + + bool has_layout() const { + return layout != file_layout_t(); + } + + void clear_layout() { + layout = file_layout_t(); + } + + uint64_t get_layout_size_increment() const { + return layout.get_period(); + } + + bool is_dirty_rstat() const { return !(rstat == accounted_rstat); } + + uint64_t get_client_range(client_t client) const { + auto it = client_ranges.find(client); + return it != client_ranges.end() ? it->second.range.last : 0; + } + + uint64_t get_max_size() const { + uint64_t max = 0; + for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + if (p->second.range.last > max) + max = p->second.range.last; + return max; + } + void set_max_size(uint64_t new_max) { + if (new_max == 0) { + client_ranges.clear(); + } else { + for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + p->second.range.last = new_max; + } + } + + void trim_client_ranges(snapid_t last) { + std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin(); + while (p != client_ranges.end()) { + if (p->second.follows >= last) + client_ranges.erase(p++); + else + ++p; + } + } + + bool is_backtrace_updated() const { + return backtrace_version == version; + } + void update_backtrace(version_t pv=0) { + backtrace_version = pv ? pv : version; + } + + void add_old_pool(int64_t l) { + backtrace_version = version; + old_pools.insert(l); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void client_ranges_cb(client_range_map& c, JSONObj *obj); + static void old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj); + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<inode_t*>& ls); + /** + * Compare this inode_t with another that represent *the same inode* + * at different points in time. + * @pre The inodes are the same ino + * + * @param other The inode_t to compare ourselves with + * @param divergent A bool pointer which will be set to true + * if the values are different in a way that can't be explained + * by one being a newer version than the other. + * + * @returns 1 if we are newer than the other, 0 if equal, -1 if older. + */ + int compare(const inode_t &other, bool *divergent) const; + + // base (immutable) + inodeno_t ino = 0; + uint32_t rdev = 0; // if special file + + // affected by any inode change... + utime_t ctime; // inode change time + utime_t btime; // birth time + + // perm (namespace permissions) + uint32_t mode = 0; + uid_t uid = 0; + gid_t gid = 0; + + // nlink + int32_t nlink = 0; + + // file (data access) + ceph_dir_layout dir_layout = {}; // [dir only] + file_layout_t layout; + compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools; + uint64_t size = 0; // on directory, # dentries + uint64_t max_size_ever = 0; // max size the file has ever been + uint32_t truncate_seq = 0; + uint64_t truncate_size = 0, truncate_from = 0; + uint32_t truncate_pending = 0; + utime_t mtime; // file data modify time. + utime_t atime; // file data access time. + uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes()) + inline_data_t inline_data; // FIXME check + + // change attribute + uint64_t change_attr = 0; + + client_range_map client_ranges; // client(s) can write to these ranges + + // dirfrag, recursive accountin + frag_info_t dirstat; // protected by my filelock + nest_info_t rstat; // protected by my nestlock + nest_info_t accounted_rstat; // protected by parent's nestlock + + quota_info_t quota; + + mds_rank_t export_pin = MDS_RANK_NONE; + + double export_ephemeral_random_pin = 0; + bool export_ephemeral_distributed_pin = false; + + // special stuff + version_t version = 0; // auth only + version_t file_data_version = 0; // auth only + version_t xattr_version = 0; + + utime_t last_scrub_stamp; // start time of last complete scrub + version_t last_scrub_version = 0;// (parent) start version of last complete scrub + + version_t backtrace_version = 0; + + snapid_t oldest_snap; + + std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink + + bool fscrypt = false; // fscrypt enabled ? + +private: + bool older_is_consistent(const inode_t &other) const; +}; + +// These methods may be moved back to mdstypes.cc when we have pmr +template<template<typename> class Allocator> +void inode_t<Allocator>::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(17, 6, bl); + + encode(ino, bl); + encode(rdev, bl); + encode(ctime, bl); + + encode(mode, bl); + encode(uid, bl); + encode(gid, bl); + + encode(nlink, bl); + { + // removed field + bool anchored = 0; + encode(anchored, bl); + } + + encode(dir_layout, bl); + encode(layout, bl, features); + encode(size, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(truncate_from, bl); + encode(truncate_pending, bl); + encode(mtime, bl); + encode(atime, bl); + encode(time_warp_seq, bl); + encode(client_ranges, bl); + + encode(dirstat, bl); + encode(rstat, bl); + encode(accounted_rstat, bl); + + encode(version, bl); + encode(file_data_version, bl); + encode(xattr_version, bl); + encode(backtrace_version, bl); + encode(old_pools, bl); + encode(max_size_ever, bl); + encode(inline_data, bl); + encode(quota, bl); + + encode(stray_prior_path, bl); + + encode(last_scrub_version, bl); + encode(last_scrub_stamp, bl); + + encode(btime, bl); + encode(change_attr, bl); + + encode(export_pin, bl); + + encode(export_ephemeral_random_pin, bl); + encode(export_ephemeral_distributed_pin, bl); + + encode(fscrypt, bl); + + ENCODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::decode(ceph::buffer::list::const_iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(17, 6, 6, p); + + decode(ino, p); + decode(rdev, p); + decode(ctime, p); + + decode(mode, p); + decode(uid, p); + decode(gid, p); + + decode(nlink, p); + { + bool anchored; + decode(anchored, p); + } + + if (struct_v >= 4) + decode(dir_layout, p); + else { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&dir_layout, 0, sizeof(dir_layout)); + } + decode(layout, p); + decode(size, p); + decode(truncate_seq, p); + decode(truncate_size, p); + decode(truncate_from, p); + if (struct_v >= 5) + decode(truncate_pending, p); + else + truncate_pending = 0; + decode(mtime, p); + decode(atime, p); + decode(time_warp_seq, p); + if (struct_v >= 3) { + decode(client_ranges, p); + } else { + std::map<client_t, client_writeable_range_t::byte_range_t> m; + decode(m, p); + for (auto q = m.begin(); q != m.end(); ++q) + client_ranges[q->first].range = q->second; + } + + decode(dirstat, p); + decode(rstat, p); + decode(accounted_rstat, p); + + decode(version, p); + decode(file_data_version, p); + decode(xattr_version, p); + if (struct_v >= 2) + decode(backtrace_version, p); + if (struct_v >= 7) + decode(old_pools, p); + if (struct_v >= 8) + decode(max_size_ever, p); + if (struct_v >= 9) { + decode(inline_data, p); + } else { + inline_data.version = CEPH_INLINE_NONE; + } + if (struct_v < 10) + backtrace_version = 0; // force update backtrace + if (struct_v >= 11) + decode(quota, p); + + if (struct_v >= 12) { + std::string tmp; + decode(tmp, p); + stray_prior_path = std::string_view(tmp); + } + + if (struct_v >= 13) { + decode(last_scrub_version, p); + decode(last_scrub_stamp, p); + } + if (struct_v >= 14) { + decode(btime, p); + decode(change_attr, p); + } else { + btime = utime_t(); + change_attr = 0; + } + + if (struct_v >= 15) { + decode(export_pin, p); + } else { + export_pin = MDS_RANK_NONE; + } + + if (struct_v >= 16) { + decode(export_ephemeral_random_pin, p); + decode(export_ephemeral_distributed_pin, p); + } else { + export_ephemeral_random_pin = 0; + export_ephemeral_distributed_pin = false; + } + + if (struct_v >= 17) { + decode(fscrypt, p); + } else { + fscrypt = 0; + } + + DECODE_FINISH(p); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("rdev", rdev); + f->dump_stream("ctime") << ctime; + f->dump_stream("btime") << btime; + f->dump_unsigned("mode", mode); + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + f->dump_unsigned("nlink", nlink); + + f->open_object_section("dir_layout"); + ::dump(dir_layout, f); + f->close_section(); + + f->dump_object("layout", layout); + + f->open_array_section("old_pools"); + for (const auto &p : old_pools) { + f->dump_int("pool", p); + } + f->close_section(); + + f->dump_unsigned("size", size); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_unsigned("truncate_from", truncate_from); + f->dump_unsigned("truncate_pending", truncate_pending); + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_unsigned("time_warp_seq", time_warp_seq); + f->dump_unsigned("change_attr", change_attr); + f->dump_int("export_pin", export_pin); + f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin); + f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin); + + f->open_array_section("client_ranges"); + for (const auto &p : client_ranges) { + f->open_object_section("client"); + f->dump_unsigned("client", p.first.v); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_object_section("dirstat"); + dirstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); + + f->dump_unsigned("version", version); + f->dump_unsigned("file_data_version", file_data_version); + f->dump_unsigned("xattr_version", xattr_version); + f->dump_unsigned("backtrace_version", backtrace_version); + + f->dump_string("stray_prior_path", stray_prior_path); + f->dump_unsigned("max_size_ever", max_size_ever); + + f->open_object_section("quota"); + quota.dump(f); + f->close_section(); + + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_unsigned("last_scrub_version", last_scrub_version); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::client_ranges_cb(typename inode_t<Allocator>::client_range_map& c, JSONObj *obj){ + + int64_t client; + JSONDecoder::decode_json("client", client, obj, true); + client_writeable_range_t client_range_tmp; + JSONDecoder::decode_json("byte range", client_range_tmp.range, obj, true); + JSONDecoder::decode_json("follows", client_range_tmp.follows.val, obj, true); + c[client] = client_range_tmp; +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj){ + + int64_t tmp; + decode_json_obj(tmp, obj); + c.insert(tmp); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::decode_json(JSONObj *obj) +{ + + JSONDecoder::decode_json("ino", ino.val, obj, true); + JSONDecoder::decode_json("rdev", rdev, obj, true); + //JSONDecoder::decode_json("ctime", ctime, obj, true); + //JSONDecoder::decode_json("btime", btime, obj, true); + JSONDecoder::decode_json("mode", mode, obj, true); + JSONDecoder::decode_json("uid", uid, obj, true); + JSONDecoder::decode_json("gid", gid, obj, true); + JSONDecoder::decode_json("nlink", nlink, obj, true); + JSONDecoder::decode_json("dir_layout", dir_layout, obj, true); + JSONDecoder::decode_json("layout", layout, obj, true); + JSONDecoder::decode_json("old_pools", old_pools, inode_t<Allocator>::old_pools_cb, obj, true); + JSONDecoder::decode_json("size", size, obj, true); + JSONDecoder::decode_json("truncate_seq", truncate_seq, obj, true); + JSONDecoder::decode_json("truncate_size", truncate_size, obj, true); + JSONDecoder::decode_json("truncate_from", truncate_from, obj, true); + JSONDecoder::decode_json("truncate_pending", truncate_pending, obj, true); + //JSONDecoder::decode_json("mtime", mtime, obj, true); + //JSONDecoder::decode_json("atime", atime, obj, true); + JSONDecoder::decode_json("time_warp_seq", time_warp_seq, obj, true); + JSONDecoder::decode_json("change_attr", change_attr, obj, true); + JSONDecoder::decode_json("export_pin", export_pin, obj, true); + JSONDecoder::decode_json("client_ranges", client_ranges, inode_t<Allocator>::client_ranges_cb, obj, true); + JSONDecoder::decode_json("dirstat", dirstat, obj, true); + JSONDecoder::decode_json("rstat", rstat, obj, true); + JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true); + JSONDecoder::decode_json("version", version, obj, true); + JSONDecoder::decode_json("file_data_version", file_data_version, obj, true); + JSONDecoder::decode_json("xattr_version", xattr_version, obj, true); + JSONDecoder::decode_json("backtrace_version", backtrace_version, obj, true); + JSONDecoder::decode_json("stray_prior_path", stray_prior_path, obj, true); + JSONDecoder::decode_json("max_size_ever", max_size_ever, obj, true); + JSONDecoder::decode_json("quota", quota, obj, true); + JSONDecoder::decode_json("last_scrub_stamp", last_scrub_stamp, obj, true); + JSONDecoder::decode_json("last_scrub_version", last_scrub_version, obj, true); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls) +{ + ls.push_back(new inode_t<Allocator>); + ls.push_back(new inode_t<Allocator>); + ls.back()->ino = 1; + // i am lazy. +} + +template<template<typename> class Allocator> +int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const +{ + ceph_assert(ino == other.ino); + *divergent = false; + if (version == other.version) { + if (rdev != other.rdev || + ctime != other.ctime || + btime != other.btime || + mode != other.mode || + uid != other.uid || + gid != other.gid || + nlink != other.nlink || + memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) || + layout != other.layout || + old_pools != other.old_pools || + size != other.size || + max_size_ever != other.max_size_ever || + truncate_seq != other.truncate_seq || + truncate_size != other.truncate_size || + truncate_from != other.truncate_from || + truncate_pending != other.truncate_pending || + change_attr != other.change_attr || + mtime != other.mtime || + atime != other.atime || + time_warp_seq != other.time_warp_seq || + inline_data != other.inline_data || + client_ranges != other.client_ranges || + !(dirstat == other.dirstat) || + !(rstat == other.rstat) || + !(accounted_rstat == other.accounted_rstat) || + file_data_version != other.file_data_version || + xattr_version != other.xattr_version || + backtrace_version != other.backtrace_version) { + *divergent = true; + } + return 0; + } else if (version > other.version) { + *divergent = !older_is_consistent(other); + return 1; + } else { + ceph_assert(version < other.version); + *divergent = !other.older_is_consistent(*this); + return -1; + } +} + +template<template<typename> class Allocator> +bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const +{ + if (max_size_ever < other.max_size_ever || + truncate_seq < other.truncate_seq || + time_warp_seq < other.time_warp_seq || + inline_data.version < other.inline_data.version || + dirstat.version < other.dirstat.version || + rstat.version < other.rstat.version || + accounted_rstat.version < other.accounted_rstat.version || + file_data_version < other.file_data_version || + xattr_version < other.xattr_version || + backtrace_version < other.backtrace_version) { + return false; + } + return true; +} + +template<template<typename> class Allocator> +inline void encode(const inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features) +{ + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template<template<typename> class Allocator> +inline void decode(inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p) +{ + c.decode(p); +} + +template<template<typename> class Allocator> +using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>; + +template<template<typename> class Allocator> +using xattr_map = std::map<alloc_string<Allocator>, + ceph::bufferptr, + std::less<alloc_string<Allocator>>, + Allocator<std::pair<const alloc_string<Allocator>, + ceph::bufferptr>>>; // FIXME bufferptr not in mempool + +template<template<typename> class Allocator> +inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p) +{ + __u32 n; + decode(n, p); + while (n-- > 0) { + alloc_string<Allocator> key; + decode(key, p); + __u32 len; + decode(len, p); + p.copy_deep(len, xattrs[key]); + } +} + +template<template<typename> class Allocator = std::allocator> +struct old_inode_t { + snapid_t first; + inode_t<Allocator> inode; + xattr_map<Allocator> xattrs; + + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<old_inode_t*>& ls); +}; + +// These methods may be moved back to mdstypes.cc when we have pmr +template<template<typename> class Allocator> +void old_inode_t<Allocator>::encode(ceph::buffer::list& bl, uint64_t features) const +{ + ENCODE_START(2, 2, bl); + encode(first, bl); + encode(inode, bl, features); + encode(xattrs, bl); + ENCODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(first, bl); + decode(inode, bl); + decode_noshare<Allocator>(xattrs, bl); + DECODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("first", first); + inode.dump(f); + f->open_object_section("xattrs"); + for (const auto &p : xattrs) { + std::string v(p.second.c_str(), p.second.length()); + f->dump_string(p.first.c_str(), v); + } + f->close_section(); +} + +template<template<typename> class Allocator> +void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls) +{ + ls.push_back(new old_inode_t<Allocator>); + ls.push_back(new old_inode_t<Allocator>); + ls.back()->first = 2; + std::list<inode_t<Allocator>*> ils; + inode_t<Allocator>::generate_test_instances(ils); + ls.back()->inode = *ils.back(); + ls.back()->xattrs["user.foo"] = ceph::buffer::copy("asdf", 4); + ls.back()->xattrs["user.unprintable"] = ceph::buffer::copy("\000\001\002", 3); +} + +template<template<typename> class Allocator> +inline void encode(const old_inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features) +{ + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template<template<typename> class Allocator> +inline void decode(old_inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p) +{ + c.decode(p); +} + +/* + * like an inode, but for a dir frag + */ +struct fnode_t { + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<fnode_t*>& ls); + + version_t version = 0; + snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru + frag_info_t fragstat, accounted_fragstat; + nest_info_t rstat, accounted_rstat; + damage_flags_t damage_flags = 0; + + // we know we and all our descendants have been scrubbed since this version + version_t recursive_scrub_version = 0; + utime_t recursive_scrub_stamp; + // version at which we last scrubbed our personal data structures + version_t localized_scrub_version = 0; + utime_t localized_scrub_stamp; +}; +WRITE_CLASS_ENCODER(fnode_t) + + +struct old_rstat_t { + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<old_rstat_t*>& ls); + + snapid_t first; + nest_info_t rstat, accounted_rstat; +}; +WRITE_CLASS_ENCODER(old_rstat_t) + +inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) { + return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")"; +} + +class feature_bitset_t { +public: + typedef uint64_t block_type; + static const size_t bits_per_block = sizeof(block_type) * 8; + + feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {} + feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {} + feature_bitset_t(unsigned long value = 0); + feature_bitset_t(const std::vector<size_t>& array); + feature_bitset_t& operator=(const feature_bitset_t& other) { + _vec = other._vec; + return *this; + } + feature_bitset_t& operator=(feature_bitset_t&& other) { + _vec = std::move(other._vec); + return *this; + } + feature_bitset_t& operator-=(const feature_bitset_t& other); + bool empty() const { + //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty" + for (auto& v : _vec) { + if (v) + return false; + } + return true; + } + bool test(size_t bit) const { + if (bit >= bits_per_block * _vec.size()) + return false; + return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block)); + } + void insert(size_t bit) { + size_t n = bit / bits_per_block; + if (n >= _vec.size()) + _vec.resize(n + 1); + _vec[n] |= ((block_type)1 << (bit % bits_per_block)); + } + void erase(size_t bit) { + size_t n = bit / bits_per_block; + if (n >= _vec.size()) + return; + _vec[n] &= ~((block_type)1 << (bit % bits_per_block)); + if (n + 1 == _vec.size()) { + while (!_vec.empty() && _vec.back() == 0) + _vec.pop_back(); + } + } + void clear() { + _vec.clear(); + } + bool operator==(const feature_bitset_t& other) const { + return _vec == other._vec; + } + bool operator!=(const feature_bitset_t& other) const { + return _vec != other._vec; + } + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator &p); + void dump(ceph::Formatter *f) const; + void print(std::ostream& out) const; +private: + std::vector<block_type> _vec; +}; +WRITE_CLASS_ENCODER(feature_bitset_t) + +inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) { + s.print(out); + return out; +} + +struct metric_spec_t { + metric_spec_t() {} + metric_spec_t(const metric_spec_t& other) : + metric_flags(other.metric_flags) {} + metric_spec_t(metric_spec_t&& other) : + metric_flags(std::move(other.metric_flags)) {} + metric_spec_t(const feature_bitset_t& mf) : + metric_flags(mf) {} + metric_spec_t(feature_bitset_t&& mf) : + metric_flags(std::move(mf)) {} + + metric_spec_t& operator=(const metric_spec_t& other) { + metric_flags = other.metric_flags; + return *this; + } + metric_spec_t& operator=(metric_spec_t&& other) { + metric_flags = std::move(other.metric_flags); + return *this; + } + + bool empty() const { + return metric_flags.empty(); + } + + void clear() { + metric_flags.clear(); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + void print(std::ostream& out) const; + + // set of metrics that a client is capable of forwarding + feature_bitset_t metric_flags; +}; +WRITE_CLASS_ENCODER(metric_spec_t) + +inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) { + mst.print(out); + return out; +} + +/* + * client_metadata_t + */ +struct client_metadata_t { + using kv_map_t = std::map<std::string,std::string>; + using iterator = kv_map_t::const_iterator; + + client_metadata_t() {} + client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) : + kv_map(kv), + features(f), + metric_spec(mst) {} + client_metadata_t& operator=(const client_metadata_t& other) { + kv_map = other.kv_map; + features = other.features; + metric_spec = other.metric_spec; + return *this; + } + + bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); } + iterator find(const std::string& key) const { return kv_map.find(key); } + iterator begin() const { return kv_map.begin(); } + iterator end() const { return kv_map.end(); } + void erase(iterator it) { kv_map.erase(it); } + std::string& operator[](const std::string& key) { return kv_map[key]; } + void merge(const client_metadata_t& other) { + kv_map.insert(other.kv_map.begin(), other.kv_map.end()); + features = other.features; + metric_spec = other.metric_spec; + } + void clear() { + kv_map.clear(); + features.clear(); + metric_spec.clear(); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + + kv_map_t kv_map; + feature_bitset_t features; + metric_spec_t metric_spec; +}; +WRITE_CLASS_ENCODER(client_metadata_t) + +/* + * session_info_t - durable part of a Session + */ +struct session_info_t { + client_t get_client() const { return client_t(inst.name.num()); } + bool has_feature(size_t bit) const { return client_metadata.features.test(bit); } + const entity_name_t& get_source() const { return inst.name; } + + void clear_meta() { + prealloc_inos.clear(); + completed_requests.clear(); + completed_flushes.clear(); + client_metadata.clear(); + } + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<session_info_t*>& ls); + + entity_inst_t inst; + std::map<ceph_tid_t,inodeno_t> completed_requests; + interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use. + client_metadata_t client_metadata; + std::set<ceph_tid_t> completed_flushes; + EntityName auth_name; +}; +WRITE_CLASS_ENCODER_FEATURES(session_info_t) + +// dentries +struct dentry_key_t { + dentry_key_t() {} + dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) : + snapid(s), name(n), hash(h) {} + + bool is_valid() { return name.length() || snapid; } + + // encode into something that can be decoded as a string. + // name_ (head) or name_%x (!head) + void encode(ceph::buffer::list& bl) const { + std::string key; + encode(key); + using ceph::encode; + encode(key, bl); + } + void encode(std::string& key) const { + char b[20]; + if (snapid != CEPH_NOSNAP) { + uint64_t val(snapid); + snprintf(b, sizeof(b), "%" PRIx64, val); + } else { + snprintf(b, sizeof(b), "%s", "head"); + } + CachedStackStringStream css; + *css << name << "_" << b; + key = css->strv(); + } + static void decode_helper(ceph::buffer::list::const_iterator& bl, std::string& nm, + snapid_t& sn) { + std::string key; + using ceph::decode; + decode(key, bl); + decode_helper(key, nm, sn); + } + static void decode_helper(std::string_view key, std::string& nm, snapid_t& sn) { + size_t i = key.find_last_of('_'); + ceph_assert(i != std::string::npos); + if (key.compare(i+1, std::string_view::npos, "head") == 0) { + // name_head + sn = CEPH_NOSNAP; + } else { + // name_%x + long long unsigned x = 0; + std::string x_str(key.substr(i+1)); + sscanf(x_str.c_str(), "%llx", &x); + sn = x; + } + nm = key.substr(0, i); + } + + snapid_t snapid = 0; + std::string_view name; + __u32 hash = 0; +}; + +inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k) +{ + return out << "(" << k.name << "," << k.snapid << ")"; +} + +inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2) +{ + /* + * order by hash, name, snap + */ + int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash); + if (c) + return c < 0; + c = k1.name.compare(k2.name); + if (c) + return c < 0; + return k1.snapid < k2.snapid; +} + +/* + * string_snap_t is a simple (string, snapid_t) pair + */ +struct string_snap_t { + string_snap_t() {} + string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {} + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<string_snap_t*>& ls); + + std::string name; + snapid_t snapid; +}; +WRITE_CLASS_ENCODER(string_snap_t) + +inline bool operator<(const string_snap_t& l, const string_snap_t& r) { + int c = l.name.compare(r.name); + return c < 0 || (c == 0 && l.snapid < r.snapid); +} + +inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k) +{ + return out << "(" << k.name << "," << k.snapid << ")"; +} + +/* + * mds_table_pending_t + * + * For mds's requesting any pending ops, child needs to encode the corresponding + * pending mutation state in the table. + */ +struct mds_table_pending_t { + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<mds_table_pending_t*>& ls); + + uint64_t reqid = 0; + __s32 mds = 0; + version_t tid = 0; +}; +WRITE_CLASS_ENCODER(mds_table_pending_t) + +// requests +struct metareqid_t { + metareqid_t() {} + metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {} + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(name, bl); + encode(tid, bl); + } + void decode(ceph::buffer::list::const_iterator &p) { + using ceph::decode; + decode(name, p); + decode(tid, p); + } + + entity_name_t name; + uint64_t tid = 0; +}; +WRITE_CLASS_ENCODER(metareqid_t) + +inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) { + return out << r.name << ":" << r.tid; +} + +inline bool operator==(const metareqid_t& l, const metareqid_t& r) { + return (l.name == r.name) && (l.tid == r.tid); +} +inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { + return (l.name != r.name) || (l.tid != r.tid); +} +inline bool operator<(const metareqid_t& l, const metareqid_t& r) { + return (l.name < r.name) || + (l.name == r.name && l.tid < r.tid); +} +inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { + return (l.name < r.name) || + (l.name == r.name && l.tid <= r.tid); +} +inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } +inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } + +namespace std { + template<> struct hash<metareqid_t> { + size_t operator()(const metareqid_t &r) const { + hash<uint64_t> H; + return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid); + } + }; +} // namespace std + +// cap info for client reconnect +struct cap_reconnect_t { + cap_reconnect_t() {} + cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i, + inodeno_t sr, snapid_t sf, ceph::buffer::list& lb) : + path(p) { + capinfo.cap_id = cap_id; + capinfo.wanted = w; + capinfo.issued = i; + capinfo.snaprealm = sr; + capinfo.pathbase = pino; + capinfo.flock_len = 0; + snap_follows = sf; + flockbl = std::move(lb); + } + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void encode_old(ceph::buffer::list& bl) const; + void decode_old(ceph::buffer::list::const_iterator& bl); + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<cap_reconnect_t*>& ls); + + std::string path; + mutable ceph_mds_cap_reconnect capinfo = {}; + snapid_t snap_follows = 0; + ceph::buffer::list flockbl; +}; +WRITE_CLASS_ENCODER(cap_reconnect_t) + +struct snaprealm_reconnect_t { + snaprealm_reconnect_t() {} + snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) { + realm.ino = ino; + realm.seq = seq; + realm.parent = parent; + } + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void encode_old(ceph::buffer::list& bl) const; + void decode_old(ceph::buffer::list::const_iterator& bl); + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls); + + mutable ceph_mds_snaprealm_reconnect realm = {}; +}; +WRITE_CLASS_ENCODER(snaprealm_reconnect_t) + +// compat for pre-FLOCK feature +struct old_ceph_mds_cap_reconnect { + ceph_le64 cap_id; + ceph_le32 wanted; + ceph_le32 issued; + ceph_le64 old_size; + struct ceph_timespec old_mtime, old_atime; + ceph_le64 snaprealm; + ceph_le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); +WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect) + +struct old_cap_reconnect_t { + const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) { + path = n.path; + capinfo.cap_id = n.capinfo.cap_id; + capinfo.wanted = n.capinfo.wanted; + capinfo.issued = n.capinfo.issued; + capinfo.snaprealm = n.capinfo.snaprealm; + capinfo.pathbase = n.capinfo.pathbase; + return *this; + } + operator cap_reconnect_t() { + cap_reconnect_t n; + n.path = path; + n.capinfo.cap_id = capinfo.cap_id; + n.capinfo.wanted = capinfo.wanted; + n.capinfo.issued = capinfo.issued; + n.capinfo.snaprealm = capinfo.snaprealm; + n.capinfo.pathbase = capinfo.pathbase; + return n; + } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(path, bl); + encode(capinfo, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(path, bl); + decode(capinfo, bl); + } + + std::string path; + old_ceph_mds_cap_reconnect capinfo; +}; +WRITE_CLASS_ENCODER(old_cap_reconnect_t) + +// dir frag +struct dirfrag_t { + dirfrag_t() {} + dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(ino, bl); + encode(frag, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(ino, bl); + decode(frag, bl); + } + + inodeno_t ino = 0; + frag_t frag; +}; +WRITE_CLASS_ENCODER(dirfrag_t) + +inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) { + out << df.ino; + if (!df.frag.is_root()) out << "." << df.frag; + return out; +} +inline bool operator<(dirfrag_t l, dirfrag_t r) { + if (l.ino < r.ino) return true; + if (l.ino == r.ino && l.frag < r.frag) return true; + return false; +} +inline bool operator==(dirfrag_t l, dirfrag_t r) { + return l.ino == r.ino && l.frag == r.frag; +} + +namespace std { + template<> struct hash<dirfrag_t> { + size_t operator()(const dirfrag_t &df) const { + static rjhash<uint64_t> H; + static rjhash<uint32_t> I; + return H(df.ino) ^ I(df.frag); + } + }; +} // namespace std + +// ================================================================ +#define META_POP_IRD 0 +#define META_POP_IWR 1 +#define META_POP_READDIR 2 +#define META_POP_FETCH 3 +#define META_POP_STORE 4 +#define META_NPOP 5 + +class inode_load_vec_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const size_t NUM = 2; + + inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {} + inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {} + + DecayCounter &get(int t) { + return vec[t]; + } + void zero() { + for (auto &d : vec) { + d.reset(); + } + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<inode_load_vec_t*>& ls); + +private: + std::array<DecayCounter, NUM> vec; +}; +inline void encode(const inode_load_vec_t &c, ceph::buffer::list &bl) { + c.encode(bl); +} +inline void decode(inode_load_vec_t & c, ceph::buffer::list::const_iterator &p) { + c.decode(p); +} + +class dirfrag_load_vec_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const size_t NUM = 5; + + dirfrag_load_vec_t() : + vec{DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()), + DecayCounter(DecayRate()) + } + {} + dirfrag_load_vec_t(const DecayRate &rate) : + vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)} + {} + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(2, 2, bl); + for (const auto &i : vec) { + encode(i, bl); + } + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &p) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + for (auto &i : vec) { + decode(i, p); + } + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const; + void dump(ceph::Formatter *f, const DecayRate& rate) const; + static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls); + + const DecayCounter &get(int t) const { + return vec[t]; + } + DecayCounter &get(int t) { + return vec[t]; + } + void adjust(double d) { + for (auto &i : vec) { + i.adjust(d); + } + } + void zero() { + for (auto &i : vec) { + i.reset(); + } + } + double meta_load() const { + return + 1*vec[META_POP_IRD].get() + + 2*vec[META_POP_IWR].get() + + 1*vec[META_POP_READDIR].get() + + 2*vec[META_POP_FETCH].get() + + 4*vec[META_POP_STORE].get(); + } + + void add(dirfrag_load_vec_t& r) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].adjust(r.vec[i].get()); + } + void sub(dirfrag_load_vec_t& r) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].adjust(-r.vec[i].get()); + } + void scale(double f) { + for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++) + vec[i].scale(f); + } + +private: + friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl); + std::array<DecayCounter, NUM> vec; +}; + +inline void encode(const dirfrag_load_vec_t &c, ceph::buffer::list &bl) { + c.encode(bl); +} +inline void decode(dirfrag_load_vec_t& c, ceph::buffer::list::const_iterator &p) { + c.decode(p); +} + +inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl) +{ + CachedStackStringStream css; + *css << std::setprecision(1) << std::fixed + << "[pop" + " IRD:" << dl.vec[0] + << " IWR:" << dl.vec[1] + << " RDR:" << dl.vec[2] + << " FET:" << dl.vec[3] + << " STR:" << dl.vec[4] + << " *LOAD:" << dl.meta_load() << "]"; + return out << css->strv() << std::endl; +} + +struct mds_load_t { + using clock = dirfrag_load_vec_t::clock; + using time = dirfrag_load_vec_t::time; + + dirfrag_load_vec_t auth; + dirfrag_load_vec_t all; + + mds_load_t() : auth(DecayRate()), all(DecayRate()) {} + mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {} + + double req_rate = 0.0; + double cache_hit_rate = 0.0; + double queue_len = 0.0; + + double cpu_load_avg = 0.0; + + double mds_load() const; // defiend in MDBalancer.cc + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<mds_load_t*>& ls); +}; +inline void encode(const mds_load_t &c, ceph::buffer::list &bl) { + c.encode(bl); +} +inline void decode(mds_load_t &c, ceph::buffer::list::const_iterator &p) { + c.decode(p); +} + +inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load) +{ + return out << "mdsload<" << load.auth << "/" << load.all + << ", req " << load.req_rate + << ", hr " << load.cache_hit_rate + << ", qlen " << load.queue_len + << ", cpu " << load.cpu_load_avg + << ">"; +} + +class load_spread_t { +public: + using time = DecayCounter::time; + using clock = DecayCounter::clock; + static const int MAX = 4; + + load_spread_t(const DecayRate &rate) : count(rate) + {} + + load_spread_t() = delete; + + double hit(int who) { + for (int i=0; i<n; i++) + if (last[i] == who) + return count.get_last(); + + // we're new(ish) + last[p++] = who; + if (n < MAX) n++; + if (n == 1) return 0.0; + + if (p == MAX) p = 0; + + return count.hit(); + } + double get() const { + return count.get(); + } + + std::array<int, MAX> last = {-1, -1, -1, -1}; + int p = 0, n = 0; + DecayCounter count; +}; + +// ================================================================ +typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t; + +// -- authority delegation -- +// directory authority types +// >= 0 is the auth mds +#define CDIR_AUTH_PARENT mds_rank_t(-1) // default +#define CDIR_AUTH_UNKNOWN mds_rank_t(-2) +#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN) +#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN) +//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2) + +class MDSCacheObjectInfo { +public: + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls); + + inodeno_t ino = 0; + dirfrag_t dirfrag; + std::string dname; + snapid_t snapid; +}; + +inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) { + if (info.ino) return out << info.ino << "." << info.snapid; + if (info.dname.length()) return out << info.dirfrag << "/" << info.dname + << " snap " << info.snapid; + return out << info.dirfrag; +} + +inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) { + if (l.ino || r.ino) + return l.ino == r.ino && l.snapid == r.snapid; + else + return l.dirfrag == r.dirfrag && l.dname == r.dname; +} +WRITE_CLASS_ENCODER(MDSCacheObjectInfo) + +// parse a map of keys/values. +namespace qi = boost::spirit::qi; + +template <typename Iterator> +struct keys_and_values + : qi::grammar<Iterator, std::map<std::string, std::string>()> +{ + keys_and_values() + : keys_and_values::base_type(query) + { + query = pair >> *(qi::lit(' ') >> pair); + pair = key >> '=' >> value; + key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9"); + value = +qi::char_("a-zA-Z0-9-_."); + } + qi::rule<Iterator, std::map<std::string, std::string>()> query; + qi::rule<Iterator, std::pair<std::string, std::string>()> pair; + qi::rule<Iterator, std::string()> key, value; +}; + +#endif |