diff options
Diffstat (limited to '')
-rw-r--r-- | src/osd/osd_types.h | 6568 |
1 files changed, 6568 insertions, 0 deletions
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h new file mode 100644 index 000000000..93645c5f2 --- /dev/null +++ b/src/osd/osd_types.h @@ -0,0 +1,6568 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OSD_TYPES_H +#define CEPH_OSD_TYPES_H + +#include <atomic> +#include <sstream> +#include <cstdio> +#include <memory> +#include <string_view> + +#include <boost/scoped_ptr.hpp> +#include <boost/optional/optional_io.hpp> +#include <boost/variant.hpp> +#include <boost/smart_ptr/local_shared_ptr.hpp> + +#include "include/rados/rados_types.hpp" +#include "include/mempool.h" + +#include "msg/msg_types.h" +#include "include/compat.h" +#include "include/types.h" +#include "include/utime.h" +#include "include/CompatSet.h" +#include "common/ceph_context.h" +#include "common/histogram.h" +#include "include/interval_set.h" +#include "include/inline_memory.h" +#include "common/Formatter.h" +#include "common/bloom_filter.hpp" +#include "common/hobject.h" +#include "common/snap_types.h" +#include "HitSet.h" +#include "Watch.h" +#include "include/cmp.h" +#include "librados/ListObjectImpl.h" +#include "compressor/Compressor.h" +#include "osd_perf_counters.h" + +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026" + +#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)") +#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object") +#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator") +#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean") +#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories") +#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool") +#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog") +#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper") +#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects") +#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints") +#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object") +#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr") +#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set") +#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure") + + +/// pool priority range set by user +#define OSD_POOL_PRIORITY_MAX 10 +#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX + +/// min recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MIN 0 + +/// base backfill priority for MBackfillReserve +#define OSD_BACKFILL_PRIORITY_BASE 100 + +/// base backfill priority for MBackfillReserve (degraded PG) +#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140 + +/// base recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_BASE 180 + +/// base backfill priority for MBackfillReserve (inactive PG) +#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220 + +/// base recovery priority for MRecoveryReserve (inactive PG) +#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220 + +/// max manually/automatically set recovery priority for MBackfillReserve +#define OSD_RECOVERY_PRIORITY_MAX 253 + +/// backfill priority for MBackfillReserve, when forced manually +#define OSD_BACKFILL_PRIORITY_FORCED 254 + +/// recovery priority for MRecoveryReserve, when forced manually +#define OSD_RECOVERY_PRIORITY_FORCED 255 + +/// priority for pg deletion when osd is not fullish +#define OSD_DELETE_PRIORITY_NORMAL 179 + +/// priority for pg deletion when osd is approaching full +#define OSD_DELETE_PRIORITY_FULLISH 219 + +/// priority when more full +#define OSD_DELETE_PRIORITY_FULL 255 + +static std::map<int, int> max_prio_map = { + {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1}, + {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1}, + {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1}, + {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}, + {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX} +}; + +typedef hobject_t collection_list_handle_t; + +/// convert a single CPEH_OSD_FLAG_* to a std::string +const char *ceph_osd_flag_name(unsigned flag); +/// convert a single CEPH_OSD_OF_FLAG_* to a std::string +const char *ceph_osd_op_flag_name(unsigned flag); + +/// convert CEPH_OSD_FLAG_* op flags to a std::string +std::string ceph_osd_flag_string(unsigned flags); +/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string +std::string ceph_osd_op_flag_string(unsigned flags); +/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string +std::string ceph_osd_alloc_hint_flag_string(unsigned flags); + +typedef std::map<std::string,std::string> osd_alert_list_t; +/// map osd id -> alert_list_t +typedef std::map<int, osd_alert_list_t> osd_alerts_t; +void dump(ceph::Formatter* f, const osd_alerts_t& alerts); + + +typedef interval_set< + snapid_t, + mempool::osdmap::flat_map> snap_interval_set_t; + + +/** + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request. + */ +struct osd_reqid_t { + entity_name_t name; // who + ceph_tid_t tid; + int32_t inc; // incarnation + + osd_reqid_t() + : tid(0), inc(0) + {} + osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t) + : name(a), tid(t), inc(i) + {} + + DENC(osd_reqid_t, v, p) { + DENC_START(2, 2, p); + denc(v.name, p); + denc(v.tid, p); + denc(v.inc, p); + DENC_FINISH(p); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<osd_reqid_t*>& o); +}; +WRITE_CLASS_DENC(osd_reqid_t) + + + +struct pg_shard_t { + static const int32_t NO_OSD = 0x7fffffff; + int32_t osd; + shard_id_t shard; + pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {} + explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {} + pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {} + bool is_undefined() const { + return osd == -1; + } + std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const { + f->dump_unsigned("osd", osd); + if (shard != shard_id_t::NO_SHARD) { + f->dump_unsigned("shard", shard); + } + } +}; +WRITE_CLASS_ENCODER(pg_shard_t) +WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard) +WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard) +std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs); + +using HobjToShardSetMapping = std::map<hobject_t, std::set<pg_shard_t>>; + +class IsPGRecoverablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const std::set<pg_shard_t> &have) const = 0; + virtual ~IsPGRecoverablePredicate() {} +}; + +class IsPGReadablePredicate { +public: + /** + * have encodes the shards available + */ + virtual bool operator()(const std::set<pg_shard_t> &have) const = 0; + virtual ~IsPGReadablePredicate() {} +}; + +inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); } +inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); } + +namespace std { + template<> struct hash<osd_reqid_t> { + size_t operator()(const osd_reqid_t &r) const { + static hash<uint64_t> H; + return H(r.name.num() ^ r.tid ^ r.inc); + } + }; +} // namespace std + + +// ----- + +// a locator constrains the placement of an object. mainly, which pool +// does it go in. +struct object_locator_t { + // You specify either the hash or the key -- not both + std::int64_t pool; ///< pool id + std::string key; ///< key string (if non-empty) + std::string nspace; ///< namespace + std::int64_t hash; ///< hash position (if >= 0) + + explicit object_locator_t() + : pool(-1), hash(-1) {} + explicit object_locator_t(int64_t po) + : pool(po), hash(-1) {} + explicit object_locator_t(int64_t po, int64_t ps) + : pool(po), hash(ps) {} + explicit object_locator_t(int64_t po, std::string_view ns) + : pool(po), nspace(ns), hash(-1) {} + explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps) + : pool(po), nspace(ns), hash(ps) {} + explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s) + : pool(po), key(s), nspace(ns), hash(-1) {} + explicit object_locator_t(const hobject_t& soid) + : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {} + + int64_t get_pool() const { + return pool; + } + + void clear() { + pool = -1; + key = ""; + nspace = ""; + hash = -1; + } + + bool empty() const { + return pool == -1; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<object_locator_t*>& o); +}; +WRITE_CLASS_ENCODER(object_locator_t) + +inline bool operator==(const object_locator_t& l, const object_locator_t& r) { + return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash; +} +inline bool operator!=(const object_locator_t& l, const object_locator_t& r) { + return !(l == r); +} + +inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc) +{ + out << "@" << loc.pool; + if (loc.nspace.length()) + out << ";" << loc.nspace; + if (loc.key.length()) + out << ":" << loc.key; + return out; +} + +struct request_redirect_t { +private: + object_locator_t redirect_locator; ///< this is authoritative + std::string redirect_object; ///< If non-empty, the request goes to this object name + + friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir); +public: + + request_redirect_t() {} + explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) : + redirect_locator(orig) { redirect_locator.pool = rpool; } + explicit request_redirect_t(const object_locator_t& rloc) : + redirect_locator(rloc) {} + explicit request_redirect_t(const object_locator_t& orig, + const std::string& robj) : + redirect_locator(orig), redirect_object(robj) {} + + bool empty() const { return redirect_locator.empty() && + redirect_object.empty(); } + + void combine_with_locator(object_locator_t& orig, std::string& obj) const { + orig = redirect_locator; + if (!redirect_object.empty()) + obj = redirect_object; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<request_redirect_t*>& o); +}; +WRITE_CLASS_ENCODER(request_redirect_t) + +inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) { + out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}"; + return out; +} + +// Internal OSD op flags - set by the OSD based on the op types +enum { + CEPH_OSD_RMW_FLAG_READ = (1 << 1), + CEPH_OSD_RMW_FLAG_WRITE = (1 << 2), + CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3), + CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4), + CEPH_OSD_RMW_FLAG_PGOP = (1 << 5), + CEPH_OSD_RMW_FLAG_CACHE = (1 << 6), + CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7), + CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8), + CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9), + CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10), + CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11), +}; + + +// pg stuff + +#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))) + +// placement seed (a hash value) +typedef uint32_t ps_t; + +// old (v1) pg_t encoding (wrap old struct ceph_pg) +struct old_pg_t { + ceph_pg v; + void encode(ceph::buffer::list& bl) const { + ceph::encode_raw(v, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + ceph::decode_raw(v, bl); + } +}; +WRITE_CLASS_ENCODER(old_pg_t) + +// placement group id +struct pg_t { + uint64_t m_pool; + uint32_t m_seed; + + pg_t() : m_pool(0), m_seed(0) {} + pg_t(ps_t seed, uint64_t pool) : + m_pool(pool), m_seed(seed) {} + // cppcheck-suppress noExplicitConstructor + pg_t(const ceph_pg& cpg) : + m_pool(cpg.pool), m_seed(cpg.ps) {} + + // cppcheck-suppress noExplicitConstructor + pg_t(const old_pg_t& opg) { + *this = opg.v; + } + + old_pg_t get_old_pg() const { + old_pg_t o; + ceph_assert(m_pool < 0xffffffffull); + o.v.pool = m_pool; + o.v.ps = m_seed; + o.v.preferred = (__s16)-1; + return o; + } + + ps_t ps() const { + return m_seed; + } + int64_t pool() const { + return m_pool; + } + + static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0' + char *calc_name(char *buf, const char *suffix_backwords) const; + + void set_ps(ps_t p) { + m_seed = p; + } + void set_pool(uint64_t p) { + m_pool = p; + } + + pg_t get_parent() const; + pg_t get_ancestor(unsigned old_pg_num) const; + + int print(char *o, int maxlen) const; + bool parse(const char *s); + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const; + + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const; + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr); + } + + /** + * Returns b such that for all object o: + * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this + */ + unsigned get_split_bits(unsigned pg_num) const; + + bool contains(int bits, const ghobject_t& oid) const { + return + (int64_t)m_pool == oid.hobj.get_logical_pool() && + oid.match(bits, ps()); + } + bool contains(int bits, const hobject_t& oid) const { + return + (int64_t)m_pool == oid.get_logical_pool() && + oid.match(bits, ps()); + } + + hobject_t get_hobj_start() const; + hobject_t get_hobj_end(unsigned pg_num) const; + + // strong ordering is supported + inline int compare(const pg_t& p) const noexcept { + if (auto delta = pool() - p.pool(); delta != 0) { + return delta; + } else if (ps() < p.ps()) { + return -1; + } else if (ps() > p.ps()) { + return 1; + } else { + return 0; + } + } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(m_pool, bl); + encode(m_seed, bl); + encode((int32_t)-1, bl); // was preferred + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + decode(m_pool, bl); + decode(m_seed, bl); + bl += sizeof(int32_t); // was preferred + } + void decode_old(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + old_pg_t opg; + decode(opg, bl); + *this = opg; + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_t) + +inline bool operator<(const pg_t& l, const pg_t& r) { + return l.compare(r) < 0; +} +inline bool operator<=(const pg_t& l, const pg_t& r) { + return l.compare(r) <= 0; +} +inline bool operator==(const pg_t& l, const pg_t& r) { + return l.compare(r) == 0; +} +inline bool operator!=(const pg_t& l, const pg_t& r) { + return l.compare(r) != 0; +} +inline bool operator>(const pg_t& l, const pg_t& r) { + return l.compare(r) > 0; +} +inline bool operator>=(const pg_t& l, const pg_t& r) { + return l.compare(r) >= 0; +} + +std::ostream& operator<<(std::ostream& out, const pg_t &pg); + +namespace std { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash<uint32_t> H; + // xor (s32)-1 in there to preserve original m_preferred result (paranoia!) + return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1)); + } + }; +} // namespace std + +struct spg_t { + pg_t pgid; + shard_id_t shard; + spg_t() : shard(shard_id_t::NO_SHARD) {} + spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {} + explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {} + unsigned get_split_bits(unsigned pg_num) const { + return pgid.get_split_bits(pg_num); + } + spg_t get_parent() const { + return spg_t(pgid.get_parent(), shard); + } + ps_t ps() const { + return pgid.ps(); + } + uint64_t pool() const { + return pgid.pool(); + } + void reset_shard(shard_id_t s) { + shard = s; + } + + static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255"); + char *calc_name(char *buf, const char *suffix_backwords) const; + + bool parse(const char *s); + bool parse(const std::string& s) { + return parse(s.c_str()); + } + + spg_t get_ancestor(unsigned old_pg_num) const { + return spg_t(pgid.get_ancestor(old_pg_num), shard); + } + + bool is_split(unsigned old_pg_num, unsigned new_pg_num, + std::set<spg_t> *pchildren) const { + std::set<pg_t> _children; + std::set<pg_t> *children = pchildren ? &_children : NULL; + bool is_split = pgid.is_split(old_pg_num, new_pg_num, children); + if (pchildren && is_split) { + for (std::set<pg_t>::iterator i = _children.begin(); + i != _children.end(); + ++i) { + pchildren->insert(spg_t(*i, shard)); + } + } + return is_split; + } + bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const { + return pgid.is_merge_target(old_pg_num, new_pg_num); + } + bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, + spg_t *parent) const { + spg_t out = *this; + bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid); + if (r && parent) { + *parent = out; + } + return r; + } + + bool is_no_shard() const { + return shard == shard_id_t::NO_SHARD; + } + + ghobject_t make_pgmeta_oid() const { + return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(pgid, bl); + encode(shard, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(pgid, bl); + decode(shard, bl); + DECODE_FINISH(bl); + } + + ghobject_t make_temp_ghobject(const std::string& name) const { + return ghobject_t( + hobject_t(object_t(name), "", CEPH_NOSNAP, + pgid.ps(), + hobject_t::get_temp_pool(pgid.pool()), + ""), + ghobject_t::NO_GEN, + shard); + } + + unsigned hash_to_shard(unsigned num_shards) const { + return ps() % num_shards; + } +}; +WRITE_CLASS_ENCODER(spg_t) +WRITE_EQ_OPERATORS_2(spg_t, pgid, shard) +WRITE_CMP_OPERATORS_2(spg_t, pgid, shard) + +namespace std { + template<> struct hash< spg_t > + { + size_t operator()( const spg_t& x ) const + { + static hash<uint32_t> H; + return H(hash<pg_t>()(x.pgid) ^ x.shard); + } + }; +} // namespace std + +std::ostream& operator<<(std::ostream& out, const spg_t &pg); + +// ---------------------- + +class coll_t { + enum type_t { + TYPE_META = 0, + TYPE_LEGACY_TEMP = 1, /* no longer used */ + TYPE_PG = 2, + TYPE_PG_TEMP = 3, + }; + type_t type; + spg_t pgid; + uint64_t removal_seq; // note: deprecated, not encoded + + char _str_buff[spg_t::calc_name_buf_size]; + char *_str; + + void calc_str(); + + coll_t(type_t t, spg_t p, uint64_t r) + : type(t), pgid(p), removal_seq(r) { + calc_str(); + } + +public: + coll_t() : type(TYPE_META), removal_seq(0) + { + calc_str(); + } + + coll_t(const coll_t& other) + : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) { + calc_str(); + } + + explicit coll_t(spg_t pgid) + : type(TYPE_PG), pgid(pgid), removal_seq(0) + { + calc_str(); + } + + coll_t& operator=(const coll_t& rhs) + { + this->type = rhs.type; + this->pgid = rhs.pgid; + this->removal_seq = rhs.removal_seq; + this->calc_str(); + return *this; + } + + // named constructors + static coll_t meta() { + return coll_t(); + } + static coll_t pg(spg_t p) { + return coll_t(p); + } + + const std::string to_str() const { + return std::string(_str); + } + const char *c_str() const { + return _str; + } + + bool parse(const std::string& s); + + int operator<(const coll_t &rhs) const { + return type < rhs.type || + (type == rhs.type && pgid < rhs.pgid); + } + + bool is_meta() const { + return type == TYPE_META; + } + bool is_pg_prefix(spg_t *pgid_) const { + if (type == TYPE_PG || type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_pg() const { + return type == TYPE_PG; + } + bool is_pg(spg_t *pgid_) const { + if (type == TYPE_PG) { + *pgid_ = pgid; + return true; + } + return false; + } + bool is_temp() const { + return type == TYPE_PG_TEMP; + } + bool is_temp(spg_t *pgid_) const { + if (type == TYPE_PG_TEMP) { + *pgid_ = pgid; + return true; + } + return false; + } + int64_t pool() const { + return pgid.pool(); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + size_t encoded_size() const; + + inline bool operator==(const coll_t& rhs) const { + // only compare type if meta + if (type != rhs.type) + return false; + if (type == TYPE_META) + return true; + return type == rhs.type && pgid == rhs.pgid; + } + inline bool operator!=(const coll_t& rhs) const { + return !(*this == rhs); + } + + // get a TEMP collection that corresponds to the current collection, + // which we presume is a pg collection. + coll_t get_temp() const { + ceph_assert(type == TYPE_PG); + return coll_t(TYPE_PG_TEMP, pgid, 0); + } + + ghobject_t get_min_hobj() const { + ghobject_t o; + switch (type) { + case TYPE_PG: + o.hobj.pool = pgid.pool(); + o.set_shard(pgid.shard); + break; + case TYPE_META: + o.hobj.pool = -1; + break; + default: + break; + } + return o; + } + + unsigned hash_to_shard(unsigned num_shards) const { + if (type == TYPE_PG) + return pgid.hash_to_shard(num_shards); + return 0; // whatever. + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<coll_t*>& o); +}; + +WRITE_CLASS_ENCODER(coll_t) + +inline std::ostream& operator<<(std::ostream& out, const coll_t& c) { + out << c.to_str(); + return out; +} + +namespace std { + template<> struct hash<coll_t> { + size_t operator()(const coll_t &c) const { + size_t h = 0; + std::string str(c.to_str()); + std::string::const_iterator end(str.end()); + for (std::string::const_iterator s = str.begin(); s != end; ++s) { + h += *s; + h += (h << 10); + h ^= (h >> 6); + } + h += (h << 3); + h ^= (h >> 11); + h += (h << 15); + return h; + } + }; +} // namespace std + +inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol) +{ + out << pg_t(ol.ol_pgid); + int su = ol.ol_stripe_unit; + if (su) + out << ".su=" << su; + return out; +} + + + +// compound rados version type +/* WARNING: If add member in eversion_t, please make sure the encode/decode function + * work well. For little-endian machine, we should make sure there is no padding + * in 32-bit machine and 64-bit machine. + */ +class eversion_t { +public: + version_t version; + epoch_t epoch; + __u32 __pad; + eversion_t() : version(0), epoch(0), __pad(0) {} + eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {} + + // cppcheck-suppress noExplicitConstructor + eversion_t(const ceph_eversion& ce) : + version(ce.version), + epoch(ce.epoch), + __pad(0) { } + + explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); } + + static const eversion_t& max() { + static const eversion_t max(-1,-1); + return max; + } + + operator ceph_eversion() { + ceph_eversion c; + c.epoch = epoch; + c.version = version; + return c; + } + + std::string get_key_name() const; + + // key must point to the beginning of a block of 32 chars + inline void get_key_name(char* key) const { + // Below is equivalent of sprintf("%010u.%020llu"); + key[31] = 0; + ritoa<uint64_t, 10, 20>(version, key + 31); + key[10] = '.'; + ritoa<uint32_t, 10, 10>(epoch, key + 10); + } + + void encode(ceph::buffer::list &bl) const { +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t)); +#else + using ceph::encode; + encode(version, bl); + encode(epoch, bl); +#endif + } + void decode(ceph::buffer::list::const_iterator &bl) { +#if defined(CEPH_LITTLE_ENDIAN) + bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this); +#else + using ceph::decode; + decode(version, bl); + decode(epoch, bl); +#endif + } + void decode(ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } +}; +WRITE_CLASS_ENCODER(eversion_t) + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) { + return out << e.epoch << "'" << e.version; +} + +/** + * objectstore_perf_stat_t + * + * current perf information about the osd + */ +struct objectstore_perf_stat_t { + // cur_op_latency is in ns since double add/sub are not associative + uint64_t os_commit_latency_ns; + uint64_t os_apply_latency_ns; + + objectstore_perf_stat_t() : + os_commit_latency_ns(0), os_apply_latency_ns(0) {} + + bool operator==(const objectstore_perf_stat_t &r) const { + return os_commit_latency_ns == r.os_commit_latency_ns && + os_apply_latency_ns == r.os_apply_latency_ns; + } + + void add(const objectstore_perf_stat_t &o) { + os_commit_latency_ns += o.os_commit_latency_ns; + os_apply_latency_ns += o.os_apply_latency_ns; + } + void sub(const objectstore_perf_stat_t &o) { + os_commit_latency_ns -= o.os_commit_latency_ns; + os_apply_latency_ns -= o.os_apply_latency_ns; + } + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t) + +/* + * pg states + */ +#define PG_STATE_CREATING (1ULL << 0) // creating +#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too) +#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas. +#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline +#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound +#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound +#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging +#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing +//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub +#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy +#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be) +#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering +#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub +#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects +#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill +#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed. +#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown. +#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH +#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files +#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content +#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full +#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations +#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size +#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active +#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover +#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps +#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps +#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full +#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps +#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other +#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other +#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors +#define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings +#define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire + +std::string pg_state_string(uint64_t state); +std::string pg_vector_string(const std::vector<int32_t> &a); +std::optional<uint64_t> pg_string_state(const std::string& state); + + +/* + * pool_snap_info_t + * + * attributes for a single pool snapshot. + */ +struct pool_snap_info_t { + snapid_t snapid; + utime_t stamp; + std::string name; + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list<pool_snap_info_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t) + +inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) { + return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')'; +} + + +/* + * pool_opts_t + * + * pool options. + */ + +// The order of items in the list is important, therefore, +// you should always add to the end of the list when adding new options. + +class pool_opts_t { +public: + enum key_t { + SCRUB_MIN_INTERVAL, + SCRUB_MAX_INTERVAL, + DEEP_SCRUB_INTERVAL, + RECOVERY_PRIORITY, + RECOVERY_OP_PRIORITY, + SCRUB_PRIORITY, + COMPRESSION_MODE, + COMPRESSION_ALGORITHM, + COMPRESSION_REQUIRED_RATIO, + COMPRESSION_MAX_BLOB_SIZE, + COMPRESSION_MIN_BLOB_SIZE, + CSUM_TYPE, + CSUM_MAX_BLOCK, + CSUM_MIN_BLOCK, + FINGERPRINT_ALGORITHM, + PG_NUM_MIN, // min pg_num + TARGET_SIZE_BYTES, // total bytes in pool + TARGET_SIZE_RATIO, // fraction of total cluster + PG_AUTOSCALE_BIAS, + READ_LEASE_INTERVAL, + DEDUP_TIER, + DEDUP_CHUNK_ALGORITHM, + DEDUP_CDC_CHUNK_SIZE, + PG_NUM_MAX, // max pg_num + }; + + enum type_t { + STR, + INT, + DOUBLE, + }; + + struct opt_desc_t { + key_t key; + type_t type; + + opt_desc_t(key_t k, type_t t) : key(k), type(t) {} + + bool operator==(const opt_desc_t& rhs) const { + return key == rhs.key && type == rhs.type; + } + }; + + typedef boost::variant<std::string,int64_t,double> value_t; + + static bool is_opt_name(const std::string& name); + static opt_desc_t get_opt_desc(const std::string& name); + + pool_opts_t() : opts() {} + + bool is_set(key_t key) const; + + template<typename T> + void set(key_t key, const T &val) { + value_t value = val; + opts[key] = value; + } + + template<typename T> + bool get(key_t key, T *val) const { + opts_t::const_iterator i = opts.find(key); + if (i == opts.end()) { + return false; + } + *val = boost::get<T>(i->second); + return true; + } + + template<typename T> + T value_or(key_t key, T&& default_value) const { + auto i = opts.find(key); + if (i == opts.end()) { + return std::forward<T>(default_value); + } + return boost::get<T>(i->second); + } + + const value_t& get(key_t key) const; + + bool unset(key_t key); + + void dump(const std::string& name, ceph::Formatter *f) const; + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + +private: + typedef std::map<key_t, value_t> opts_t; + opts_t opts; + + friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_opts_t) + +struct pg_merge_meta_t { + pg_t source_pgid; + epoch_t ready_epoch = 0; + epoch_t last_epoch_started = 0; + epoch_t last_epoch_clean = 0; + eversion_t source_version; + eversion_t target_version; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(source_pgid, bl); + encode(ready_epoch, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(source_version, bl); + encode(target_version, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(source_pgid, p); + decode(ready_epoch, p); + decode(last_epoch_started, p); + decode(last_epoch_clean, p); + decode(source_version, p); + decode(target_version, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("source_pgid") << source_pgid; + f->dump_unsigned("ready_epoch", ready_epoch); + f->dump_unsigned("last_epoch_started", last_epoch_started); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("source_version") << source_version; + f->dump_stream("target_version") << target_version; + } +}; +WRITE_CLASS_ENCODER(pg_merge_meta_t) + +class OSDMap; + +/* + * pg_pool + */ +struct pg_pool_t { + static const char *APPLICATION_NAME_CEPHFS; + static const char *APPLICATION_NAME_RBD; + static const char *APPLICATION_NAME_RGW; + + enum { + TYPE_REPLICATED = 1, // replication + //TYPE_RAID4 = 2, // raid4 (never implemented) + TYPE_ERASURE = 3, // erasure-coded + }; + static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */ + static std::string_view get_type_name(int t) { + switch (t) { + case TYPE_REPLICATED: return "replicated"; + //case TYPE_RAID4: return "raid4"; + case TYPE_ERASURE: return "erasure"; + default: return "???"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + + enum { + FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding) + FLAG_FULL = 1<<1, // pool is full + FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled + FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay) + FLAG_NODELETE = 1<<4, // pool can't be deleted + FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed + FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed + FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED + FLAG_NOSCRUB = 1<<8, // block periodic scrub + FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub + FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too + FLAG_NEARFULL = 1<<11, // pool is nearfull + FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull + FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps + FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps + FLAG_CREATING = 1<<15, // initial pool PGs are being created + FLAG_BULK = 1<<17, //pool is large + }; + + static const char *get_flag_name(int f) { + switch (f) { + case FLAG_HASHPSPOOL: return "hashpspool"; + case FLAG_FULL: return "full"; + case FLAG_EC_OVERWRITES: return "ec_overwrites"; + case FLAG_INCOMPLETE_CLONES: return "incomplete_clones"; + case FLAG_NODELETE: return "nodelete"; + case FLAG_NOPGCHANGE: return "nopgchange"; + case FLAG_NOSIZECHANGE: return "nosizechange"; + case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed"; + case FLAG_NOSCRUB: return "noscrub"; + case FLAG_NODEEP_SCRUB: return "nodeep-scrub"; + case FLAG_FULL_QUOTA: return "full_quota"; + case FLAG_NEARFULL: return "nearfull"; + case FLAG_BACKFILLFULL: return "backfillfull"; + case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps"; + case FLAG_POOL_SNAPS: return "pool_snaps"; + case FLAG_CREATING: return "creating"; + case FLAG_BULK: return "bulk"; + default: return "???"; + } + } + static std::string get_flags_string(uint64_t f) { + std::string s; + for (unsigned n=0; f && n<64; ++n) { + if (f & (1ull << n)) { + if (s.length()) + s += ","; + s += get_flag_name(1ull << n); + } + } + return s; + } + std::string get_flags_string() const { + return get_flags_string(flags); + } + static uint64_t get_flag_by_name(const std::string& name) { + if (name == "hashpspool") + return FLAG_HASHPSPOOL; + if (name == "full") + return FLAG_FULL; + if (name == "ec_overwrites") + return FLAG_EC_OVERWRITES; + if (name == "incomplete_clones") + return FLAG_INCOMPLETE_CLONES; + if (name == "nodelete") + return FLAG_NODELETE; + if (name == "nopgchange") + return FLAG_NOPGCHANGE; + if (name == "nosizechange") + return FLAG_NOSIZECHANGE; + if (name == "write_fadvise_dontneed") + return FLAG_WRITE_FADVISE_DONTNEED; + if (name == "noscrub") + return FLAG_NOSCRUB; + if (name == "nodeep-scrub") + return FLAG_NODEEP_SCRUB; + if (name == "full_quota") + return FLAG_FULL_QUOTA; + if (name == "nearfull") + return FLAG_NEARFULL; + if (name == "backfillfull") + return FLAG_BACKFILLFULL; + if (name == "selfmanaged_snaps") + return FLAG_SELFMANAGED_SNAPS; + if (name == "pool_snaps") + return FLAG_POOL_SNAPS; + if (name == "creating") + return FLAG_CREATING; + if (name == "bulk") + return FLAG_BULK; + return 0; + } + + /// converts the acting/up vector to a set of pg shards + void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const; + + typedef enum { + CACHEMODE_NONE = 0, ///< no caching + CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later + CACHEMODE_FORWARD = 2, ///< forward if not in cache + CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent] + CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later + CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later + CACHEMODE_PROXY = 6, ///< proxy if not in cache + } cache_mode_t; + static const char *get_cache_mode_name(cache_mode_t m) { + switch (m) { + case CACHEMODE_NONE: return "none"; + case CACHEMODE_WRITEBACK: return "writeback"; + case CACHEMODE_FORWARD: return "forward"; + case CACHEMODE_READONLY: return "readonly"; + case CACHEMODE_READFORWARD: return "readforward"; + case CACHEMODE_READPROXY: return "readproxy"; + case CACHEMODE_PROXY: return "proxy"; + default: return "unknown"; + } + } + static cache_mode_t get_cache_mode_from_str(const std::string& s) { + if (s == "none") + return CACHEMODE_NONE; + if (s == "writeback") + return CACHEMODE_WRITEBACK; + if (s == "forward") + return CACHEMODE_FORWARD; + if (s == "readonly") + return CACHEMODE_READONLY; + if (s == "readforward") + return CACHEMODE_READFORWARD; + if (s == "readproxy") + return CACHEMODE_READPROXY; + if (s == "proxy") + return CACHEMODE_PROXY; + return (cache_mode_t)-1; + } + const char *get_cache_mode_name() const { + return get_cache_mode_name(cache_mode); + } + bool cache_mode_requires_hit_set() const { + switch (cache_mode) { + case CACHEMODE_NONE: + case CACHEMODE_FORWARD: + case CACHEMODE_READONLY: + case CACHEMODE_PROXY: + return false; + case CACHEMODE_WRITEBACK: + case CACHEMODE_READFORWARD: + case CACHEMODE_READPROXY: + return true; + default: + ceph_abort_msg("implement me"); + } + } + + enum class pg_autoscale_mode_t : uint8_t { + OFF = 0, + WARN = 1, + ON = 2, + UNKNOWN = UINT8_MAX, + }; + static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) { + switch (m) { + case pg_autoscale_mode_t::OFF: return "off"; + case pg_autoscale_mode_t::ON: return "on"; + case pg_autoscale_mode_t::WARN: return "warn"; + default: return "???"; + } + } + static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) { + if (m == "off") { + return pg_autoscale_mode_t::OFF; + } + if (m == "warn") { + return pg_autoscale_mode_t::WARN; + } + if (m == "on") { + return pg_autoscale_mode_t::ON; + } + return pg_autoscale_mode_t::UNKNOWN; + } + + utime_t create_time; + uint64_t flags = 0; ///< FLAG_* + __u8 type = 0; ///< TYPE_* + __u8 size = 0, min_size = 0; ///< number of osds in each pg + __u8 crush_rule = 0; ///< crush placement rule + __u8 object_hash = 0; ///< hash mapping object name to ps + pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN; + +private: + __u32 pg_num = 0, pgp_num = 0; ///< number of pgs + __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to + __u32 pg_num_target = 0; ///< pg_num we should converge toward + __u32 pgp_num_target = 0; ///< pgp_num we should converge toward + +public: + std::map<std::string, std::string> properties; ///< OBSOLETE + std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap + epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes + // If non-zero, require OSDs in at least this many different instances... + uint32_t peering_crush_bucket_count = 0; + // of this bucket type... + uint32_t peering_crush_bucket_barrier = 0; + // including this one + int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE; + // The per-bucket replica count is calculated with this "target" + // instead of the above crush_bucket_count. This means we can maintain a + // target size of 4 without attempting to place them all in 1 DC + uint32_t peering_crush_bucket_target = 0; + /// last epoch that forced clients to resend + epoch_t last_force_op_resend = 0; + /// last epoch that forced clients to resend (pre-nautilus clients only) + epoch_t last_force_op_resend_prenautilus = 0; + /// last epoch that forced clients to resend (pre-luminous clients only) + epoch_t last_force_op_resend_preluminous = 0; + + /// metadata for the most recent PG merge + pg_merge_meta_t last_pg_merge_meta; + + snapid_t snap_seq = 0; ///< seq for per-pool snapshot + epoch_t snap_epoch = 0; ///< osdmap epoch of last snap + uint64_t auid = 0; ///< who owns the pg + + uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool + uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool + + /* + * Pool snaps (global to this pool). These define a SnapContext for + * the pool, unless the client manually specifies an alternate + * context. + */ + std::map<snapid_t, pool_snap_info_t> snaps; + /* + * Alternatively, if we are defining non-pool snaps (e.g. via the + * Ceph MDS), we must track @removed_snaps (since @snaps is not + * used). Snaps and removed_snaps are to be used exclusive of each + * other! + */ + interval_set<snapid_t> removed_snaps; + + unsigned pg_num_mask = 0, pgp_num_mask = 0; + + std::set<uint64_t> tiers; ///< pools that are tiers of us + int64_t tier_of = -1; ///< pool for which we are a tier + // Note that write wins for read+write ops + int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to + int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to + cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode + + bool is_tier() const { return tier_of >= 0; } + bool has_tiers() const { return !tiers.empty(); } + void clear_tier() { + tier_of = -1; + clear_read_tier(); + clear_write_tier(); + clear_tier_tunables(); + } + bool has_read_tier() const { return read_tier >= 0; } + void clear_read_tier() { read_tier = -1; } + bool has_write_tier() const { return write_tier >= 0; } + void clear_write_tier() { write_tier = -1; } + void clear_tier_tunables() { + if (cache_mode != CACHEMODE_NONE) + flags |= FLAG_INCOMPLETE_CLONES; + cache_mode = CACHEMODE_NONE; + + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_dirty_high_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + hit_set_params = HitSet::Params(); + hit_set_period = 0; + hit_set_count = 0; + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 0; + grade_table.resize(0); + } + + bool is_stretch_pool() const { + return peering_crush_bucket_count != 0; + } + + bool stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap, + std::ostream *out) const; + bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap, + std::ostream *out) const { + if (!is_stretch_pool()) return true; + set<int> swant; + for (auto i : want) swant.insert(i); + return stretch_set_can_peer(swant, osdmap, out); + } + + uint64_t target_max_bytes = 0; ///< tiering: target max pool size + uint64_t target_max_objects = 0; ///< tiering: target max pool size + + uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty + uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed + uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest + + uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush + uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict + + HitSet::Params hit_set_params; ///< The HitSet params to use on this pool + uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds) + uint32_t hit_set_count = 0; ///< number of periods to retain + bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object + uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read + uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write + uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects + ///< temperature count,the follow hit_set's priority decay + ///< by this params than pre hit_set + uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature + + uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes + + uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates + ///< user does not specify any expected value + bool fast_read = false; ///< whether turn on fast read on the pool or not + + pool_opts_t opts; ///< options + + typedef enum { + TYPE_FINGERPRINT_NONE = 0, + TYPE_FINGERPRINT_SHA1 = 1, + TYPE_FINGERPRINT_SHA256 = 2, + TYPE_FINGERPRINT_SHA512 = 3, + } fingerprint_t; + static fingerprint_t get_fingerprint_from_str(const std::string& s) { + if (s == "none") + return TYPE_FINGERPRINT_NONE; + if (s == "sha1") + return TYPE_FINGERPRINT_SHA1; + if (s == "sha256") + return TYPE_FINGERPRINT_SHA256; + if (s == "sha512") + return TYPE_FINGERPRINT_SHA512; + return (fingerprint_t)-1; + } + const fingerprint_t get_fingerprint_type() const { + std::string fp_str; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + return get_fingerprint_from_str(fp_str); + } + const char *get_fingerprint_name() const { + std::string fp_str; + fingerprint_t fp_t; + opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str); + fp_t = get_fingerprint_from_str(fp_str); + return get_fingerprint_name(fp_t); + } + static const char *get_fingerprint_name(fingerprint_t m) { + switch (m) { + case TYPE_FINGERPRINT_NONE: return "none"; + case TYPE_FINGERPRINT_SHA1: return "sha1"; + case TYPE_FINGERPRINT_SHA256: return "sha256"; + case TYPE_FINGERPRINT_SHA512: return "sha512"; + default: return "unknown"; + } + } + + typedef enum { + TYPE_DEDUP_CHUNK_NONE = 0, + TYPE_DEDUP_CHUNK_FASTCDC = 1, + TYPE_DEDUP_CHUNK_FIXEDCDC = 2, + } dedup_chunk_algo_t; + static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) { + if (s == "none") + return TYPE_DEDUP_CHUNK_NONE; + if (s == "fastcdc") + return TYPE_DEDUP_CHUNK_FASTCDC; + if (s == "fixed") + return TYPE_DEDUP_CHUNK_FIXEDCDC; + return (dedup_chunk_algo_t)-1; + } + const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const { + std::string algo_str; + opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str); + return get_dedup_chunk_algorithm_from_str(algo_str); + } + const char *get_dedup_chunk_algorithm_name() const { + std::string dedup_chunk_algo_str; + dedup_chunk_algo_t dedup_chunk_algo_t; + opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str); + dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str); + return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t); + } + static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) { + switch (m) { + case TYPE_DEDUP_CHUNK_NONE: return "none"; + case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc"; + case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed"; + default: return "unknown"; + } + } + + int64_t get_dedup_tier() const { + int64_t tier_id = 0; + opts.get(pool_opts_t::DEDUP_TIER, &tier_id); + return tier_id; + } + int64_t get_dedup_cdc_chunk_size() const { + int64_t chunk_size = 0; + opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size); + return chunk_size; + } + + /// application -> key/value metadata + std::map<std::string, std::map<std::string, std::string>> application_metadata; + +private: + std::vector<uint32_t> grade_table; + +public: + uint32_t get_grade(unsigned i) const { + if (grade_table.size() <= i) + return 0; + return grade_table[i]; + } + void calc_grade_table() { + unsigned v = 1000000; + grade_table.resize(hit_set_count); + for (unsigned i = 0; i < hit_set_count; i++) { + v = v * (1 - (hit_set_grade_decay_rate / 100.0)); + grade_table[i] = v; + } + } + + pg_pool_t() = default; + + void dump(ceph::Formatter *f) const; + + const utime_t &get_create_time() const { return create_time; } + uint64_t get_flags() const { return flags; } + bool has_flag(uint64_t f) const { return flags & f; } + void set_flag(uint64_t f) { flags |= f; } + void unset_flag(uint64_t f) { flags &= ~f; } + + bool require_rollback() const { + return is_erasure(); + } + + /// true if incomplete clones may be present + bool allow_incomplete_clones() const { + return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + } + + unsigned get_type() const { return type; } + unsigned get_size() const { return size; } + unsigned get_min_size() const { return min_size; } + int get_crush_rule() const { return crush_rule; } + int get_object_hash() const { return object_hash; } + const char *get_object_hash_name() const { + return ceph_str_hash_name(get_object_hash()); + } + epoch_t get_last_change() const { return last_change; } + epoch_t get_last_force_op_resend() const { return last_force_op_resend; } + epoch_t get_last_force_op_resend_prenautilus() const { + return last_force_op_resend_prenautilus; + } + epoch_t get_last_force_op_resend_preluminous() const { + return last_force_op_resend_preluminous; + } + epoch_t get_snap_epoch() const { return snap_epoch; } + snapid_t get_snap_seq() const { return snap_seq; } + uint64_t get_auid() const { return auid; } + + void set_snap_seq(snapid_t s) { snap_seq = s; } + void set_snap_epoch(epoch_t e) { snap_epoch = e; } + + void set_stripe_width(uint32_t s) { stripe_width = s; } + uint32_t get_stripe_width() const { return stripe_width; } + + bool is_replicated() const { return get_type() == TYPE_REPLICATED; } + bool is_erasure() const { return get_type() == TYPE_ERASURE; } + + bool supports_omap() const { + return !(get_type() == TYPE_ERASURE); + } + + bool requires_aligned_append() const { + return is_erasure() && !has_flag(FLAG_EC_OVERWRITES); + } + uint64_t required_alignment() const { return stripe_width; } + + bool allows_ecoverwrites() const { + return has_flag(FLAG_EC_OVERWRITES); + } + + bool can_shift_osds() const { + switch (get_type()) { + case TYPE_REPLICATED: + return true; + case TYPE_ERASURE: + return false; + default: + ceph_abort_msg("unhandled pool type"); + } + } + + unsigned get_pg_num() const { return pg_num; } + unsigned get_pgp_num() const { return pgp_num; } + unsigned get_pg_num_target() const { return pg_num_target; } + unsigned get_pgp_num_target() const { return pgp_num_target; } + unsigned get_pg_num_pending() const { return pg_num_pending; } + + unsigned get_pg_num_mask() const { return pg_num_mask; } + unsigned get_pgp_num_mask() const { return pgp_num_mask; } + + // if pg_num is not a multiple of two, pgs are not equally sized. + // return, for a given pg, the fraction (denominator) of the total + // pool size that it represents. + unsigned get_pg_num_divisor(pg_t pgid) const; + + bool is_pending_merge(pg_t pgid, bool *target) const; + + void set_pg_num(int p) { + pg_num = p; + pg_num_pending = p; + calc_pg_masks(); + } + void set_pgp_num(int p) { + pgp_num = p; + calc_pg_masks(); + } + void set_pg_num_pending(int p) { + pg_num_pending = p; + calc_pg_masks(); + } + void set_pg_num_target(int p) { + pg_num_target = p; + } + void set_pgp_num_target(int p) { + pgp_num_target = p; + } + void dec_pg_num(pg_t source_pgid, + epoch_t ready_epoch, + eversion_t source_version, + eversion_t target_version, + epoch_t last_epoch_started, + epoch_t last_epoch_clean) { + --pg_num; + last_pg_merge_meta.source_pgid = source_pgid; + last_pg_merge_meta.ready_epoch = ready_epoch; + last_pg_merge_meta.source_version = source_version; + last_pg_merge_meta.target_version = target_version; + last_pg_merge_meta.last_epoch_started = last_epoch_started; + last_pg_merge_meta.last_epoch_clean = last_epoch_clean; + calc_pg_masks(); + } + + void set_quota_max_bytes(uint64_t m) { + quota_max_bytes = m; + } + uint64_t get_quota_max_bytes() { + return quota_max_bytes; + } + + void set_quota_max_objects(uint64_t m) { + quota_max_objects = m; + } + uint64_t get_quota_max_objects() { + return quota_max_objects; + } + + void set_last_force_op_resend(uint64_t t) { + last_force_op_resend = t; + last_force_op_resend_prenautilus = t; + last_force_op_resend_preluminous = t; + } + + void calc_pg_masks(); + + /* + * we have two snap modes: + * - pool global snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - removal governed by removed_snaps + * + * we know which mode we're using based on whether removed_snaps is empty. + * If nothing has been created, both functions report false. + */ + bool is_pool_snaps_mode() const; + bool is_unmanaged_snaps_mode() const; + bool is_removed_snap(snapid_t s) const; + + snapid_t snap_exists(std::string_view s) const; + void add_snap(const char *n, utime_t stamp); + uint64_t add_unmanaged_snap(bool preoctopus_compat); + void remove_snap(snapid_t s); + void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat); + + SnapContext get_snap_context() const; + + /// hash a object name+namespace key to a hash position + uint32_t hash_key(const std::string& key, const std::string& ns) const; + + /// round a hash position down to a pg num + uint32_t raw_hash_to_pg(uint32_t v) const; + + /* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ + pg_t raw_pg_to_pg(pg_t pg) const; + + /* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ + ps_t raw_pg_to_pps(pg_t pg) const; + + /// choose a random hash position within a pg + uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const; + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + + static void generate_test_instances(std::list<pg_pool_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_pool_t) + +std::ostream& operator<<(std::ostream& out, const pg_pool_t& p); + + +/** + * a summation of object stats + * + * This is just a container for object stats; we don't know what for. + * + * If you add members in object_stat_sum_t, you should make sure there are + * not padding among these members. + * You should also modify the padding_check function. + + */ +struct object_stat_sum_t { + /************************************************************************** + * WARNING: be sure to update operator==, floor, and split when + * adding/removing fields! + **************************************************************************/ + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_clones; + int64_t num_object_copies; // num_objects * num_replicas + int64_t num_objects_missing_on_primary; + int64_t num_objects_degraded; + int64_t num_objects_unfound; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_scrub_errors; // total deep and shallow scrub errors + int64_t num_objects_recovered; + int64_t num_bytes_recovered; + int64_t num_keys_recovered; + int64_t num_shallow_scrub_errors; + int64_t num_deep_scrub_errors; + int64_t num_objects_dirty; + int64_t num_whiteouts; + int64_t num_objects_omap; + int64_t num_objects_hit_set_archive; + int64_t num_objects_misplaced; + int64_t num_bytes_hit_set_archive; + int64_t num_flush; + int64_t num_flush_kb; + int64_t num_evict; + int64_t num_evict_kb; + int64_t num_promote; + int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0 + int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0 + int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0 + int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0 + int64_t num_objects_pinned; + int64_t num_objects_missing; + int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets + int64_t num_large_omap_objects = 0; + int64_t num_objects_manifest = 0; + int64_t num_omap_bytes = 0; + int64_t num_omap_keys = 0; + int64_t num_objects_repaired = 0; + + object_stat_sum_t() + : num_bytes(0), + num_objects(0), num_object_clones(0), num_object_copies(0), + num_objects_missing_on_primary(0), num_objects_degraded(0), + num_objects_unfound(0), + num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0), + num_scrub_errors(0), + num_objects_recovered(0), + num_bytes_recovered(0), + num_keys_recovered(0), + num_shallow_scrub_errors(0), + num_deep_scrub_errors(0), + num_objects_dirty(0), + num_whiteouts(0), + num_objects_omap(0), + num_objects_hit_set_archive(0), + num_objects_misplaced(0), + num_bytes_hit_set_archive(0), + num_flush(0), + num_flush_kb(0), + num_evict(0), + num_evict_kb(0), + num_promote(0), + num_flush_mode_high(0), num_flush_mode_low(0), + num_evict_mode_some(0), num_evict_mode_full(0), + num_objects_pinned(0), + num_objects_missing(0), + num_legacy_snapsets(0) + {} + + void floor(int64_t f) { +#define FLOOR(x) if (x < f) x = f + FLOOR(num_bytes); + FLOOR(num_objects); + FLOOR(num_object_clones); + FLOOR(num_object_copies); + FLOOR(num_objects_missing_on_primary); + FLOOR(num_objects_missing); + FLOOR(num_objects_degraded); + FLOOR(num_objects_misplaced); + FLOOR(num_objects_unfound); + FLOOR(num_rd); + FLOOR(num_rd_kb); + FLOOR(num_wr); + FLOOR(num_wr_kb); + FLOOR(num_large_omap_objects); + FLOOR(num_objects_manifest); + FLOOR(num_omap_bytes); + FLOOR(num_omap_keys); + FLOOR(num_shallow_scrub_errors); + FLOOR(num_deep_scrub_errors); + num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors; + FLOOR(num_objects_recovered); + FLOOR(num_bytes_recovered); + FLOOR(num_keys_recovered); + FLOOR(num_objects_dirty); + FLOOR(num_whiteouts); + FLOOR(num_objects_omap); + FLOOR(num_objects_hit_set_archive); + FLOOR(num_bytes_hit_set_archive); + FLOOR(num_flush); + FLOOR(num_flush_kb); + FLOOR(num_evict); + FLOOR(num_evict_kb); + FLOOR(num_promote); + FLOOR(num_flush_mode_high); + FLOOR(num_flush_mode_low); + FLOOR(num_evict_mode_some); + FLOOR(num_evict_mode_full); + FLOOR(num_objects_pinned); + FLOOR(num_legacy_snapsets); + FLOOR(num_objects_repaired); +#undef FLOOR + } + + void split(std::vector<object_stat_sum_t> &out) const { +#define SPLIT(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + out[i].PARAM = PARAM / out.size(); \ + if (i < (PARAM % out.size())) { \ + out[i].PARAM++; \ + } \ + } +#define SPLIT_PRESERVE_NONZERO(PARAM) \ + for (unsigned i = 0; i < out.size(); ++i) { \ + if (PARAM) \ + out[i].PARAM = 1 + PARAM / out.size(); \ + else \ + out[i].PARAM = 0; \ + } + + SPLIT(num_bytes); + SPLIT(num_objects); + SPLIT(num_object_clones); + SPLIT(num_object_copies); + SPLIT(num_objects_missing_on_primary); + SPLIT(num_objects_missing); + SPLIT(num_objects_degraded); + SPLIT(num_objects_misplaced); + SPLIT(num_objects_unfound); + SPLIT(num_rd); + SPLIT(num_rd_kb); + SPLIT(num_wr); + SPLIT(num_wr_kb); + SPLIT(num_large_omap_objects); + SPLIT(num_objects_manifest); + SPLIT(num_omap_bytes); + SPLIT(num_omap_keys); + SPLIT(num_objects_repaired); + SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors); + SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors); + for (unsigned i = 0; i < out.size(); ++i) { + out[i].num_scrub_errors = out[i].num_shallow_scrub_errors + + out[i].num_deep_scrub_errors; + } + SPLIT(num_objects_recovered); + SPLIT(num_bytes_recovered); + SPLIT(num_keys_recovered); + SPLIT(num_objects_dirty); + SPLIT(num_whiteouts); + SPLIT(num_objects_omap); + SPLIT(num_objects_hit_set_archive); + SPLIT(num_bytes_hit_set_archive); + SPLIT(num_flush); + SPLIT(num_flush_kb); + SPLIT(num_evict); + SPLIT(num_evict_kb); + SPLIT(num_promote); + SPLIT(num_flush_mode_high); + SPLIT(num_flush_mode_low); + SPLIT(num_evict_mode_some); + SPLIT(num_evict_mode_full); + SPLIT(num_objects_pinned); + SPLIT_PRESERVE_NONZERO(num_legacy_snapsets); +#undef SPLIT +#undef SPLIT_PRESERVE_NONZERO + } + + void clear() { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(this, 0, sizeof(*this)); + } + + void calc_copies(int nrep) { + num_object_copies = nrep * num_objects; + } + + bool is_zero() const { + return mem_is_zero((char*)this, sizeof(*this)); + } + + void add(const object_stat_sum_t& o); + void sub(const object_stat_sum_t& o); + + void dump(ceph::Formatter *f) const; + void padding_check() { + static_assert( + sizeof(object_stat_sum_t) == + sizeof(num_bytes) + + sizeof(num_objects) + + sizeof(num_object_clones) + + sizeof(num_object_copies) + + sizeof(num_objects_missing_on_primary) + + sizeof(num_objects_degraded) + + sizeof(num_objects_unfound) + + sizeof(num_rd) + + sizeof(num_rd_kb) + + sizeof(num_wr) + + sizeof(num_wr_kb) + + sizeof(num_scrub_errors) + + sizeof(num_large_omap_objects) + + sizeof(num_objects_manifest) + + sizeof(num_omap_bytes) + + sizeof(num_omap_keys) + + sizeof(num_objects_repaired) + + sizeof(num_objects_recovered) + + sizeof(num_bytes_recovered) + + sizeof(num_keys_recovered) + + sizeof(num_shallow_scrub_errors) + + sizeof(num_deep_scrub_errors) + + sizeof(num_objects_dirty) + + sizeof(num_whiteouts) + + sizeof(num_objects_omap) + + sizeof(num_objects_hit_set_archive) + + sizeof(num_objects_misplaced) + + sizeof(num_bytes_hit_set_archive) + + sizeof(num_flush) + + sizeof(num_flush_kb) + + sizeof(num_evict) + + sizeof(num_evict_kb) + + sizeof(num_promote) + + sizeof(num_flush_mode_high) + + sizeof(num_flush_mode_low) + + sizeof(num_evict_mode_some) + + sizeof(num_evict_mode_full) + + sizeof(num_objects_pinned) + + sizeof(num_objects_missing) + + sizeof(num_legacy_snapsets) + , + "object_stat_sum_t have padding"); + } + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list<object_stat_sum_t*>& o); +}; +WRITE_CLASS_ENCODER(object_stat_sum_t) + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r); + +/** + * a collection of object stat sums + * + * This is a collection of stat sums over different categories. + */ +struct object_stat_collection_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + object_stat_sum_t sum; + + void calc_copies(int nrep) { + sum.calc_copies(nrep); + } + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + static void generate_test_instances(std::list<object_stat_collection_t*>& o); + + bool is_zero() const { + return sum.is_zero(); + } + + void clear() { + sum.clear(); + } + + void floor(int64_t f) { + sum.floor(f); + } + + void add(const object_stat_sum_t& o) { + sum.add(o); + } + + void add(const object_stat_collection_t& o) { + sum.add(o.sum); + } + void sub(const object_stat_collection_t& o) { + sum.sub(o.sum); + } +}; +WRITE_CLASS_ENCODER(object_stat_collection_t) + +inline bool operator==(const object_stat_collection_t& l, + const object_stat_collection_t& r) { + return l.sum == r.sum; +} + + +/** pg_stat + * aggregate stats for a single PG. + */ +struct pg_stat_t { + /************************************************************************** + * WARNING: be sure to update the operator== when adding/removing fields! * + **************************************************************************/ + eversion_t version; + version_t reported_seq; // sequence number + epoch_t reported_epoch; // epoch of this report + uint64_t state; + utime_t last_fresh; // last reported + utime_t last_change; // new state != previous state + utime_t last_active; // state & PG_STATE_ACTIVE + utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED + utime_t last_clean; // state & PG_STATE_CLEAN + utime_t last_unstale; // (state & PG_STATE_STALE) == 0 + utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0 + utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0 + + eversion_t log_start; // (log_start,version] + eversion_t ondisk_log_start; // there may be more on disk + + epoch_t created; + epoch_t last_epoch_clean; + pg_t parent; + __u32 parent_split_bits; + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + object_stat_collection_t stats; + + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + + std::vector<int32_t> up, acting; + std::vector<pg_shard_t> avail_no_missing; + std::map< std::set<pg_shard_t>, int32_t > object_location_counts; + epoch_t mapping_epoch; + + std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked + + interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged + + utime_t last_became_active; + utime_t last_became_peered; + + /// up, acting primaries + int32_t up_primary; + int32_t acting_primary; + + // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is + // absurd already, so cap it to 2^32 and save 4 bytes at the same time + uint32_t snaptrimq_len; + + bool stats_invalid:1; + /// true if num_objects_dirty is not accurate (because it was not + /// maintained starting from pool creation) + bool dirty_stats_invalid:1; + bool omap_stats_invalid:1; + bool hitset_stats_invalid:1; + bool hitset_bytes_stats_invalid:1; + bool pin_stats_invalid:1; + bool manifest_stats_invalid:1; + + pg_stat_t() + : reported_seq(0), + reported_epoch(0), + state(0), + created(0), last_epoch_clean(0), + parent_split_bits(0), + log_size(0), ondisk_log_size(0), + mapping_epoch(0), + up_primary(-1), + acting_primary(-1), + snaptrimq_len(0), + stats_invalid(false), + dirty_stats_invalid(false), + omap_stats_invalid(false), + hitset_stats_invalid(false), + hitset_bytes_stats_invalid(false), + pin_stats_invalid(false), + manifest_stats_invalid(false) + { } + + epoch_t get_effective_last_epoch_clean() const { + if (state & PG_STATE_CLEAN) { + // we are clean as of this report, and should thus take the + // reported epoch + return reported_epoch; + } else { + return last_epoch_clean; + } + } + + std::pair<epoch_t, version_t> get_version_pair() const { + return { reported_epoch, reported_seq }; + } + + void floor(int64_t f) { + stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (snaptrimq_len < f) + snaptrimq_len = f; + } + + void add_sub_invalid_flags(const pg_stat_t& o) { + // adding (or subtracting!) invalid stats render our stats invalid too + stats_invalid |= o.stats_invalid; + dirty_stats_invalid |= o.dirty_stats_invalid; + omap_stats_invalid |= o.omap_stats_invalid; + hitset_stats_invalid |= o.hitset_stats_invalid; + hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid; + pin_stats_invalid |= o.pin_stats_invalid; + manifest_stats_invalid |= o.manifest_stats_invalid; + } + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len, + (uint64_t)(1ull << 31)); + add_sub_invalid_flags(o); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + if (o.snaptrimq_len < snaptrimq_len) { + snaptrimq_len -= o.snaptrimq_len; + } else { + snaptrimq_len = 0; + } + add_sub_invalid_flags(o); + } + + bool is_acting_osd(int32_t osd, bool primary) const; + void dump(ceph::Formatter *f) const; + void dump_brief(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list<pg_stat_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_stat_t) + +bool operator==(const pg_stat_t& l, const pg_stat_t& r); + +/** store_statfs_t + * ObjectStore full statfs information + */ +struct store_statfs_t +{ + uint64_t total = 0; ///< Total bytes + uint64_t available = 0; ///< Free bytes available + uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes + + int64_t allocated = 0; ///< Bytes allocated by the store + + int64_t data_stored = 0; ///< Bytes actually stored by the user + int64_t data_compressed = 0; ///< Bytes stored after compression + int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data + int64_t data_compressed_original = 0; ///< Bytes that were compressed + + int64_t omap_allocated = 0; ///< approx usage of omap data + int64_t internal_metadata = 0; ///< approx usage of internal metadata + + void reset() { + *this = store_statfs_t(); + } + void floor(int64_t f) { +#define FLOOR(x) if (int64_t(x) < f) x = f + FLOOR(total); + FLOOR(available); + FLOOR(internally_reserved); + FLOOR(allocated); + FLOOR(data_stored); + FLOOR(data_compressed); + FLOOR(data_compressed_allocated); + FLOOR(data_compressed_original); + + FLOOR(omap_allocated); + FLOOR(internal_metadata); +#undef FLOOR + } + + bool operator ==(const store_statfs_t& other) const; + bool is_zero() const { + return *this == store_statfs_t(); + } + + uint64_t get_used() const { + return total - available - internally_reserved; + } + + // this accumulates both actually used and statfs's internally_reserved + uint64_t get_used_raw() const { + return total - available; + } + + float get_used_raw_ratio() const { + if (total) { + return (float)get_used_raw() / (float)total; + } else { + return 0.0; + } + } + + // helpers to ease legacy code porting + uint64_t kb_avail() const { + return available >> 10; + } + uint64_t kb() const { + return total >> 10; + } + uint64_t kb_used() const { + return (total - available - internally_reserved) >> 10; + } + uint64_t kb_used_raw() const { + return get_used_raw() >> 10; + } + + uint64_t kb_used_data() const { + return allocated >> 10; + } + uint64_t kb_used_omap() const { + return omap_allocated >> 10; + } + + uint64_t kb_used_internal_metadata() const { + return internal_metadata >> 10; + } + + void add(const store_statfs_t& o) { + total += o.total; + available += o.available; + internally_reserved += o.internally_reserved; + allocated += o.allocated; + data_stored += o.data_stored; + data_compressed += o.data_compressed; + data_compressed_allocated += o.data_compressed_allocated; + data_compressed_original += o.data_compressed_original; + omap_allocated += o.omap_allocated; + internal_metadata += o.internal_metadata; + } + void sub(const store_statfs_t& o) { + total -= o.total; + available -= o.available; + internally_reserved -= o.internally_reserved; + allocated -= o.allocated; + data_stored -= o.data_stored; + data_compressed -= o.data_compressed; + data_compressed_allocated -= o.data_compressed_allocated; + data_compressed_original -= o.data_compressed_original; + omap_allocated -= o.omap_allocated; + internal_metadata -= o.internal_metadata; + } + void dump(ceph::Formatter *f) const; + DENC(store_statfs_t, v, p) { + DENC_START(1, 1, p); + denc(v.total, p); + denc(v.available, p); + denc(v.internally_reserved, p); + denc(v.allocated, p); + denc(v.data_stored, p); + denc(v.data_compressed, p); + denc(v.data_compressed_allocated, p); + denc(v.data_compressed_original, p); + denc(v.omap_allocated, p); + denc(v.internal_metadata, p); + DENC_FINISH(p); + } + static void generate_test_instances(std::list<store_statfs_t*>& o); +}; +WRITE_CLASS_DENC(store_statfs_t) + +std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs); + +/** osd_stat + * aggregate stats for an osd + */ +struct osd_stat_t { + store_statfs_t statfs; + std::vector<int> hb_peers; + int32_t snap_trim_queue_len, num_snap_trimming; + uint64_t num_shards_repaired; + + pow2_hist_t op_queue_age_hist; + + objectstore_perf_stat_t os_perf_stat; + osd_alerts_t os_alerts; + + epoch_t up_from = 0; + uint64_t seq = 0; + + uint32_t num_pgs = 0; + + uint32_t num_osds = 0; + uint32_t num_per_pool_osds = 0; + uint32_t num_per_pool_omap_osds = 0; + + struct Interfaces { + uint32_t last_update; // in seconds + uint32_t back_pingtime[3]; + uint32_t back_min[3]; + uint32_t back_max[3]; + uint32_t back_last; + uint32_t front_pingtime[3]; + uint32_t front_min[3]; + uint32_t front_max[3]; + uint32_t front_last; + }; + std::map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces + + osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0), + num_shards_repaired(0) {} + + void add(const osd_stat_t& o) { + statfs.add(o.statfs); + snap_trim_queue_len += o.snap_trim_queue_len; + num_snap_trimming += o.num_snap_trimming; + num_shards_repaired += o.num_shards_repaired; + op_queue_age_hist.add(o.op_queue_age_hist); + os_perf_stat.add(o.os_perf_stat); + num_pgs += o.num_pgs; + num_osds += o.num_osds; + num_per_pool_osds += o.num_per_pool_osds; + num_per_pool_omap_osds += o.num_per_pool_omap_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.emplace(i.first, i.second); + } + } + } + void sub(const osd_stat_t& o) { + statfs.sub(o.statfs); + snap_trim_queue_len -= o.snap_trim_queue_len; + num_snap_trimming -= o.num_snap_trimming; + num_shards_repaired -= o.num_shards_repaired; + op_queue_age_hist.sub(o.op_queue_age_hist); + os_perf_stat.sub(o.os_perf_stat); + num_pgs -= o.num_pgs; + num_osds -= o.num_osds; + num_per_pool_osds -= o.num_per_pool_osds; + num_per_pool_omap_osds -= o.num_per_pool_omap_osds; + for (const auto& a : o.os_alerts) { + auto& target = os_alerts[a.first]; + for (auto& i : a.second) { + target.erase(i.first); + } + if (target.empty()) { + os_alerts.erase(a.first); + } + } + } + void dump(ceph::Formatter *f, bool with_net = true) const; + void dump_ping_time(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list<osd_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(osd_stat_t) + +inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) { + return l.statfs == r.statfs && + l.snap_trim_queue_len == r.snap_trim_queue_len && + l.num_snap_trimming == r.num_snap_trimming && + l.num_shards_repaired == r.num_shards_repaired && + l.hb_peers == r.hb_peers && + l.op_queue_age_hist == r.op_queue_age_hist && + l.os_perf_stat == r.os_perf_stat && + l.num_pgs == r.num_pgs && + l.num_osds == r.num_osds && + l.num_per_pool_osds == r.num_per_pool_osds && + l.num_per_pool_omap_osds == r.num_per_pool_omap_osds; +} +inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) { + return !(l == r); +} + +inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) { + return out << "osd_stat(" << s.statfs << ", " + << "peers " << s.hb_peers + << " op hist " << s.op_queue_age_hist.h + << ")"; +} + +/* + * summation over an entire pool + */ +struct pool_stat_t { + object_stat_collection_t stats; + store_statfs_t store_stats; + int64_t log_size; + int64_t ondisk_log_size; // >= active_log_size + int32_t up; ///< number of up replicas or shards + int32_t acting; ///< number of acting replicas or shards + int32_t num_store_stats; ///< amount of store_stats accumulated + + pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0), + num_store_stats(0) + { } + + void floor(int64_t f) { + stats.floor(f); + store_stats.floor(f); + if (log_size < f) + log_size = f; + if (ondisk_log_size < f) + ondisk_log_size = f; + if (up < f) + up = f; + if (acting < f) + acting = f; + if (num_store_stats < f) + num_store_stats = f; + } + + void add(const store_statfs_t& o) { + store_stats.add(o); + ++num_store_stats; + } + void sub(const store_statfs_t& o) { + store_stats.sub(o); + --num_store_stats; + } + + void add(const pg_stat_t& o) { + stats.add(o.stats); + log_size += o.log_size; + ondisk_log_size += o.ondisk_log_size; + up += o.up.size(); + acting += o.acting.size(); + } + void sub(const pg_stat_t& o) { + stats.sub(o.stats); + log_size -= o.log_size; + ondisk_log_size -= o.ondisk_log_size; + up -= o.up.size(); + acting -= o.acting.size(); + } + + bool is_zero() const { + return (stats.is_zero() && + store_stats.is_zero() && + log_size == 0 && + ondisk_log_size == 0 && + up == 0 && + acting == 0 && + num_store_stats == 0); + } + + // helper accessors to retrieve used/netto bytes depending on the + // collection method: new per-pool objectstore report or legacy PG + // summation at OSD. + // In legacy mode used and netto values are the same. But for new per-pool + // collection 'used' provides amount of space ALLOCATED at all related OSDs + // and 'netto' is amount of stored user data. + uint64_t get_allocated_data_bytes(bool per_pool) const { + if (per_pool) { + return store_stats.allocated; + } else { + // legacy mode, use numbers from 'stats' + return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive; + } + } + uint64_t get_allocated_omap_bytes(bool per_pool_omap) const { + if (per_pool_omap) { + return store_stats.omap_allocated; + } else { + // omap is not broken out by pool by nautilus bluestore; report the + // scrub value. this will be imprecise in that it won't account for + // any storage overhead/efficiency. + return stats.sum.num_omap_bytes; + } + } + uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor + bool per_pool) const { + // NOTE: we need the space amp factor so that we can work backwards from + // the raw utilization to the amount of data that the user actually stored. + if (per_pool) { + return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0; + } else { + // legacy mode, use numbers from 'stats'. note that we do NOT use the + // raw_used_rate factor here because we are working from the PG stats + // directly. + return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive; + } + } + uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor + bool per_pool_omap) const { + if (per_pool_omap) { + return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0; + } else { + // omap usage is lazily reported during scrub; this value may lag. + return stats.sum.num_omap_bytes; + } + } + + void dump(ceph::Formatter *f) const; + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + static void generate_test_instances(std::list<pool_stat_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) + + +// ----------------------------------------- + +/** + * pg_hit_set_info_t - information about a single recorded HitSet + * + * Track basic metadata about a HitSet, like the number of insertions + * and the time range it covers. + */ +struct pg_hit_set_info_t { + utime_t begin, end; ///< time interval + eversion_t version; ///< version this HitSet object was written + bool using_gmt; ///< use gmt for creating the hit_set archive object name + + friend bool operator==(const pg_hit_set_info_t& l, + const pg_hit_set_info_t& r) { + return + l.begin == r.begin && + l.end == r.end && + l.version == r.version && + l.using_gmt == r.using_gmt; + } + + explicit pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_hit_set_info_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_info_t) + +/** + * pg_hit_set_history_t - information about a history of hitsets + * + * Include information about the currently accumulating hit set as well + * as archived/historical ones. + */ +struct pg_hit_set_history_t { + eversion_t current_last_update; ///< last version inserted into current set + std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest + + friend bool operator==(const pg_hit_set_history_t& l, + const pg_hit_set_history_t& r) { + return + l.current_last_update == r.current_last_update && + l.history == r.history; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_hit_set_history_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_hit_set_history_t) + + +// ----------------------------------------- + +/** + * pg_history_t - information about recent pg peering/mapping history + * + * This is aggressively shared between OSDs to bound the amount of past + * history they need to worry about. + */ +struct pg_history_t { + epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg) + epoch_t epoch_pool_created = 0; // epoch in which *pool* was created + // (note: may be pg creation epoch for + // pre-luminous clusters) + epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally) + // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/ + epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval + epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean. + epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval + epoch_t last_epoch_split = 0;; // as parent or child + epoch_t last_epoch_marked_full = 0;; // pool or cluster + + /** + * In the event of a map discontinuity, same_*_since may reflect the first + * map the osd has seen in the new map sequence rather than the actual start + * of the interval. This is ok since a discontinuity at epoch e means there + * must have been a clean interval between e and now and that we cannot be + * in the active set during the interval containing e. + */ + epoch_t same_up_since = 0;; // same acting set since + epoch_t same_interval_since = 0;; // same acting AND up set since + epoch_t same_primary_since = 0;; // same primary at least back through this epoch. + + eversion_t last_scrub; + eversion_t last_deep_scrub; + utime_t last_scrub_stamp; + utime_t last_deep_scrub_stamp; + utime_t last_clean_scrub_stamp; + + /// upper bound on how long prior interval readable (relative to encode time) + ceph::timespan prior_readable_until_ub = ceph::timespan::zero(); + + friend bool operator==(const pg_history_t& l, const pg_history_t& r) { + return + l.epoch_created == r.epoch_created && + l.epoch_pool_created == r.epoch_pool_created && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_epoch_clean == r.last_epoch_clean && + l.last_interval_clean == r.last_interval_clean && + l.last_epoch_split == r.last_epoch_split && + l.last_epoch_marked_full == r.last_epoch_marked_full && + l.same_up_since == r.same_up_since && + l.same_interval_since == r.same_interval_since && + l.same_primary_since == r.same_primary_since && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp && + l.prior_readable_until_ub == r.prior_readable_until_ub; + } + + pg_history_t() {} + pg_history_t(epoch_t created, utime_t stamp) + : epoch_created(created), + epoch_pool_created(created), + same_up_since(created), + same_interval_since(created), + same_primary_since(created), + last_scrub_stamp(stamp), + last_deep_scrub_stamp(stamp), + last_clean_scrub_stamp(stamp) {} + + bool merge(const pg_history_t &other) { + // Here, we only update the fields which cannot be calculated from the OSDmap. + bool modified = false; + if (epoch_created < other.epoch_created) { + epoch_created = other.epoch_created; + modified = true; + } + if (epoch_pool_created < other.epoch_pool_created) { + // FIXME: for jewel compat only; this should either be 0 or always the + // same value across all pg instances. + epoch_pool_created = other.epoch_pool_created; + modified = true; + } + if (last_epoch_started < other.last_epoch_started) { + last_epoch_started = other.last_epoch_started; + modified = true; + } + if (last_interval_started < other.last_interval_started) { + last_interval_started = other.last_interval_started; + // if we are learning about a newer *started* interval, our + // readable_until_ub is obsolete + prior_readable_until_ub = other.prior_readable_until_ub; + modified = true; + } else if (other.last_interval_started == last_interval_started && + other.prior_readable_until_ub < prior_readable_until_ub) { + // if other is the *same* interval, than pull our upper bound in + // if they have a tighter bound. + prior_readable_until_ub = other.prior_readable_until_ub; + modified = true; + } + if (last_epoch_clean < other.last_epoch_clean) { + last_epoch_clean = other.last_epoch_clean; + modified = true; + } + if (last_interval_clean < other.last_interval_clean) { + last_interval_clean = other.last_interval_clean; + modified = true; + } + if (last_epoch_split < other.last_epoch_split) { + last_epoch_split = other.last_epoch_split; + modified = true; + } + if (last_epoch_marked_full < other.last_epoch_marked_full) { + last_epoch_marked_full = other.last_epoch_marked_full; + modified = true; + } + if (other.last_scrub > last_scrub) { + last_scrub = other.last_scrub; + modified = true; + } + if (other.last_scrub_stamp > last_scrub_stamp) { + last_scrub_stamp = other.last_scrub_stamp; + modified = true; + } + if (other.last_deep_scrub > last_deep_scrub) { + last_deep_scrub = other.last_deep_scrub; + modified = true; + } + if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) { + last_deep_scrub_stamp = other.last_deep_scrub_stamp; + modified = true; + } + if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) { + last_clean_scrub_stamp = other.last_clean_scrub_stamp; + modified = true; + } + return modified; + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_history_t*>& o); + + ceph::signedspan refresh_prior_readable_until_ub( + ceph::signedspan now, ///< now, relative to osd startup_time + ceph::signedspan ub) { ///< ub, relative to osd startup_time + if (now >= ub) { + // prior interval(s) are unreadable; we can zero the upper bound + prior_readable_until_ub = ceph::signedspan::zero(); + return ceph::signedspan::zero(); + } else { + prior_readable_until_ub = ub - now; + return ub; + } + } + ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) { + if (prior_readable_until_ub == ceph::signedspan::zero()) { + return ceph::signedspan::zero(); + } + return now + prior_readable_until_ub; + } +}; +WRITE_CLASS_ENCODER(pg_history_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) { + out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created + << " lis/c=" << h.last_interval_started + << "/" << h.last_interval_clean + << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean + << "/" << h.last_epoch_marked_full + << " sis=" << h.same_interval_since; + if (h.prior_readable_until_ub != ceph::timespan::zero()) { + out << " pruub=" << h.prior_readable_until_ub; + } + return out; +} + + +/** + * pg_info_t - summary of PG statistics. + * + * some notes: + * - last_complete implies we have all objects that existed as of that + * stamp, OR a newer object, OR have already applied a later delete. + * - if last_complete >= log.tail, then we know pg contents thru log.head. + * otherwise, we have no idea what the pg is supposed to contain. + */ +struct pg_info_t { + spg_t pgid; + eversion_t last_update; ///< last object version applied to store. + eversion_t last_complete; ///< last version pg was complete through. + epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd + epoch_t last_interval_started; ///< first epoch of last_epoch_started interval + + version_t last_user_version; ///< last user object version applied to store + + eversion_t log_tail; ///< oldest log entry. + + hobject_t last_backfill; ///< objects >= this and < last_complete may be missing + + interval_set<snapid_t> purged_snaps; + + pg_stat_t stats; + + pg_history_t history; + pg_hit_set_history_t hit_set; + + friend bool operator==(const pg_info_t& l, const pg_info_t& r) { + return + l.pgid == r.pgid && + l.last_update == r.last_update && + l.last_complete == r.last_complete && + l.last_epoch_started == r.last_epoch_started && + l.last_interval_started == r.last_interval_started && + l.last_user_version == r.last_user_version && + l.log_tail == r.log_tail && + l.last_backfill == r.last_backfill && + l.purged_snaps == r.purged_snaps && + l.stats == r.stats && + l.history == r.history && + l.hit_set == r.hit_set; + } + + pg_info_t() + : last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()) + { } + // cppcheck-suppress noExplicitConstructor + pg_info_t(spg_t p) + : pgid(p), + last_epoch_started(0), + last_interval_started(0), + last_user_version(0), + last_backfill(hobject_t::get_max()) + { } + + void set_last_backfill(hobject_t pos) { + last_backfill = pos; + } + + bool is_empty() const { return last_update.version == 0; } + bool dne() const { return history.epoch_created == 0; } + + bool has_missing() const { return last_complete != last_update; } + bool is_incomplete() const { return !last_backfill.is_max(); } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_info_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_info_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi) +{ + out << pgi.pgid << "("; + if (pgi.dne()) + out << " DNE"; + if (pgi.is_empty()) + out << " empty"; + else { + out << " v " << pgi.last_update; + if (pgi.last_complete != pgi.last_update) + out << " lc " << pgi.last_complete; + out << " (" << pgi.log_tail << "," << pgi.last_update << "]"; + } + if (pgi.is_incomplete()) + out << " lb " << pgi.last_backfill; + //out << " c " << pgi.epoch_created; + out << " local-lis/les=" << pgi.last_interval_started + << "/" << pgi.last_epoch_started; + out << " n=" << pgi.stats.stats.sum.num_objects; + out << " " << pgi.history + << ")"; + return out; +} + +/** + * pg_fast_info_t - common pg_info_t fields + * + * These are the fields of pg_info_t (and children) that are updated for + * most IO operations. + * + * ** WARNING ** + * Because we rely on these fields to be applied to the normal + * info struct, adding a new field here that is not also new in info + * means that we must set an incompat OSD feature bit! + */ +struct pg_fast_info_t { + eversion_t last_update; + eversion_t last_complete; + version_t last_user_version; + struct { // pg_stat_t stats + eversion_t version; + version_t reported_seq; + utime_t last_fresh; + utime_t last_active; + utime_t last_peered; + utime_t last_clean; + utime_t last_unstale; + utime_t last_undegraded; + utime_t last_fullsized; + int64_t log_size; // (also ondisk_log_size, which has the same value) + struct { // object_stat_collection_t stats; + struct { // objct_stat_sum_t sum + int64_t num_bytes; // in bytes + int64_t num_objects; + int64_t num_object_copies; + int64_t num_rd; + int64_t num_rd_kb; + int64_t num_wr; + int64_t num_wr_kb; + int64_t num_objects_dirty; + } sum; + } stats; + } stats; + + void populate_from(const pg_info_t& info) { + last_update = info.last_update; + last_complete = info.last_complete; + last_user_version = info.last_user_version; + stats.version = info.stats.version; + stats.reported_seq = info.stats.reported_seq; + stats.last_fresh = info.stats.last_fresh; + stats.last_active = info.stats.last_active; + stats.last_peered = info.stats.last_peered; + stats.last_clean = info.stats.last_clean; + stats.last_unstale = info.stats.last_unstale; + stats.last_undegraded = info.stats.last_undegraded; + stats.last_fullsized = info.stats.last_fullsized; + stats.log_size = info.stats.log_size; + stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes; + stats.stats.sum.num_objects = info.stats.stats.sum.num_objects; + stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies; + stats.stats.sum.num_rd = info.stats.stats.sum.num_rd; + stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb; + stats.stats.sum.num_wr = info.stats.stats.sum.num_wr; + stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb; + stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty; + } + + bool try_apply_to(pg_info_t* info) { + if (last_update <= info->last_update) + return false; + info->last_update = last_update; + info->last_complete = last_complete; + info->last_user_version = last_user_version; + info->stats.version = stats.version; + info->stats.reported_seq = stats.reported_seq; + info->stats.last_fresh = stats.last_fresh; + info->stats.last_active = stats.last_active; + info->stats.last_peered = stats.last_peered; + info->stats.last_clean = stats.last_clean; + info->stats.last_unstale = stats.last_unstale; + info->stats.last_undegraded = stats.last_undegraded; + info->stats.last_fullsized = stats.last_fullsized; + info->stats.log_size = stats.log_size; + info->stats.ondisk_log_size = stats.log_size; + info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes; + info->stats.stats.sum.num_objects = stats.stats.sum.num_objects; + info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies; + info->stats.stats.sum.num_rd = stats.stats.sum.num_rd; + info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb; + info->stats.stats.sum.num_wr = stats.stats.sum.num_wr; + info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb; + info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty; + return true; + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(last_user_version, bl); + encode(stats.version, bl); + encode(stats.reported_seq, bl); + encode(stats.last_fresh, bl); + encode(stats.last_active, bl); + encode(stats.last_peered, bl); + encode(stats.last_clean, bl); + encode(stats.last_unstale, bl); + encode(stats.last_undegraded, bl); + encode(stats.last_fullsized, bl); + encode(stats.log_size, bl); + encode(stats.stats.sum.num_bytes, bl); + encode(stats.stats.sum.num_objects, bl); + encode(stats.stats.sum.num_object_copies, bl); + encode(stats.stats.sum.num_rd, bl); + encode(stats.stats.sum.num_rd_kb, bl); + encode(stats.stats.sum.num_wr, bl); + encode(stats.stats.sum.num_wr_kb, bl); + encode(stats.stats.sum.num_objects_dirty, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(last_update, p); + decode(last_complete, p); + decode(last_user_version, p); + decode(stats.version, p); + decode(stats.reported_seq, p); + decode(stats.last_fresh, p); + decode(stats.last_active, p); + decode(stats.last_peered, p); + decode(stats.last_clean, p); + decode(stats.last_unstale, p); + decode(stats.last_undegraded, p); + decode(stats.last_fullsized, p); + decode(stats.log_size, p); + decode(stats.stats.sum.num_bytes, p); + decode(stats.stats.sum.num_objects, p); + decode(stats.stats.sum.num_object_copies, p); + decode(stats.stats.sum.num_rd, p); + decode(stats.stats.sum.num_rd_kb, p); + decode(stats.stats.sum.num_wr, p); + decode(stats.stats.sum.num_wr_kb, p); + decode(stats.stats.sum.num_objects_dirty, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(pg_fast_info_t) + + +/** + * PastIntervals -- information needed to determine the PriorSet and + * the might_have_unfound set + */ +class PastIntervals { +#ifdef WITH_SEASTAR + using OSDMapRef = boost::local_shared_ptr<const OSDMap>; +#else + using OSDMapRef = std::shared_ptr<const OSDMap>; +#endif +public: + struct pg_interval_t { + std::vector<int32_t> up, acting; + epoch_t first, last; + bool maybe_went_rw; + int32_t primary; + int32_t up_primary; + + pg_interval_t() + : first(0), last(0), + maybe_went_rw(false), + primary(-1), + up_primary(-1) + {} + + pg_interval_t( + std::vector<int32_t> &&up, + std::vector<int32_t> &&acting, + epoch_t first, + epoch_t last, + bool maybe_went_rw, + int32_t primary, + int32_t up_primary) + : up(up), acting(acting), first(first), last(last), + maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary) + {} + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_interval_t*>& o); + }; + + PastIntervals(); + PastIntervals(PastIntervals &&rhs) = default; + PastIntervals &operator=(PastIntervals &&rhs) = default; + + PastIntervals(const PastIntervals &rhs); + PastIntervals &operator=(const PastIntervals &rhs); + + class interval_rep { + public: + virtual size_t size() const = 0; + virtual bool empty() const = 0; + virtual void clear() = 0; + virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0; + virtual std::set<pg_shard_t> get_all_participants( + bool ec_pool) const = 0; + virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0; + virtual std::unique_ptr<interval_rep> clone() const = 0; + virtual std::ostream &print(std::ostream &out) const = 0; + virtual void encode(ceph::buffer::list &bl) const = 0; + virtual void decode(ceph::buffer::list::const_iterator &bl) = 0; + virtual void dump(ceph::Formatter *f) const = 0; + virtual void iterate_mayberw_back_to( + epoch_t les, + std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0; + + virtual bool has_full_intervals() const { return false; } + virtual void iterate_all_intervals( + std::function<void(const pg_interval_t &)> &&f) const { + ceph_assert(!has_full_intervals()); + ceph_abort_msg("not valid for this implementation"); + } + virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0; + + virtual ~interval_rep() {} + }; + friend class pi_compact_rep; +private: + + std::unique_ptr<interval_rep> past_intervals; + + explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {} + +public: + void add_interval(bool ec_pool, const pg_interval_t &interval) { + ceph_assert(past_intervals); + return past_intervals->add_interval(ec_pool, interval); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + if (past_intervals) { + __u8 type = 2; + encode(type, bl); + past_intervals->encode(bl); + } else { + encode((__u8)0, bl); + } + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &bl); + + void dump(ceph::Formatter *f) const { + ceph_assert(past_intervals); + past_intervals->dump(f); + } + static void generate_test_instances(std::list<PastIntervals *> & o); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, + int new_acting_primary, + const std::vector<int> &old_acting, + const std::vector<int> &new_acting, + int old_up_primary, + int new_up_primary, + const std::vector<int> &old_up, + const std::vector<int> &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, + pg_t pgid + ); + + /** + * Determines whether there is an interval change + */ + static bool is_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of lastmap + const std::vector<int> &old_acting, ///< [in] acting as of lastmap + const std::vector<int> &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector<int> &old_up, ///< [in] up as of lastmap + const std::vector<int> &new_up, ///< [in] up as of osdmap + const OSDMap *osdmap, ///< [in] current map + const OSDMap *lastmap, ///< [in] last map + pg_t pgid ///< [in] pgid for pg + ); + + /** + * Integrates a new map into *past_intervals, returns true + * if an interval was closed out. + */ + static bool check_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of osdmap + const std::vector<int> &old_acting, ///< [in] acting as of lastmap + const std::vector<int> &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector<int> &old_up, ///< [in] up as of lastmap + const std::vector<int> &new_up, ///< [in] up as of osdmap + epoch_t same_interval_since, ///< [in] as of osdmap + epoch_t last_epoch_clean, ///< [in] current + const OSDMap *osdmap, ///< [in] current map + const OSDMap *lastmap, ///< [in] last map + pg_t pgid, ///< [in] pgid for pg + const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active + PastIntervals *past_intervals, ///< [out] intervals + std::ostream *out = 0 ///< [out] debug ostream + ); + static bool check_new_interval( + int old_acting_primary, ///< [in] primary as of lastmap + int new_acting_primary, ///< [in] primary as of osdmap + const std::vector<int> &old_acting, ///< [in] acting as of lastmap + const std::vector<int> &new_acting, ///< [in] acting as of osdmap + int old_up_primary, ///< [in] up primary of lastmap + int new_up_primary, ///< [in] up primary of osdmap + const std::vector<int> &old_up, ///< [in] up as of lastmap + const std::vector<int> &new_up, ///< [in] up as of osdmap + epoch_t same_interval_since, ///< [in] as of osdmap + epoch_t last_epoch_clean, ///< [in] current + OSDMapRef osdmap, ///< [in] current map + OSDMapRef lastmap, ///< [in] last map + pg_t pgid, ///< [in] pgid for pg + const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active + PastIntervals *past_intervals, ///< [out] intervals + std::ostream *out = 0 ///< [out] debug ostream + ) { + return check_new_interval( + old_acting_primary, new_acting_primary, + old_acting, new_acting, + old_up_primary, new_up_primary, + old_up, new_up, + same_interval_since, last_epoch_clean, + osdmap.get(), lastmap.get(), + pgid, + could_have_gone_active, + past_intervals, + out); + } + + friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i); + + template <typename F> + void iterate_mayberw_back_to( + epoch_t les, + F &&f) const { + ceph_assert(past_intervals); + past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f)); + } + void clear() { + ceph_assert(past_intervals); + past_intervals->clear(); + } + + /** + * Should return a value which gives an indication of the amount + * of state contained + */ + size_t size() const { + ceph_assert(past_intervals); + return past_intervals->size(); + } + + bool empty() const { + ceph_assert(past_intervals); + return past_intervals->empty(); + } + + void swap(PastIntervals &other) { + using std::swap; + swap(other.past_intervals, past_intervals); + } + + /** + * Return all shards which have been in the acting set back to the + * latest epoch to which we have trimmed except for pg_whoami + */ + std::set<pg_shard_t> get_might_have_unfound( + pg_shard_t pg_whoami, + bool ec_pool) const { + ceph_assert(past_intervals); + auto ret = past_intervals->get_all_participants(ec_pool); + ret.erase(pg_whoami); + return ret; + } + + /** + * Return all shards which we might want to talk to for peering + */ + std::set<pg_shard_t> get_all_probe( + bool ec_pool) const { + ceph_assert(past_intervals); + return past_intervals->get_all_participants(ec_pool); + } + + /* Return the set of epochs [start, end) represented by the + * past_interval set. + */ + std::pair<epoch_t, epoch_t> get_bounds() const { + ceph_assert(past_intervals); + return past_intervals->get_bounds(); + } + + void adjust_start_backwards(epoch_t last_epoch_clean) { + ceph_assert(past_intervals); + past_intervals->adjust_start_backwards(last_epoch_clean); + } + + enum osd_state_t { + UP, + DOWN, + DNE, + LOST + }; + struct PriorSet { + bool ec_pool = false; + std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe. + std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting. + std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set + + bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set. + const IsPGRecoverablePredicate* pcontdec = nullptr; + + PriorSet() = default; + PriorSet(PriorSet &&) = default; + PriorSet &operator=(PriorSet &&) = default; + + PriorSet &operator=(const PriorSet &) = delete; + PriorSet(const PriorSet &) = delete; + + bool operator==(const PriorSet &rhs) const { + return (ec_pool == rhs.ec_pool) && + (probe == rhs.probe) && + (down == rhs.down) && + (blocked_by == rhs.blocked_by) && + (pg_down == rhs.pg_down); + } + + bool affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const; + + // For verifying tests + PriorSet( + bool ec_pool, + std::set<pg_shard_t> probe, + std::set<int> down, + std::map<int, epoch_t> blocked_by, + bool pg_down, + const IsPGRecoverablePredicate *pcontdec) + : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by), + pg_down(pg_down), pcontdec(pcontdec) {} + + private: + template <typename F> + PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + const IsPGRecoverablePredicate *c, + F f, + const std::vector<int> &up, + const std::vector<int> &acting, + const DoutPrefixProvider *dpp); + + friend class PastIntervals; + }; + + template <typename... Args> + PriorSet get_prior_set(Args&&... args) const { + return PriorSet(*this, std::forward<Args>(args)...); + } +}; +WRITE_CLASS_ENCODER(PastIntervals) + +std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i); +std::ostream& operator<<(std::ostream& out, const PastIntervals &i); +std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i); + +template <typename F> +PastIntervals::PriorSet::PriorSet( + const PastIntervals &past_intervals, + bool ec_pool, + epoch_t last_epoch_started, + const IsPGRecoverablePredicate *c, + F f, + const std::vector<int> &up, + const std::vector<int> &acting, + const DoutPrefixProvider *dpp) + : ec_pool(ec_pool), pg_down(false), pcontdec(c) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (alive_thru + * still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // Include current acting and up nodes... not because they may + // contain old data (this interval hasn't gone active, obviously), + // but because we want their pg_info to inform choose_acting(), and + // so that we know what they do/do not have explicitly before + // sending them any new info/logs/whatever. + for (unsigned i = 0; i < acting.size(); i++) { + if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE) + probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + // It may be possible to exclude the up nodes, but let's keep them in + // there for now. + for (unsigned i = 0; i < up.size(); i++) { + if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE) + probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + + std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool); + ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl; + for (auto &&i: all_probe) { + switch (f(0, i.osd, nullptr)) { + case UP: { + probe.insert(i); + break; + } + case DNE: + case LOST: + case DOWN: { + down.insert(i.osd); + break; + } + } + } + + past_intervals.iterate_mayberw_back_to( + last_epoch_started, + [&](epoch_t start, const std::set<pg_shard_t> &acting) { + ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start + << ", acting: " << acting << dendl; + + // look at candidate osds during this interval. each falls into + // one of three categories: up, down (but potentially + // interesting), or lost (down, but we won't wait for it). + std::set<pg_shard_t> up_now; + std::map<int, epoch_t> candidate_blocked_by; + // any candidates down now (that might have useful data) + bool any_down_now = false; + + // consider ACTING osds + for (auto &&so: acting) { + epoch_t lost_at = 0; + switch (f(start, so.osd, &lost_at)) { + case UP: { + // include past acting osds if they are up. + up_now.insert(so); + break; + } + case DNE: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " no longer exists" << dendl; + break; + } + case LOST: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down, but lost_at " << lost_at << dendl; + up_now.insert(so); + break; + } + case DOWN: { + ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd + << " is down" << dendl; + candidate_blocked_by[so.osd] = lost_at; + any_down_now = true; + break; + } + } + } + + // if not enough osds survived this interval, and we may have gone rw, + // then we need to wait for one of those osds to recover to + // ensure that we haven't lost any information. + if (!(*pcontdec)(up_now) && any_down_now) { + // fixme: how do we identify a "clean" shutdown anyway? + ldpp_dout(dpp, 10) << "build_prior possibly went active+rw," + << " insufficient up; including down osds" << dendl; + ceph_assert(!candidate_blocked_by.empty()); + pg_down = true; + blocked_by.insert( + candidate_blocked_by.begin(), + candidate_blocked_by.end()); + } + }); + + ldpp_dout(dpp, 10) << "build_prior final: probe " << probe + << " down " << down + << " blocked_by " << blocked_by + << (pg_down ? " pg_down":"") + << dendl; +} + +struct pg_notify_t { + epoch_t query_epoch; + epoch_t epoch_sent; + pg_info_t info; + shard_id_t to; + shard_id_t from; + PastIntervals past_intervals; + pg_notify_t() : + query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_notify_t( + shard_id_t to, + shard_id_t from, + epoch_t query_epoch, + epoch_t epoch_sent, + const pg_info_t &info, + const PastIntervals& pi) + : query_epoch(query_epoch), + epoch_sent(epoch_sent), + info(info), to(to), from(from), + past_intervals(pi) { + ceph_assert(from == info.pgid.shard); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_notify_t*> &o); +}; +WRITE_CLASS_ENCODER(pg_notify_t) +std::ostream &operator<<(std::ostream &lhs, const pg_notify_t ¬ify); + + +/** + * pg_query_t - used to ask a peer for information about a pg. + * + * note: if version=0, type=LOG, then we just provide our full log. + */ +struct pg_query_t { + enum { + INFO = 0, + LOG = 1, + MISSING = 4, + FULLLOG = 5, + }; + std::string_view get_type_name() const { + switch (type) { + case INFO: return "info"; + case LOG: return "log"; + case MISSING: return "missing"; + case FULLLOG: return "fulllog"; + default: return "???"; + } + } + + __s32 type; + eversion_t since; + pg_history_t history; + epoch_t epoch_sent; + shard_id_t to; + shard_id_t from; + + pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD), + from(shard_id_t::NO_SHARD) {} + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), + history(h), + epoch_sent(epoch_sent), + to(to), from(from) { + ceph_assert(t != LOG); + } + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + eversion_t s, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), since(s), history(h), + epoch_sent(epoch_sent), to(to), from(from) { + ceph_assert(t == LOG); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_query_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(pg_query_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) { + out << "query(" << q.get_type_name() << " " << q.since; + if (q.type == pg_query_t::LOG) + out << " " << q.history; + out << " epoch_sent " << q.epoch_sent; + out << ")"; + return out; +} + +/** + * pg_lease_t - readable lease metadata, from primary -> non-primary + * + * This metadata serves to increase either or both of the lease expiration + * and upper bound on the non-primary. + */ +struct pg_lease_t { + /// pg readable_until value; replicas must not be readable beyond this + ceph::signedspan readable_until = ceph::signedspan::zero(); + + /// upper bound on any acting osd's readable_until + ceph::signedspan readable_until_ub = ceph::signedspan::zero(); + + /// duration of the lease (in case clock deltas aren't available) + ceph::signedspan interval = ceph::signedspan::zero(); + + pg_lease_t() {} + pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub, + ceph::signedspan i) + : readable_until(ru), + readable_until_ub(ruub), + interval(i) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_lease_t*>& o); + + friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) { + return out << "pg_lease(ru " << l.readable_until + << " ub " << l.readable_until_ub + << " int " << l.interval << ")"; + } +}; +WRITE_CLASS_ENCODER(pg_lease_t) + +/** + * pg_lease_ack_t - lease ack, from non-primary -> primary + * + * This metadata acknowledges to the primary what a non-primary's noted + * upper bound is. + */ +struct pg_lease_ack_t { + /// highest upper bound non-primary has recorded (primary's clock) + ceph::signedspan readable_until_ub = ceph::signedspan::zero(); + + pg_lease_ack_t() {} + pg_lease_ack_t(ceph::signedspan ub) + : readable_until_ub(ub) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_lease_ack_t*>& o); + + friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) { + return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")"; + } +}; +WRITE_CLASS_ENCODER(pg_lease_ack_t) + + + +class PGBackend; +class ObjectModDesc { + bool can_local_rollback; + bool rollback_info_completed; + + // version required to decode, reflected in encode/decode version + __u8 max_required_version = 1; +public: + class Visitor { + public: + virtual void append(uint64_t old_offset) {} + virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {} + virtual void rmobject(version_t old_version) {} + /** + * Used to support the unfound_lost_delete log event: if the stashed + * version exists, we unstash it, otherwise, we do nothing. This way + * each replica rolls back to whatever state it had prior to the attempt + * at mark unfound lost delete + */ + virtual void try_rmobject(version_t old_version) { + rmobject(old_version); + } + virtual void create() {} + virtual void update_snaps(const std::set<snapid_t> &old_snaps) {} + virtual void rollback_extents( + version_t gen, + const std::vector<std::pair<uint64_t, uint64_t> > &extents) {} + virtual ~Visitor() {} + }; + void visit(Visitor *visitor) const; + mutable ceph::buffer::list bl; + enum ModID { + APPEND = 1, + SETATTRS = 2, + DELETE = 3, + CREATE = 4, + UPDATE_SNAPS = 5, + TRY_DELETE = 6, + ROLLBACK_EXTENTS = 7 + }; + ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) { + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + } + void claim(ObjectModDesc &other) { + bl = std::move(other.bl); + can_local_rollback = other.can_local_rollback; + rollback_info_completed = other.rollback_info_completed; + } + void claim_append(ObjectModDesc &other) { + if (!can_local_rollback || rollback_info_completed) + return; + if (!other.can_local_rollback) { + mark_unrollbackable(); + return; + } + bl.claim_append(other.bl); + rollback_info_completed = other.rollback_info_completed; + } + void swap(ObjectModDesc &other) { + bl.swap(other.bl); + + using std::swap; + swap(other.can_local_rollback, can_local_rollback); + swap(other.rollback_info_completed, rollback_info_completed); + swap(other.max_required_version, max_required_version); + } + void append_id(ModID id) { + using ceph::encode; + uint8_t _id(id); + encode(_id, bl); + } + void append(uint64_t old_size) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(APPEND); + encode(old_size, bl); + ENCODE_FINISH(bl); + } + void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(SETATTRS); + encode(old_attrs, bl); + ENCODE_FINISH(bl); + } + bool rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + bool try_rmobject(version_t deletion_version) { + if (!can_local_rollback || rollback_info_completed) + return false; + ENCODE_START(1, 1, bl); + append_id(TRY_DELETE); + encode(deletion_version, bl); + ENCODE_FINISH(bl); + rollback_info_completed = true; + return true; + } + void create() { + if (!can_local_rollback || rollback_info_completed) + return; + rollback_info_completed = true; + ENCODE_START(1, 1, bl); + append_id(CREATE); + ENCODE_FINISH(bl); + } + void update_snaps(const std::set<snapid_t> &old_snaps) { + if (!can_local_rollback || rollback_info_completed) + return; + ENCODE_START(1, 1, bl); + append_id(UPDATE_SNAPS); + encode(old_snaps, bl); + ENCODE_FINISH(bl); + } + void rollback_extents( + version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) { + ceph_assert(can_local_rollback); + ceph_assert(!rollback_info_completed); + if (max_required_version < 2) + max_required_version = 2; + ENCODE_START(2, 2, bl); + append_id(ROLLBACK_EXTENTS); + encode(gen, bl); + encode(extents, bl); + ENCODE_FINISH(bl); + } + + // cannot be rolled back + void mark_unrollbackable() { + can_local_rollback = false; + bl.clear(); + } + bool can_rollback() const { + return can_local_rollback; + } + bool empty() const { + return can_local_rollback && (bl.length() == 0); + } + + bool requires_kraken() const { + return max_required_version >= 2; + } + + /** + * Create fresh copy of bl bytes to avoid keeping large buffers around + * in the case that bl contains ptrs which point into a much larger + * message buffer + */ + void trim_bl() const { + if (bl.length() > 0) + bl.rebuild(); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ObjectModDesc*>& o); +}; +WRITE_CLASS_ENCODER(ObjectModDesc) + +class ObjectCleanRegions { +private: + bool new_object; + bool clean_omap; + interval_set<uint64_t> clean_offsets; + static std::atomic<uint32_t> max_num_intervals; + + /** + * trim the number of intervals if clean_offsets.num_intervals() + * exceeds the given upbound max_num_intervals + * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]} + * then new interval [30~10] will evict out the shortest one [20~5] + * finally, clean_offsets becomes {[5~10], [30~10]} + */ + void trim(); + friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr); +public: + ObjectCleanRegions() : new_object(false), clean_omap(true) { + clean_offsets.insert(0, (uint64_t)-1); + } + ObjectCleanRegions(uint64_t offset, uint64_t len, bool co) + : new_object(false), clean_omap(co) { + clean_offsets.insert(offset, len); + } + bool operator==(const ObjectCleanRegions &orc) const { + return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets; + } + static void set_max_num_intervals(uint32_t num); + void merge(const ObjectCleanRegions &other); + void mark_data_region_dirty(uint64_t offset, uint64_t len); + void mark_omap_dirty(); + void mark_object_new(); + void mark_fully_dirty(); + interval_set<uint64_t> get_dirty_regions() const; + bool omap_is_dirty() const; + bool object_is_exist() const; + bool is_clean_region(uint64_t offset, uint64_t len) const; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ObjectCleanRegions*>& o); +}; +WRITE_CLASS_ENCODER(ObjectCleanRegions) +std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr); + + +struct OSDOp { + ceph_osd_op op; + sobject_t soid; + + ceph::buffer::list indata, outdata; + errorcode32_t rval = 0; + + OSDOp() { + // FIPS zeroization audit 20191115: this memset clean for security + memset(&op, 0, sizeof(ceph_osd_op)); + } + + OSDOp(const int op_code) { + // FIPS zeroization audit 20191115: this memset clean for security + memset(&op, 0, sizeof(ceph_osd_op)); + op.op = op_code; + } + + /** + * split a ceph::buffer::list into constituent indata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + template<typename V> + static void split_osd_op_vector_in_data(V& ops, + ceph::buffer::list& in) { + ceph::buffer::list::iterator datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].indata); + } + } + } + + /** + * merge indata members of a vector of OSDOp into a single ceph::buffer::list + * + * Notably this also encodes certain other OSDOp data into the data + * buffer, including the sobject_t soid. + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + template<typename V> + static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) { + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].indata.length()) { + ops[i].op.payload_len = ops[i].indata.length(); + out.append(ops[i].indata); + } + } + } + + /** + * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps + * + * @param ops [out] vector of OSDOps + * @param in [in] combined data buffer + */ + static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in); + + /** + * merge outdata members of a vector of OSDOps into a single ceph::buffer::list + * + * @param ops [in] vector of OSDOps + * @param out [out] combined data buffer + */ + static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out); + + /** + * Clear data as much as possible, leave minimal data for historical op dump + * + * @param ops [in] vector of OSDOps + */ + template<typename V> + static void clear_data(V& ops) { + for (unsigned i = 0; i < ops.size(); i++) { + OSDOp& op = ops[i]; + op.outdata.clear(); + if (ceph_osd_op_type_attr(op.op.op) && + op.op.xattr.name_len && + op.indata.length() >= op.op.xattr.name_len) { + ceph::buffer::list bl; + bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len)); + bl.begin().copy_in(op.op.xattr.name_len, op.indata); + op.indata = std::move(bl); + } else if (ceph_osd_op_type_exec(op.op.op) && + op.op.cls.class_len && + op.indata.length() > + (op.op.cls.class_len + op.op.cls.method_len)) { + __u8 len = op.op.cls.class_len + op.op.cls.method_len; + ceph::buffer::list bl; + bl.push_back(ceph::buffer::ptr_node::create(len)); + bl.begin().copy_in(len, op.indata); + op.indata = std::move(bl); + } else { + op.indata.clear(); + } + } + } +}; +std::ostream& operator<<(std::ostream& out, const OSDOp& op); + +struct pg_log_op_return_item_t { + int32_t rval; + ceph::buffer::list bl; + void encode(ceph::buffer::list& p) const { + using ceph::encode; + encode(rval, p); + encode(bl, p); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(rval, p); + decode(bl, p); + } + void dump(ceph::Formatter *f) const { + f->dump_int("rval", rval); + f->dump_unsigned("bl_length", bl.length()); + } + friend bool operator==(const pg_log_op_return_item_t& lhs, + const pg_log_op_return_item_t& rhs) { + return lhs.rval == rhs.rval && + lhs.bl.contents_equal(rhs.bl); + } + friend bool operator!=(const pg_log_op_return_item_t& lhs, + const pg_log_op_return_item_t& rhs) { + return !(lhs == rhs); + } + friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) { + return out << "r=" << i.rval << "+" << i.bl.length() << "b"; + } +}; +WRITE_CLASS_ENCODER(pg_log_op_return_item_t) + +/** + * pg_log_entry_t - single entry/event in pg log + * + */ +struct pg_log_entry_t { + enum { + MODIFY = 1, // some unspecified modification (but not *all* modifications) + CLONE = 2, // cloned object from head + DELETE = 3, // deleted object + //BACKLOG = 4, // event invented by generate_backlog [obsolete] + LOST_REVERT = 5, // lost new version, revert to an older version. + LOST_DELETE = 6, // lost new version, revert to no object (deleted). + LOST_MARK = 7, // lost new version, now EIO + PROMOTE = 8, // promoted object from another tier + CLEAN = 9, // mark an object clean + ERROR = 10, // write that returned an error + }; + static const char *get_op_name(int op) { + switch (op) { + case MODIFY: + return "modify"; + case PROMOTE: + return "promote"; + case CLONE: + return "clone"; + case DELETE: + return "delete"; + case LOST_REVERT: + return "l_revert"; + case LOST_DELETE: + return "l_delete"; + case LOST_MARK: + return "l_mark"; + case CLEAN: + return "clean"; + case ERROR: + return "error"; + default: + return "unknown"; + } + } + const char *get_op_name() const { + return get_op_name(op); + } + + // describes state for a locally-rollbackable entry + ObjectModDesc mod_desc; + ceph::buffer::list snaps; // only for clone entries + hobject_t soid; + osd_reqid_t reqid; // caller+tid to uniquely identify request + mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids; + + /// map extra_reqids by index to error return code (if any) + mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes; + + eversion_t version, prior_version, reverting_to; + version_t user_version; // the user version for this entry + utime_t mtime; // this is the _user_ mtime, mind you + int32_t return_code; // only stored for ERRORs for dup detection + + std::vector<pg_log_op_return_item_t> op_returns; + + __s32 op; + bool invalid_hash; // only when decoding sobject_t based entries + bool invalid_pool; // only when decoding pool-less hobject based entries + ObjectCleanRegions clean_regions; + + pg_log_entry_t() + : user_version(0), return_code(0), op(0), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + pg_log_entry_t(int _op, const hobject_t& _soid, + const eversion_t& v, const eversion_t& pv, + version_t uv, + const osd_reqid_t& rid, const utime_t& mt, + int return_code) + : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv), + mtime(mt), return_code(return_code), op(_op), + invalid_hash(false), invalid_pool(false) { + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + bool is_clone() const { return op == CLONE; } + bool is_modify() const { return op == MODIFY; } + bool is_promote() const { return op == PROMOTE; } + bool is_clean() const { return op == CLEAN; } + bool is_lost_revert() const { return op == LOST_REVERT; } + bool is_lost_delete() const { return op == LOST_DELETE; } + bool is_lost_mark() const { return op == LOST_MARK; } + bool is_error() const { return op == ERROR; } + + bool is_update() const { + return + is_clone() || is_modify() || is_promote() || is_clean() || + is_lost_revert() || is_lost_mark(); + } + bool is_delete() const { + return op == DELETE || op == LOST_DELETE; + } + + bool can_rollback() const { + return mod_desc.can_rollback(); + } + + void mark_unrollbackable() { + mod_desc.mark_unrollbackable(); + } + + bool requires_kraken() const { + return mod_desc.requires_kraken(); + } + + // Errors are only used for dup detection, whereas + // the index by objects is used by recovery, copy_get, + // and other facilities that don't expect or need to + // be aware of error entries. + bool object_is_indexed() const { + return !is_error(); + } + + bool reqid_is_indexed() const { + return reqid != osd_reqid_t() && + (op == MODIFY || op == DELETE || op == ERROR); + } + + void set_op_returns(const std::vector<OSDOp>& ops) { + op_returns.resize(ops.size()); + for (unsigned i = 0; i < ops.size(); ++i) { + op_returns[i].rval = ops[i].rval; + op_returns[i].bl = ops[i].outdata; + } + } + + std::string get_key_name() const; + void encode_with_checksum(ceph::buffer::list& bl) const; + void decode_with_checksum(ceph::buffer::list::const_iterator& p); + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_log_entry_t*>& o); + +}; +WRITE_CLASS_ENCODER(pg_log_entry_t) + +std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e); + +struct pg_log_dup_t { + osd_reqid_t reqid; // caller+tid to uniquely identify request + eversion_t version; + version_t user_version; // the user version for this entry + int32_t return_code; // only stored for ERRORs for dup detection + + std::vector<pg_log_op_return_item_t> op_returns; + + pg_log_dup_t() + : user_version(0), return_code(0) + {} + explicit pg_log_dup_t(const pg_log_entry_t& entry) + : reqid(entry.reqid), version(entry.version), + user_version(entry.user_version), + return_code(entry.return_code), + op_returns(entry.op_returns) + {} + pg_log_dup_t(const eversion_t& v, version_t uv, + const osd_reqid_t& rid, int return_code) + : reqid(rid), version(v), user_version(uv), + return_code(return_code) + {} + + std::string get_key_name() const; + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_log_dup_t*>& o); + + bool operator==(const pg_log_dup_t &rhs) const { + return reqid == rhs.reqid && + version == rhs.version && + user_version == rhs.user_version && + return_code == rhs.return_code && + op_returns == rhs.op_returns; + } + bool operator!=(const pg_log_dup_t &rhs) const { + return !(*this == rhs); + } + + friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); +}; +WRITE_CLASS_ENCODER(pg_log_dup_t) + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e); + +/** + * pg_log_t - incremental log of recent pg changes. + * + * serves as a recovery queue for recent changes. + */ +struct pg_log_t { + /* + * head - newest entry (update|delete) + * tail - entry previous to oldest (update|delete) for which we have + * complete negative information. + * i.e. we can infer pg contents for any store whose last_update >= tail. + */ + eversion_t head; // newest entry + eversion_t tail; // version prior to oldest + +protected: + // We can rollback rollback-able entries > can_rollback_to + eversion_t can_rollback_to; + + // always <= can_rollback_to, indicates how far stashed rollback + // data can be found + eversion_t rollback_info_trimmed_to; + +public: + // the actual log + mempool::osd_pglog::list<pg_log_entry_t> log; + + // entries just for dup op detection ordered oldest to newest + mempool::osd_pglog::list<pg_log_dup_t> dups; + + pg_log_t() = default; + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + mempool::osd_pglog::list<pg_log_entry_t> &&entries, + mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to), + log(std::move(entries)), dups(std::move(dup_entries)) {} + pg_log_t(const eversion_t &last_update, + const eversion_t &log_tail, + const eversion_t &can_rollback_to, + const eversion_t &rollback_info_trimmed_to, + const std::list<pg_log_entry_t> &entries, + const std::list<pg_log_dup_t> &dup_entries) + : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to), + rollback_info_trimmed_to(rollback_info_trimmed_to) { + for (auto &&entry: entries) { + log.push_back(entry); + } + for (auto &&entry: dup_entries) { + dups.push_back(entry); + } + } + + void clear() { + eversion_t z; + rollback_info_trimmed_to = can_rollback_to = head = tail = z; + log.clear(); + dups.clear(); + } + + eversion_t get_rollback_info_trimmed_to() const { + return rollback_info_trimmed_to; + } + eversion_t get_can_rollback_to() const { + return can_rollback_to; + } + + + pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) { + mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog; + oldlog.swap(log); + + eversion_t old_tail; + unsigned mask = ~((~0)<<split_bits); + for (auto i = oldlog.begin(); + i != oldlog.end(); + ) { + if ((i->soid.get_hash() & mask) == child_pgid.m_seed) { + childlog.push_back(*i); + } else { + log.push_back(*i); + } + oldlog.erase(i++); + } + + // osd_reqid is unique, so it doesn't matter if there are extra + // dup entries in each pg. To avoid storing oid with the dup + // entries, just copy the whole list. + auto childdups(dups); + + return pg_log_t( + head, + tail, + can_rollback_to, + rollback_info_trimmed_to, + std::move(childlog), + std::move(childdups)); + } + + mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { + ceph_assert(newhead >= tail); + + mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end(); + mempool::osd_pglog::list<pg_log_entry_t> divergent; + while (true) { + if (p == log.begin()) { + // yikes, the whole thing is divergent! + using std::swap; + swap(divergent, log); + break; + } + --p; + if (p->version.version <= newhead.version) { + /* + * look at eversion.version here. we want to avoid a situation like: + * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529 + * lower_bound = 100'9 + * i.e, same request, different version. If the eversion.version is > the + * lower_bound, we it is divergent. + */ + ++p; + divergent.splice(divergent.begin(), log, p, log.end()); + break; + } + ceph_assert(p->version > newhead); + } + head = newhead; + + if (can_rollback_to > newhead) + can_rollback_to = newhead; + + if (rollback_info_trimmed_to > newhead) + rollback_info_trimmed_to = newhead; + + return divergent; + } + + void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) { + log.clear(); + + // sort and merge dups + std::multimap<eversion_t,pg_log_dup_t> sorted; + for (auto& d : dups) { + sorted.emplace(d.version, d); + } + for (auto l : slogs) { + for (auto& d : l->dups) { + sorted.emplace(d.version, d); + } + } + dups.clear(); + for (auto& i : sorted) { + dups.push_back(i.second); + } + + head = last_update; + tail = last_update; + can_rollback_to = last_update; + rollback_info_trimmed_to = last_update; + } + + bool empty() const { + return log.empty(); + } + + bool null() const { + return head.version == 0 && head.epoch == 0; + } + + uint64_t approx_size() const { + return head.version - tail.version; + } + + static void filter_log(spg_t import_pgid, const OSDMap &curmap, + const std::string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject); + + /** + * copy entries from the tail of another pg_log_t + * + * @param other pg_log_t to copy from + * @param from copy entries after this version + */ + void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from); + + /** + * copy up to N entries + * + * @param other source log + * @param max max number of entries to copy + */ + void copy_up_to(CephContext* cct, const pg_log_t &other, int max); + + std::ostream& print(std::ostream& out) const; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_log_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_log_t) + +inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log) +{ + out << "log((" << log.tail << "," << log.head << "], crt=" + << log.get_can_rollback_to() << ")"; + return out; +} + + +/** + * pg_missing_t - summary of missing objects. + * + * kept in memory, as a supplement to pg_log_t + * also used to pass missing info in messages. + */ +struct pg_missing_item { + eversion_t need, have; + ObjectCleanRegions clean_regions; + enum missing_flags_t { + FLAG_NONE = 0, + FLAG_DELETE = 1, + } flags; + pg_missing_item() : flags(FLAG_NONE) {} + explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version + pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) : + need(n), have(h) { + set_delete(is_delete); + if (old_style) + clean_regions.mark_fully_dirty(); + } + + void encode(ceph::buffer::list& bl, uint64_t features) const { + using ceph::encode; + if (HAVE_FEATURE(features, SERVER_OCTOPUS)) { + // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、 + // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not + // possible. This can be replaced with the legacy encoding + encode(eversion_t(), bl); + encode(eversion_t(-1, -1), bl); + encode(need, bl); + encode(have, bl); + encode(static_cast<uint8_t>(flags), bl); + encode(clean_regions, bl); + } else { + encode(eversion_t(), bl); + encode(need, bl); + encode(have, bl); + encode(static_cast<uint8_t>(flags), bl); + } + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + eversion_t e, l; + decode(e, bl); + decode(l, bl); + if(l == eversion_t(-1, -1)) { + // support all + decode(need, bl); + decode(have, bl); + uint8_t f; + decode(f, bl); + flags = static_cast<missing_flags_t>(f); + decode(clean_regions, bl); + } else { + // support OSD_RECOVERY_DELETES + need = l; + decode(have, bl); + uint8_t f; + decode(f, bl); + flags = static_cast<missing_flags_t>(f); + clean_regions.mark_fully_dirty(); + } + } + + void set_delete(bool is_delete) { + flags = is_delete ? FLAG_DELETE : FLAG_NONE; + } + + bool is_delete() const { + return (flags & FLAG_DELETE) == FLAG_DELETE; + } + + std::string flag_str() const { + if (flags == FLAG_NONE) { + return "none"; + } else { + return "delete"; + } + } + + void dump(ceph::Formatter *f) const { + f->dump_stream("need") << need; + f->dump_stream("have") << have; + f->dump_stream("flags") << flag_str(); + f->dump_stream("clean_regions") << clean_regions; + } + static void generate_test_instances(std::list<pg_missing_item*>& o) { + o.push_back(new pg_missing_item); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(1, 2); + o.back()->have = eversion_t(1, 1); + o.push_back(new pg_missing_item); + o.back()->need = eversion_t(3, 5); + o.back()->have = eversion_t(3, 4); + o.back()->clean_regions.mark_data_region_dirty(4096, 8192); + o.back()->clean_regions.mark_omap_dirty(); + o.back()->flags = FLAG_DELETE; + } + bool operator==(const pg_missing_item &rhs) const { + return need == rhs.need && have == rhs.have && flags == rhs.flags; + } + bool operator!=(const pg_missing_item &rhs) const { + return !(*this == rhs); + } +}; +WRITE_CLASS_ENCODER_FEATURES(pg_missing_item) +std::ostream& operator<<(std::ostream& out, const pg_missing_item &item); + +class pg_missing_const_i { +public: + virtual const std::map<hobject_t, pg_missing_item> & + get_items() const = 0; + virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0; + virtual bool get_may_include_deletes() const = 0; + virtual unsigned int num_missing() const = 0; + virtual bool have_missing() const = 0; + virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0; + virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0; + virtual ~pg_missing_const_i() {} +}; + + +template <bool Track> +class ChangeTracker { +public: + void changed(const hobject_t &obj) {} + template <typename F> + void get_changed(F &&f) const {} + void flush() {} + bool is_clean() const { + return true; + } +}; +template <> +class ChangeTracker<true> { + std::set<hobject_t> _changed; +public: + void changed(const hobject_t &obj) { + _changed.insert(obj); + } + template <typename F> + void get_changed(F &&f) const { + for (auto const &i: _changed) { + f(i); + } + } + void flush() { + _changed.clear(); + } + bool is_clean() const { + return _changed.empty(); + } +}; + +template <bool TrackChanges> +class pg_missing_set : public pg_missing_const_i { + using item = pg_missing_item; + std::map<hobject_t, item> missing; // oid -> (need v, have v) + std::map<version_t, hobject_t> rmissing; // v -> oid + ChangeTracker<TrackChanges> tracker; + +public: + pg_missing_set() = default; + + template <typename missing_type> + pg_missing_set(const missing_type &m) { + missing = m.get_items(); + rmissing = m.get_rmissing(); + may_include_deletes = m.get_may_include_deletes(); + for (auto &&i: missing) + tracker.changed(i.first); + } + + bool may_include_deletes = false; + + const std::map<hobject_t, item> &get_items() const override { + return missing; + } + const std::map<version_t, hobject_t> &get_rmissing() const override { + return rmissing; + } + bool get_may_include_deletes() const override { + return may_include_deletes; + } + unsigned int num_missing() const override { + return missing.size(); + } + bool have_missing() const override { + return !missing.empty(); + } + void merge(const pg_log_entry_t& e) { + auto miter = missing.find(e.soid); + if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have) + miter->second.clean_regions.merge(e.clean_regions); + } + bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override { + auto iter = missing.find(oid); + if (iter == missing.end()) + return false; + if (out) + *out = iter->second; + return true; + } + bool is_missing(const hobject_t& oid, eversion_t v) const override { + std::map<hobject_t, item>::const_iterator m = + missing.find(oid); + if (m == missing.end()) + return false; + const item &item(m->second); + if (item.need > v) + return false; + return true; + } + eversion_t get_oldest_need() const { + if (missing.empty()) { + return eversion_t(); + } + auto it = missing.find(rmissing.begin()->second); + ceph_assert(it != missing.end()); + return it->second.need; + } + + void claim(pg_missing_set&& o) { + static_assert(!TrackChanges, "Can't use claim with TrackChanges"); + missing = std::move(o.missing); + rmissing = std::move(o.rmissing); + } + + /* + * this needs to be called in log order as we extend the log. it + * assumes missing is accurate up through the previous log entry. + */ + void add_next_event(const pg_log_entry_t& e) { + std::map<hobject_t, item>::iterator missing_it; + missing_it = missing.find(e.soid); + bool is_missing_divergent_item = missing_it != missing.end(); + if (e.prior_version == eversion_t() || e.is_clone()) { + // new object. + if (is_missing_divergent_item) { // use iterator + rmissing.erase(missing_it->second.need.version); + // .have = nil + missing_it->second = item(e.version, eversion_t(), e.is_delete()); + missing_it->second.clean_regions.mark_fully_dirty(); + } else { + // create new element in missing map + // .have = nil + missing[e.soid] = item(e.version, eversion_t(), e.is_delete()); + missing[e.soid].clean_regions.mark_fully_dirty(); + } + } else if (is_missing_divergent_item) { + // already missing (prior). + rmissing.erase((missing_it->second).need.version); + missing_it->second.need = e.version; // leave .have unchanged. + missing_it->second.set_delete(e.is_delete()); + if (e.is_lost_revert()) + missing_it->second.clean_regions.mark_fully_dirty(); + else + missing_it->second.clean_regions.merge(e.clean_regions); + } else { + // not missing, we must have prior_version (if any) + ceph_assert(!is_missing_divergent_item); + missing[e.soid] = item(e.version, e.prior_version, e.is_delete()); + if (e.is_lost_revert()) + missing[e.soid].clean_regions.mark_fully_dirty(); + else + missing[e.soid].clean_regions = e.clean_regions; + } + rmissing[e.version.version] = e.soid; + tracker.changed(e.soid); + } + + void revise_need(hobject_t oid, eversion_t need, bool is_delete) { + auto p = missing.find(oid); + if (p != missing.end()) { + rmissing.erase((p->second).need.version); + p->second.need = need; // do not adjust .have + p->second.set_delete(is_delete); + p->second.clean_regions.mark_fully_dirty(); + } else { + missing[oid] = item(need, eversion_t(), is_delete); + missing[oid].clean_regions.mark_fully_dirty(); + } + rmissing[need.version] = oid; + + tracker.changed(oid); + } + + void revise_have(hobject_t oid, eversion_t have) { + auto p = missing.find(oid); + if (p != missing.end()) { + tracker.changed(oid); + (p->second).have = have; + } + } + + void mark_fully_dirty(const hobject_t& oid) { + auto p = missing.find(oid); + if (p != missing.end()) { + tracker.changed(oid); + (p->second).clean_regions.mark_fully_dirty(); + } + } + + void add(const hobject_t& oid, eversion_t need, eversion_t have, + bool is_delete) { + missing[oid] = item(need, have, is_delete, true); + rmissing[need.version] = oid; + tracker.changed(oid); + } + + void add(const hobject_t& oid, pg_missing_item&& item) { + rmissing[item.need.version] = oid; + missing.insert({oid, std::move(item)}); + tracker.changed(oid); + } + + void rm(const hobject_t& oid, eversion_t v) { + std::map<hobject_t, item>::iterator p = missing.find(oid); + if (p != missing.end() && p->second.need <= v) + rm(p); + } + + void rm(std::map<hobject_t, item>::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void got(const hobject_t& oid, eversion_t v) { + std::map<hobject_t, item>::iterator p = missing.find(oid); + ceph_assert(p != missing.end()); + ceph_assert(p->second.need <= v || p->second.is_delete()); + got(p); + } + + void got(std::map<hobject_t, item>::const_iterator m) { + tracker.changed(m->first); + rmissing.erase(m->second.need.version); + missing.erase(m); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + pg_missing_set *omissing) { + omissing->may_include_deletes = may_include_deletes; + unsigned mask = ~((~0)<<split_bits); + for (std::map<hobject_t, item>::iterator i = missing.begin(); + i != missing.end(); + ) { + if ((i->first.get_hash() & mask) == child_pgid.m_seed) { + omissing->add(i->first, i->second.need, i->second.have, + i->second.is_delete()); + rm(i++); + } else { + ++i; + } + } + } + + void clear() { + for (auto const &i: missing) + tracker.changed(i.first); + missing.clear(); + rmissing.clear(); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(5, 2, bl) + encode(missing, bl, features); + encode(may_include_deletes, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) { + for (auto const &i: missing) + tracker.changed(i.first); + DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl); + decode(missing, bl); + if (struct_v >= 4) { + decode(may_include_deletes, bl); + } + DECODE_FINISH(bl); + + if (struct_v < 3) { + // Handle hobject_t upgrade + std::map<hobject_t, item> tmp; + for (std::map<hobject_t, item>::iterator i = + missing.begin(); + i != missing.end(); + ) { + if (!i->first.is_max() && i->first.pool == -1) { + hobject_t to_insert(i->first); + to_insert.pool = pool; + tmp[to_insert] = i->second; + missing.erase(i++); + } else { + ++i; + } + } + missing.insert(tmp.begin(), tmp.end()); + } + + for (std::map<hobject_t,item>::iterator it = + missing.begin(); + it != missing.end(); + ++it) + rmissing[it->second.need.version] = it->first; + for (auto const &i: missing) + tracker.changed(i.first); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("missing"); + for (std::map<hobject_t,item>::const_iterator p = + missing.begin(); p != missing.end(); ++p) { + f->open_object_section("item"); + f->dump_stream("object") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_bool("may_include_deletes", may_include_deletes); + } + template <typename F> + void filter_objects(F &&f) { + for (auto i = missing.begin(); i != missing.end();) { + if (f(i->first)) { + rm(i++); + } else { + ++i; + } + } + } + static void generate_test_instances(std::list<pg_missing_set*>& o) { + o.push_back(new pg_missing_set); + o.back()->may_include_deletes = true; + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), false); + o.back()->may_include_deletes = true; + o.push_back(new pg_missing_set); + o.back()->add( + hobject_t(object_t("foo"), "foo", 123, 456, 0, ""), + eversion_t(5, 6), eversion_t(5, 1), true); + o.back()->may_include_deletes = true; + } + template <typename F> + void get_changed(F &&f) const { + tracker.get_changed(f); + } + void flush() { + tracker.flush(); + } + bool is_clean() const { + return tracker.is_clean(); + } + template <typename missing_t> + bool debug_verify_from_init( + const missing_t &init_missing, + std::ostream *oss) const { + if (!TrackChanges) + return true; + auto check_missing(init_missing.get_items()); + tracker.get_changed([&](const hobject_t &hoid) { + check_missing.erase(hoid); + if (missing.count(hoid)) { + check_missing.insert(*(missing.find(hoid))); + } + }); + bool ok = true; + if (check_missing.size() != missing.size()) { + if (oss) { + *oss << "Size mismatch, check: " << check_missing.size() + << ", actual: " << missing.size() << "\n"; + } + ok = false; + } + for (auto &i: missing) { + if (!check_missing.count(i.first)) { + if (oss) + *oss << "check_missing missing " << i.first << "\n"; + ok = false; + } else if (check_missing[i.first] != i.second) { + if (oss) + *oss << "check_missing missing item mismatch on " << i.first + << ", check: " << check_missing[i.first] + << ", actual: " << i.second << "\n"; + ok = false; + } + } + if (oss && !ok) { + *oss << "check_missing: " << check_missing << "\n"; + std::set<hobject_t> changed; + tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); }); + *oss << "changed: " << changed << "\n"; + } + return ok; + } +}; +template <bool TrackChanges> +void encode( + const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) { + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template <bool TrackChanges> +void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) { + c.decode(p); +} +template <bool TrackChanges> +std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing) +{ + out << "missing(" << missing.num_missing() + << " may_include_deletes = " << missing.may_include_deletes; + //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; + out << ")"; + return out; +} + +using pg_missing_t = pg_missing_set<false>; +using pg_missing_tracker_t = pg_missing_set<true>; + + + + +/** + * pg list objects response format + * + */ + +template<typename T> +struct pg_nls_response_template { + collection_list_handle_t handle; + std::vector<T> entries; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(handle, bl); + __u32 n = (__u32)entries.size(); + encode(n, bl); + for (auto i = entries.begin(); i != entries.end(); ++i) { + encode(i->nspace, bl); + encode(i->oid, bl); + encode(i->locator, bl); + } + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(handle, bl); + __u32 n; + decode(n, bl); + entries.clear(); + while (n--) { + T i; + decode(i.nspace, bl); + decode(i.oid, bl); + decode(i.locator, bl); + entries.push_back(i); + } + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (auto p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_string("namespace", p->nspace); + f->dump_string("object", p->oid); + f->dump_string("key", p->locator); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list<pg_nls_response_template<T>*>& o) { + o.push_back(new pg_nls_response_template<T>); + o.push_back(new pg_nls_response_template<T>); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.push_back(new pg_nls_response_template<T>); + o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + o.push_back(new pg_nls_response_template<T>); + o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, ""); + o.back()->entries.push_back(librados::ListObjectImpl("", "one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("", "three", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", "")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey")); + o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", "")); + } +}; + +using pg_nls_response_t = pg_nls_response_template<librados::ListObjectImpl>; + +WRITE_CLASS_ENCODER(pg_nls_response_t) + +// For backwards compatibility with older OSD requests +struct pg_ls_response_t { + collection_list_handle_t handle; + std::list<std::pair<object_t, std::string> > entries; + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + __u8 v = 1; + encode(v, bl); + encode(handle, bl); + encode(entries, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + __u8 v; + decode(v, bl); + ceph_assert(v == 1); + decode(handle, bl); + decode(entries, bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("handle") << handle; + f->open_array_section("entries"); + for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("object"); + f->dump_stream("object") << p->first; + f->dump_string("key", p->second); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list<pg_ls_response_t*>& o) { + o.push_back(new pg_ls_response_t); + o.push_back(new pg_ls_response_t); + o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, ""); + o.back()->entries.push_back(std::make_pair(object_t("one"), std::string())); + o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey"))); + } +}; + +WRITE_CLASS_ENCODER(pg_ls_response_t) + +/** + * object_copy_cursor_t + */ +struct object_copy_cursor_t { + uint64_t data_offset; + std::string omap_offset; + bool attr_complete; + bool data_complete; + bool omap_complete; + + object_copy_cursor_t() + : data_offset(0), + attr_complete(false), + data_complete(false), + omap_complete(false) + {} + + bool is_initial() const { + return !attr_complete && data_offset == 0 && omap_offset.empty(); + } + bool is_complete() const { + return attr_complete && data_complete && omap_complete; + } + + static void generate_test_instances(std::list<object_copy_cursor_t*>& o); + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER(object_copy_cursor_t) + +/** + * object_copy_data_t + * + * Return data from a copy request. The semantics are a little strange + * as a result of the encoding's heritage. + * + * In particular, the sender unconditionally fills in the cursor (from what + * it receives and sends), the size, and the mtime, but is responsible for + * figuring out whether it should put any data in the attrs, data, or + * omap members (corresponding to xattrs, object data, and the omap entries) + * based on external data (the client includes a max amount to return with + * the copy request). The client then looks into the attrs, data, and/or omap + * based on the contents of the cursor. + */ +struct object_copy_data_t { + enum { + FLAG_DATA_DIGEST = 1<<0, + FLAG_OMAP_DIGEST = 1<<1, + }; + object_copy_cursor_t cursor; + uint64_t size; + utime_t mtime; + uint32_t data_digest, omap_digest; + uint32_t flags; + std::map<std::string, ceph::buffer::list> attrs; + ceph::buffer::list data; + ceph::buffer::list omap_header; + ceph::buffer::list omap_data; + + /// which snaps we are defined for (if a snap and not the head) + std::vector<snapid_t> snaps; + /// latest snap seq for the object (if head) + snapid_t snap_seq; + + /// recent reqids on this object + mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids; + + /// map reqids by index to error return code (if any) + mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; + + uint64_t truncate_seq; + uint64_t truncate_size; + +public: + object_copy_data_t() : + size((uint64_t)-1), data_digest(-1), + omap_digest(-1), flags(0), + truncate_seq(0), + truncate_size(0) {} + + static void generate_test_instances(std::list<object_copy_data_t*>& o); + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t) + +/** + * pg creation info + */ +struct pg_create_t { + epoch_t created; // epoch pg created + pg_t parent; // split from parent (if != pg_t()) + __s32 split_bits; + + pg_create_t() + : created(0), split_bits(0) {} + pg_create_t(unsigned c, pg_t p, int s) + : created(c), parent(p), split_bits(s) {} + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<pg_create_t*>& o); +}; +WRITE_CLASS_ENCODER(pg_create_t) + +// ----------------------------------------- + +class ObjectExtent { + /** + * ObjectExtents are used for specifying IO behavior against RADOS + * objects when one is using the ObjectCacher. + * + * To use this in a real system, *every member* must be filled + * out correctly. In particular, make sure to initialize the + * oloc correctly, as its default values are deliberate poison + * and will cause internal ObjectCacher asserts. + * + * Similarly, your buffer_extents vector *must* specify a total + * size equal to your length. If the buffer_extents inadvertently + * contain less space than the length member specifies, you + * will get unintelligible asserts deep in the ObjectCacher. + * + * If you are trying to do testing and don't care about actual + * RADOS function, the simplest thing to do is to initialize + * the ObjectExtent (truncate_size can be 0), create a single entry + * in buffer_extents matching the length, and set oloc.pool to 0. + */ + public: + object_t oid; // object id + uint64_t objectno; + uint64_t offset; // in object + uint64_t length; // in object + uint64_t truncate_size; // in object + + object_locator_t oloc; // object locator (pool etc) + + std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {} + ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) : + oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { } +}; + +inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " (" << ex.objectno << ") in " << ex.oloc + << " " << ex.offset << "~" << ex.length + << " -> " << ex.buffer_extents + << ")"; +} + + +// --------------------------------------- + +class OSDSuperblock { +public: + uuid_d cluster_fsid, osd_fsid; + int32_t whoami = -1; // my role in this fs. + epoch_t current_epoch = 0; // most recent epoch + epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have. + double weight = 0.0; + + CompatSet compat_features; + + // last interval over which i mounted and was then active + epoch_t mounted = 0; // last epoch i mounted + epoch_t clean_thru = 0; // epoch i was active and clean thru + + epoch_t purged_snaps_last = 0; + utime_t last_purged_snaps_scrub; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<OSDSuperblock*>& o); +}; +WRITE_CLASS_ENCODER(OSDSuperblock) + +inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb) +{ + return out << "sb(" << sb.cluster_fsid + << " osd." << sb.whoami + << " " << sb.osd_fsid + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map << "]" + << " lci=[" << sb.mounted << "," << sb.clean_thru << "]" + << ")"; +} + + +// ------- + + + + + + +/* + * attached to object head. describes most recent snap context, and + * set of existing clones. + */ +struct SnapSet { + snapid_t seq; + // NOTE: this is for pre-octopus compatibility only! remove in Q release + std::vector<snapid_t> snaps; // descending + std::vector<snapid_t> clones; // ascending + std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest + std::map<snapid_t, uint64_t> clone_size; + std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending + + SnapSet() : seq(0) {} + explicit SnapSet(ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } + + /// populate SnapSet from a librados::snap_set_t + void from_snap_set(const librados::snap_set_t& ss, bool legacy); + + /// get space accounted to clone + uint64_t get_clone_bytes(snapid_t clone) const; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<SnapSet*>& o); + + SnapContext get_ssc_as_of(snapid_t as_of) const { + SnapContext out; + out.seq = as_of; + for (auto p = clone_snaps.rbegin(); + p != clone_snaps.rend(); + ++p) { + for (auto snap : p->second) { + if (snap <= as_of) { + out.snaps.push_back(snap); + } + } + } + return out; + } + + + SnapSet get_filtered(const pg_pool_t &pinfo) const; + void filter(const pg_pool_t &pinfo); +}; +WRITE_CLASS_ENCODER(SnapSet) + +std::ostream& operator<<(std::ostream& out, const SnapSet& cs); + + + +#define OI_ATTR "_" +#define SS_ATTR "snapset" + +struct watch_info_t { + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_info_t() : cookie(0), timeout_seconds(0) { } + watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {} + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<watch_info_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(watch_info_t) + +static inline bool operator==(const watch_info_t& l, const watch_info_t& r) { + return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds + && l.addr == r.addr; +} + +static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) { + return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s" + << " " << w.addr << ")"; +} + +struct notify_info_t { + uint64_t cookie; + uint64_t notify_id; + uint32_t timeout; + ceph::buffer::list bl; +}; + +static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) { + return out << "notify(cookie " << n.cookie + << " notify" << n.notify_id + << " " << n.timeout << "s)"; +} + +class object_ref_delta_t { + std::map<hobject_t, int> ref_delta; + +public: + object_ref_delta_t() = default; + object_ref_delta_t(const object_ref_delta_t &) = default; + object_ref_delta_t(object_ref_delta_t &&) = default; + + object_ref_delta_t(decltype(ref_delta) &&ref_delta) + : ref_delta(std::move(ref_delta)) {} + object_ref_delta_t(const decltype(ref_delta) &ref_delta) + : ref_delta(ref_delta) {} + + object_ref_delta_t &operator=(const object_ref_delta_t &) = default; + object_ref_delta_t &operator=(object_ref_delta_t &&) = default; + + void dec_ref(const hobject_t &hoid, unsigned num=1) { + mut_ref(hoid, -num); + } + void inc_ref(const hobject_t &hoid, unsigned num=1) { + mut_ref(hoid, num); + } + void mut_ref(const hobject_t &hoid, int num) { + [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0); + iter->second += num; + if (iter->second == 0) + ref_delta.erase(iter); + } + + auto begin() const { return ref_delta.begin(); } + auto end() const { return ref_delta.end(); } + auto find(hobject_t &key) const { return ref_delta.find(key); } + + bool operator==(const object_ref_delta_t &rhs) const { + return ref_delta == rhs.ref_delta; + } + bool operator!=(const object_ref_delta_t &rhs) const { + return !(*this == rhs); + } + bool is_empty() { + return ref_delta.empty(); + } + uint64_t size() { + return ref_delta.size(); + } + friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci); +}; + +struct chunk_info_t { + typedef enum { + FLAG_DIRTY = 1, + FLAG_MISSING = 2, + FLAG_HAS_REFERENCE = 4, + FLAG_HAS_FINGERPRINT = 8, + } cflag_t; + uint32_t offset; + uint32_t length; + hobject_t oid; + cflag_t flags; // FLAG_* + + chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { } + chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) : + offset(offset), length(length), oid(oid), flags((cflag_t)0) { } + + static std::string get_flag_string(uint64_t flags) { + std::string r; + if (flags & FLAG_DIRTY) { + r += "|dirty"; + } + if (flags & FLAG_MISSING) { + r += "|missing"; + } + if (flags & FLAG_HAS_REFERENCE) { + r += "|has_reference"; + } + if (flags & FLAG_HAS_FINGERPRINT) { + r += "|has_fingerprint"; + } + if (r.length()) + return r.substr(1); + return r; + } + bool test_flag(cflag_t f) const { + return (flags & f) == f; + } + void set_flag(cflag_t f) { + flags = (cflag_t)(flags | f); + } + void set_flags(cflag_t f) { + flags = f; + } + void clear_flag(cflag_t f) { + flags = (cflag_t)(flags & ~f); + } + void clear_flags() { + flags = (cflag_t)0; + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_missing() const { + return test_flag(FLAG_MISSING); + } + bool has_reference() const { + return test_flag(FLAG_HAS_REFERENCE); + } + bool has_fingerprint() const { + return test_flag(FLAG_HAS_FINGERPRINT); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci); + bool operator==(const chunk_info_t& cit) const; + bool operator!=(const chunk_info_t& cit) const { + return !(cit == *this); + } +}; +WRITE_CLASS_ENCODER(chunk_info_t) +std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci); + +struct object_info_t; +struct object_manifest_t { + enum { + TYPE_NONE = 0, + TYPE_REDIRECT = 1, + TYPE_CHUNKED = 2, + }; + uint8_t type; // redirect, chunked, ... + hobject_t redirect_target; + std::map<uint64_t, chunk_info_t> chunk_map; + + object_manifest_t() : type(0) { } + object_manifest_t(uint8_t type, const hobject_t& redirect_target) + : type(type), redirect_target(redirect_target) { } + + bool is_empty() const { + return type == TYPE_NONE; + } + bool is_redirect() const { + return type == TYPE_REDIRECT; + } + bool is_chunked() const { + return type == TYPE_CHUNKED; + } + static std::string_view get_type_name(uint8_t m) { + switch (m) { + case TYPE_NONE: return "none"; + case TYPE_REDIRECT: return "redirect"; + case TYPE_CHUNKED: return "chunked"; + default: return "unknown"; + } + } + std::string_view get_type_name() const { + return get_type_name(type); + } + void clear() { + type = 0; + redirect_target = hobject_t(); + chunk_map.clear(); + } + + /** + * calc_refs_to_inc_on_set + * + * Takes a manifest and returns the set of refs to + * increment upon set-chunk + * + * l should be nullptr if there are no clones, or + * l and g may each be null if the corresponding clone does not exist. + * *this contains the set of new references to set + * + */ + void calc_refs_to_inc_on_set( + const object_manifest_t* g, ///< [in] manifest for clone > *this + const object_manifest_t* l, ///< [in] manifest for clone < *this + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + /** + * calc_refs_to_drop_on_modify + * + * Takes a manifest and returns the set of refs to + * drop upon modification + * + * l should be nullptr if there are no clones, or + * l may be null if the corresponding clone does not exist. + * + */ + void calc_refs_to_drop_on_modify( + const object_manifest_t* l, ///< [in] manifest for previous clone + const ObjectCleanRegions& clean_regions, ///< [in] clean regions + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + /** + * calc_refs_to_drop_on_removal + * + * Takes the two adjacent manifests and returns the set of refs to + * drop upon removal of the clone containing *this. + * + * g should be nullptr if *this is on HEAD, l should be nullptr if + * *this is on the oldest clone (or head if there are no clones). + */ + void calc_refs_to_drop_on_removal( + const object_manifest_t* g, ///< [in] manifest for clone > *this + const object_manifest_t* l, ///< [in] manifest for clone < *this + object_ref_delta_t &delta ///< [out] set of refs to drop + ) const; + + static void generate_test_instances(std::list<object_manifest_t*>& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + void dump(ceph::Formatter *f) const; + friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi); +}; +WRITE_CLASS_ENCODER(object_manifest_t) +std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi); + +struct object_info_t { + hobject_t soid; + eversion_t version, prior_version; + version_t user_version; + osd_reqid_t last_reqid; + + uint64_t size; + utime_t mtime; + utime_t local_mtime; // local mtime + + // note: these are currently encoded into a total 16 bits; see + // encode()/decode() for the weirdness. + typedef enum { + FLAG_LOST = 1<<0, + FLAG_WHITEOUT = 1<<1, // object logically does not exist + FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied + FLAG_OMAP = 1<<3, // has (or may have) some/any omap data + FLAG_DATA_DIGEST = 1<<4, // has data crc + FLAG_OMAP_DIGEST = 1<<5, // has omap crc + FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier + FLAG_MANIFEST = 1<<7, // has manifest + FLAG_USES_TMAP = 1<<8, // deprecated; no longer used + FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference + } flag_t; + + flag_t flags; + + static std::string get_flag_string(flag_t flags) { + std::string s; + std::vector<std::string> sv = get_flag_vector(flags); + for (auto ss : sv) { + s += std::string("|") + ss; + } + if (s.length()) + return s.substr(1); + return s; + } + static std::vector<std::string> get_flag_vector(flag_t flags) { + std::vector<std::string> sv; + if (flags & FLAG_LOST) + sv.insert(sv.end(), "lost"); + if (flags & FLAG_WHITEOUT) + sv.insert(sv.end(), "whiteout"); + if (flags & FLAG_DIRTY) + sv.insert(sv.end(), "dirty"); + if (flags & FLAG_USES_TMAP) + sv.insert(sv.end(), "uses_tmap"); + if (flags & FLAG_OMAP) + sv.insert(sv.end(), "omap"); + if (flags & FLAG_DATA_DIGEST) + sv.insert(sv.end(), "data_digest"); + if (flags & FLAG_OMAP_DIGEST) + sv.insert(sv.end(), "omap_digest"); + if (flags & FLAG_CACHE_PIN) + sv.insert(sv.end(), "cache_pin"); + if (flags & FLAG_MANIFEST) + sv.insert(sv.end(), "manifest"); + if (flags & FLAG_REDIRECT_HAS_REFERENCE) + sv.insert(sv.end(), "redirect_has_reference"); + return sv; + } + std::string get_flag_string() const { + return get_flag_string(flags); + } + + uint64_t truncate_seq, truncate_size; + + std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers; + + // opportunistic checksums; may or may not be present + __u32 data_digest; ///< data crc32c + __u32 omap_digest; ///< omap crc32c + + // alloc hint attribute + uint64_t expected_object_size, expected_write_size; + uint32_t alloc_hint_flags; + + struct object_manifest_t manifest; + + void copy_user_bits(const object_info_t& other); + + bool test_flag(flag_t f) const { + return (flags & f) == f; + } + void set_flag(flag_t f) { + flags = (flag_t)(flags | f); + } + void clear_flag(flag_t f) { + flags = (flag_t)(flags & ~f); + } + bool is_lost() const { + return test_flag(FLAG_LOST); + } + bool is_whiteout() const { + return test_flag(FLAG_WHITEOUT); + } + bool is_dirty() const { + return test_flag(FLAG_DIRTY); + } + bool is_omap() const { + return test_flag(FLAG_OMAP); + } + bool is_data_digest() const { + return test_flag(FLAG_DATA_DIGEST); + } + bool is_omap_digest() const { + return test_flag(FLAG_OMAP_DIGEST); + } + bool is_cache_pinned() const { + return test_flag(FLAG_CACHE_PIN); + } + bool has_manifest() const { + return test_flag(FLAG_MANIFEST); + } + void set_data_digest(__u32 d) { + set_flag(FLAG_DATA_DIGEST); + data_digest = d; + } + void set_omap_digest(__u32 d) { + set_flag(FLAG_OMAP_DIGEST); + omap_digest = d; + } + void clear_data_digest() { + clear_flag(FLAG_DATA_DIGEST); + data_digest = -1; + } + void clear_omap_digest() { + clear_flag(FLAG_OMAP_DIGEST); + omap_digest = -1; + } + void new_object() { + clear_data_digest(); + clear_omap_digest(); + } + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void decode(const ceph::buffer::list& bl) { + auto p = std::cbegin(bl); + decode(p); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<object_info_t*>& o); + + explicit object_info_t() + : user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(const hobject_t& s) + : soid(s), + user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0), + data_digest(-1), omap_digest(-1), + expected_object_size(0), expected_write_size(0), + alloc_hint_flags(0) + {} + + explicit object_info_t(ceph::buffer::list& bl) { + decode(bl); + } +}; +WRITE_CLASS_ENCODER_FEATURES(object_info_t) + +std::ostream& operator<<(std::ostream& out, const object_info_t& oi); + + + +// Object recovery +struct ObjectRecoveryInfo { + hobject_t soid; + eversion_t version; + uint64_t size; + object_info_t oi; + SnapSet ss; // only populated if soid is_snap() + interval_set<uint64_t> copy_subset; + std::map<hobject_t, interval_set<uint64_t>> clone_subset; + bool object_exist; + + ObjectRecoveryInfo() : size(0), object_exist(true) { } + + static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo) +std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf); + +struct ObjectRecoveryProgress { + uint64_t data_recovered_to; + std::string omap_recovered_to; + bool first; + bool data_complete; + bool omap_complete; + bool error = false; + + ObjectRecoveryProgress() + : data_recovered_to(0), + first(true), + data_complete(false), omap_complete(false) { } + + bool is_complete(const ObjectRecoveryInfo& info) const { + return (data_recovered_to >= ( + info.copy_subset.empty() ? + 0 : info.copy_subset.range_end())) && + omap_complete; + } + + static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; +}; +WRITE_CLASS_ENCODER(ObjectRecoveryProgress) +std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog); + +struct PushReplyOp { + hobject_t soid; + + static void generate_test_instances(std::list<PushReplyOp*>& o); + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER(PushReplyOp) +std::ostream& operator<<(std::ostream& out, const PushReplyOp &op); + +struct PullOp { + hobject_t soid; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + static void generate_test_instances(std::list<PullOp*>& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PullOp) +std::ostream& operator<<(std::ostream& out, const PullOp &op); + +struct PushOp { + hobject_t soid; + eversion_t version; + ceph::buffer::list data; + interval_set<uint64_t> data_included; + ceph::buffer::list omap_header; + std::map<std::string, ceph::buffer::list> omap_entries; + std::map<std::string, ceph::buffer::list> attrset; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress before_progress; + ObjectRecoveryProgress after_progress; + + static void generate_test_instances(std::list<PushOp*>& o); + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator &bl); + std::ostream &print(std::ostream &out) const; + void dump(ceph::Formatter *f) const; + + uint64_t cost(CephContext *cct) const; +}; +WRITE_CLASS_ENCODER_FEATURES(PushOp) +std::ostream& operator<<(std::ostream& out, const PushOp &op); + +enum class scrub_level_t : bool { shallow = false, deep = true }; +enum class scrub_type_t : bool { not_repair = false, do_repair = true }; + +/* + * summarize pg contents for purposes of a scrub + */ +struct ScrubMap { + struct object { + std::map<std::string, ceph::buffer::ptr> attrs; + uint64_t size; + __u32 omap_digest; ///< omap crc32c + __u32 digest; ///< data crc32c + bool negative:1; + bool digest_present:1; + bool omap_digest_present:1; + bool read_error:1; + bool stat_error:1; + bool ec_hash_mismatch:1; + bool ec_size_mismatch:1; + bool large_omap_object_found:1; + uint64_t large_omap_object_key_count = 0; + uint64_t large_omap_object_value_size = 0; + uint64_t object_omap_bytes = 0; + uint64_t object_omap_keys = 0; + + object() : + // Init invalid size so it won't match if we get a stat EIO error + size(-1), omap_digest(0), digest(0), + negative(false), digest_present(false), omap_digest_present(false), + read_error(false), stat_error(false), ec_hash_mismatch(false), + ec_size_mismatch(false), large_omap_object_found(false) {} + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<object*>& o); + }; + WRITE_CLASS_ENCODER(object) + + std::map<hobject_t,object> objects; + eversion_t valid_through; + eversion_t incr_since; + bool has_large_omap_object_errors:1; + bool has_omap_keys:1; + + void merge_incr(const ScrubMap &l); + void clear_from(const hobject_t& start) { + objects.erase(objects.lower_bound(start), objects.end()); + } + void insert(const ScrubMap &r) { + objects.insert(r.objects.begin(), r.objects.end()); + } + void swap(ScrubMap &r) { + using std::swap; + swap(objects, r.objects); + swap(valid_through, r.valid_through); + swap(incr_since, r.incr_since); + } + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ScrubMap*>& o); +}; +WRITE_CLASS_ENCODER(ScrubMap::object) +WRITE_CLASS_ENCODER(ScrubMap) + +struct ScrubMapBuilder { + bool deep = false; + std::vector<hobject_t> ls; + size_t pos = 0; + int64_t data_pos = 0; + std::string omap_pos; + int ret = 0; + ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value + uint64_t omap_keys = 0; + uint64_t omap_bytes = 0; + + bool empty() { + return ls.empty(); + } + bool done() { + return pos >= ls.size(); + } + void reset() { + *this = ScrubMapBuilder(); + } + + bool data_done() { + return data_pos < 0; + } + + void next_object() { + ++pos; + data_pos = 0; + omap_pos.clear(); + omap_keys = 0; + omap_bytes = 0; + } + + friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) { + out << "(" << pos.pos << "/" << pos.ls.size(); + if (pos.pos < pos.ls.size()) { + out << " " << pos.ls[pos.pos]; + } + if (pos.data_pos < 0) { + out << " byte " << pos.data_pos; + } + if (!pos.omap_pos.empty()) { + out << " key " << pos.omap_pos; + } + if (pos.deep) { + out << " deep"; + } + if (pos.ret) { + out << " ret " << pos.ret; + } + return out << ")"; + } +}; + +struct watch_item_t { + entity_name_t name; + uint64_t cookie; + uint32_t timeout_seconds; + entity_addr_t addr; + + watch_item_t() : cookie(0), timeout_seconds(0) { } + watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout, + const entity_addr_t& addr) + : name(name), cookie(cookie), timeout_seconds(timeout), + addr(addr) { } + + void encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(2, bl); + decode(name, bl); + decode(cookie, bl); + decode(timeout_seconds, bl); + if (struct_v >= 2) { + decode(addr, bl); + } + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_stream("watcher") << name; + f->dump_int("cookie", cookie); + f->dump_int("timeout", timeout_seconds); + f->open_object_section("addr"); + addr.dump(f); + f->close_section(); + } + static void generate_test_instances(std::list<watch_item_t*>& o) { + entity_addr_t ea; + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1000); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 0); + ea.set_in4_quad(3, 1); + ea.set_port(1024); + o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea)); + ea.set_nonce(1001); + ea.set_in4_quad(3, 2); + ea.set_port(1025); + o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea)); + } +}; +WRITE_CLASS_ENCODER_FEATURES(watch_item_t) + +struct obj_watch_item_t { + hobject_t obj; + watch_item_t wi; +}; + +/** + * obj list watch response format + * + */ +struct obj_list_watch_response_t { + std::list<watch_item_t> entries; + + void encode(ceph::buffer::list& bl, uint64_t features) const { + ENCODE_START(1, 1, bl); + encode(entries, bl, features); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(entries, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("entries"); + for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) { + f->open_object_section("watch"); + p->dump(f); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) { + entity_addr_t ea; + o.push_back(new obj_list_watch_response_t); + o.push_back(new obj_list_watch_response_t); + std::list<watch_item_t*> test_watchers; + watch_item_t::generate_test_instances(test_watchers); + for (auto &e : test_watchers) { + o.back()->entries.push_back(*e); + delete e; + } + } +}; +WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t) + +struct clone_info { + snapid_t cloneid; + std::vector<snapid_t> snaps; // ascending + std::vector< std::pair<uint64_t,uint64_t> > overlap; + uint64_t size; + + clone_info() : cloneid(CEPH_NOSNAP), size(0) {} + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(cloneid, bl); + encode(snaps, bl); + encode(overlap, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(cloneid, bl); + decode(snaps, bl); + decode(overlap, bl); + decode(size, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + if (cloneid == CEPH_NOSNAP) + f->dump_string("cloneid", "HEAD"); + else + f->dump_unsigned("cloneid", cloneid.val); + f->open_array_section("snapshots"); + for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("snap"); + f->dump_unsigned("id", p->val); + f->close_section(); + } + f->close_section(); + f->open_array_section("overlaps"); + for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin(); + q != overlap.end(); ++q) { + f->open_object_section("overlap"); + f->dump_unsigned("offset", q->first); + f->dump_unsigned("length", q->second); + f->close_section(); + } + f->close_section(); + f->dump_unsigned("size", size); + } + static void generate_test_instances(std::list<clone_info*>& o) { + o.push_back(new clone_info); + o.push_back(new clone_info); + o.back()->cloneid = 1; + o.back()->snaps.push_back(1); + o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096)); + o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096)); + o.back()->size = 16384; + o.push_back(new clone_info); + o.back()->cloneid = CEPH_NOSNAP; + o.back()->size = 32768; + } +}; +WRITE_CLASS_ENCODER(clone_info) + +/** + * obj list snaps response format + * + */ +struct obj_list_snap_response_t { + std::vector<clone_info> clones; // ascending + snapid_t seq; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(clones, bl); + encode(seq, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(2, bl); + decode(clones, bl); + if (struct_v >= 2) + decode(seq, bl); + else + seq = CEPH_NOSNAP; + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->open_array_section("clones"); + for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) { + f->open_object_section("clone"); + p->dump(f); + f->close_section(); + } + f->dump_unsigned("seq", seq); + f->close_section(); + } + static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) { + o.push_back(new obj_list_snap_response_t); + o.push_back(new obj_list_snap_response_t); + clone_info cl; + cl.cloneid = 1; + cl.snaps.push_back(1); + cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096)); + cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096)); + cl.size = 16384; + o.back()->clones.push_back(cl); + cl.cloneid = CEPH_NOSNAP; + cl.snaps.clear(); + cl.overlap.clear(); + cl.size = 32768; + o.back()->clones.push_back(cl); + o.back()->seq = 123; + } +}; + +WRITE_CLASS_ENCODER(obj_list_snap_response_t) + +// PromoteCounter + +struct PromoteCounter { + std::atomic<unsigned long long> attempts{0}; + std::atomic<unsigned long long> objects{0}; + std::atomic<unsigned long long> bytes{0}; + + void attempt() { + attempts++; + } + + void finish(uint64_t size) { + objects++; + bytes += size; + } + + void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) { + *a = attempts; + *o = objects; + *b = bytes; + attempts = *a / 2; + objects = *o / 2; + bytes = *b / 2; + } +}; + +struct pool_pg_num_history_t { + /// last epoch updated + epoch_t epoch = 0; + /// poolid -> epoch -> pg_num + std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums; + /// pair(epoch, poolid) + std::set<std::pair<epoch_t,int64_t>> deleted_pools; + + void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) { + pg_nums[pool][epoch] = pg_num; + } + void log_pool_delete(epoch_t epoch, int64_t pool) { + deleted_pools.insert(std::make_pair(epoch, pool)); + } + + /// prune history based on oldest osdmap epoch in the cluster + void prune(epoch_t oldest_epoch) { + auto i = deleted_pools.begin(); + while (i != deleted_pools.end()) { + if (i->first >= oldest_epoch) { + break; + } + pg_nums.erase(i->second); + i = deleted_pools.erase(i); + } + for (auto& j : pg_nums) { + auto k = j.second.lower_bound(oldest_epoch); + // keep this and the entry before it (just to be paranoid) + if (k != j.second.begin()) { + --k; + j.second.erase(j.second.begin(), k); + } + } + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(pg_nums, bl); + encode(deleted_pools, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(epoch, p); + decode(pg_nums, p); + decode(deleted_pools, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("epoch", epoch); + f->open_object_section("pools"); + for (auto& i : pg_nums) { + f->open_object_section("pool"); + f->dump_unsigned("pool_id", i.first); + f->open_array_section("changes"); + for (auto& j : i.second) { + f->open_object_section("change"); + f->dump_unsigned("epoch", j.first); + f->dump_unsigned("pg_num", j.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("deleted_pools"); + for (auto& i : deleted_pools) { + f->open_object_section("deletion"); + f->dump_unsigned("pool_id", i.second); + f->dump_unsigned("epoch", i.first); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) { + ls.push_back(new pool_pg_num_history_t); + } + friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) { + return out << "pg_num_history(e" << h.epoch + << " pg_nums " << h.pg_nums + << " deleted_pools " << h.deleted_pools + << ")"; + } +}; +WRITE_CLASS_ENCODER(pool_pg_num_history_t) + +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +static const std::string_view infover_key = "_infover"; +static const std::string_view info_key = "_info"; +static const std::string_view biginfo_key = "_biginfo"; +static const std::string_view epoch_key = "_epoch"; +static const std::string_view fastinfo_key = "_fastinfo"; + +static const __u8 pg_latest_struct_v = 10; +// v10 is the new past_intervals encoding +// v9 was fastinfo_key addition +// v8 was the move to a per-pg pgmeta object +// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad +// (first appeared in cuttlefish). +static const __u8 pg_compat_struct_v = 10; + +int prepare_info_keymap( + CephContext* cct, + std::map<std::string,ceph::buffer::list> *km, + std::string *key_to_remove, + epoch_t epoch, + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger = nullptr, + DoutPrefixProvider *dpp = nullptr); + +namespace ceph::os { + class Transaction; +}; + +void create_pg_collection( + ceph::os::Transaction& t, spg_t pgid, int bits); + +void init_pg_ondisk( + ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool); + +// omap specific stats +struct omap_stat_t { + int large_omap_objects; + int64_t omap_bytes; + int64_t omap_keys; +}; + +// filter for pg listings +class PGLSFilter { + CephContext* cct; +protected: + std::string xattr; +public: + PGLSFilter(); + virtual ~PGLSFilter(); + virtual bool filter(const hobject_t &obj, + const ceph::buffer::list& xattr_data) const = 0; + + /** + * Arguments passed from the RADOS client. Implementations must + * handle any encoding errors, and return an appropriate error code, + * or 0 on valid input. + */ + virtual int init(ceph::buffer::list::const_iterator ¶ms) = 0; + + /** + * xattr key, or empty string. If non-empty, this xattr will be fetched + * and the value passed into ::filter + */ + virtual const std::string& get_xattr() const { return xattr; } + + /** + * If true, objects without the named xattr (if xattr name is not empty) + * will be rejected without calling ::filter + */ + virtual bool reject_empty_xattr() const { return true; } +}; + +class PGLSPlainFilter : public PGLSFilter { + std::string val; +public: + int init(ceph::buffer::list::const_iterator ¶ms) override; + ~PGLSPlainFilter() override {} + bool filter(const hobject_t& obj, + const ceph::buffer::list& xattr_data) const override; +}; + +// alias name for this structure: +using missing_map_t = std::map<hobject_t, + std::pair<std::optional<uint32_t>, + std::optional<uint32_t>>>; + +#endif |