diff options
Diffstat (limited to '')
-rw-r--r-- | src/mds/MDSCacheObject.h | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h new file mode 100644 index 00000000..e17089bb --- /dev/null +++ b/src/mds/MDSCacheObject.h @@ -0,0 +1,415 @@ +#ifndef CEPH_MDSCACHEOBJECT_H +#define CEPH_MDSCACHEOBJECT_H + +#include <ostream> +#include <string_view> + +#include "common/config.h" + +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "include/mempool.h" +#include "include/types.h" +#include "include/xlist.h" + +#include "mdstypes.h" +#include "MDSContext.h" + +#define MDS_REF_SET // define me for improved debug output, sanity checking +//#define MDS_AUTHPIN_SET // define me for debugging auth pin leaks +//#define MDS_VERIFY_FRAGSTAT // do (slow) sanity checking on frags + + +class MLock; +class SimpleLock; +class MDSCacheObject; +class MDSContext; + +/* + * for metadata leases to clients + */ +struct ClientLease { + client_t client; + MDSCacheObject *parent; + + ceph_seq_t seq = 0; + utime_t ttl; + xlist<ClientLease*>::item item_session_lease; // per-session list + xlist<ClientLease*>::item item_lease; // global list + + ClientLease(client_t c, MDSCacheObject *p) : + client(c), parent(p), + item_session_lease(this), + item_lease(this) { } + ClientLease() = delete; +}; + + +// print hack +struct mdsco_db_line_prefix { + MDSCacheObject *object; + explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} +}; +std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o); + +// printer +std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o); + +class MDSCacheObject { + public: + // -- pins -- + const static int PIN_REPLICATED = 1000; + const static int PIN_DIRTY = 1001; + const static int PIN_LOCK = -1002; + const static int PIN_REQUEST = -1003; + const static int PIN_WAITER = 1004; + const static int PIN_DIRTYSCATTERED = -1005; + static const int PIN_AUTHPIN = 1006; + static const int PIN_PTRWAITER = -1007; + const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export + static const int PIN_CLIENTLEASE = 1009; + static const int PIN_DISCOVERBASE = 1010; + + std::string_view generic_pin_name(int p) const { + switch (p) { + case PIN_REPLICATED: return "replicated"; + case PIN_DIRTY: return "dirty"; + case PIN_LOCK: return "lock"; + case PIN_REQUEST: return "request"; + case PIN_WAITER: return "waiter"; + case PIN_DIRTYSCATTERED: return "dirtyscattered"; + case PIN_AUTHPIN: return "authpin"; + case PIN_PTRWAITER: return "ptrwaiter"; + case PIN_TEMPEXPORTING: return "tempexporting"; + case PIN_CLIENTLEASE: return "clientlease"; + case PIN_DISCOVERBASE: return "discoverbase"; + default: ceph_abort(); return std::string_view(); + } + } + + // -- state -- + const static int STATE_AUTH = (1<<30); + const static int STATE_DIRTY = (1<<29); + const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put() + const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy + const static int STATE_REJOINUNDEF = (1<<26); // contents undefined. + + + // -- wait -- + const static uint64_t WAIT_ORDERED = (1ull<<61); + const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); + const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE + + + // ============================================ + // cons + public: + MDSCacheObject() {} + virtual ~MDSCacheObject() {} + + // printing + virtual void print(std::ostream& out) = 0; + virtual std::ostream& print_db_line_prefix(std::ostream& out) { + return out << "mdscacheobject(" << this << ") "; + } + + // -------------------------------------------- + // state + protected: + __u32 state = 0; // state bits + + public: + unsigned get_state() const { return state; } + unsigned state_test(unsigned mask) const { return (state & mask); } + void state_clear(unsigned mask) { state &= ~mask; } + void state_set(unsigned mask) { state |= mask; } + void state_reset(unsigned s) { state = s; } + + bool is_auth() const { return state_test(STATE_AUTH); } + bool is_dirty() const { return state_test(STATE_DIRTY); } + bool is_clean() const { return !is_dirty(); } + bool is_rejoining() const { return state_test(STATE_REJOINING); } + + // -------------------------------------------- + // authority + virtual mds_authority_t authority() const = 0; + bool is_ambiguous_auth() const { + return authority().second != CDIR_AUTH_UNKNOWN; + } + + // -------------------------------------------- + // pins +protected: + __s32 ref = 0; // reference count +#ifdef MDS_REF_SET + mempool::mds_co::flat_map<int,int> ref_map; +#endif + + public: + int get_num_ref(int by = -1) const { +#ifdef MDS_REF_SET + if (by >= 0) { + if (ref_map.find(by) == ref_map.end()) { + return 0; + } else { + return ref_map.find(by)->second; + } + } +#endif + return ref; + } + virtual std::string_view pin_name(int by) const = 0; + //bool is_pinned_by(int by) { return ref_set.count(by); } + //multiset<int>& get_ref_set() { return ref_set; } + + virtual void last_put() {} + virtual void bad_put(int by) { +#ifdef MDS_REF_SET + ceph_assert(ref_map[by] > 0); +#endif + ceph_assert(ref > 0); + } + virtual void _put() {} + void put(int by) { +#ifdef MDS_REF_SET + if (ref == 0 || ref_map[by] == 0) { +#else + if (ref == 0) { +#endif + bad_put(by); + } else { + ref--; +#ifdef MDS_REF_SET + ref_map[by]--; +#endif + if (ref == 0) + last_put(); + if (state_test(STATE_NOTIFYREF)) + _put(); + } + } + + virtual void first_get() {} + virtual void bad_get(int by) { +#ifdef MDS_REF_SET + ceph_assert(by < 0 || ref_map[by] == 0); +#endif + ceph_abort(); + } + void get(int by) { + if (ref == 0) + first_get(); + ref++; +#ifdef MDS_REF_SET + if (ref_map.find(by) == ref_map.end()) + ref_map[by] = 0; + ref_map[by]++; +#endif + } + + void print_pin_set(std::ostream& out) const { +#ifdef MDS_REF_SET + for(auto const &p : ref_map) { + out << " " << pin_name(p.first) << "=" << p.second; + } +#else + out << " nref=" << ref; +#endif + } + +protected: + int auth_pins = 0; +#ifdef MDS_AUTHPIN_SET + mempool::mds_co::multiset<void*> auth_pin_set; +#endif + +public: + int get_num_auth_pins() const { return auth_pins; } +#ifdef MDS_AUTHPIN_SET + void print_authpin_set(std::ostream& out) const { + out << " (" << auth_pin_set << ")"; + } +#endif + + void dump_states(Formatter *f) const; + void dump(Formatter *f) const; + + // -------------------------------------------- + // auth pins + enum { + // can_auth_pin() error codes + ERR_NOT_AUTH = 1, + ERR_EXPORTING_TREE, + ERR_FRAGMENTING_DIR, + ERR_EXPORTING_INODE, + }; + virtual bool can_auth_pin(int *err_code=nullptr) const = 0; + virtual void auth_pin(void *who) = 0; + virtual void auth_unpin(void *who) = 0; + virtual bool is_frozen() const = 0; + virtual bool is_freezing() const = 0; + virtual bool is_freezing_or_frozen() const { + return is_frozen() || is_freezing(); + } + + + // -------------------------------------------- + // replication (across mds cluster) + protected: + unsigned replica_nonce = 0; // [replica] defined on replica + typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type; + replica_map_type replica_map; // [auth] mds -> nonce + + public: + bool is_replicated() const { return !get_replicas().empty(); } + bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); } + int num_replicas() const { return get_replicas().size(); } + unsigned add_replica(mds_rank_t mds) { + if (get_replicas().count(mds)) + return ++get_replicas()[mds]; // inc nonce + if (get_replicas().empty()) + get(PIN_REPLICATED); + return get_replicas()[mds] = 1; + } + void add_replica(mds_rank_t mds, unsigned nonce) { + if (get_replicas().empty()) + get(PIN_REPLICATED); + get_replicas()[mds] = nonce; + } + unsigned get_replica_nonce(mds_rank_t mds) { + ceph_assert(get_replicas().count(mds)); + return get_replicas()[mds]; + } + void remove_replica(mds_rank_t mds) { + ceph_assert(get_replicas().count(mds)); + get_replicas().erase(mds); + if (get_replicas().empty()) { + put(PIN_REPLICATED); + } + } + void clear_replica_map() { + if (!get_replicas().empty()) + put(PIN_REPLICATED); + replica_map.clear(); + } + replica_map_type& get_replicas() { return replica_map; } + const replica_map_type& get_replicas() const { return replica_map; } + void list_replicas(std::set<mds_rank_t>& ls) const { + for (const auto &p : get_replicas()) { + ls.insert(p.first); + } + } + + unsigned get_replica_nonce() const { return replica_nonce; } + void set_replica_nonce(unsigned n) { replica_nonce = n; } + + + // --------------------------------------------- + // waiting + private: + mempool::mds_co::compact_multimap<uint64_t, std::pair<uint64_t, MDSContext*>> waiting; + static uint64_t last_wait_seq; + + public: + bool is_waiter_for(uint64_t mask, uint64_t min=0) { + if (!min) { + min = mask; + while (min & (min-1)) // if more than one bit is set + min &= min-1; // clear LSB + } + for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) { + if (p->first & mask) return true; + if (p->first > mask) return false; + } + return false; + } + virtual void add_waiter(uint64_t mask, MDSContext *c) { + if (waiting.empty()) + get(PIN_WAITER); + + uint64_t seq = 0; + if (mask & WAIT_ORDERED) { + seq = ++last_wait_seq; + mask &= ~WAIT_ORDERED; + } + waiting.insert(pair<uint64_t, pair<uint64_t, MDSContext*> >( + mask, + pair<uint64_t, MDSContext*>(seq, c))); +// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) +// << "add_waiter " << hex << mask << dec << " " << c +// << " on " << *this +// << dendl; + + } + virtual void take_waiting(uint64_t mask, MDSContext::vec& ls) { + if (waiting.empty()) return; + + // process ordered waiters in the same order that they were added. + std::map<uint64_t, MDSContext*> ordered_waiters; + + for (auto it = waiting.begin(); it != waiting.end(); ) { + if (it->first & mask) { + if (it->second.first > 0) { + ordered_waiters.insert(it->second); + } else { + ls.push_back(it->second.second); + } +// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) +// << "take_waiting mask " << hex << mask << dec << " took " << it->second +// << " tag " << hex << it->first << dec +// << " on " << *this +// << dendl; + waiting.erase(it++); + } else { +// pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second +// << " tag " << hex << it->first << dec +// << " on " << *this +// << dendl; + ++it; + } + } + for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) { + ls.push_back(it->second); + } + if (waiting.empty()) { + put(PIN_WAITER); + waiting.clear(); + } + } + void finish_waiting(uint64_t mask, int result = 0); + + // --------------------------------------------- + // locking + // noop unless overloaded. + virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; } + virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); } + virtual void encode_lock_state(int type, bufferlist& bl) { ceph_abort(); } + virtual void decode_lock_state(int type, const bufferlist& bl) { ceph_abort(); } + virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); } + virtual void add_lock_waiter(int type, uint64_t mask, MDSContext *c) { ceph_abort(); } + virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; } + + virtual void clear_dirty_scattered(int type) { ceph_abort(); } + + // --------------------------------------------- + // ordering + virtual bool is_lt(const MDSCacheObject *r) const = 0; + struct ptr_lt { + bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { + return l->is_lt(r); + } + }; + +}; + +inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) { + o.print(out); + return out; +} + +inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) { + o.object->print_db_line_prefix(out); + return out; +} + +#endif |