diff options
Diffstat (limited to 'src/mds/MDSRank.h')
-rw-r--r-- | src/mds/MDSRank.h | 728 |
1 files changed, 728 insertions, 0 deletions
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h new file mode 100644 index 000000000..b61fc178c --- /dev/null +++ b/src/mds/MDSRank.h @@ -0,0 +1,728 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef MDS_RANK_H_ +#define MDS_RANK_H_ + +#include <string_view> + +#include <boost/asio/io_context.hpp> + +#include "common/DecayCounter.h" +#include "common/LogClient.h" +#include "common/Timer.h" +#include "common/fair_mutex.h" +#include "common/TrackedOp.h" +#include "common/ceph_mutex.h" + +#include "include/common_fwd.h" + +#include "messages/MClientRequest.h" +#include "messages/MCommand.h" +#include "messages/MMDSMap.h" + +#include "Beacon.h" +#include "DamageTable.h" +#include "MDSMap.h" +#include "SessionMap.h" +#include "MDCache.h" +#include "MDLog.h" +#include "MDSContext.h" +#include "PurgeQueue.h" +#include "Server.h" +#include "MetricsHandler.h" +#include "osdc/Journaler.h" + +// Full .h import instead of forward declaration for PerfCounter, for the +// benefit of those including this header and using MDSRank::logger +#include "common/perf_counters.h" + +enum { + l_mds_first = 2000, + l_mds_request, + l_mds_reply, + l_mds_reply_latency, + l_mds_slow_reply, + l_mds_forward, + l_mds_dir_fetch_complete, + l_mds_dir_fetch_keys, + l_mds_dir_commit, + l_mds_dir_split, + l_mds_dir_merge, + l_mds_inodes, + l_mds_inodes_top, + l_mds_inodes_bottom, + l_mds_inodes_pin_tail, + l_mds_inodes_pinned, + l_mds_inodes_expired, + l_mds_inodes_with_caps, + l_mds_caps, + l_mds_subtrees, + l_mds_traverse, + l_mds_traverse_hit, + l_mds_traverse_forward, + l_mds_traverse_discover, + l_mds_traverse_dir_fetch, + l_mds_traverse_remote_ino, + l_mds_traverse_lock, + l_mds_load_cent, + l_mds_dispatch_queue_len, + l_mds_exported, + l_mds_exported_inodes, + l_mds_imported, + l_mds_imported_inodes, + l_mds_openino_dir_fetch, + l_mds_openino_backtrace_fetch, + l_mds_openino_peer_discover, + l_mds_root_rfiles, + l_mds_root_rbytes, + l_mds_root_rsnaps, + l_mds_scrub_backtrace_fetch, + l_mds_scrub_set_tag, + l_mds_scrub_backtrace_repaired, + l_mds_scrub_inotable_repaired, + l_mds_scrub_dir_inodes, + l_mds_scrub_dir_base_inodes, + l_mds_scrub_dirfrag_rstats, + l_mds_scrub_file_inodes, + l_mdss_handle_inode_file_caps, + l_mdss_ceph_cap_op_revoke, + l_mdss_ceph_cap_op_grant, + l_mdss_ceph_cap_op_trunc, + l_mdss_ceph_cap_op_flushsnap_ack, + l_mdss_ceph_cap_op_flush_ack, + l_mdss_handle_client_caps, + l_mdss_handle_client_caps_dirty, + l_mdss_handle_client_cap_release, + l_mdss_process_request_cap_release, + l_mds_last, +}; + +// memory utilization +enum { + l_mdm_first = 2500, + l_mdm_ino, + l_mdm_inoa, + l_mdm_inos, + l_mdm_dir, + l_mdm_dira, + l_mdm_dirs, + l_mdm_dn, + l_mdm_dna, + l_mdm_dns, + l_mdm_cap, + l_mdm_capa, + l_mdm_caps, + l_mdm_rss, + l_mdm_heap, + l_mdm_last, +}; + +namespace ceph { + struct heartbeat_handle_d; +} + +class Locker; +class MDCache; +class MDLog; +class MDBalancer; +class InoTable; +class SnapServer; +class SnapClient; +class MDSTableServer; +class MDSTableClient; +class Messenger; +class MetricAggregator; +class Objecter; +class MonClient; +class MgrClient; +class Finisher; +class ScrubStack; +class C_ExecAndReply; + +/** + * The public part of this class's interface is what's exposed to all + * the various subsystems (server, mdcache, etc), such as pointers + * to the other subsystems, and message-sending calls. + */ +class MDSRank { + public: + friend class C_Flush_Journal; + friend class C_Drop_Cache; + friend class C_CacheDropExecAndReply; + friend class C_ScrubExecAndReply; + friend class C_ScrubControlExecAndReply; + + CephContext *cct; + + MDSRank( + mds_rank_t whoami_, + ceph::fair_mutex &mds_lock_, + LogChannelRef &clog_, + CommonSafeTimer<ceph::fair_mutex> &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap> & mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_, + boost::asio::io_context& ioc); + + mds_rank_t get_nodeid() const { return whoami; } + int64_t get_metadata_pool() const + { + return metadata_pool; + } + + mono_time get_starttime() const { + return starttime; + } + std::chrono::duration<double> get_uptime() const { + mono_time now = mono_clock::now(); + return std::chrono::duration<double>(now-starttime); + } + + bool is_daemon_stopping() const; + + MDSTableClient *get_table_client(int t); + MDSTableServer *get_table_server(int t); + + Session *get_session(client_t client) { + return sessionmap.get_session(entity_name_t::CLIENT(client.v)); + } + Session *get_session(const cref_t<Message> &m); + + MDSMap::DaemonState get_state() const { return state; } + MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } + + bool is_creating() const { return state == MDSMap::STATE_CREATING; } + bool is_starting() const { return state == MDSMap::STATE_STARTING; } + bool is_standby() const { return state == MDSMap::STATE_STANDBY; } + bool is_replay() const { return state == MDSMap::STATE_REPLAY; } + bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; } + bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; } + bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; } + bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; } + bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; } + bool is_active() const { return state == MDSMap::STATE_ACTIVE; } + bool is_stopping() const { return state == MDSMap::STATE_STOPPING; } + bool is_any_replay() const { return (is_replay() || is_standby_replay()); } + bool is_stopped() const { return mdsmap->is_stopped(whoami); } + bool is_cluster_degraded() const { return cluster_degraded; } + bool allows_multimds_snaps() const { return mdsmap->allows_multimds_snaps(); } + + bool is_cache_trimmable() const { + return is_standby_replay() || is_clientreplay() || is_active() || is_stopping(); + } + + void handle_write_error(int err); + void handle_write_error_with_lock(int err); + + void update_mlogger(); + + void queue_waiter(MDSContext *c) { + finished_queue.push_back(c); + progress_thread.signal(); + } + void queue_waiter_front(MDSContext *c) { + finished_queue.push_front(c); + progress_thread.signal(); + } + void queue_waiters(MDSContext::vec& ls) { + MDSContext::vec v; + v.swap(ls); + std::copy(v.begin(), v.end(), std::back_inserter(finished_queue)); + progress_thread.signal(); + } + void queue_waiters_front(MDSContext::vec& ls) { + MDSContext::vec v; + v.swap(ls); + std::copy(v.rbegin(), v.rend(), std::front_inserter(finished_queue)); + progress_thread.signal(); + } + + // Daemon lifetime functions: these guys break the abstraction + // and call up into the parent MDSDaemon instance. It's kind + // of unavoidable: if we want any depth into our calls + // to be able to e.g. tear down the whole process, we have to + // have a reference going all the way down. + // >>> + void suicide(); + void respawn(); + // <<< + + /** + * Call this periodically if inside a potentially long running piece + * of code while holding the mds_lock + */ + void heartbeat_reset(); + int heartbeat_reset_grace(int count=1) { + return count * _heartbeat_reset_grace; + } + + /** + * Abort the MDS and flush any clog messages. + * + * Callers must already hold mds_lock. + */ + void abort(std::string_view msg); + + /** + * Report state DAMAGED to the mon, and then pass on to respawn(). Call + * this when an unrecoverable error is encountered while attempting + * to load an MDS rank's data structures. This is *not* for use with + * errors affecting normal dirfrag/inode objects -- they should be handled + * through cleaner scrub/repair mechanisms. + * + * Callers must already hold mds_lock. + */ + void damaged(); + + /** + * Wrapper around `damaged` for users who are not + * already holding mds_lock. + * + * Callers must not already hold mds_lock. + */ + void damaged_unlocked(); + + double last_cleared_laggy() const { + return beacon.last_cleared_laggy(); + } + + double get_dispatch_queue_max_age(utime_t now) const; + + void send_message_mds(const ref_t<Message>& m, mds_rank_t mds); + void send_message_mds(const ref_t<Message>& m, const entity_addrvec_t &addr); + void forward_message_mds(MDRequestRef& mdr, mds_rank_t mds); + void send_message_client_counted(const ref_t<Message>& m, client_t client); + void send_message_client_counted(const ref_t<Message>& m, Session* session); + void send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection); + void send_message_client(const ref_t<Message>& m, Session* session); + void send_message(const ref_t<Message>& m, const ConnectionRef& c); + + void wait_for_bootstrapped_peer(mds_rank_t who, MDSContext *c) { + waiting_for_bootstrapping_peer[who].push_back(c); + } + void wait_for_active_peer(mds_rank_t who, MDSContext *c) { + waiting_for_active_peer[who].push_back(c); + } + void wait_for_cluster_recovered(MDSContext *c) { + ceph_assert(cluster_degraded); + waiting_for_active_peer[MDS_RANK_NONE].push_back(c); + } + + void wait_for_any_client_connection(MDSContext *c) { + waiting_for_any_client_connection.push_back(c); + } + void kick_waiters_for_any_client_connection(void) { + finish_contexts(g_ceph_context, waiting_for_any_client_connection); + } + void wait_for_active(MDSContext *c) { + waiting_for_active.push_back(c); + } + void wait_for_replay(MDSContext *c) { + waiting_for_replay.push_back(c); + } + void wait_for_rejoin(MDSContext *c) { + waiting_for_rejoin.push_back(c); + } + void wait_for_reconnect(MDSContext *c) { + waiting_for_reconnect.push_back(c); + } + void wait_for_resolve(MDSContext *c) { + waiting_for_resolve.push_back(c); + } + void wait_for_mdsmap(epoch_t e, MDSContext *c) { + waiting_for_mdsmap[e].push_back(c); + } + void enqueue_replay(MDSContext *c) { + replay_queue.push_back(c); + } + + bool queue_one_replay(); + void maybe_clientreplay_done(); + + void set_osd_epoch_barrier(epoch_t e); + epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;} + epoch_t get_osd_epoch() const; + + ceph_tid_t issue_tid() { return ++last_tid; } + + MDSMap *get_mds_map() { return mdsmap.get(); } + + uint64_t get_num_requests() const { return logger->get(l_mds_request); } + + int get_mds_slow_req_count() const { return mds_slow_req_count; } + + void dump_status(Formatter *f) const; + + void hit_export_target(mds_rank_t rank, double amount=-1.0); + bool is_export_target(mds_rank_t rank) { + const std::set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets; + return map_targets.count(rank); + } + + bool evict_client(int64_t session_id, bool wait, bool blocklist, + std::ostream& ss, Context *on_killed=nullptr); + int config_client(int64_t session_id, bool remove, + const std::string& option, const std::string& value, + std::ostream& ss); + void schedule_inmemory_logger(); + + double get_inject_journal_corrupt_dentry_first() const { + return inject_journal_corrupt_dentry_first; + } + + // Reference to global MDS::mds_lock, so that users of MDSRank don't + // carry around references to the outer MDS, and we can substitute + // a separate lock here in future potentially. + ceph::fair_mutex &mds_lock; + + // Reference to global cluster log client, just to avoid initialising + // a separate one here. + LogChannelRef &clog; + + // Reference to global timer utility, because MDSRank and MDSDaemon + // currently both use the same mds_lock, so it makes sense for them + // to share a timer. + CommonSafeTimer<ceph::fair_mutex> &timer; + + std::unique_ptr<MDSMap> &mdsmap; /* MDSDaemon::mdsmap */ + + Objecter *objecter; + + // sub systems + Server *server = nullptr; + MDCache *mdcache = nullptr; + Locker *locker = nullptr; + MDLog *mdlog = nullptr; + MDBalancer *balancer = nullptr; + ScrubStack *scrubstack = nullptr; + DamageTable damage_table; + + InoTable *inotable = nullptr; + + SnapServer *snapserver = nullptr; + SnapClient *snapclient = nullptr; + + SessionMap sessionmap; + + PerfCounters *logger = nullptr, *mlogger = nullptr; + OpTracker op_tracker; + + // The last different state I held before current + MDSMap::DaemonState last_state = MDSMap::STATE_BOOT; + // The state assigned to me by the MDSMap + MDSMap::DaemonState state = MDSMap::STATE_STANDBY; + + bool cluster_degraded = false; + + Finisher *finisher; + protected: + typedef enum { + // The MDSMap is available, configure default layouts and structures + MDS_BOOT_INITIAL = 0, + // We are ready to open some inodes + MDS_BOOT_OPEN_ROOT, + // We are ready to do a replay if needed + MDS_BOOT_PREPARE_LOG, + // Replay is complete + MDS_BOOT_REPLAY_DONE + } BootStep; + + class ProgressThread : public Thread { + public: + explicit ProgressThread(MDSRank *mds_) : mds(mds_) {} + void * entry() override; + void shutdown(); + void signal() {cond.notify_all();} + private: + MDSRank *mds; + std::condition_variable_any cond; + } progress_thread; + + class C_MDS_StandbyReplayRestart; + class C_MDS_StandbyReplayRestartFinish; + // Friended to access retry_dispatch + friend class C_MDS_RetryMessage; + friend class C_MDS_BootStart; + friend class C_MDS_InternalBootStart; + friend class C_MDS_MonCommand; + + const mds_rank_t whoami; + + ~MDSRank(); + + void inc_dispatch_depth() { ++dispatch_depth; } + void dec_dispatch_depth() { --dispatch_depth; } + void retry_dispatch(const cref_t<Message> &m); + bool is_valid_message(const cref_t<Message> &m); + void handle_message(const cref_t<Message> &m); + void _advance_queues(); + bool _dispatch(const cref_t<Message> &m, bool new_msg); + bool is_stale_message(const cref_t<Message> &m) const; + + /** + * Emit clog warnings for any ops reported as warnings by optracker + */ + void check_ops_in_flight(); + + /** + * Share MDSMap with clients + */ + void create_logger(); + + void dump_clientreplay_status(Formatter *f) const; + void command_scrub_start(Formatter *f, + std::string_view path, std::string_view tag, + const std::vector<std::string>& scrubop_vec, Context *on_finish); + void command_tag_path(Formatter *f, std::string_view path, + std::string_view tag); + // scrub control commands + void command_scrub_abort(Formatter *f, Context *on_finish); + void command_scrub_pause(Formatter *f, Context *on_finish); + void command_scrub_resume(Formatter *f); + void command_scrub_status(Formatter *f); + + void command_flush_path(Formatter *f, std::string_view path); + void command_flush_journal(Formatter *f); + void command_get_subtrees(Formatter *f); + void command_export_dir(Formatter *f, + std::string_view path, mds_rank_t dest); + bool command_dirfrag_split( + cmdmap_t cmdmap, + std::ostream &ss); + bool command_dirfrag_merge( + cmdmap_t cmdmap, + std::ostream &ss); + bool command_dirfrag_ls( + cmdmap_t cmdmap, + std::ostream &ss, + Formatter *f); + int _command_export_dir(std::string_view path, mds_rank_t dest); + CDir *_command_dirfrag_get( + const cmdmap_t &cmdmap, + std::ostream &ss); + void command_openfiles_ls(Formatter *f); + void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f); + void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss); + void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish); + + // FIXME the state machine logic should be separable from the dispatch + // logic that calls it. + // >>> + void calc_recovery_set(); + void request_state(MDSMap::DaemonState s); + + void boot_create(); // i am new mds. + void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay + + void replay_start(); + void creating_done(); + void starting_done(); + void replay_done(); + void standby_replay_restart(); + void _standby_replay_restart_finish(int r, uint64_t old_read_pos); + + void reopen_log(); + + void resolve_start(); + void resolve_done(); + void reconnect_start(); + void reconnect_done(); + void rejoin_joint_start(); + void rejoin_start(); + void rejoin_done(); + void recovery_done(int oldstate); + void clientreplay_start(); + void clientreplay_done(); + void active_start(); + void stopping_start(); + void stopping_done(); + + void validate_sessions(); + + void handle_mds_recovery(mds_rank_t who); + void handle_mds_failure(mds_rank_t who); + + /* Update MDSMap export_targets for this rank. Called on ::tick(). */ + void update_targets(); + + void _mon_command_finish(int r, std::string_view cmd, std::string_view outs); + void set_mdsmap_multimds_snaps_allowed(); + + Context *create_async_exec_context(C_ExecAndReply *ctx); + + // blocklist the provided addrs and set OSD epoch barrier + // with the provided epoch. + void apply_blocklist(const std::set<entity_addr_t> &addrs, epoch_t epoch); + + void reset_event_flags(); + + // Incarnation as seen in MDSMap at the point where a rank is + // assigned. + int incarnation = 0; + + // Flag to indicate we entered shutdown: anyone seeing this to be true + // after taking mds_lock must drop out. + bool stopping = false; + + // PurgeQueue is only used by StrayManager, but it is owned by MDSRank + // because its init/shutdown happens at the top level. + PurgeQueue purge_queue; + + MetricsHandler metrics_handler; + std::unique_ptr<MetricAggregator> metric_aggregator; + + std::list<cref_t<Message>> waiting_for_nolaggy; + MDSContext::que finished_queue; + // Dispatch, retry, queues + int dispatch_depth = 0; + + ceph::heartbeat_handle_d *hb = nullptr; // Heartbeat for threads using mds_lock + double heartbeat_grace; + int _heartbeat_reset_grace; + + std::map<mds_rank_t, version_t> peer_mdsmap_epoch; + + ceph_tid_t last_tid = 0; // for mds-initiated requests (e.g. stray rename) + + MDSContext::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin, + waiting_for_reconnect, waiting_for_resolve; + MDSContext::vec waiting_for_any_client_connection; + MDSContext::que replay_queue; + bool replaying_requests_done = false; + + std::map<mds_rank_t, MDSContext::vec> waiting_for_active_peer; + std::map<mds_rank_t, MDSContext::vec> waiting_for_bootstrapping_peer; + std::map<epoch_t, MDSContext::vec> waiting_for_mdsmap; + + epoch_t osd_epoch_barrier = 0; + + // Const reference to the beacon so that we can behave differently + // when it's laggy. + Beacon &beacon; + + int mds_slow_req_count = 0; + + std::map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */ + + Messenger *messenger; + MonClient *monc; + MgrClient *mgrc; + + Context *respawn_hook; + Context *suicide_hook; + + bool standby_replaying = false; // true if current replay pass is in standby-replay mode + uint64_t extraordinary_events_dump_interval = 0; + double inject_journal_corrupt_dentry_first = 0.0; +private: + bool send_status = true; + + // The metadata pool won't change in the whole life time of the fs, + // with this we can get rid of the mds_lock in many places too. + int64_t metadata_pool = -1; + + // "task" string that gets displayed in ceph status + inline static const std::string SCRUB_STATUS_KEY = "scrub status"; + + bool client_eviction_dump = false; + + void get_task_status(std::map<std::string, std::string> *status); + void schedule_update_timer_task(); + void send_task_status(); + + void inmemory_logger(); + bool is_rank0() const { + return whoami == (mds_rank_t)0; + } + + mono_time starttime = mono_clock::zero(); + boost::asio::io_context& ioc; +}; + +class C_MDS_RetryMessage : public MDSInternalContext { +public: + C_MDS_RetryMessage(MDSRank *mds, const cref_t<Message> &m) + : MDSInternalContext(mds), m(m) {} + void finish(int r) override { + get_mds()->retry_dispatch(m); + } +protected: + cref_t<Message> m; +}; + +class CF_MDS_RetryMessageFactory : public MDSContextFactory { +public: + CF_MDS_RetryMessageFactory(MDSRank *mds, const cref_t<Message> &m) + : mds(mds), m(m) {} + + MDSContext *build() { + return new C_MDS_RetryMessage(mds, m); + } +private: + MDSRank *mds; + cref_t<Message> m; +}; + +/** + * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e. + * the service/dispatcher stuff like init/shutdown that subsystems should + * never touch. + */ +class MDSRankDispatcher : public MDSRank, public md_config_obs_t +{ +public: + MDSRankDispatcher( + mds_rank_t whoami_, + ceph::fair_mutex &mds_lock_, + LogChannelRef &clog_, + CommonSafeTimer<ceph::fair_mutex> &timer_, + Beacon &beacon_, + std::unique_ptr<MDSMap> &mdsmap_, + Messenger *msgr, + MonClient *monc_, + MgrClient *mgrc, + Context *respawn_hook_, + Context *suicide_hook_, + boost::asio::io_context& ioc); + + void init(); + void tick(); + void shutdown(); + void handle_asok_command( + std::string_view command, + const cmdmap_t& cmdmap, + Formatter *f, + const bufferlist &inbl, + std::function<void(int,const std::string&,bufferlist&)> on_finish); + void handle_mds_map(const cref_t<MMDSMap> &m, const MDSMap &oldmap); + void handle_osd_map(); + void update_log_config(); + + const char** get_tracked_conf_keys() const override final; + void handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) override; + + void dump_sessions(const SessionFilter &filter, Formatter *f, bool cap_dump=false) const; + void evict_clients(const SessionFilter &filter, + std::function<void(int,const std::string&,bufferlist&)> on_finish); + + // Call into me from MDS::ms_dispatch + bool ms_dispatch(const cref_t<Message> &m); +}; + +#endif // MDS_RANK_H_ |