summaryrefslogtreecommitdiffstats
path: root/src/mon/Elector.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/mon/Elector.h')
-rw-r--r--src/mon/Elector.h406
1 files changed, 406 insertions, 0 deletions
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
new file mode 100644
index 000000000..2a53c1fc4
--- /dev/null
+++ b/src/mon/Elector.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MON_ELECTOR_H
+#define CEPH_MON_ELECTOR_H
+
+#include <map>
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "mon/MonOpRequest.h"
+#include "mon/mon_types.h"
+#include "mon/ElectionLogic.h"
+#include "mon/ConnectionTracker.h"
+
+class Monitor;
+
+
+/**
+ * This class is responsible for handling messages and maintaining
+ * an ElectionLogic which holds the local state when electing
+ * a new Leader. We may win or we may lose. If we win, it means we became the
+ * Leader; if we lose, it means we are a Peon.
+ */
+class Elector : public ElectionOwner, RankProvider {
+ /**
+ * @defgroup Elector_h_class Elector
+ * @{
+ */
+ ElectionLogic logic;
+ // connectivity validation and scoring
+ ConnectionTracker peer_tracker;
+ map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked
+ map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent
+ set<int> live_pinging; // ranks which we are currently pinging
+ set<int> dead_pinging; // ranks which didn't answer (degrading scores)
+ double ping_timeout; // the timeout after which we consider a ping to be dead
+ int PING_DIVISOR = 2; // we time out pings
+
+ /**
+ * @defgroup Elector_h_internal_types Internal Types
+ * @{
+ */
+ /**
+ * This struct will hold the features from a given peer.
+ * Features may both be the cluster's (in the form of a uint64_t), or
+ * mon-specific features. Instead of keeping maps to hold them both, or
+ * a pair, which would be weird, a struct to keep them seems appropriate.
+ */
+ struct elector_info_t {
+ uint64_t cluster_features = 0;
+ mon_feature_t mon_features;
+ ceph_release_t mon_release{0};
+ std::map<std::string,std::string> metadata;
+ };
+
+ /**
+ * @}
+ */
+
+ /**
+ * The Monitor instance associated with this class.
+ */
+ Monitor *mon;
+
+ /**
+ * Event callback responsible for dealing with an expired election once a
+ * timer runs out and fires up.
+ */
+ Context *expire_event = nullptr;
+
+ /**
+ * Resets the expire_event timer, by cancelling any existing one and
+ * scheduling a new one.
+ *
+ * @remarks This function assumes as a default firing value the duration of
+ * the monitor's lease interval, and adds to it the value specified
+ * in @e plus
+ *
+ * @post expire_event is set
+ *
+ * @param plus The amount of time to be added to the default firing value.
+ */
+ void reset_timer(double plus=0.0);
+ /**
+ * Cancel the expire_event timer, if it is defined.
+ *
+ * @post expire_event is not set
+ */
+ void cancel_timer();
+
+ // electing me
+ /**
+ * @defgroup Elector_h_electing_me_vars We are being elected
+ * @{
+ */
+ /**
+ * Map containing info of all those that acked our proposal to become the Leader.
+ * Note each peer's info.
+ */
+ std::map<int, elector_info_t> peer_info;
+ /**
+ * @}
+ */
+
+ /**
+ * Handle a message from some other node proposing itself to become it
+ * the Leader.
+ *
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, then pass the request to our ElectionLogic.
+ *
+ * @invariant The received message is an operation of type OP_PROPOSE
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ *
+ * @param m A message sent by another participant in the quorum.
+ */
+ void handle_propose(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant Acking us as the Leader.
+ *
+ * We validate that the sending Monitor is allowed to participate based on
+ * its supported features, add it to peer_info, and pass the ack to our
+ * ElectionLogic.
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ *
+ * @param m A message with an operation type of OP_ACK
+ */
+ void handle_ack(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant declaring Victory.
+ *
+ * We just got a message from someone declaring themselves Victorious, thus
+ * the new Leader.
+ *
+ * We pass the Victory to our ElectionLogic, and if it confirms the
+ * victory we lose the election and start following this Leader. Otherwise,
+ * drop the message.
+ *
+ * @pre Message epoch is from the current or a newer epoch
+ * @post Election is not on-going
+ * @post Updated @p epoch
+ * @post We have a new quorum if we lost the election
+ *
+ * @param m A message with an operation type of OP_VICTORY
+ */
+ void handle_victory(MonOpRequestRef op);
+ /**
+ * Send a nak to a peer who's out of date, containing information about why.
+ *
+ * If we get a message from a peer who can't support the required quorum
+ * features, we have to ignore them. This function will at least send
+ * them a message about *why* they're being ignored -- if they're new
+ * enough to support such a message.
+ *
+ * @param m A message from a monitor not supporting required features. We
+ * take ownership of the reference.
+ */
+ void nak_old_peer(MonOpRequestRef op);
+ /**
+ * Handle a message from some other participant declaring
+ * we cannot join the quorum.
+ *
+ * Apparently the quorum requires some feature that we do not implement. Shut
+ * down gracefully.
+ *
+ * @pre Election is on-going.
+ * @post We've shut down.
+ *
+ * @param m A message with an operation type of OP_NAK
+ */
+ void handle_nak(MonOpRequestRef op);
+ /**
+ * Send a ping to the specified peer.
+ * @n optional time that we will use instead of calling ceph_clock_now()
+ */
+ bool send_peer_ping(int peer, const utime_t *n=NULL);
+ /**
+ * Check the state of pinging the specified peer. This is our
+ * "tick" for heartbeating; scheduled by itself and begin_peer_ping().
+ */
+ void ping_check(int peer);
+ /**
+ * Move the peer out of live_pinging into dead_pinging set
+ * and schedule dead_ping()ing on it.
+ */
+ void begin_dead_ping(int peer);
+ /**
+ * Checks that the peer is still marked for dead pinging,
+ * and then marks it as dead for the appropriate interval.
+ */
+ void dead_ping(int peer);
+ /**
+ * Handle a ping from another monitor and assimilate the data it contains.
+ */
+ void handle_ping(MonOpRequestRef op);
+ /**
+ * Update our view of everybody else's connectivity based on the provided
+ * tracker bufferlist
+ */
+ void assimilate_connection_reports(const bufferlist& bl);
+
+ public:
+ /**
+ * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
+ * @{
+ */
+ /* Commit the given epoch to our MonStore.
+ * We also take the opportunity to persist our peer_tracker.
+ */
+ void persist_epoch(epoch_t e);
+ /* Read the epoch out of our MonStore */
+ epoch_t read_persisted_epoch() const;
+ /* Write a nonsense key "election_writeable_test" to our MonStore */
+ void validate_store();
+ /* Reset my tracking. Currently, just call Monitor::join_election() */
+ void notify_bump_epoch();
+ /* Call a new election: Invoke Monitor::start_election() */
+ void trigger_new_election();
+ /* Retrieve rank from the Monitor */
+ int get_my_rank() const;
+ /* Send MMonElection OP_PROPOSE to every monitor in the map. */
+ void propose_to_peers(epoch_t e, bufferlist &bl);
+ /* bootstrap() the Monitor */
+ void reset_election();
+ /* Retrieve the Monitor::has_ever_joined member */
+ bool ever_participated() const;
+ /* Retrieve monmap->size() */
+ unsigned paxos_size() const;
+ /* Right now we don't disallow anybody */
+ set<int> disallowed_leaders;
+ const set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
+ /**
+ * Reset the expire_event timer so we can limit the amount of time we
+ * will be electing. Clean up our peer_info.
+ *
+ * @post we reset the expire_event timer
+ */
+ void _start();
+ /**
+ * Send an MMonElection message deferring to the identified monitor. We
+ * also increase the election timeout so the monitor we defer to
+ * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
+ *
+ * @post we sent an ack message to @p who
+ * @post we reset the expire_event timer
+ *
+ * @param who Some other monitor's numeric identifier.
+ */
+ void _defer_to(int who);
+ /**
+ * Our ElectionLogic told us we won an election! Identify the quorum
+ * features, tell our new peons we've won, and invoke Monitor::win_election().
+ */
+ void message_victory(const std::set<int>& quorum);
+ /* Check if rank is in mon->quorum */
+ bool is_current_member(int rank) const;
+ /*
+ * @}
+ */
+ /**
+ * Persist our peer_tracker to disk.
+ */
+ void persist_connectivity_scores();
+
+ Elector *elector;
+
+ /**
+ * Create an Elector class
+ *
+ * @param m A Monitor instance
+ * @param strategy The election strategy to use, defined in MonMap/ElectionLogic
+ */
+ explicit Elector(Monitor *m, int strategy);
+ virtual ~Elector() {}
+
+ /**
+ * Inform this class it is supposed to shutdown.
+ *
+ * We will simply cancel the @p expire_event if any exists.
+ *
+ * @post @p expire_event is cancelled
+ */
+ void shutdown();
+
+ /**
+ * Obtain our epoch from ElectionLogic.
+ *
+ * @returns Our current epoch number
+ */
+ epoch_t get_epoch() { return logic.get_epoch(); }
+
+ /**
+ * If the Monitor knows there are no Paxos peers (so
+ * we are rank 0 and there are no others) we can declare victory.
+ */
+ void declare_standalone_victory() {
+ logic.declare_standalone_victory();
+ }
+ /**
+ * Tell the Elector to start pinging a given peer.
+ * Do this when you discover a peer and it has a rank assigned.
+ * We do it ourselves on receipt of pings and when receiving other messages.
+ */
+ void begin_peer_ping(int peer);
+ /**
+ * Handle received messages.
+ *
+ * We will ignore all messages that are not of type @p MSG_MON_ELECTION
+ * (i.e., messages whose interface is not of type @p MMonElection). All of
+ * those that are will then be dispatched to their operation-specific
+ * functions.
+ *
+ * @param m A received message
+ */
+ void dispatch(MonOpRequestRef op);
+
+ /**
+ * Call an election.
+ *
+ * This function simply calls ElectionLogic::start.
+ */
+ void call_election() {
+ logic.start();
+ }
+
+ /**
+ * Stop participating in subsequent Elections.
+ *
+ * @post @p participating is false
+ */
+ void stop_participating() { logic.participating = false; }
+ /**
+ * Start participating in Elections.
+ *
+ * If we are already participating (i.e., @p participating is true), then
+ * calling this function is moot.
+ *
+ * However, if we are not participating (i.e., @p participating is false),
+ * then we will start participating by setting @p participating to true and
+ * we will call for an Election.
+ *
+ * @post @p participating is true
+ */
+ void start_participating();
+ /**
+ * Check if our peer_tracker is self-consistent, not suffering from
+ * https://tracker.ceph.com/issues/58049
+ */
+ bool peer_tracker_is_clean();
+ /**
+ * Forget everything about our peers. :(
+ */
+ void notify_clear_peer_state();
+ /**
+ * Notify that our local rank has changed
+ * and we may need to update internal data structures.
+ */
+ void notify_rank_changed(int new_rank);
+ /**
+ * A peer has been removed so we should clean up state related to it.
+ * This is safe to call even if we haven't joined or are currently
+ * in a quorum.
+ */
+ void notify_rank_removed(int rank_removed, int new_rank);
+ void notify_strategy_maybe_changed(int strategy);
+ /**
+ * Set the disallowed leaders.
+ *
+ * If you call this and the new disallowed set
+ * contains your current leader, you are
+ * responsible for calling an election!
+ *
+ * @returns false if the set is unchanged,
+ * true if the set changed
+ */
+ bool set_disallowed_leaders(const set<int>& dl) {
+ if (dl == disallowed_leaders) return false;
+ disallowed_leaders = dl;
+ return true;
+ }
+ void dump_connection_scores(Formatter *f) {
+ f->open_object_section("connection scores");
+ peer_tracker.dump(f);
+ f->close_section();
+ }
+ /**
+ * @}
+ */
+};
+
+#endif