1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
|
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef CEPH_MON_ELECTOR_H
#define CEPH_MON_ELECTOR_H
#include <map>
#include "include/types.h"
#include "include/Context.h"
#include "mon/MonOpRequest.h"
#include "mon/mon_types.h"
#include "mon/ElectionLogic.h"
#include "mon/ConnectionTracker.h"
class Monitor;
/**
* This class is responsible for handling messages and maintaining
* an ElectionLogic which holds the local state when electing
* a new Leader. We may win or we may lose. If we win, it means we became the
* Leader; if we lose, it means we are a Peon.
*/
class Elector : public ElectionOwner, RankProvider {
/**
* @defgroup Elector_h_class Elector
* @{
*/
ElectionLogic logic;
// connectivity validation and scoring
ConnectionTracker peer_tracker;
std::map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked
std::map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent
std::set<int> live_pinging; // ranks which we are currently pinging
std::set<int> dead_pinging; // ranks which didn't answer (degrading scores)
double ping_timeout; // the timeout after which we consider a ping to be dead
int PING_DIVISOR = 2; // we time out pings
/**
* @defgroup Elector_h_internal_types Internal Types
* @{
*/
/**
* This struct will hold the features from a given peer.
* Features may both be the cluster's (in the form of a uint64_t), or
* mon-specific features. Instead of keeping maps to hold them both, or
* a pair, which would be weird, a struct to keep them seems appropriate.
*/
struct elector_info_t {
uint64_t cluster_features = 0;
mon_feature_t mon_features;
ceph_release_t mon_release{0};
std::map<std::string,std::string> metadata;
};
/**
* @}
*/
/**
* The Monitor instance associated with this class.
*/
Monitor *mon;
/**
* Event callback responsible for dealing with an expired election once a
* timer runs out and fires up.
*/
Context *expire_event = nullptr;
/**
* Resets the expire_event timer, by cancelling any existing one and
* scheduling a new one.
*
* @remarks This function assumes as a default firing value the duration of
* the monitor's lease interval, and adds to it the value specified
* in @e plus
*
* @post expire_event is set
*
* @param plus The amount of time to be added to the default firing value.
*/
void reset_timer(double plus=0.0);
/**
* Cancel the expire_event timer, if it is defined.
*
* @post expire_event is not set
*/
void cancel_timer();
// electing me
/**
* @defgroup Elector_h_electing_me_vars We are being elected
* @{
*/
/**
* Map containing info of all those that acked our proposal to become the Leader.
* Note each peer's info.
*/
std::map<int, elector_info_t> peer_info;
/**
* @}
*/
/**
* Handle a message from some other node proposing itself to become it
* the Leader.
*
* We validate that the sending Monitor is allowed to participate based on
* its supported features, then pass the request to our ElectionLogic.
*
* @invariant The received message is an operation of type OP_PROPOSE
*
* @pre Message epoch is from the current or a newer epoch
*
* @param m A message sent by another participant in the quorum.
*/
void handle_propose(MonOpRequestRef op);
/**
* Handle a message from some other participant Acking us as the Leader.
*
* We validate that the sending Monitor is allowed to participate based on
* its supported features, add it to peer_info, and pass the ack to our
* ElectionLogic.
*
* @pre Message epoch is from the current or a newer epoch
*
* @param m A message with an operation type of OP_ACK
*/
void handle_ack(MonOpRequestRef op);
/**
* Handle a message from some other participant declaring Victory.
*
* We just got a message from someone declaring themselves Victorious, thus
* the new Leader.
*
* We pass the Victory to our ElectionLogic, and if it confirms the
* victory we lose the election and start following this Leader. Otherwise,
* drop the message.
*
* @pre Message epoch is from the current or a newer epoch
* @post Election is not on-going
* @post Updated @p epoch
* @post We have a new quorum if we lost the election
*
* @param m A message with an operation type of OP_VICTORY
*/
void handle_victory(MonOpRequestRef op);
/**
* Send a nak to a peer who's out of date, containing information about why.
*
* If we get a message from a peer who can't support the required quorum
* features, we have to ignore them. This function will at least send
* them a message about *why* they're being ignored -- if they're new
* enough to support such a message.
*
* @param m A message from a monitor not supporting required features. We
* take ownership of the reference.
*/
void nak_old_peer(MonOpRequestRef op);
/**
* Handle a message from some other participant declaring
* we cannot join the quorum.
*
* Apparently the quorum requires some feature that we do not implement. Shut
* down gracefully.
*
* @pre Election is on-going.
* @post We've shut down.
*
* @param m A message with an operation type of OP_NAK
*/
void handle_nak(MonOpRequestRef op);
/**
* Send a ping to the specified peer.
* @n optional time that we will use instead of calling ceph_clock_now()
*/
bool send_peer_ping(int peer, const utime_t *n=NULL);
/**
* Check the state of pinging the specified peer. This is our
* "tick" for heartbeating; scheduled by itself and begin_peer_ping().
*/
void ping_check(int peer);
/**
* Move the peer out of live_pinging into dead_pinging set
* and schedule dead_ping()ing on it.
*/
void begin_dead_ping(int peer);
/**
* Checks that the peer is still marked for dead pinging,
* and then marks it as dead for the appropriate interval.
*/
void dead_ping(int peer);
/**
* Handle a ping from another monitor and assimilate the data it contains.
*/
void handle_ping(MonOpRequestRef op);
/**
* Update our view of everybody else's connectivity based on the provided
* tracker bufferlist
*/
void assimilate_connection_reports(const bufferlist& bl);
public:
/**
* @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
* @{
*/
/* Commit the given epoch to our MonStore.
* We also take the opportunity to persist our peer_tracker.
*/
void persist_epoch(epoch_t e);
/* Read the epoch out of our MonStore */
epoch_t read_persisted_epoch() const;
/* Write a nonsense key "election_writeable_test" to our MonStore */
void validate_store();
/* Reset my tracking. Currently, just call Monitor::join_election() */
void notify_bump_epoch();
/* Call a new election: Invoke Monitor::start_election() */
void trigger_new_election();
/* Retrieve rank from the Monitor */
int get_my_rank() const;
/* Send MMonElection OP_PROPOSE to every monitor in the map. */
void propose_to_peers(epoch_t e, bufferlist &bl);
/* bootstrap() the Monitor */
void reset_election();
/* Retrieve the Monitor::has_ever_joined member */
bool ever_participated() const;
/* Retrieve monmap->size() */
unsigned paxos_size() const;
/* Right now we don't disallow anybody */
std::set<int> disallowed_leaders;
const std::set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
/**
* Reset the expire_event timer so we can limit the amount of time we
* will be electing. Clean up our peer_info.
*
* @post we reset the expire_event timer
*/
void _start();
/**
* Send an MMonElection message deferring to the identified monitor. We
* also increase the election timeout so the monitor we defer to
* has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
*
* @post we sent an ack message to @p who
* @post we reset the expire_event timer
*
* @param who Some other monitor's numeric identifier.
*/
void _defer_to(int who);
/**
* Our ElectionLogic told us we won an election! Identify the quorum
* features, tell our new peons we've won, and invoke Monitor::win_election().
*/
void message_victory(const std::set<int>& quorum);
/* Check if rank is in mon->quorum */
bool is_current_member(int rank) const;
/*
* @}
*/
/**
* Persist our peer_tracker to disk.
*/
void persist_connectivity_scores();
Elector *elector;
/**
* Create an Elector class
*
* @param m A Monitor instance
* @param strategy The election strategy to use, defined in MonMap/ElectionLogic
*/
explicit Elector(Monitor *m, int strategy);
virtual ~Elector() {}
/**
* Inform this class it is supposed to shutdown.
*
* We will simply cancel the @p expire_event if any exists.
*
* @post @p expire_event is cancelled
*/
void shutdown();
/**
* Obtain our epoch from ElectionLogic.
*
* @returns Our current epoch number
*/
epoch_t get_epoch() { return logic.get_epoch(); }
/**
* If the Monitor knows there are no Paxos peers (so
* we are rank 0 and there are no others) we can declare victory.
*/
void declare_standalone_victory() {
logic.declare_standalone_victory();
}
/**
* Tell the Elector to start pinging a given peer.
* Do this when you discover a peer and it has a rank assigned.
* We do it ourselves on receipt of pings and when receiving other messages.
*/
void begin_peer_ping(int peer);
/**
* Handle received messages.
*
* We will ignore all messages that are not of type @p MSG_MON_ELECTION
* (i.e., messages whose interface is not of type @p MMonElection). All of
* those that are will then be dispatched to their operation-specific
* functions.
*
* @param m A received message
*/
void dispatch(MonOpRequestRef op);
/**
* Call an election.
*
* This function simply calls ElectionLogic::start.
*/
void call_election() {
logic.start();
}
/**
* Stop participating in subsequent Elections.
*
* @post @p participating is false
*/
void stop_participating() { logic.participating = false; }
/**
* Start participating in Elections.
*
* If we are already participating (i.e., @p participating is true), then
* calling this function is moot.
*
* However, if we are not participating (i.e., @p participating is false),
* then we will start participating by setting @p participating to true and
* we will call for an Election.
*
* @post @p participating is true
*/
void start_participating();
/**
* Check if our peer_tracker is self-consistent, not suffering from
* https://tracker.ceph.com/issues/58049
*/
bool peer_tracker_is_clean();
/**
* Forget everything about our peers. :(
*/
void notify_clear_peer_state();
/**
* Notify that our local rank has changed
* and we may need to update internal data structures.
*/
void notify_rank_changed(int new_rank);
/**
* A peer has been removed so we should clean up state related to it.
* This is safe to call even if we haven't joined or are currently
* in a quorum.
*/
void notify_rank_removed(unsigned rank_removed, unsigned new_rank);
void notify_strategy_maybe_changed(int strategy);
/**
* Set the disallowed leaders.
*
* If you call this and the new disallowed set
* contains your current leader, you are
* responsible for calling an election!
*
* @returns false if the set is unchanged,
* true if the set changed
*/
bool set_disallowed_leaders(const std::set<int>& dl) {
if (dl == disallowed_leaders) return false;
disallowed_leaders = dl;
return true;
}
void dump_connection_scores(Formatter *f) {
f->open_object_section("connection scores");
peer_tracker.dump(f);
f->close_section();
}
/**
* @}
*/
};
#endif
|