1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
|
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef CEPH_ELECTIONLOGIC_H
#define CEPH_ELECTIONLOGIC_H
#include <map>
#include <set>
#include "include/types.h"
#include "ConnectionTracker.h"
class ElectionOwner {
public:
/**
* Write down the given epoch in persistent storage, such that it
* can later be retrieved by read_persisted_epoch even across process
* or machine restarts.
*
* @param e The epoch to write
*/
virtual void persist_epoch(epoch_t e) = 0;
/**
* Retrieve the most-previously-persisted epoch.
*
* @returns The latest epoch passed to persist_epoch()
*/
virtual epoch_t read_persisted_epoch() const = 0;
/**
* Validate that the persistent store is working by committing
* to it. (There is no interface for retrieving the value; this
* tests local functionality before doing things like triggering
* elections to try and join a quorum.)
*/
virtual void validate_store() = 0;
/**
* Notify the ElectionOwner that ElectionLogic has increased its
* election epoch. This resets an election (either on local loss or victory,
* or when trying a new election round) and the ElectionOwner
* should reset any tracking of its own to match. (The ElectionLogic
* will further trigger sending election messages if that is
* appropriate.)
*/
virtual void notify_bump_epoch() = 0;
/**
* Notify the ElectionOwner we must start a new election.
*/
virtual void trigger_new_election() = 0;
/**
* Retrieve this Paxos instance's rank.
*/
virtual int get_my_rank() const = 0;
/**
* Send a PROPOSE message to all our peers. This happens when
* we have started a new election (which may mean attempting to
* override a current one).
*
* @param e The election epoch of our proposal.
* @param bl A bufferlist containing data the logic wishes to share
*/
virtual void propose_to_peers(epoch_t e, bufferlist& bl) = 0;
/**
* The election has failed and we aren't sure what the state of the
* quorum is, so reset the entire system as if from scratch.
*/
virtual void reset_election() = 0;
/**
* Ask the ElectionOwner if we-the-Monitor have ever participated in the
* quorum (including across process restarts!).
*
* @returns true if we have participated, false otherwise
*/
virtual bool ever_participated() const = 0;
/**
* Ask the ElectionOwner for the size of the Paxos set. This includes
* those monitors which may not be in the current quorum!
* The value returned by this function can change between elections,
* but not during them. (In practical terms, it can be updated
* by making a paxos commit, but not by injecting values while
* an election is ongoing.)
*/
virtual unsigned paxos_size() const = 0;
/**
* Retrieve a set of ranks which are not allowed to become the leader.
* Like paxos_size(), This set can change between elections, but not
* during them.
*/
virtual const std::set<int>& get_disallowed_leaders() const = 0;
/**
* Tell the ElectionOwner we have started a new election.
*
* The ElectionOwner is responsible for timing out the election (by invoking
* end_election_period()) if it takes too long (as defined by the ElectionOwner).
* This function is the opportunity to do that and to clean up any other external
* election state it may be maintaining.
*/
virtual void _start() = 0;
/**
* Tell the ElectionOwner to defer to the identified peer. Tell that peer
* we have deferred to it.
*
* @post we sent an ack message to @p who
*/
virtual void _defer_to(int who) = 0;
/**
* We have won an election, so have the ElectionOwner message that to
* our new quorum!
*
* @param quorum The ranks of our peers which deferred to us and
* must be told of our victory
*/
virtual void message_victory(const std::set<int>& quorum) = 0;
/**
* Query the ElectionOwner about if a given rank is in the
* currently active quorum.
* @param rank the Paxos rank whose status we are checking
* @returns true if the rank is in our current quorum, false otherwise.
*/
virtual bool is_current_member(int rank) const = 0;
virtual ~ElectionOwner() {}
};
/**
* This class maintains local state for running an election
* between Paxos instances. It receives input requests
* and calls back out to its ElectionOwner to do persistence
* and message other entities.
*/
class ElectionLogic {
ElectionOwner *elector;
ConnectionTracker *peer_tracker;
CephContext *cct;
/**
* Latest epoch we've seen.
*
* @remarks if its value is odd, we're electing; if it's even, then we're
* stable.
*/
epoch_t epoch = 0;
/**
* The last rank which won an election we participated in
*/
int last_election_winner = -1;
/**
* Only used in the connectivity handler.
* The rank we voted for in the last election we voted in.
*/
int last_voted_for = -1;
double ignore_propose_margin = 0.0001;
/**
* Only used in the connectivity handler.
* Points at a stable copy of the peer_tracker we use to keep scores
* throughout an election period.
*/
std::unique_ptr<ConnectionTracker> stable_peer_tracker;
std::unique_ptr<ConnectionTracker> leader_peer_tracker;
/**
* Indicates who we have acked
*/
int leader_acked;
public:
enum election_strategy {
// Keep in sync with MonMap.h!
CLASSIC = 1, // the original rank-based one
DISALLOW = 2, // disallow a set from being leader
CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
};
election_strategy strategy;
/**
* Indicates if we are participating in the quorum.
*
* @remarks By default, we are created as participating. We may stop
* participating if something explicitly sets our value
* false, though. If that happens, it will
* have to set participating=true and invoke start() for us to resume
* participating in the quorum.
*/
bool participating;
/**
* Indicates if we are the ones being elected.
*
* We always attempt to be the one being elected if we are the ones starting
* the election. If we are not the ones that started it, we will only attempt
* to be elected if we think we might have a chance (i.e., the other guy's
* rank is lower than ours).
*/
bool electing_me;
/**
* Set containing all those that acked our proposal to become the Leader.
*
* If we are acked by ElectionOwner::paxos_size() peers, we will declare
* victory.
*/
std::set<int> acked_me;
ElectionLogic(ElectionOwner *e, election_strategy es, ConnectionTracker *t,
double ipm,
CephContext *c) : elector(e), peer_tracker(t), cct(c),
last_election_winner(-1), last_voted_for(-1),
ignore_propose_margin(ipm),
stable_peer_tracker(),
leader_peer_tracker(),
leader_acked(-1),
strategy(es),
participating(true),
electing_me(false) {}
/**
* Set the election strategy to use. If this is not consistent across the
* electing cluster, you're going to have a bad time.
* Defaults to CLASSIC.
*/
void set_election_strategy(election_strategy es) {
strategy = es;
}
/**
* If there are no other peers in this Paxos group, ElectionOwner
* can simply declare victory and we will make it so.
*
* @pre paxos_size() is 1
* @pre get_my_rank is 0
*/
void declare_standalone_victory();
/**
* Start a new election by proposing ourselves as the new Leader.
*
* Basically, send propose messages to all the peers.
*
* @pre participating is true
* @post epoch is an odd value
* @post electing_me is true
* @post We have invoked propose_to_peers() on our ElectionOwner
* @post We have invoked _start() on our ElectionOwner
*/
void start();
/**
* ElectionOwner has decided the election has taken too long and expired.
*
* This will happen when no one declared victory or started a new election
* during the allowed time span.
*
* When the election expires, we will check if we were the ones who won, and
* if so we will declare victory. If that is not the case, then we assume
* that the one we deferred to didn't declare victory quickly enough (in fact,
* as far as we know, it may even be dead); so, just propose ourselves as the
* Leader.
*/
void end_election_period();
/**
* Handle a proposal from some other node proposing asking to become
* the Leader.
*
* If the message appears to be old (i.e., its epoch is lower than our epoch),
* then we may take one of two actions:
*
* @li Ignore it because it's nothing more than an old proposal
* @li Start new elections if we verify that it was sent by a monitor from
* outside the quorum; given its old state, it's fair to assume it just
* started, so we should start new elections so it may rejoin. (Some
* handlers may choose to ignore even these, if they think it's flapping.)
*
* We pass the propose off to a propose_*_handler function based
* on the election strategy we're using.
* Only the Connectivity strategy cares about the ConnectionTracker; it should
* be NULL if other strategies are in use. Otherwise, it will take ownership
* of the underlying data and delete it as needed.
*
* @pre Message epoch is from the current or a newer epoch
* @param mepoch The epoch of the proposal
* @param from The rank proposing itself as leader
* @param ct Any incoming ConnectionTracker data sent with the message.
* Callers are responsible for deleting this -- we will copy it if we want
* to keep the data.
*/
void receive_propose(int from, epoch_t mepoch, const ConnectionTracker *ct);
/**
* Handle a message from some other participant Acking us as the Leader.
*
* When we receive such a message, one of three thing may be happening:
* @li We received a message with a newer epoch, which means we must have
* somehow lost track of what was going on (maybe we rebooted), thus we
* will start a new election
* @li We consider ourselves in the run for the Leader (i.e., @p electing_me
* is true), and we are actually being Acked by someone; thus simply add
* the one acking us to the @p acked_me set. If we do now have acks from
* all the participants, then we can declare victory
* @li We already deferred the election to somebody else, so we will just
* ignore this message
*
* @pre Message epoch is from the current or a newer epoch
* @post Election is on-going if we deferred to somebody else
* @post Election is on-going if we are still waiting for further Acks
* @post Election is not on-going if we are victorious
* @post Election is not on-going if we must start a new one
*
* @param from The rank which acked us
* @param from_epoch The election epoch the ack belongs to
*/
void receive_ack(int from, epoch_t from_epoch);
/**
* Handle a message from some other participant declaring Victory.
*
* We just got a message from someone declaring themselves Victorious, thus
* the new Leader.
*
* However, if the message's epoch happens to be different from our epoch+1,
* then it means we lost track of something and we must start a new election.
*
* If that is not the case, then we will simply update our epoch to the one
* in the message and invoke start() to reset the quorum.
*
* @pre from_epoch is the current or a newer epoch
* @post Election is not on-going
* @post Updated @p epoch
* @post We are a peon in a new quorum if we lost the election
*
* @param from The victory-claiming rank
* @param from_epoch The election epoch in which they claim victory
*/
bool receive_victory_claim(int from, epoch_t from_epoch);
/**
* Obtain our epoch
*
* @returns Our current epoch number
*/
epoch_t get_epoch() const { return epoch; }
int get_election_winner() { return last_election_winner; }
private:
/**
* Initiate the ElectionLogic class.
*
* Basically, we will simply read whatever epoch value we have in our stable
* storage, or consider it to be 1 if none is read.
*
* @post @p epoch is set to 1 or higher.
*/
void init();
/**
* Update our epoch.
*
* If we come across a higher epoch, we simply update ours, also making
* sure we are no longer being elected (even though we could have been,
* we no longer are since we no longer are on that old epoch).
*
* @pre Our epoch is not larger than @p e
* @post Our epoch equals @p e
*
* @param e Epoch to which we will update our epoch
*/
void bump_epoch(epoch_t e);
/**
* If the incoming proposal is newer, bump our own epoch; if
* it comes from an out-of-quorum peer, trigger a new eleciton.
* @returns true if you should drop this proposal, false otherwise.
*/
bool propose_classic_prefix(int from, epoch_t mepoch);
/**
* Handle a proposal from another rank using the classic strategy.
* We will take one of the following actions:
*
* @li Ignore it because we already acked another node with higher rank
* @li Ignore it and start a new election because we outrank it
* @li Defer to it because it outranks us and the node we previously
* acked, if any
*/
void propose_classic_handler(int from, epoch_t mepoch);
/**
* Handle a proposal from another rank using our disallow strategy.
* This is the same as the classic strategy except we also disallow
* certain ranks from becoming the leader.
*/
void propose_disallow_handler(int from, epoch_t mepoch);
/**
* Handle a proposal from another rank using the connectivity strategy.
* We will choose to defer or not based on the ordered criteria:
*
* @li Whether the other monitor (or ourself) is on the disallow list
* @li Whether the other monitor or ourself has the most connectivity to peers
* @li Whether the other monitor or ourself has the lower rank
*/
void propose_connectivity_handler(int from, epoch_t mepoch, const ConnectionTracker *ct);
/**
* Helper function for connectivity handler. Combines the disallowed list
* with ConnectionTracker scores.
*/
double connectivity_election_score(int rank);
/**
* Defer the current election to some other monitor.
*
* This means that we will ack some other monitor and drop out from the run
* to become the Leader. We will only defer an election if the monitor we
* are deferring to outranks us.
*
* @pre @p who outranks us (i.e., who < our rank)
* @pre @p who outranks any other monitor we have deferred to in the past
* @post electing_me is false
* @post leader_acked equals @p who
* @post we triggered ElectionOwner's _defer_to() on @p who
*
* @param who Some other monitor's numeric identifier.
*/
void defer(int who);
/**
* Declare Victory.
*
* We won. Or at least we believe we won, but for all intents and purposes
* that does not matter. What matters is that we Won.
*
* That said, we must now bump our epoch to reflect that the election is over
* and then we must let everybody in the quorum know we are their brand new
* Leader.
*
* Actually, the quorum will be now defined as the group of monitors that
* acked us during the election process.
*
* @pre Election is on-going
* @pre electing_me is true
* @post electing_me is false
* @post epoch is bumped up into an even value
* @post Election is not on-going
* @post We have a quorum, composed of the monitors that acked us
* @post We invoked message_victory() on the ElectionOwner
*/
void declare_victory();
/**
* This is just a helper function to validate that the victory claim we
* get from another rank makes any sense.
*/
bool victory_makes_sense(int from);
/**
* Reset some data members which we only care about while we are in an election
* or need to be set consistently during stable states.
*/
void clear_live_election_state();
void reset_stable_tracker();
/**
* Only for the connectivity handler, Bump the epoch
* when we get a message from a newer one and clear
* out leader and stable tracker
* data so that we can switch our allegiance.
*/
void connectivity_bump_epoch_in_election(epoch_t mepoch);
};
#endif
|