summaryrefslogtreecommitdiffstats
path: root/src/mon/Paxos.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/mon/Paxos.h
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/mon/Paxos.h1385
1 files changed, 1385 insertions, 0 deletions
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
new file mode 100644
index 00000000..75bac472
--- /dev/null
+++ b/src/mon/Paxos.h
@@ -0,0 +1,1385 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/*
+time---->
+
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca???????????????????????????????????????? leader
+cccccccccccccccccc?????????????????????????????????????????
+ccccc??????????????????????????????????????????????????????
+
+last_committed
+
+pn_from
+pn
+
+a 12v
+b 12v
+c 14v
+d
+e 12v
+*/
+
+/**
+ * Paxos storage layout and behavior
+ *
+ * Currently, we use a key/value store to hold all the Paxos-related data, but
+ * it can logically be depicted as this:
+ *
+ * paxos:
+ * first_committed -> 1
+ * last_committed -> 4
+ * 1 -> value_1
+ * 2 -> value_2
+ * 3 -> value_3
+ * 4 -> value_4
+ *
+ * Since we are relying on a k/v store supporting atomic transactions, we can
+ * guarantee that if 'last_committed' has a value of '4', then we have up to
+ * version 4 on the store, and no more than that; the same applies to
+ * 'first_committed', which holding '1' will strictly meaning that our lowest
+ * version is 1.
+ *
+ * Each version's value (value_1, value_2, ..., value_n) is a blob of data,
+ * incomprehensible to the Paxos. These values are proposed to the Paxos on
+ * propose_new_value() and each one is a transaction encoded in a bufferlist.
+ *
+ * The Paxos will write the value to disk, associating it with its version,
+ * but will take a step further: the value shall be decoded, and the operations
+ * on that transaction shall be applied during the same transaction that will
+ * write the value's encoded bufferlist to disk. This behavior ensures that
+ * whatever is being proposed will only be available on the store when it is
+ * applied by Paxos, which will then be aware of such new values, guaranteeing
+ * the store state is always consistent without requiring shady workarounds.
+ *
+ * So, let's say that FooMonitor proposes the following transaction, neatly
+ * encoded on a bufferlist of course:
+ *
+ * Tx_Foo
+ * put(foo, last_committed, 3)
+ * put(foo, 3, foo_value_3)
+ * erase(foo, 2)
+ * erase(foo, 1)
+ * put(foo, first_committed, 3)
+ *
+ * And knowing that the Paxos is proposed Tx_Foo as a bufferlist, once it is
+ * ready to commit, and assuming we are now committing version 5 of the Paxos,
+ * we will do something along the lines of:
+ *
+ * Tx proposed_tx;
+ * proposed_tx.decode(Tx_foo_bufferlist);
+ *
+ * Tx our_tx;
+ * our_tx.put(paxos, last_committed, 5);
+ * our_tx.put(paxos, 5, Tx_foo_bufferlist);
+ * our_tx.append(proposed_tx);
+ *
+ * store_apply(our_tx);
+ *
+ * And the store should look like this after we apply 'our_tx':
+ *
+ * paxos:
+ * first_committed -> 1
+ * last_committed -> 5
+ * 1 -> value_1
+ * 2 -> value_2
+ * 3 -> value_3
+ * 4 -> value_4
+ * 5 -> Tx_foo_bufferlist
+ * foo:
+ * first_committed -> 3
+ * last_committed -> 3
+ * 3 -> foo_value_3
+ *
+ */
+
+#ifndef CEPH_MON_PAXOS_H
+#define CEPH_MON_PAXOS_H
+
+#include "include/types.h"
+#include "mon_types.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+#include "include/Context.h"
+#include "common/perf_counters.h"
+#include <errno.h>
+
+#include "MonitorDBStore.h"
+#include "mon/MonOpRequest.h"
+
+class Monitor;
+class MMonPaxos;
+
+enum {
+ l_paxos_first = 45800,
+ l_paxos_start_leader,
+ l_paxos_start_peon,
+ l_paxos_restart,
+ l_paxos_refresh,
+ l_paxos_refresh_latency,
+ l_paxos_begin,
+ l_paxos_begin_keys,
+ l_paxos_begin_bytes,
+ l_paxos_begin_latency,
+ l_paxos_commit,
+ l_paxos_commit_keys,
+ l_paxos_commit_bytes,
+ l_paxos_commit_latency,
+ l_paxos_collect,
+ l_paxos_collect_keys,
+ l_paxos_collect_bytes,
+ l_paxos_collect_latency,
+ l_paxos_collect_uncommitted,
+ l_paxos_collect_timeout,
+ l_paxos_accept_timeout,
+ l_paxos_lease_ack_timeout,
+ l_paxos_lease_timeout,
+ l_paxos_store_state,
+ l_paxos_store_state_keys,
+ l_paxos_store_state_bytes,
+ l_paxos_store_state_latency,
+ l_paxos_share_state,
+ l_paxos_share_state_keys,
+ l_paxos_share_state_bytes,
+ l_paxos_new_pn,
+ l_paxos_new_pn_latency,
+ l_paxos_last,
+};
+
+
+// i am one state machine.
+/**
+ * This library is based on the Paxos algorithm, but varies in a few key ways:
+ * 1- Only a single new value is generated at a time, simplifying the recovery logic.
+ * 2- Nodes track "committed" values, and share them generously (and trustingly)
+ * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is
+ * safe to "read" their copy of the last committed value.
+ *
+ * This provides a simple replication substrate that services can be built on top of.
+ * See PaxosService.h
+ */
+class Paxos {
+ /**
+ * @defgroup Paxos_h_class Paxos
+ * @{
+ */
+ /**
+ * The Monitor to which this Paxos class is associated with.
+ */
+ Monitor *mon;
+
+ /// perf counter for internal instrumentations
+ PerfCounters *logger;
+
+ void init_logger();
+
+ // my state machine info
+ const string paxos_name;
+
+ friend class Monitor;
+ friend class PaxosService;
+
+ list<std::string> extra_state_dirs;
+
+ // LEADER+PEON
+
+ // -- generic state --
+public:
+ /**
+ * @defgroup Paxos_h_states States on which the leader/peon may be.
+ * @{
+ */
+ enum {
+ /**
+ * Leader/Peon is in Paxos' Recovery state
+ */
+ STATE_RECOVERING,
+ /**
+ * Leader/Peon is idle, and the Peon may or may not have a valid lease.
+ */
+ STATE_ACTIVE,
+ /**
+ * Leader/Peon is updating to a new value.
+ */
+ STATE_UPDATING,
+ /*
+ * Leader proposing an old value
+ */
+ STATE_UPDATING_PREVIOUS,
+ /*
+ * Leader/Peon is writing a new commit. readable, but not
+ * writeable.
+ */
+ STATE_WRITING,
+ /*
+ * Leader/Peon is writing a new commit from a previous round.
+ */
+ STATE_WRITING_PREVIOUS,
+ // leader: refresh following a commit
+ STATE_REFRESH,
+ // Shutdown after WRITING or WRITING_PREVIOUS
+ STATE_SHUTDOWN
+ };
+
+ /**
+ * Obtain state name from constant value.
+ *
+ * @note This function will raise a fatal error if @p s is not
+ * a valid state value.
+ *
+ * @param s State value.
+ * @return The state's name.
+ */
+ static const string get_statename(int s) {
+ switch (s) {
+ case STATE_RECOVERING:
+ return "recovering";
+ case STATE_ACTIVE:
+ return "active";
+ case STATE_UPDATING:
+ return "updating";
+ case STATE_UPDATING_PREVIOUS:
+ return "updating-previous";
+ case STATE_WRITING:
+ return "writing";
+ case STATE_WRITING_PREVIOUS:
+ return "writing-previous";
+ case STATE_REFRESH:
+ return "refresh";
+ case STATE_SHUTDOWN:
+ return "shutdown";
+ default:
+ return "UNKNOWN";
+ }
+ }
+
+private:
+ /**
+ * The state we are in.
+ */
+ int state;
+ /**
+ * @}
+ */
+ int commits_started = 0;
+
+ Cond shutdown_cond;
+
+public:
+ /**
+ * Check if we are recovering.
+ *
+ * @return 'true' if we are on the Recovering state; 'false' otherwise.
+ */
+ bool is_recovering() const { return (state == STATE_RECOVERING); }
+ /**
+ * Check if we are active.
+ *
+ * @return 'true' if we are on the Active state; 'false' otherwise.
+ */
+ bool is_active() const { return state == STATE_ACTIVE; }
+ /**
+ * Check if we are updating.
+ *
+ * @return 'true' if we are on the Updating state; 'false' otherwise.
+ */
+ bool is_updating() const { return state == STATE_UPDATING; }
+
+ /**
+ * Check if we are updating/proposing a previous value from a
+ * previous quorum
+ */
+ bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; }
+
+ /// @return 'true' if we are writing an update to disk
+ bool is_writing() const { return state == STATE_WRITING; }
+
+ /// @return 'true' if we are writing an update-previous to disk
+ bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; }
+
+ /// @return 'true' if we are refreshing an update just committed
+ bool is_refresh() const { return state == STATE_REFRESH; }
+
+ /// @return 'true' if we are in the process of shutting down
+ bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+
+private:
+ /**
+ * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
+ * @note These variables are common to both the Leader and the Peons.
+ * @{
+ */
+ /**
+ *
+ */
+ version_t first_committed;
+ /**
+ * Last Proposal Number
+ *
+ * @todo Expand description
+ */
+ version_t last_pn;
+ /**
+ * Last committed value's version.
+ *
+ * On both the Leader and the Peons, this is the last value's version that
+ * was accepted by a given quorum and thus committed, that this instance
+ * knows about.
+ *
+ * @note It may not be the last committed value's version throughout the
+ * system. If we are a Peon, we may have not been part of the quorum
+ * that accepted the value, and for this very same reason we may still
+ * be a (couple of) version(s) behind, until we learn about the most
+ * recent version. This should only happen if we are not active (i.e.,
+ * part of the quorum), which should not happen if we are up, running
+ * and able to communicate with others -- thus able to be part of the
+ * monmap and trigger new elections.
+ */
+ version_t last_committed;
+ /**
+ * Last committed value's time.
+ *
+ * When the commit finished.
+ */
+ utime_t last_commit_time;
+ /**
+ * The last Proposal Number we have accepted.
+ *
+ * On the Leader, it will be the Proposal Number picked by the Leader
+ * itself. On the Peon, however, it will be the proposal sent by the Leader
+ * and it will only be updated if its value is higher than the one
+ * already known by the Peon.
+ */
+ version_t accepted_pn;
+ /**
+ * The last_committed epoch of the leader at the time we accepted the last pn.
+ *
+ * This has NO SEMANTIC MEANING, and is there only for the debug output.
+ */
+ version_t accepted_pn_from;
+ /**
+ * Map holding the first committed version by each quorum member.
+ *
+ * The versions kept in this map are updated during the collect phase.
+ * When the Leader starts the collect phase, each Peon will reply with its
+ * first committed version, which will then be kept in this map.
+ */
+ map<int,version_t> peer_first_committed;
+ /**
+ * Map holding the last committed version by each quorum member.
+ *
+ * The versions kept in this map are updated during the collect phase.
+ * When the Leader starts the collect phase, each Peon will reply with its
+ * last committed version, which will then be kept in this map.
+ */
+ map<int,version_t> peer_last_committed;
+ /**
+ * @}
+ */
+
+ // active (phase 2)
+ /**
+ * @defgroup Paxos_h_active_vars Common active-related member variables
+ * @{
+ */
+ /**
+ * When does our read lease expires.
+ *
+ * Instead of performing a full commit each time a read is requested, we
+ * keep leases. Each lease will have an expiration date, which may or may
+ * not be extended.
+ */
+ utime_t lease_expire;
+ /**
+ * List of callbacks waiting for our state to change into STATE_ACTIVE.
+ */
+ list<Context*> waiting_for_active;
+ /**
+ * List of callbacks waiting for the chance to read a version from us.
+ *
+ * Each entry on the list may result from an attempt to read a version that
+ * wasn't available at the time, or an attempt made during a period during
+ * which we could not satisfy the read request. The first case happens if
+ * the requested version is greater than our last committed version. The
+ * second scenario may happen if we are recovering, or if we don't have a
+ * valid lease.
+ *
+ * The list will be woken up once we change to STATE_ACTIVE with an extended
+ * lease -- which can be achieved if we have everyone on the quorum on board
+ * with the latest proposal, or if we don't really care about the remaining
+ * uncommitted values --, or if we're on a quorum of one.
+ */
+ list<Context*> waiting_for_readable;
+ /**
+ * @}
+ */
+
+ // -- leader --
+ // recovery (paxos phase 1)
+ /**
+ * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
+ * @{
+ */
+ /**
+ * Number of replies to the collect phase we've received so far.
+ *
+ * This variable is reset to 1 each time we start a collect phase; it is
+ * incremented each time we receive a reply to the collect message, and
+ * is used to determine whether or not we have received replies from the
+ * whole quorum.
+ */
+ unsigned num_last;
+ /**
+ * Uncommitted value's version.
+ *
+ * If we have, or end up knowing about, an uncommitted value, then its
+ * version will be kept in this variable.
+ *
+ * @note If this version equals @p last_committed+1 when we reach the final
+ * steps of recovery, then the algorithm will assume this is a value
+ * the Leader does not know about, and trustingly the Leader will
+ * propose this version's value.
+ */
+ version_t uncommitted_v;
+ /**
+ * Uncommitted value's Proposal Number.
+ *
+ * We use this variable to assess if the Leader should take into consideration
+ * an uncommitted value sent by a Peon. Given that the Peon will send back to
+ * the Leader the last Proposal Number it accepted, the Leader will be able
+ * to infer if this value is more recent than the one the Leader has, thus
+ * more relevant.
+ */
+ version_t uncommitted_pn;
+ /**
+ * Uncommitted Value.
+ *
+ * If the system fails in-between the accept replies from the Peons and the
+ * instruction to commit from the Leader, then we may end up with accepted
+ * but yet-uncommitted values. During the Leader's recovery, it will attempt
+ * to bring the whole system to the latest state, and that means committing
+ * past accepted but uncommitted values.
+ *
+ * This variable will hold an uncommitted value, which may originate either
+ * on the Leader, or learnt by the Leader from a Peon during the collect
+ * phase.
+ */
+ bufferlist uncommitted_value;
+ /**
+ * Used to specify when an on-going collect phase times out.
+ */
+ Context *collect_timeout_event;
+ /**
+ * @}
+ */
+
+ // active
+ /**
+ * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
+ * @{
+ */
+ /**
+ * Set of participants (Leader & Peons) that have acked a lease extension.
+ *
+ * Each Peon that acknowledges a lease extension will have its place in this
+ * set, which will be used to account for all the acks from all the quorum
+ * members, guaranteeing that we trigger new elections if some don't ack in
+ * the expected timeframe.
+ */
+ set<int> acked_lease;
+ /**
+ * Callback responsible for extending the lease periodically.
+ */
+ Context *lease_renew_event;
+ /**
+ * Callback to trigger new elections once the time for acks is out.
+ */
+ Context *lease_ack_timeout_event;
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
+ * @{
+ */
+ /**
+ * Callback to trigger new elections when the Peon's lease times out.
+ *
+ * If the Peon's lease is extended, this callback will be reset (i.e.,
+ * we cancel the event and reschedule a new one with starting from the
+ * beginning).
+ */
+ Context *lease_timeout_event;
+ /**
+ * @}
+ */
+
+ // updating (paxos phase 2)
+ /**
+ * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
+ * @{
+ */
+ /**
+ * New Value being proposed to the Peons.
+ *
+ * This bufferlist holds the value the Leader is proposing to the Peons, and
+ * that will be committed if the Peons do accept the proposal.
+ */
+ bufferlist new_value;
+ /**
+ * Set of participants (Leader & Peons) that accepted the new proposed value.
+ *
+ * This set is used to keep track of those who have accepted the proposed
+ * value, so the leader may know when to issue a commit (when a majority of
+ * participants has accepted the proposal), and when to extend the lease
+ * (when all the quorum members have accepted the proposal).
+ */
+ set<int> accepted;
+ /**
+ * Callback to trigger a new election if the proposal is not accepted by the
+ * full quorum within a given timeframe.
+ *
+ * If the full quorum does not accept the proposal, then it means that the
+ * Leader may no longer be recognized as the leader, or that the quorum has
+ * changed, and the value may have not reached all the participants. Thus,
+ * the leader must call new elections, and go through a recovery phase in
+ * order to propagate the new value throughout the system.
+ *
+ * This does not mean that we won't commit. We will commit as soon as we
+ * have a majority of acceptances. But if we do not have full acceptance
+ * from the quorum, then we cannot extend the lease, as some participants
+ * may not have the latest committed value.
+ */
+ Context *accept_timeout_event;
+
+ /**
+ * List of callbacks waiting for it to be possible to write again.
+ *
+ * @remarks It is not possible to write if we are not the Leader, or we are
+ * not on the active state, or if the lease has expired.
+ */
+ list<Context*> waiting_for_writeable;
+
+ /**
+ * Pending proposal transaction
+ *
+ * This is the transaction that is under construction and pending
+ * proposal. We will add operations to it until we decide it is
+ * time to start a paxos round.
+ */
+ MonitorDBStore::TransactionRef pending_proposal;
+
+ /**
+ * Finishers for pending transaction
+ *
+ * These are waiting for updates in the pending proposal/transaction
+ * to be committed.
+ */
+ list<Context*> pending_finishers;
+
+ /**
+ * Finishers for committing transaction
+ *
+ * When the pending_proposal is submitted, pending_finishers move to
+ * this list. When it commits, these finishers are notified.
+ */
+ list<Context*> committing_finishers;
+ /**
+ * This function re-triggers pending_ and committing_finishers
+ * safely, so as to maintain existing system invariants. In particular
+ * we maintain ordering by triggering committing before pending, and
+ * we clear out pending_finishers prior to any triggers so that
+ * we don't trigger asserts on them being empty. You should
+ * use it instead of sending -EAGAIN to them with finish_contexts.
+ */
+ void reset_pending_committing_finishers();
+
+ /**
+ * @defgroup Paxos_h_sync_warns Synchronization warnings
+ * @todo Describe these variables
+ * @{
+ */
+ utime_t last_clock_drift_warn;
+ int clock_drift_warned;
+ /**
+ * @}
+ */
+
+ /**
+ * Should be true if we have proposed to trim, or are in the middle of
+ * trimming; false otherwise.
+ */
+ bool trimming;
+
+ /**
+ * true if we want trigger_propose to *not* propose (yet)
+ */
+ bool plugged = false;
+
+ /**
+ * @defgroup Paxos_h_callbacks Callback classes.
+ * @{
+ */
+ /**
+ * Callback class responsible for handling a Collect Timeout.
+ */
+ class C_CollectTimeout;
+ /**
+ * Callback class responsible for handling an Accept Timeout.
+ */
+ class C_AcceptTimeout;
+ /**
+ * Callback class responsible for handling a Lease Ack Timeout.
+ */
+ class C_LeaseAckTimeout;
+
+ /**
+ * Callback class responsible for handling a Lease Timeout.
+ */
+ class C_LeaseTimeout;
+
+ /**
+ * Callback class responsible for handling a Lease Renew Timeout.
+ */
+ class C_LeaseRenew;
+
+ class C_Trimmed;
+ /**
+ *
+ */
+public:
+ class C_Proposal : public Context {
+ Context *proposer_context;
+ public:
+ bufferlist bl;
+ // for debug purposes. Will go away. Soon.
+ bool proposed;
+ utime_t proposal_time;
+
+ C_Proposal(Context *c, bufferlist& proposal_bl) :
+ proposer_context(c),
+ bl(proposal_bl),
+ proposed(false),
+ proposal_time(ceph_clock_now())
+ { }
+
+ void finish(int r) override {
+ if (proposer_context) {
+ proposer_context->complete(r);
+ proposer_context = NULL;
+ }
+ }
+ };
+ /**
+ * @}
+ */
+private:
+ /**
+ * @defgroup Paxos_h_election_triggered Steps triggered by an election.
+ *
+ * @note All these functions play a significant role in the Recovery Phase,
+ * which is triggered right after an election once someone becomes
+ * the Leader.
+ * @{
+ */
+ /**
+ * Create a new Proposal Number and propose it to the Peons.
+ *
+ * This function starts the Recovery Phase, which can be directly mapped
+ * onto the original Paxos' Prepare phase. Basically, we'll generate a
+ * Proposal Number, taking @p oldpn into consideration, and we will send
+ * it to a quorum, along with our first and last committed versions. By
+ * sending these information in a message to the quorum, we expect to
+ * obtain acceptances from a majority, allowing us to commit, or be
+ * informed of a higher Proposal Number known by one or more of the Peons
+ * in the quorum.
+ *
+ * @pre We are the Leader.
+ * @post Recovery Phase initiated by sending messages to the quorum.
+ *
+ * @param oldpn A proposal number taken as the highest known so far, that
+ * should be taken into consideration when generating a new
+ * Proposal Number for the Recovery Phase.
+ */
+ void collect(version_t oldpn);
+ /**
+ * Handle the reception of a collect message from the Leader and reply
+ * accordingly.
+ *
+ * Once a Peon receives a collect message from the Leader it will reply
+ * with its first and last committed versions, as well as information so
+ * the Leader may know if its Proposal Number was, or was not, accepted by
+ * the Peon. The Peon will accept the Leader's Proposal Number if it is
+ * higher than the Peon's currently accepted Proposal Number. The Peon may
+ * also inform the Leader of accepted but uncommitted values.
+ *
+ * @invariant The message is an operation of type OP_COLLECT.
+ * @pre We are a Peon.
+ * @post Replied to the Leader, accepting or not accepting its PN.
+ *
+ * @param collect The collect message sent by the Leader to the Peon.
+ */
+ void handle_collect(MonOpRequestRef op);
+ /**
+ * Handle a response from a Peon to the Leader's collect phase.
+ *
+ * The received message will state the Peon's last committed version, as
+ * well as its last proposal number. This will lead to one of the following
+ * scenarios: if the replied Proposal Number is equal to the one we proposed,
+ * then the Peon has accepted our proposal, and if all the Peons do accept
+ * our Proposal Number, then we are allowed to proceed with the commit;
+ * however, if a Peon replies with a higher Proposal Number, we assume he
+ * knows something we don't and the Leader will have to abort the current
+ * proposal in order to retry with the Proposal Number specified by the Peon.
+ * It may also occur that the Peon replied with a lower Proposal Number, in
+ * which case we assume it is a reply to an older value and we'll simply
+ * drop it.
+ * This function will also check if the Peon replied with an accepted but
+ * yet uncommitted value. In this case, if its version is higher than our
+ * last committed value by one, we assume that the Peon knows a value from a
+ * previous proposal that has never been committed, and we should try to
+ * commit that value by proposing it next. On the other hand, if that is
+ * not the case, we'll assume it is an old, uncommitted value, we do not
+ * care about and we'll consider the system active by extending the leases.
+ *
+ * @invariant The message is an operation of type OP_LAST.
+ * @pre We are the Leader.
+ * @post We initiate a commit, or we retry with a higher Proposal Number,
+ * or we drop the message.
+ * @post We move from STATE_RECOVERING to STATE_ACTIVE.
+ *
+ * @param last The message sent by the Peon to the Leader.
+ */
+ void handle_last(MonOpRequestRef op);
+ /**
+ * The Recovery Phase timed out, meaning that a significant part of the
+ * quorum does not believe we are the Leader, and we thus should trigger new
+ * elections.
+ *
+ * @pre We believe to be the Leader.
+ * @post Trigger new elections.
+ */
+ void collect_timeout();
+ /**
+ * @}
+ */
+
+ /**
+ * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
+ *
+ * These functions may easily be mapped to the original Paxos Algorithm's
+ * phases.
+ *
+ * Taking into account the algorithm can be divided in 4 phases (Prepare,
+ * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
+ * both the Prepare and Accept Request phases; the Paxos::handle_begin to
+ * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
+ * @{
+ */
+ /**
+ * Start a new proposal with the intent of committing @p value.
+ *
+ * If we are alone on the system (i.e., a quorum of one), then we will
+ * simply commit the value, but if we are not alone, then we need to propose
+ * the value to the quorum.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post We commit, if we are alone, or we send a message to each quorum
+ * member
+ * @post We are on STATE_ACTIVE, if we are alone, or on
+ * STATE_UPDATING otherwise
+ *
+ * @param value The value being proposed to the quorum
+ */
+ void begin(bufferlist& value);
+ /**
+ * Accept or decline (by ignoring) a proposal from the Leader.
+ *
+ * We will decline the proposal (by ignoring it) if we have promised to
+ * accept a higher numbered proposal. If that is not the case, we will
+ * accept it and accordingly reply to the Leader.
+ *
+ * @pre We are a Peon
+ * @pre We are on STATE_ACTIVE
+ * @post We are on STATE_UPDATING if we accept the Leader's proposal
+ * @post We send a reply message to the Leader if we accept its proposal
+ *
+ * @invariant The received message is an operation of type OP_BEGIN
+ *
+ * @param begin The message sent by the Leader to the Peon during the
+ * Paxos::begin function
+ *
+ */
+ void handle_begin(MonOpRequestRef op);
+ /**
+ * Handle an Accept message sent by a Peon.
+ *
+ * In order to commit, the Leader has to receive accepts from a majority of
+ * the quorum. If that does happen, then the Leader may proceed with the
+ * commit. However, the Leader needs the accepts from all the quorum members
+ * in order to extend the lease and move on to STATE_ACTIVE.
+ *
+ * This function handles these two situations, accounting for the amount of
+ * received accepts.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post We are on STATE_ACTIVE if we received accepts from the full quorum
+ * @post We extended the lease if we moved on to STATE_ACTIVE
+ * @post We are on STATE_UPDATING if we didn't received accepts from the
+ * full quorum
+ * @post We have committed if we received accepts from a majority
+ *
+ * @invariant The received message is an operation of type OP_ACCEPT
+ *
+ * @param accept The message sent by the Peons to the Leader during the
+ * Paxos::handle_begin function
+ */
+ void handle_accept(MonOpRequestRef op);
+ /**
+ * Trigger a fresh election.
+ *
+ * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
+ * order to limit the amount of time we spend waiting for Accept replies.
+ * This callback will call Paxos::accept_timeout when it is fired.
+ *
+ * This is essential to the algorithm because there may be the chance that
+ * we are no longer the Leader (i.e., others don't believe in us) and we
+ * are getting ignored, or we dropped out of the quorum and haven't realised
+ * it. So, our only option is to trigger fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post Triggered fresh elections
+ */
+ void accept_timeout();
+ /**
+ * @}
+ */
+
+
+ utime_t commit_start_stamp;
+ friend struct C_Committed;
+
+ /**
+ * Commit a value throughout the system.
+ *
+ * The Leader will cancel the current lease (as it was for the old value),
+ * and will store the committed value locally. It will then instruct every
+ * quorum member to do so as well.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @pre A majority of quorum members accepted our proposal
+ * @post Value locally stored
+ * @post Quorum members instructed to commit the new value.
+ */
+ void commit_start();
+ void commit_finish(); ///< finish a commit after txn becomes durable
+ void abort_commit(); ///< Handle commit finish after shutdown started
+ /**
+ * Commit the new value to stable storage as being the latest available
+ * version.
+ *
+ * @pre We are a Peon
+ * @post The new value is locally stored
+ * @post Fire up the callbacks waiting on waiting_for_commit
+ *
+ * @invariant The received message is an operation of type OP_COMMIT
+ *
+ * @param commit The message sent by the Leader to the Peon during
+ * Paxos::commit
+ */
+ void handle_commit(MonOpRequestRef op);
+ /**
+ * Extend the system's lease.
+ *
+ * This means that the Leader considers that it should now safe to read from
+ * any node on the system, since every quorum member is now in possession of
+ * the latest version. Therefore, the Leader will send a message stating just
+ * this to each quorum member, and will impose a limited timeframe during
+ * which acks will be accepted. If there aren't as many acks as expected
+ * (i.e, if at least one quorum member does not ack the lease) during this
+ * timeframe, then we will force fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post A message extending the lease is sent to each quorum member
+ * @post A timeout callback is set to limit the amount of time we will wait
+ * for lease acks.
+ * @post A timer is set in order to renew the lease after a certain amount
+ * of time.
+ */
+ void extend_lease();
+ /**
+ * Update the lease on the Peon's side of things.
+ *
+ * Once a Peon receives a Lease message, it will update its lease_expire
+ * variable, reply to the Leader acknowledging the lease update and set a
+ * timeout callback to be fired upon the lease's expiration. Finally, the
+ * Peon will fire up all the callbacks waiting for it to become active,
+ * which it just did, and all those waiting for it to become readable,
+ * which should be true if the Peon's lease didn't expire in the mean time.
+ *
+ * @pre We are a Peon
+ * @post We update the lease accordingly
+ * @post A lease timeout callback is set
+ * @post Move to STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for readable if we are readable
+ * @post Ack the lease to the Leader
+ *
+ * @invariant The received message is an operation of type OP_LEASE
+ *
+ * @param lease The message sent by the Leader to the Peon during the
+ * Paxos::extend_lease function
+ */
+ void handle_lease(MonOpRequestRef op);
+ /**
+ * Account for all the Lease Acks the Leader receives from the Peons.
+ *
+ * Once the Leader receives all the Lease Acks from the Peons, it will be
+ * able to cancel the Lease Ack timeout callback, thus avoiding calling
+ * fresh elections.
+ *
+ * @pre We are the Leader
+ * @post Cancel the Lease Ack timeout callback if we receive acks from all
+ * the quorum members
+ *
+ * @invariant The received message is an operation of type OP_LEASE_ACK
+ *
+ * @param ack The message sent by a Peon to the Leader during the
+ * Paxos::handle_lease function
+ */
+ void handle_lease_ack(MonOpRequestRef op);
+ /**
+ * Call fresh elections because at least one Peon didn't acked our lease.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Trigger fresh elections
+ */
+ void lease_ack_timeout();
+ /**
+ * Extend lease since we haven't had new committed values meanwhile.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Go through with Paxos::extend_lease
+ */
+ void lease_renew_timeout();
+ /**
+ * Call fresh elections because the Peon's lease expired without being
+ * renewed or receiving a fresh lease.
+ *
+ * This means that the Peon is no longer assumed as being in the quorum
+ * (or there is no Leader to speak of), so just trigger fresh elections
+ * to circumvent this issue.
+ *
+ * @pre We are a Peon
+ * @post Trigger fresh elections
+ */
+ void lease_timeout(); // on peon, if lease isn't extended
+
+ /// restart the lease timeout timer
+ void reset_lease_timeout();
+
+ /**
+ * Cancel all of Paxos' timeout/renew events.
+ */
+ void cancel_events();
+ /**
+ * Shutdown this Paxos machine
+ */
+ void shutdown();
+
+ /**
+ * Generate a new Proposal Number based on @p gt
+ *
+ * @todo Check what @p gt actually means and what its usage entails
+ * @param gt A hint for the geration of the Proposal Number
+ * @return A globally unique, monotonically increasing Proposal Number
+ */
+ version_t get_new_proposal_number(version_t gt=0);
+
+ /**
+ * @todo document sync function
+ */
+ void warn_on_future_time(utime_t t, entity_name_t from);
+
+ /**
+ * Begin proposing the pending_proposal.
+ */
+ void propose_pending();
+
+ /**
+ * refresh state from store
+ *
+ * Called when we have new state for the mon to consume. If we return false,
+ * abort (we triggered a bootstrap).
+ *
+ * @returns true on success, false if we are now bootstrapping
+ */
+ bool do_refresh();
+
+ void commit_proposal();
+ void finish_round();
+
+public:
+ /**
+ * @param m A monitor
+ * @param name A name for the paxos service. It serves as the naming space
+ * of the underlying persistent storage for this service.
+ */
+ Paxos(Monitor *m, const string &name)
+ : mon(m),
+ logger(NULL),
+ paxos_name(name),
+ state(STATE_RECOVERING),
+ first_committed(0),
+ last_pn(0),
+ last_committed(0),
+ accepted_pn(0),
+ accepted_pn_from(0),
+ num_last(0),
+ uncommitted_v(0), uncommitted_pn(0),
+ collect_timeout_event(0),
+ lease_renew_event(0),
+ lease_ack_timeout_event(0),
+ lease_timeout_event(0),
+ accept_timeout_event(0),
+ clock_drift_warned(0),
+ trimming(false) { }
+
+ ~Paxos() {
+ delete logger;
+ }
+
+ const string get_name() const {
+ return paxos_name;
+ }
+
+ void dispatch(MonOpRequestRef op);
+
+ void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
+ version_t from, version_t last);
+
+ void init();
+
+ /**
+ * dump state info to a formatter
+ */
+ void dump_info(Formatter *f);
+
+ /**
+ * This function runs basic consistency checks. Importantly, if
+ * it is inconsistent and shouldn't be, it asserts out.
+ *
+ * @return True if consistent, false if not.
+ */
+ bool is_consistent();
+
+ void restart();
+ /**
+ * Initiate the Leader after it wins an election.
+ *
+ * Once an election is won, the Leader will be initiated and there are two
+ * possible outcomes of this method: the Leader directly jumps to the active
+ * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
+ * will start recovering (STATE_RECOVERING) by initiating the collect phase.
+ *
+ * @pre Our monitor is the Leader.
+ * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
+ * or on STATE_RECOVERING otherwise.
+ */
+ void leader_init();
+ /**
+ * Initiate a Peon after it loses an election.
+ *
+ * If we are a Peon, then there must be a Leader and we are not alone in the
+ * quorum, thus automatically assume we are on STATE_RECOVERING, which means
+ * we will soon be enrolled into the Leader's collect phase.
+ *
+ * @pre There is a Leader, and it?s about to start the collect phase.
+ * @post We are on STATE_RECOVERING and will soon receive collect phase's
+ * messages.
+ */
+ void peon_init();
+
+ /**
+ * Include an incremental state of values, ranging from peer_first_committed
+ * to the last committed value, on the message m
+ *
+ * @param m A message
+ * @param peer_first_committed Lowest version to take into account
+ * @param peer_last_committed Highest version to take into account
+ */
+ void share_state(MMonPaxos *m, version_t peer_first_committed,
+ version_t peer_last_committed);
+ /**
+ * Store on disk a state that was shared with us
+ *
+ * Basically, we received a set of version. Or just one. It doesn't matter.
+ * What matters is that we have to stash it in the store. So, we will simply
+ * write every single bufferlist into their own versions on our side (i.e.,
+ * onto paxos-related keys), and then we will decode those same bufferlists
+ * we just wrote and apply the transactions they hold. We will also update
+ * our first and last committed values to point to the new values, if need
+ * be. All this is done tightly wrapped in a transaction to ensure we
+ * enjoy the atomicity guarantees given by our awesome k/v store.
+ *
+ * @param m A message
+ * @returns true if we stored something new; false otherwise
+ */
+ bool store_state(MMonPaxos *m);
+ void _sanity_check_store();
+
+ /**
+ * Helper function to decode a bufferlist into a transaction and append it
+ * to another transaction.
+ *
+ * This function is used during the Leader's commit and during the
+ * Paxos::store_state in order to apply the bufferlist's transaction onto
+ * the store.
+ *
+ * @param t The transaction to which we will append the operations
+ * @param bl A bufferlist containing an encoded transaction
+ */
+ static void decode_append_transaction(MonitorDBStore::TransactionRef t,
+ bufferlist& bl) {
+ auto vt(std::make_shared<MonitorDBStore::Transaction>());
+ auto it = bl.cbegin();
+ vt->decode(it);
+ t->append(vt);
+ }
+
+ /**
+ * @todo This appears to be used only by the OSDMonitor, and I would say
+ * its objective is to allow a third-party to have a "private"
+ * state dir. -JL
+ */
+ void add_extra_state_dir(string s) {
+ extra_state_dirs.push_back(s);
+ }
+
+ // -- service interface --
+ /**
+ * Add c to the list of callbacks waiting for us to become active.
+ *
+ * @param c A callback
+ */
+ void wait_for_active(MonOpRequestRef op, Context *c) {
+ if (op)
+ op->mark_event("paxos:wait_for_active");
+ waiting_for_active.push_back(c);
+ }
+ void wait_for_active(Context *c) {
+ MonOpRequestRef o;
+ wait_for_active(o, c);
+ }
+
+ /**
+ * Trim the Paxos state as much as we can.
+ */
+ void trim();
+
+ /**
+ * Check if we should trim.
+ *
+ * If trimming is disabled, we must take that into consideration and only
+ * return true if we are positively sure that we should trim soon.
+ *
+ * @returns true if we should trim; false otherwise.
+ */
+ bool should_trim() {
+ int available_versions = get_version() - get_first_committed();
+ int maximum_versions = g_conf()->paxos_min + g_conf()->paxos_trim_min;
+
+ if (trimming || (available_versions <= maximum_versions))
+ return false;
+
+ return true;
+ }
+
+ bool is_plugged() const {
+ return plugged;
+ }
+ void plug() {
+ ceph_assert(plugged == false);
+ plugged = true;
+ }
+ void unplug() {
+ ceph_assert(plugged == true);
+ plugged = false;
+ }
+
+ // read
+ /**
+ * @defgroup Paxos_h_read_funcs Read-related functions
+ * @{
+ */
+ /**
+ * Get latest committed version
+ *
+ * @return latest committed version
+ */
+ version_t get_version() { return last_committed; }
+ /**
+ * Get first committed version
+ *
+ * @return the first committed version
+ */
+ version_t get_first_committed() { return first_committed; }
+ /**
+ * Check if a given version is readable.
+ *
+ * A version may not be readable for a myriad of reasons:
+ * @li the version @e v is higher that the last committed version
+ * @li we are not the Leader nor a Peon (election may be on-going)
+ * @li we do not have a committed value yet
+ * @li we do not have a valid lease
+ *
+ * @param seen The version we want to check if it is readable.
+ * @return 'true' if the version is readable; 'false' otherwise.
+ */
+ bool is_readable(version_t seen=0);
+ /**
+ * Read version @e v and store its value in @e bl
+ *
+ * @param[in] v The version we want to read
+ * @param[out] bl The version's value
+ * @return 'true' if we successfully read the value; 'false' otherwise
+ */
+ bool read(version_t v, bufferlist &bl);
+ /**
+ * Read the latest committed version
+ *
+ * @param[out] bl The version's value
+ * @return the latest committed version if we successfully read the value;
+ * or 0 (zero) otherwise.
+ */
+ version_t read_current(bufferlist &bl);
+ /**
+ * Add onreadable to the list of callbacks waiting for us to become readable.
+ *
+ * @param onreadable A callback
+ */
+ void wait_for_readable(MonOpRequestRef op, Context *onreadable) {
+ ceph_assert(!is_readable());
+ if (op)
+ op->mark_event("paxos:wait_for_readable");
+ waiting_for_readable.push_back(onreadable);
+ }
+ void wait_for_readable(Context *onreadable) {
+ MonOpRequestRef o;
+ wait_for_readable(o, onreadable);
+ }
+ /**
+ * @}
+ */
+
+ /**
+ * Check if we have a valid lease.
+ *
+ * @returns true if the lease is still valid; false otherwise.
+ */
+ bool is_lease_valid();
+ // write
+ /**
+ * @defgroup Paxos_h_write_funcs Write-related functions
+ * @{
+ */
+ /**
+ * Check if we are writeable.
+ *
+ * We are writeable if we are alone (i.e., a quorum of one), or if we match
+ * all the following conditions:
+ * @li We are the Leader
+ * @li We are on STATE_ACTIVE
+ * @li We have a valid lease
+ *
+ * @return 'true' if we are writeable; 'false' otherwise.
+ */
+ bool is_writeable();
+ /**
+ * Add c to the list of callbacks waiting for us to become writeable.
+ *
+ * @param c A callback
+ */
+ void wait_for_writeable(MonOpRequestRef op, Context *c) {
+ ceph_assert(!is_writeable());
+ if (op)
+ op->mark_event("paxos:wait_for_writeable");
+ waiting_for_writeable.push_back(c);
+ }
+ void wait_for_writeable(Context *c) {
+ MonOpRequestRef o;
+ wait_for_writeable(o, c);
+ }
+
+ /**
+ * Get a transaction to submit operations to propose against
+ *
+ * Apply operations to this transaction. It will eventually be proposed
+ * to paxos.
+ */
+ MonitorDBStore::TransactionRef get_pending_transaction();
+
+ /**
+ * Queue a completion for the pending proposal
+ *
+ * This completion will get triggered when the pending proposal
+ * transaction commits.
+ */
+ void queue_pending_finisher(Context *onfinished);
+
+ /**
+ * (try to) trigger a proposal
+ *
+ * Tell paxos that it should submit the pending proposal. Note that if it
+ * is not active (e.g., because it is already in the midst of committing
+ * something) that will be deferred (e.g., until the current round finishes).
+ */
+ bool trigger_propose();
+ /**
+ * @}
+ */
+
+ /**
+ * @}
+ */
+ protected:
+ MonitorDBStore *get_store();
+};
+
+inline ostream& operator<<(ostream& out, Paxos::C_Proposal& p)
+{
+ string proposed = (p.proposed ? "proposed" : "unproposed");
+ out << " " << proposed
+ << " queued " << (ceph_clock_now() - p.proposal_time)
+ << " tx dump:\n";
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ auto p_it = p.bl.cbegin();
+ t->decode(p_it);
+ JSONFormatter f(true);
+ t->dump(&f);
+ f.flush(out);
+ return out;
+}
+
+#endif
+