summaryrefslogtreecommitdiffstats
path: root/src/osd/PeeringState.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/osd/PeeringState.h2521
1 files changed, 2521 insertions, 0 deletions
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
new file mode 100644
index 000000000..6901ab506
--- /dev/null
+++ b/src/osd/PeeringState.h
@@ -0,0 +1,2521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <string>
+#include <atomic>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+
+#include "PGLog.h"
+#include "PGStateUtils.h"
+#include "PGPeeringEvent.h"
+#include "osd_types.h"
+#include "osd_types_fmt.h"
+#include "os/ObjectStore.h"
+#include "OSDMap.h"
+#include "MissingLoc.h"
+#include "osd/osd_perf_counters.h"
+#include "common/ostream_temp.h"
+
+struct PGPool {
+ epoch_t cached_epoch;
+ int64_t id;
+ std::string name;
+
+ pg_pool_t info;
+ SnapContext snapc; // the default pool snapc, ready to go.
+
+ PGPool(OSDMapRef map, int64_t i, const pg_pool_t& info,
+ const std::string& name)
+ : cached_epoch(map->get_epoch()),
+ id(i),
+ name(name),
+ info(info) {
+ snapc = info.get_snap_context();
+ }
+
+ void update(OSDMapRef map);
+
+ ceph::timespan get_readable_interval(ConfigProxy &conf) const {
+ double v = 0;
+ if (info.opts.get(pool_opts_t::READ_LEASE_INTERVAL, &v)) {
+ return ceph::make_timespan(v);
+ } else {
+ auto hbi = conf->osd_heartbeat_grace;
+ auto fac = conf->osd_pool_default_read_lease_ratio;
+ return ceph::make_timespan(hbi * fac);
+ }
+ }
+};
+
+template <>
+struct fmt::formatter<PGPool> {
+ template <typename ParseContext>
+ constexpr auto parse(ParseContext& ctx) { return ctx.begin(); }
+
+ template <typename FormatContext>
+ auto format(const PGPool& pool, FormatContext& ctx)
+ {
+ return fmt::format_to(ctx.out(),
+ "{}/{}({})",
+ pool.id,
+ pool.name,
+ pool.info);
+ }
+};
+
+struct PeeringCtx;
+
+// [primary only] content recovery state
+struct BufferedRecoveryMessages {
+#if defined(WITH_SEASTAR)
+ std::map<int, std::vector<MessageURef>> message_map;
+#else
+ std::map<int, std::vector<MessageRef>> message_map;
+#endif
+
+ BufferedRecoveryMessages() = default;
+ BufferedRecoveryMessages(PeeringCtx &ctx);
+
+ void accept_buffered_messages(BufferedRecoveryMessages &m) {
+ for (auto &[target, ls] : m.message_map) {
+ auto &ovec = message_map[target];
+ // put buffered messages in front
+ ls.reserve(ls.size() + ovec.size());
+ ls.insert(ls.end(), std::make_move_iterator(ovec.begin()), std::make_move_iterator(ovec.end()));
+ ovec.clear();
+ ovec.swap(ls);
+ }
+ }
+
+ template <class MsgT> // MsgT = MessageRef for ceph-osd and MessageURef for crimson-osd
+ void send_osd_message(int target, MsgT&& m) {
+ message_map[target].emplace_back(std::forward<MsgT>(m));
+ }
+ void send_notify(int to, const pg_notify_t &n);
+ void send_query(int to, spg_t spgid, const pg_query_t &q);
+ void send_info(int to, spg_t to_spgid,
+ epoch_t min_epoch, epoch_t cur_epoch,
+ const pg_info_t &info,
+ std::optional<pg_lease_t> lease = {},
+ std::optional<pg_lease_ack_t> lease_ack = {});
+};
+
+struct HeartbeatStamps : public RefCountedObject {
+ mutable ceph::mutex lock = ceph::make_mutex("HeartbeatStamps::lock");
+
+ const int osd;
+
+ // we maintain an upper and lower bound on the delta between our local
+ // mono_clock time (minus the startup_time) to the peer OSD's mono_clock
+ // time (minus its startup_time).
+ //
+ // delta is (remote_clock_time - local_clock_time), so that
+ // local_time + delta -> peer_time, and peer_time - delta -> local_time.
+ //
+ // we have an upper and lower bound value on this delta, meaning the
+ // value of the remote clock is somewhere between [my_time + lb, my_time + ub]
+ //
+ // conversely, if we have a remote timestamp T, then that is
+ // [T - ub, T - lb] in terms of the local clock. i.e., if you are
+ // substracting the delta, then take care that you swap the role of the
+ // lb and ub values.
+
+ /// lower bound on peer clock - local clock
+ std::optional<ceph::signedspan> peer_clock_delta_lb;
+
+ /// upper bound on peer clock - local clock
+ std::optional<ceph::signedspan> peer_clock_delta_ub;
+
+ /// highest up_from we've seen from this rank
+ epoch_t up_from = 0;
+
+ void print(std::ostream& out) const {
+ std::lock_guard l(lock);
+ out << "hbstamp(osd." << osd << " up_from " << up_from
+ << " peer_clock_delta [";
+ if (peer_clock_delta_lb) {
+ out << *peer_clock_delta_lb;
+ }
+ out << ",";
+ if (peer_clock_delta_ub) {
+ out << *peer_clock_delta_ub;
+ }
+ out << "])";
+ }
+
+ void sent_ping(std::optional<ceph::signedspan> *delta_ub) {
+ std::lock_guard l(lock);
+ // the non-primaries need a lower bound on remote clock - local clock. if
+ // we assume the transit for the last ping_reply was
+ // instantaneous, that would be (the negative of) our last
+ // peer_clock_delta_lb value.
+ if (peer_clock_delta_lb) {
+ *delta_ub = - *peer_clock_delta_lb;
+ }
+ }
+
+ void got_ping(epoch_t this_up_from,
+ ceph::signedspan now,
+ ceph::signedspan peer_send_stamp,
+ std::optional<ceph::signedspan> delta_ub,
+ ceph::signedspan *out_delta_ub) {
+ std::lock_guard l(lock);
+ if (this_up_from < up_from) {
+ return;
+ }
+ if (this_up_from > up_from) {
+ up_from = this_up_from;
+ }
+ peer_clock_delta_lb = peer_send_stamp - now;
+ peer_clock_delta_ub = delta_ub;
+ *out_delta_ub = - *peer_clock_delta_lb;
+ }
+
+ void got_ping_reply(ceph::signedspan now,
+ ceph::signedspan peer_send_stamp,
+ std::optional<ceph::signedspan> delta_ub) {
+ std::lock_guard l(lock);
+ peer_clock_delta_lb = peer_send_stamp - now;
+ peer_clock_delta_ub = delta_ub;
+ }
+
+private:
+ FRIEND_MAKE_REF(HeartbeatStamps);
+ HeartbeatStamps(int o)
+ : RefCountedObject(NULL),
+ osd(o) {}
+};
+using HeartbeatStampsRef = ceph::ref_t<HeartbeatStamps>;
+
+inline std::ostream& operator<<(std::ostream& out, const HeartbeatStamps& hb)
+{
+ hb.print(out);
+ return out;
+}
+
+
+struct PeeringCtx : BufferedRecoveryMessages {
+ ObjectStore::Transaction transaction;
+ HBHandle* handle = nullptr;
+
+ PeeringCtx() = default;
+
+ PeeringCtx(const PeeringCtx &) = delete;
+ PeeringCtx &operator=(const PeeringCtx &) = delete;
+
+ PeeringCtx(PeeringCtx &&) = default;
+ PeeringCtx &operator=(PeeringCtx &&) = default;
+
+ void reset_transaction() {
+ transaction = ObjectStore::Transaction();
+ }
+};
+
+/**
+ * Wraps PeeringCtx to hide the difference between buffering messages to
+ * be sent after flush or immediately.
+ */
+struct PeeringCtxWrapper {
+ utime_t start_time;
+ BufferedRecoveryMessages &msgs;
+ ObjectStore::Transaction &transaction;
+ HBHandle * const handle = nullptr;
+
+ PeeringCtxWrapper(PeeringCtx &wrapped) :
+ msgs(wrapped),
+ transaction(wrapped.transaction),
+ handle(wrapped.handle) {}
+
+ PeeringCtxWrapper(BufferedRecoveryMessages &buf, PeeringCtx &wrapped)
+ : msgs(buf),
+ transaction(wrapped.transaction),
+ handle(wrapped.handle) {}
+
+ PeeringCtxWrapper(PeeringCtxWrapper &&ctx) = default;
+
+ template <class MsgT> // MsgT = MessageRef for ceph-osd and MessageURef for crimson-osd
+ void send_osd_message(int target, MsgT&& m) {
+ msgs.send_osd_message(target, std::forward<MsgT>(m));
+ }
+ void send_notify(int to, const pg_notify_t &n) {
+ msgs.send_notify(to, n);
+ }
+ void send_query(int to, spg_t spgid, const pg_query_t &q) {
+ msgs.send_query(to, spgid, q);
+ }
+ void send_info(int to, spg_t to_spgid,
+ epoch_t min_epoch, epoch_t cur_epoch,
+ const pg_info_t &info,
+ std::optional<pg_lease_t> lease = {},
+ std::optional<pg_lease_ack_t> lease_ack = {}) {
+ msgs.send_info(to, to_spgid, min_epoch, cur_epoch, info,
+ lease, lease_ack);
+ }
+};
+
+/* Encapsulates PG recovery process */
+class PeeringState : public MissingLoc::MappingInfo {
+public:
+ struct PeeringListener : public EpochSource {
+ /// Prepare t with written information
+ virtual void prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ObjectStore::Transaction &t) = 0;
+
+ /// Notify that info/history changed (generally to update scrub registration)
+ virtual void on_info_history_change() = 0;
+
+ /// Notify PG that Primary/Replica status has changed (to update scrub registration)
+ virtual void on_primary_status_change(bool was_primary, bool now_primary) = 0;
+
+ /// Need to reschedule next scrub. Assuming no change in role
+ virtual void reschedule_scrub() = 0;
+
+ /// Notify that a scrub has been requested
+ virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0;
+
+ /// Return current snap_trimq size
+ virtual uint64_t get_snap_trimq_size() const = 0;
+
+ /// Send cluster message to osd
+ #if defined(WITH_SEASTAR)
+ virtual void send_cluster_message(
+ int osd, MessageURef m, epoch_t epoch, bool share_map_update=false) = 0;
+ #else
+ virtual void send_cluster_message(
+ int osd, MessageRef m, epoch_t epoch, bool share_map_update=false) = 0;
+ #endif
+ /// Send pg_created to mon
+ virtual void send_pg_created(pg_t pgid) = 0;
+
+ virtual ceph::signedspan get_mnow() const = 0;
+ virtual HeartbeatStampsRef get_hb_stamps(int peer) = 0;
+ virtual void schedule_renew_lease(epoch_t plr, ceph::timespan delay) = 0;
+ virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0;
+ virtual void recheck_readable() = 0;
+
+ virtual unsigned get_target_pg_log_entries() const = 0;
+
+ // ============ Flush state ==================
+ /**
+ * try_flush_or_schedule_async()
+ *
+ * If true, caller may assume all past operations on this pg
+ * have been flushed. Else, caller will receive an on_flushed()
+ * call once the flush has completed.
+ */
+ virtual bool try_flush_or_schedule_async() = 0;
+ /// Arranges for a commit on t to call on_flushed() once flushed.
+ virtual void start_flush_on_transaction(
+ ObjectStore::Transaction &t) = 0;
+ /// Notification that all outstanding flushes for interval have completed
+ virtual void on_flushed() = 0;
+
+ //============= Recovery ====================
+ /// Arrange for even to be queued after delay
+ virtual void schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) = 0;
+ /**
+ * request_local_background_io_reservation
+ *
+ * Request reservation at priority with on_grant queued on grant
+ * and on_preempt on preempt
+ */
+ virtual void request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) = 0;
+ /// Modify pending local background reservation request priority
+ virtual void update_local_background_io_priority(
+ unsigned priority) = 0;
+ /// Cancel pending local background reservation request
+ virtual void cancel_local_background_io_reservation() = 0;
+
+ /**
+ * request_remote_background_io_reservation
+ *
+ * Request reservation at priority with on_grant queued on grant
+ * and on_preempt on preempt
+ */
+ virtual void request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) = 0;
+ /// Cancel pending remote background reservation request
+ virtual void cancel_remote_recovery_reservation() = 0;
+
+ /// Arrange for on_commit to be queued upon commit of t
+ virtual void schedule_event_on_commit(
+ ObjectStore::Transaction &t,
+ PGPeeringEventRef on_commit) = 0;
+
+ //============================ HB =============================
+ /// Update hb set to peers
+ virtual void update_heartbeat_peers(std::set<int> peers) = 0;
+
+ /// Std::set targets being probed in this interval
+ virtual void set_probe_targets(const std::set<pg_shard_t> &probe_set) = 0;
+ /// Clear targets being probed in this interval
+ virtual void clear_probe_targets() = 0;
+
+ /// Queue for a pg_temp of wanted
+ virtual void queue_want_pg_temp(const std::vector<int> &wanted) = 0;
+ /// Clear queue for a pg_temp of wanted
+ virtual void clear_want_pg_temp() = 0;
+
+ /// Arrange for stats to be shipped to mon to be updated for this pg
+ virtual void publish_stats_to_osd() = 0;
+ /// Clear stats to be shipped to mon for this pg
+ virtual void clear_publish_stats() = 0;
+
+ /// Notification to check outstanding operation targets
+ virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
+ /// Notification to check outstanding blocklist
+ virtual void check_blocklisted_watchers() = 0;
+ /// Notification to clear state associated with primary
+ virtual void clear_primary_state() = 0;
+
+ // =================== Event notification ====================
+ virtual void on_pool_change() = 0;
+ virtual void on_role_change() = 0;
+ virtual void on_change(ObjectStore::Transaction &t) = 0;
+ virtual void on_activate(interval_set<snapid_t> to_trim) = 0;
+ virtual void on_activate_complete() = 0;
+ virtual void on_new_interval() = 0;
+ virtual Context *on_clean() = 0;
+ virtual void on_activate_committed() = 0;
+ virtual void on_active_exit() = 0;
+
+ // ====================== PG deletion =======================
+ /// Notification of removal complete, t must be populated to complete removal
+ virtual void on_removal(ObjectStore::Transaction &t) = 0;
+ /// Perform incremental removal work
+ virtual std::pair<ghobject_t, bool> do_delete_work(
+ ObjectStore::Transaction &t, ghobject_t _next) = 0;
+
+ // ======================= PG Merge =========================
+ virtual void clear_ready_to_merge() = 0;
+ virtual void set_not_ready_to_merge_target(pg_t pgid, pg_t src) = 0;
+ virtual void set_not_ready_to_merge_source(pg_t pgid) = 0;
+ virtual void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) = 0;
+ virtual void set_ready_to_merge_source(eversion_t lu) = 0;
+
+ // ==================== Std::map notifications ===================
+ virtual void on_active_actmap() = 0;
+ virtual void on_active_advmap(const OSDMapRef &osdmap) = 0;
+ virtual epoch_t cluster_osdmap_trim_lower_bound() = 0;
+
+ // ============ recovery reservation notifications ==========
+ virtual void on_backfill_reserved() = 0;
+ virtual void on_backfill_canceled() = 0;
+ virtual void on_recovery_reserved() = 0;
+
+ // ================recovery space accounting ================
+ virtual bool try_reserve_recovery_space(
+ int64_t primary_num_bytes, int64_t local_num_bytes) = 0;
+ virtual void unreserve_recovery_space() = 0;
+
+ // ================== Peering log events ====================
+ /// Get handler for rolling forward/back log entries
+ virtual PGLog::LogEntryHandlerRef get_log_handler(
+ ObjectStore::Transaction &t) = 0;
+
+ // ============ On disk representation changes ==============
+ virtual void rebuild_missing_set_with_deletes(PGLog &pglog) = 0;
+
+ // ======================= Logging ==========================
+ virtual PerfCounters &get_peering_perf() = 0;
+ virtual PerfCounters &get_perf_logger() = 0;
+ virtual void log_state_enter(const char *state) = 0;
+ virtual void log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) = 0;
+ virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+
+ virtual OstreamTemp get_clog_info() = 0;
+ virtual OstreamTemp get_clog_error() = 0;
+ virtual OstreamTemp get_clog_debug() = 0;
+
+ virtual ~PeeringListener() {}
+ };
+
+ struct QueryState : boost::statechart::event< QueryState > {
+ ceph::Formatter *f;
+ explicit QueryState(ceph::Formatter *f) : f(f) {}
+ void print(std::ostream *out) const {
+ *out << "Query";
+ }
+ };
+
+ struct QueryUnfound : boost::statechart::event< QueryUnfound > {
+ ceph::Formatter *f;
+ explicit QueryUnfound(ceph::Formatter *f) : f(f) {}
+ void print(std::ostream *out) const {
+ *out << "QueryUnfound";
+ }
+ };
+
+ struct AdvMap : boost::statechart::event< AdvMap > {
+ OSDMapRef osdmap;
+ OSDMapRef lastmap;
+ std::vector<int> newup, newacting;
+ int up_primary, acting_primary;
+ AdvMap(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ std::vector<int>& newup, int up_primary,
+ std::vector<int>& newacting, int acting_primary):
+ osdmap(osdmap), lastmap(lastmap),
+ newup(newup),
+ newacting(newacting),
+ up_primary(up_primary),
+ acting_primary(acting_primary) {}
+ void print(std::ostream *out) const {
+ *out << "AdvMap";
+ }
+ };
+
+ struct ActMap : boost::statechart::event< ActMap > {
+ ActMap() : boost::statechart::event< ActMap >() {}
+ void print(std::ostream *out) const {
+ *out << "ActMap";
+ }
+ };
+ struct Activate : boost::statechart::event< Activate > {
+ epoch_t activation_epoch;
+ explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
+ activation_epoch(q) {}
+ void print(std::ostream *out) const {
+ *out << "Activate from " << activation_epoch;
+ }
+ };
+ struct ActivateCommitted : boost::statechart::event< ActivateCommitted > {
+ epoch_t epoch;
+ epoch_t activation_epoch;
+ explicit ActivateCommitted(epoch_t e, epoch_t ae)
+ : boost::statechart::event< ActivateCommitted >(),
+ epoch(e),
+ activation_epoch(ae) {}
+ void print(std::ostream *out) const {
+ *out << "ActivateCommitted from " << activation_epoch
+ << " processed at " << epoch;
+ }
+ };
+public:
+ struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
+ explicit UnfoundBackfill() {}
+ void print(std::ostream *out) const {
+ *out << "UnfoundBackfill";
+ }
+ };
+ struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
+ explicit UnfoundRecovery() {}
+ void print(std::ostream *out) const {
+ *out << "UnfoundRecovery";
+ }
+ };
+
+ struct RequestScrub : boost::statechart::event<RequestScrub> {
+ scrub_level_t deep;
+ scrub_type_t repair;
+ explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {}
+ void print(std::ostream *out) const {
+ *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow")
+ << ((repair==scrub_type_t::do_repair) ? " repair)" : ")");
+ }
+ };
+
+ TrivialEvent(Initialize)
+ TrivialEvent(GotInfo)
+ TrivialEvent(NeedUpThru)
+ TrivialEvent(Backfilled)
+ TrivialEvent(LocalBackfillReserved)
+ TrivialEvent(RejectTooFullRemoteReservation)
+ TrivialEvent(RequestBackfill)
+ TrivialEvent(RemoteRecoveryPreempted)
+ TrivialEvent(RemoteBackfillPreempted)
+ TrivialEvent(BackfillTooFull)
+ TrivialEvent(RecoveryTooFull)
+
+ TrivialEvent(MakePrimary)
+ TrivialEvent(MakeStray)
+ TrivialEvent(NeedActingChange)
+ TrivialEvent(IsIncomplete)
+ TrivialEvent(IsDown)
+
+ TrivialEvent(AllReplicasRecovered)
+ TrivialEvent(DoRecovery)
+ TrivialEvent(LocalRecoveryReserved)
+ TrivialEvent(AllRemotesReserved)
+ TrivialEvent(AllBackfillsReserved)
+ TrivialEvent(GoClean)
+
+ TrivialEvent(AllReplicasActivated)
+
+ TrivialEvent(IntervalFlush)
+
+ TrivialEvent(DeleteStart)
+ TrivialEvent(DeleteSome)
+
+ TrivialEvent(SetForceRecovery)
+ TrivialEvent(UnsetForceRecovery)
+ TrivialEvent(SetForceBackfill)
+ TrivialEvent(UnsetForceBackfill)
+
+ TrivialEvent(DeleteReserved)
+ TrivialEvent(DeleteInterrupted)
+
+ TrivialEvent(CheckReadable)
+
+ void start_handle(PeeringCtx *new_ctx);
+ void end_handle();
+ void begin_block_outgoing();
+ void end_block_outgoing();
+ void clear_blocked_outgoing();
+ private:
+
+ /* States */
+ struct Initial;
+ class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > {
+ public:
+ PeeringState *state;
+ PGStateHistory *state_history;
+ CephContext *cct;
+ spg_t spgid;
+ DoutPrefixProvider *dpp;
+ PeeringListener *pl;
+
+ utime_t event_time;
+ uint64_t event_count;
+
+ void clear_event_counters() {
+ event_time = utime_t();
+ event_count = 0;
+ }
+
+ void log_enter(const char *state_name);
+ void log_exit(const char *state_name, utime_t duration);
+
+ PeeringMachine(
+ PeeringState *state, CephContext *cct,
+ spg_t spgid,
+ DoutPrefixProvider *dpp,
+ PeeringListener *pl,
+ PGStateHistory *state_history) :
+ state(state),
+ state_history(state_history),
+ cct(cct), spgid(spgid),
+ dpp(dpp), pl(pl),
+ event_count(0) {}
+
+ /* Accessor functions for state methods */
+ ObjectStore::Transaction& get_cur_transaction() {
+ ceph_assert(state->rctx);
+ return state->rctx->transaction;
+ }
+
+ PeeringCtxWrapper &get_recovery_ctx() {
+ assert(state->rctx);
+ return *(state->rctx);
+ }
+
+ void send_notify(int to, const pg_notify_t &n) {
+ ceph_assert(state->rctx);
+ state->rctx->send_notify(to, n);
+ }
+ void send_query(int to, const pg_query_t &query) {
+ state->rctx->send_query(
+ to,
+ spg_t(spgid.pgid, query.to),
+ query);
+ }
+ };
+ friend class PeeringMachine;
+
+ /* States */
+ // Initial
+ // Reset
+ // Start
+ // Started
+ // Primary
+ // WaitActingChange
+ // Peering
+ // GetInfo
+ // GetLog
+ // GetMissing
+ // WaitUpThru
+ // Incomplete
+ // Active
+ // Activating
+ // Clean
+ // Recovered
+ // Backfilling
+ // WaitRemoteBackfillReserved
+ // WaitLocalBackfillReserved
+ // NotBackfilling
+ // NotRecovering
+ // Recovering
+ // WaitRemoteRecoveryReserved
+ // WaitLocalRecoveryReserved
+ // ReplicaActive
+ // RepNotRecovering
+ // RepRecovering
+ // RepWaitBackfillReserved
+ // RepWaitRecoveryReserved
+ // Stray
+ // ToDelete
+ // WaitDeleteReserved
+ // Deleting
+ // Crashed
+
+ struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState {
+ explicit Crashed(my_context ctx);
+ };
+
+ struct Reset;
+
+ struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState {
+ explicit Initial(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::transition< Initialize, Reset >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+
+ boost::statechart::result react(const MNotifyRec&);
+ boost::statechart::result react(const MInfoRec&);
+ boost::statechart::result react(const MLogRec&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState {
+ explicit Reset(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction< IntervalFlush >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Start;
+
+ struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState {
+ explicit Started(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< IntervalFlush >,
+ // ignored
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<UnsetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>,
+ boost::statechart::custom_reaction<UnsetForceBackfill>,
+ boost::statechart::custom_reaction<RequestScrub>,
+ boost::statechart::custom_reaction<CheckReadable>,
+ // crash
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Primary;
+ struct Stray;
+
+ struct Start : boost::statechart::state< Start, Started >, NamedState {
+ explicit Start(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::transition< MakePrimary, Primary >,
+ boost::statechart::transition< MakeStray, Stray >
+ > reactions;
+ };
+
+ struct Peering;
+ struct WaitActingChange;
+ struct Incomplete;
+ struct Down;
+
+ struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
+ explicit Primary(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<UnsetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>,
+ boost::statechart::custom_reaction<UnsetForceBackfill>,
+ boost::statechart::custom_reaction<RequestScrub>
+ > reactions;
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const MNotifyRec&);
+ boost::statechart::result react(const SetForceRecovery&);
+ boost::statechart::result react(const UnsetForceRecovery&);
+ boost::statechart::result react(const SetForceBackfill&);
+ boost::statechart::result react(const UnsetForceBackfill&);
+ boost::statechart::result react(const RequestScrub&);
+ };
+
+ struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
+ NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MNotifyRec >
+ > reactions;
+ explicit WaitActingChange(my_context ctx);
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const MLogRec&);
+ boost::statechart::result react(const MInfoRec&);
+ boost::statechart::result react(const MNotifyRec&);
+ void exit();
+ };
+
+ struct GetInfo;
+ struct Active;
+
+ struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
+ PastIntervals::PriorSet prior_set;
+ bool history_les_bound; //< need osd_find_best_info_ignore_history_les
+
+ explicit Peering(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< Activate, Active >,
+ boost::statechart::custom_reaction< AdvMap >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap &advmap);
+ };
+
+ struct WaitLocalRecoveryReserved;
+ struct Activating;
+ struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
+ explicit Active(my_context ctx);
+ void exit();
+
+ const std::set<pg_shard_t> remote_shards_to_reserve_recovery;
+ const std::set<pg_shard_t> remote_shards_to_reserve_backfill;
+ bool all_replicas_activated;
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MTrim >,
+ boost::statechart::custom_reaction< Backfilled >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< AllReplicasActivated >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+ boost::statechart::custom_reaction< RemoteReservationRevoked>,
+ boost::statechart::custom_reaction< DoRecovery>,
+ boost::statechart::custom_reaction< RenewLease>,
+ boost::statechart::custom_reaction< MLeaseAck>,
+ boost::statechart::custom_reaction< CheckReadable>
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const MNotifyRec& notevt);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MTrim& trimevt);
+ boost::statechart::result react(const Backfilled&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const ActivateCommitted&);
+ boost::statechart::result react(const AllReplicasActivated&);
+ boost::statechart::result react(const RenewLease&);
+ boost::statechart::result react(const MLeaseAck&);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationRevoked&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DoRecovery&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const CheckReadable&);
+ void all_activated_and_committed();
+ };
+
+ struct Clean : boost::statechart::state< Clean, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>
+ > reactions;
+ explicit Clean(my_context ctx);
+ void exit();
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< GoClean, Clean >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction< AllReplicasActivated >
+ > reactions;
+ explicit Recovered(my_context ctx);
+ void exit();
+ boost::statechart::result react(const AllReplicasActivated&) {
+ post_event(GoClean());
+ return forward_event();
+ }
+ };
+
+ struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< Backfilled >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+ boost::statechart::custom_reaction< RemoteReservationRevoked>
+ > reactions;
+ explicit Backfilling(my_context ctx);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) {
+ // for compat with old peers
+ post_event(RemoteReservationRevokedTooFull());
+ return discard_event();
+ }
+ void backfill_release_reservations();
+ boost::statechart::result react(const Backfilled& evt);
+ boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
+ boost::statechart::result react(const RemoteReservationRevoked& evt);
+ boost::statechart::result react(const DeferBackfill& evt);
+ boost::statechart::result react(const UnfoundBackfill& evt);
+ void cancel_backfill();
+ void exit();
+ };
+
+ struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationRevoked >,
+ boost::statechart::transition< AllBackfillsReserved, Backfilling >
+ > reactions;
+ std::set<pg_shard_t>::const_iterator backfill_osd_it;
+ explicit WaitRemoteBackfillReserved(my_context ctx);
+ void retry();
+ void exit();
+ boost::statechart::result react(const RemoteBackfillReserved& evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+ boost::statechart::result react(const RemoteReservationRevoked& evt);
+ };
+
+ struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >
+ > reactions;
+ explicit WaitLocalBackfillReserved(my_context ctx);
+ boost::statechart::result react(const RemoteBackfillReserved& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >
+ > reactions;
+ explicit NotBackfilling(my_context ctx);
+ void exit();
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const RemoteBackfillReserved& evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+ };
+
+ struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< UnfoundRecovery >
+ > reactions;
+ explicit NotRecovering(my_context ctx);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct ToDelete;
+ struct RepNotRecovering;
+ struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
+ explicit ReplicaActive(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MQuery >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MTrim >,
+ boost::statechart::custom_reaction< Activate >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteBackfillPreempted >,
+ boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+ boost::statechart::custom_reaction< RecoveryDone >,
+ boost::statechart::transition<DeleteStart, ToDelete>,
+ boost::statechart::custom_reaction< MLease >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MTrim& trimevt);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const MQuery&);
+ boost::statechart::result react(const Activate&);
+ boost::statechart::result react(const ActivateCommitted&);
+ boost::statechart::result react(const MLease&);
+ boost::statechart::result react(const RecoveryDone&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteBackfillPreempted& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
+ return discard_event();
+ }
+ };
+
+ struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+ // for compat with old peers
+ boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+ boost::statechart::custom_reaction< BackfillTooFull >,
+ boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+ boost::statechart::custom_reaction< RemoteBackfillPreempted >
+ > reactions;
+ explicit RepRecovering(my_context ctx);
+ boost::statechart::result react(const RemoteRecoveryPreempted &evt);
+ boost::statechart::result react(const BackfillTooFull &evt);
+ boost::statechart::result react(const RemoteBackfillPreempted &evt);
+ void exit();
+ };
+
+ struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
+ > reactions;
+ explicit RepWaitBackfillReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RemoteBackfillReserved &evt);
+ boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull &evt);
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
+ };
+
+ struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ // for compat with old peers
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
+ > reactions;
+ explicit RepWaitRecoveryReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RemoteRecoveryReserved &evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) {
+ // for compat with old peers
+ post_event(RemoteReservationCanceled());
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
+ };
+
+ struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RequestRecoveryPrio >,
+ boost::statechart::custom_reaction< RequestBackfillPrio >,
+ boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+ boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
+ > reactions;
+ explicit RepNotRecovering(my_context ctx);
+ boost::statechart::result react(const RequestRecoveryPrio &evt);
+ boost::statechart::result react(const RequestBackfillPrio &evt);
+ boost::statechart::result react(const RemoteBackfillReserved &evt) {
+ // my reservation completion raced with a RELEASE from primary
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteRecoveryReserved &evt) {
+ // my reservation completion raced with a RELEASE from primary
+ return discard_event();
+ }
+ boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+ void exit();
+ };
+
+ struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< AllReplicasRecovered >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< RequestBackfill >
+ > reactions;
+ explicit Recovering(my_context ctx);
+ void exit();
+ void release_reservations(bool cancel = false);
+ boost::statechart::result react(const AllReplicasRecovered &evt);
+ boost::statechart::result react(const DeferRecovery& evt);
+ boost::statechart::result react(const UnfoundRecovery& evt);
+ boost::statechart::result react(const RequestBackfill &evt);
+ };
+
+ struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ boost::statechart::transition< AllRemotesReserved, Recovering >
+ > reactions;
+ std::set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
+ explicit WaitRemoteRecoveryReserved(my_context ctx);
+ boost::statechart::result react(const RemoteRecoveryReserved &evt);
+ void exit();
+ };
+
+ struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
+ boost::statechart::custom_reaction< RecoveryTooFull >
+ > reactions;
+ explicit WaitLocalRecoveryReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RecoveryTooFull &evt);
+ };
+
+ struct Activating : boost::statechart::state< Activating, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition< AllReplicasRecovered, Recovered >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
+ > reactions;
+ explicit Activating(my_context ctx);
+ void exit();
+ };
+
+ struct Stray : boost::statechart::state< Stray, Started >,
+ NamedState {
+ explicit Stray(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< MQuery >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< RecoveryDone >,
+ boost::statechart::transition<DeleteStart, ToDelete>
+ > reactions;
+ boost::statechart::result react(const MQuery& query);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const RecoveryDone&) {
+ return discard_event();
+ }
+ };
+
+ struct WaitDeleteReserved;
+ struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
+ unsigned priority = 0;
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< DeleteSome >
+ > reactions;
+ explicit ToDelete(my_context ctx);
+ boost::statechart::result react(const ActMap &evt);
+ boost::statechart::result react(const DeleteSome &evt) {
+ // happens if we drop out of Deleting due to reprioritization etc.
+ return discard_event();
+ }
+ boost::statechart::result react(const ActivateCommitted&) {
+ // Can happens if we were activated as a stray but not actually pulled
+ // from prior to the pg going clean and sending a delete.
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct Deleting;
+ struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
+ ToDelete>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition<DeleteReserved, Deleting>
+ > reactions;
+ explicit WaitDeleteReserved(my_context ctx);
+ void exit();
+ };
+
+ struct Deleting : boost::statechart::state<Deleting,
+ ToDelete>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< DeleteSome >,
+ boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
+ > reactions;
+ ghobject_t next;
+ explicit Deleting(my_context ctx);
+ boost::statechart::result react(const DeleteSome &evt);
+ void exit();
+ };
+
+ struct GetLog;
+
+ struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
+ std::set<pg_shard_t> peer_info_requested;
+
+ explicit GetInfo(my_context ctx);
+ void exit();
+ void get_infos();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< GotInfo, GetLog >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::transition< IsDown, Down >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ };
+
+ struct GotLog : boost::statechart::event< GotLog > {
+ GotLog() : boost::statechart::event< GotLog >() {}
+ };
+
+ struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
+ pg_shard_t auth_log_shard;
+ boost::intrusive_ptr<MOSDPGLog> msg;
+
+ explicit GetLog(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< GotLog >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::transition< NeedActingChange, WaitActingChange >,
+ boost::statechart::transition< IsIncomplete, Incomplete >
+ > reactions;
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const GotLog&);
+ };
+
+ struct WaitUpThru;
+
+ struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
+ std::set<pg_shard_t> peer_missing_requested;
+
+ explicit GetMissing(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::transition< NeedUpThru, WaitUpThru >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MLogRec& logevt);
+ };
+
+ struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
+ explicit WaitUpThru(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MLogRec >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const ActMap& am);
+ boost::statechart::result react(const MLogRec& logrec);
+ };
+
+ struct Down : boost::statechart::state< Down, Peering>, NamedState {
+ explicit Down(my_context ctx);
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MNotifyRec >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ void exit();
+ };
+
+ struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< QueryState >
+ > reactions;
+ explicit Incomplete(my_context ctx);
+ boost::statechart::result react(const AdvMap &advmap);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const QueryState& q);
+ void exit();
+ };
+
+ PGStateHistory state_history;
+ CephContext* cct;
+ spg_t spgid;
+ DoutPrefixProvider *dpp;
+ PeeringListener *pl;
+
+ /// context passed in by state machine caller
+ PeeringCtx *orig_ctx;
+
+ /// populated if we are buffering messages pending a flush
+ std::optional<BufferedRecoveryMessages> messages_pending_flush;
+
+ /**
+ * populated between start_handle() and end_handle(), points into
+ * the message lists for messages_pending_flush while blocking messages
+ * or into orig_ctx otherwise
+ */
+ std::optional<PeeringCtxWrapper> rctx;
+
+ /**
+ * OSDMap state
+ */
+ OSDMapRef osdmap_ref; ///< Reference to current OSDMap
+ PGPool pool; ///< Current pool state
+ epoch_t last_persisted_osdmap = 0; ///< Last osdmap epoch persisted
+
+
+ /**
+ * Peering state information
+ */
+ int role = -1; ///< 0 = primary, 1 = replica, -1=none.
+ uint64_t state = 0; ///< PG_STATE_*
+
+ pg_shard_t primary; ///< id/shard of primary
+ pg_shard_t pg_whoami; ///< my id/shard
+ pg_shard_t up_primary; ///< id/shard of primary of up set
+ std::vector<int> up; ///< crush mapping without temp pgs
+ std::set<pg_shard_t> upset; ///< up in set form
+ std::vector<int> acting; ///< actual acting set for the current interval
+ std::set<pg_shard_t> actingset; ///< acting in set form
+
+ /// union of acting, recovery, and backfill targets
+ std::set<pg_shard_t> acting_recovery_backfill;
+
+ std::vector<HeartbeatStampsRef> hb_stamps;
+
+ ceph::signedspan readable_interval = ceph::signedspan::zero();
+
+ /// how long we can service reads in this interval
+ ceph::signedspan readable_until = ceph::signedspan::zero();
+
+ /// upper bound on any acting OSDs' readable_until in this interval
+ ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+ /// upper bound from prior interval(s)
+ ceph::signedspan prior_readable_until_ub = ceph::signedspan::zero();
+
+ /// pg instances from prior interval(s) that may still be readable
+ std::set<int> prior_readable_down_osds;
+
+ /// [replica] upper bound we got from the primary (primary's clock)
+ ceph::signedspan readable_until_ub_from_primary = ceph::signedspan::zero();
+
+ /// [primary] last upper bound shared by primary to replicas
+ ceph::signedspan readable_until_ub_sent = ceph::signedspan::zero();
+
+ /// [primary] readable ub acked by acting set members
+ std::vector<ceph::signedspan> acting_readable_until_ub;
+
+ bool send_notify = false; ///< True if a notify needs to be sent to the primary
+
+ bool dirty_info = false; ///< small info structu on disk out of date
+ bool dirty_big_info = false; ///< big info structure on disk out of date
+
+ pg_info_t info; ///< current pg info
+ pg_info_t last_written_info; ///< last written info
+ PastIntervals past_intervals; ///< information about prior pg mappings
+ PGLog pg_log; ///< pg log
+
+ epoch_t last_peering_reset = 0; ///< epoch of last peering reset
+
+ /// last_update that has committed; ONLY DEFINED WHEN is_active()
+ eversion_t last_update_ondisk;
+ eversion_t last_complete_ondisk; ///< last_complete that has committed.
+ eversion_t last_update_applied; ///< last_update readable
+ /// last version to which rollback_info trimming has been applied
+ eversion_t last_rollback_info_trimmed_to_applied;
+
+ /// Counter to determine when pending flushes have completed
+ unsigned flushes_in_progress = 0;
+
+ /**
+ * Primary state
+ */
+ std::set<pg_shard_t> stray_set; ///< non-acting osds that have PG data.
+ std::map<pg_shard_t, pg_info_t> peer_info; ///< info from peers (stray or prior)
+ std::map<pg_shard_t, int64_t> peer_bytes; ///< Peer's num_bytes from peer_info
+ std::set<pg_shard_t> peer_purged; ///< peers purged
+ std::map<pg_shard_t, pg_missing_t> peer_missing; ///< peer missing sets
+ std::set<pg_shard_t> peer_log_requested; ///< logs i've requested (and start stamps)
+ std::set<pg_shard_t> peer_missing_requested; ///< missing sets requested
+
+ /// features supported by all peers
+ uint64_t peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ /// features supported by acting set
+ uint64_t acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ /// features supported by up and acting
+ uint64_t upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+
+ /// most recently consumed osdmap's require_osd_version
+ ceph_release_t last_require_osd_release;
+
+ std::vector<int> want_acting; ///< non-empty while peering needs a new acting set
+
+ // acting_recovery_backfill contains shards that are acting,
+ // async recovery targets, or backfill targets.
+ std::map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
+
+ /// up: min over last_complete_ondisk, peer_last_complete_ondisk
+ eversion_t min_last_complete_ondisk;
+ /// point to which the log should be trimmed
+ eversion_t pg_trim_to;
+
+ std::set<int> blocked_by; ///< osds we are blocked by (for pg stats)
+
+ bool need_up_thru = false; ///< true if osdmap with updated up_thru needed
+
+ /// I deleted these strays; ignore racing PGInfo from them
+ std::set<pg_shard_t> peer_activated;
+
+ std::set<pg_shard_t> backfill_targets; ///< osds to be backfilled
+ std::set<pg_shard_t> async_recovery_targets; ///< osds to be async recovered
+
+ /// osds which might have objects on them which are unfound on the primary
+ std::set<pg_shard_t> might_have_unfound;
+
+ bool deleting = false; /// true while in removing or OSD is shutting down
+ std::atomic<bool> deleted = {false}; /// true once deletion complete
+
+ MissingLoc missing_loc; ///< information about missing objects
+
+ bool backfill_reserved = false;
+ bool backfill_reserving = false;
+
+ PeeringMachine machine;
+
+ void update_osdmap_ref(OSDMapRef newmap) {
+ osdmap_ref = std::move(newmap);
+ }
+
+ void update_heartbeat_peers();
+ void query_unfound(Formatter *f, std::string state);
+ bool proc_replica_info(
+ pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch);
+ void remove_down_peer_info(const OSDMapRef &osdmap);
+ void check_recovery_sources(const OSDMapRef& map);
+ void set_last_peering_reset();
+ void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
+ bool should_restart_peering(
+ int newupprimary,
+ int newactingprimary,
+ const std::vector<int>& newup,
+ const std::vector<int>& newacting,
+ OSDMapRef lastmap,
+ OSDMapRef osdmap);
+ void start_peering_interval(
+ const OSDMapRef lastmap,
+ const std::vector<int>& newup, int up_primary,
+ const std::vector<int>& newacting, int acting_primary,
+ ObjectStore::Transaction &t);
+ void on_new_interval();
+ void clear_recovery_state();
+ void clear_primary_state();
+ void check_past_interval_bounds() const;
+ bool set_force_recovery(bool b);
+ bool set_force_backfill(bool b);
+
+ /// clip calculated priority to reasonable range
+ int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
+ /// get log recovery reservation priority
+ unsigned get_recovery_priority();
+ /// get backfill reservation priority
+ unsigned get_backfill_priority();
+ /// get priority for pg deletion
+ unsigned get_delete_priority();
+
+public:
+ /**
+ * recovery_msg_priority_t
+ *
+ * Defines priority values for use with recovery messages. The values are
+ * chosen to be reasonable for wpq during an upgrade scenarios, but are
+ * actually translated into a class in PGRecoveryMsg::get_scheduler_class()
+ */
+ enum recovery_msg_priority_t : int {
+ FORCED = 20,
+ UNDERSIZED = 15,
+ DEGRADED = 10,
+ BEST_EFFORT = 5
+ };
+
+ /// get message priority for recovery messages
+ int get_recovery_op_priority() const {
+ if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+ /* For mclock, we use special priority values which will be
+ * translated into op classes within PGRecoveryMsg::get_scheduler_class
+ */
+ if (is_forced_recovery_or_backfill()) {
+ return recovery_msg_priority_t::FORCED;
+ } else if (is_undersized()) {
+ return recovery_msg_priority_t::UNDERSIZED;
+ } else if (is_degraded()) {
+ return recovery_msg_priority_t::DEGRADED;
+ } else {
+ return recovery_msg_priority_t::BEST_EFFORT;
+ }
+ } else {
+ /* For WeightedPriorityQueue, we use pool or osd config settings to
+ * statically set the priority for recovery messages. This special
+ * handling should probably be removed after Reef */
+ int64_t pri = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+ return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+ }
+ }
+
+private:
+ bool check_prior_readable_down_osds(const OSDMapRef& map);
+
+ bool adjust_need_up_thru(const OSDMapRef osdmap);
+ PastIntervals::PriorSet build_prior();
+
+ void reject_reservation();
+
+ // acting std::set
+ std::map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+ const std::map<pg_shard_t, pg_info_t> &infos,
+ bool restrict_to_up_acting,
+ bool *history_les_bound) const;
+
+ static void calc_ec_acting(
+ std::map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ std::ostream &ss);
+
+ static std::pair<std::map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
+ select_replicated_primary(
+ std::map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ uint64_t force_auth_primary_missing_objects,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ const OSDMapRef osdmap,
+ std::ostream &ss);
+
+ static void calc_replicated_acting(
+ std::map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ std::ostream &ss);
+ static void calc_replicated_acting_stretch(
+ std::map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ std::ostream &ss);
+
+ void choose_async_recovery_ec(
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const;
+ void choose_async_recovery_replicated(
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const;
+
+ bool recoverable(const std::vector<int> &want) const;
+ bool choose_acting(pg_shard_t &auth_log_shard,
+ bool restrict_to_up_acting,
+ bool *history_les_bound,
+ bool request_pg_temp_change_only = false);
+
+ bool search_for_missing(
+ const pg_info_t &oinfo, const pg_missing_t &omissing,
+ pg_shard_t fromosd,
+ PeeringCtxWrapper &rctx);
+ void build_might_have_unfound();
+ void log_weirdness();
+ void activate(
+ ObjectStore::Transaction& t,
+ epoch_t activation_epoch,
+ PeeringCtxWrapper &ctx);
+
+ void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
+ void merge_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t&& olog, pg_shard_t from);
+
+ void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
+ void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t&& olog, pg_missing_t&& omissing,
+ pg_shard_t from);
+ void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
+ pg_missing_t&& omissing, pg_shard_t from);
+
+ void calc_min_last_complete_ondisk() {
+ eversion_t min = last_complete_ondisk;
+ ceph_assert(!acting_recovery_backfill.empty());
+ for (std::set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ if (peer_last_complete_ondisk.count(*i) == 0)
+ return; // we don't have complete info
+ eversion_t a = peer_last_complete_ondisk[*i];
+ if (a < min)
+ min = a;
+ }
+ if (min == min_last_complete_ondisk)
+ return;
+ min_last_complete_ondisk = min;
+ return;
+ }
+
+ void fulfill_info(
+ pg_shard_t from, const pg_query_t &query,
+ std::pair<pg_shard_t, pg_info_t> &notify_info);
+ void fulfill_log(
+ pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
+ void fulfill_query(const MQuery& q, PeeringCtxWrapper &rctx);
+
+ void try_mark_clean();
+
+ void update_blocked_by();
+ void update_calc_stats();
+
+ void add_log_entry(const pg_log_entry_t& e, bool applied);
+
+ void calc_trim_to();
+ void calc_trim_to_aggressive();
+
+public:
+ PeeringState(
+ CephContext *cct,
+ pg_shard_t pg_whoami,
+ spg_t spgid,
+ const PGPool &pool,
+ OSDMapRef curmap,
+ DoutPrefixProvider *dpp,
+ PeeringListener *pl);
+
+ /// Process evt
+ void handle_event(const boost::statechart::event_base &evt,
+ PeeringCtx *rctx) {
+ start_handle(rctx);
+ machine.process_event(evt);
+ end_handle();
+ }
+
+ /// Process evt
+ void handle_event(PGPeeringEventRef evt,
+ PeeringCtx *rctx) {
+ start_handle(rctx);
+ machine.process_event(evt->get_event());
+ end_handle();
+ }
+
+ /// Init fresh instance of PG
+ void init(
+ int role,
+ const std::vector<int>& newup, int new_up_primary,
+ const std::vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ ObjectStore::Transaction &t);
+
+ /// Init pg instance from disk state
+ template <typename F>
+ auto init_from_disk_state(
+ pg_info_t &&info_from_disk,
+ PastIntervals &&past_intervals_from_disk,
+ F &&pg_log_init) {
+ info = std::move(info_from_disk);
+ last_written_info = info;
+ past_intervals = std::move(past_intervals_from_disk);
+ auto ret = pg_log_init(pg_log);
+ log_weirdness();
+ return ret;
+ }
+
+ /// Std::set initial primary/acting
+ void init_primary_up_acting(
+ const std::vector<int> &newup,
+ const std::vector<int> &newacting,
+ int new_up_primary,
+ int new_acting_primary);
+ void init_hb_stamps();
+
+ /// Std::set initial role
+ void set_role(int r) {
+ role = r;
+ }
+
+ /// Std::set predicates used for determining readable and recoverable
+ void set_backend_predicates(
+ IsPGReadablePredicate *is_readable,
+ IsPGRecoverablePredicate *is_recoverable) {
+ missing_loc.set_backend_predicates(is_readable, is_recoverable);
+ }
+
+ /// Send current pg_info to peers
+ void share_pg_info();
+
+ /// Get stats for child pgs
+ void start_split_stats(
+ const std::set<spg_t>& childpgs, std::vector<object_stat_sum_t> *out);
+
+ /// Update new child with stats
+ void finish_split_stats(
+ const object_stat_sum_t& stats, ObjectStore::Transaction &t);
+
+ /// Split state for child_pgid into *child
+ void split_into(
+ pg_t child_pgid, PeeringState *child, unsigned split_bits);
+
+ /// Merge state from sources
+ void merge_from(
+ std::map<spg_t,PeeringState *>& sources,
+ PeeringCtx &rctx,
+ unsigned split_bits,
+ const pg_merge_meta_t& last_pg_merge_meta);
+
+ /// Permit stray replicas to purge now unnecessary state
+ void purge_strays();
+
+ /**
+ * update_stats
+ *
+ * Mechanism for updating stats and/or history. Pass t to mark
+ * dirty and write out. Return true if stats should be published
+ * to the osd.
+ */
+ void update_stats(
+ std::function<bool(pg_history_t &, pg_stat_t &)> f,
+ ObjectStore::Transaction *t = nullptr);
+
+ void update_stats_wo_resched(
+ std::function<void(pg_history_t &, pg_stat_t &)> f);
+
+ /**
+ * adjust_purged_snaps
+ *
+ * Mechanism for updating purged_snaps. Marks dirty_info, big_dirty_info.
+ */
+ void adjust_purged_snaps(
+ std::function<void(interval_set<snapid_t> &snaps)> f);
+
+ /// Updates info.hit_set to hset_history, does not dirty
+ void update_hset(const pg_hit_set_history_t &hset_history);
+
+ /// Get all pg_shards that needs recovery
+ std::vector<pg_shard_t> get_replica_recovery_order() const;
+
+ /**
+ * update_history
+ *
+ * Merges new_history into info.history clearing past_intervals and
+ * dirtying as needed.
+ *
+ * Calls PeeringListener::on_info_history_change()
+ */
+ void update_history(const pg_history_t& new_history);
+
+ /**
+ * prepare_stats_for_publish
+ *
+ * Returns updated pg_stat_t if stats have changed since
+ * pg_stats_publish adding in unstable_stats.
+ *
+ * @param pg_stats_publish the latest pg_stat possessed by caller
+ * @param unstable_stats additional stats which should be included in the
+ * returned stats
+ * @return the up to date stats if it is different from the specfied
+ * @c pg_stats_publish
+ */
+ std::optional<pg_stat_t> prepare_stats_for_publish(
+ const std::optional<pg_stat_t> &pg_stats_publish,
+ const object_stat_collection_t &unstable_stats);
+
+ /**
+ * Merge entries updating missing as necessary on all
+ * acting_recovery_backfill logs and missings (also missing_loc)
+ */
+ bool append_log_entries_update_missing(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t,
+ std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to);
+
+ void append_log_with_trim_to_updated(
+ std::vector<pg_log_entry_t>&& log_entries,
+ eversion_t roll_forward_to,
+ ObjectStore::Transaction &t,
+ bool transaction_applied,
+ bool async) {
+ update_trim_to();
+ append_log(std::move(log_entries), pg_trim_to, roll_forward_to,
+ min_last_complete_ondisk, t, transaction_applied, async);
+ }
+
+ /**
+ * Updates local log to reflect new write from primary.
+ */
+ void append_log(
+ std::vector<pg_log_entry_t>&& logv,
+ eversion_t trim_to,
+ eversion_t roll_forward_to,
+ eversion_t min_last_complete_ondisk,
+ ObjectStore::Transaction &t,
+ bool transaction_applied,
+ bool async);
+
+ /**
+ * retrieve the min last_backfill among backfill targets
+ */
+ hobject_t earliest_backfill() const;
+
+
+ /**
+ * Updates local log/missing to reflect new oob log update from primary
+ */
+ void merge_new_log_entries(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t,
+ std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to);
+
+ /// Update missing set to reflect e (TODOSAM: not sure why this is needed)
+ void add_local_next_event(const pg_log_entry_t& e) {
+ pg_log.missing_add_next_entry(e);
+ }
+
+ /// Update log trim boundary
+ void update_trim_to() {
+ bool hard_limit = (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+ if (hard_limit)
+ calc_trim_to_aggressive();
+ else
+ calc_trim_to();
+ }
+
+ /// Pre-process pending update on hoid represented by logv
+ void pre_submit_op(
+ const hobject_t &hoid,
+ const std::vector<pg_log_entry_t>& logv,
+ eversion_t at_version);
+
+ /// Signal that oid has been locally recovered to version v
+ void recover_got(
+ const hobject_t &oid, eversion_t v,
+ bool is_delete,
+ ObjectStore::Transaction &t);
+
+ /// Signal that oid has been recovered on peer to version
+ void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &soid,
+ const eversion_t &version);
+
+ /// Notify that soid is being recovered on peer
+ void begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t soid);
+
+ /// Pull missing sets from all candidate peers
+ bool discover_all_missing(
+ BufferedRecoveryMessages &rctx);
+
+ /// Notify that hoid has been fully recocovered
+ void object_recovered(
+ const hobject_t &hoid,
+ const object_stat_sum_t &stat_diff) {
+ info.stats.stats.sum.add(stat_diff);
+ missing_loc.recovered(hoid);
+ }
+
+ /// Update info/stats to reflect backfill progress
+ void update_backfill_progress(
+ const hobject_t &updated_backfill,
+ const pg_stat_t &updated_stats,
+ bool preserve_local_num_bytes,
+ ObjectStore::Transaction &t);
+
+ /// Update info/stats to reflect completed backfill on hoid
+ void update_complete_backfill_object_stats(
+ const hobject_t &hoid,
+ const pg_stat_t &stats);
+
+ /// Update last_backfill for peer to new_last_backfill
+ void update_peer_last_backfill(
+ pg_shard_t peer,
+ const hobject_t &new_last_backfill);
+
+ /// Update info.stats with delta_stats for operation on soid
+ void apply_op_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats);
+
+ /**
+ * force_object_missing
+ *
+ * Force oid on peer to be missing at version. If the object does not
+ * currently need recovery, either candidates if provided or the remainder
+ * of the acting std::set will be deemed to have the object.
+ */
+ void force_object_missing(
+ const pg_shard_t &peer,
+ const hobject_t &oid,
+ eversion_t version) {
+ force_object_missing(std::set<pg_shard_t>{peer}, oid, version);
+ }
+ void force_object_missing(
+ const std::set<pg_shard_t> &peer,
+ const hobject_t &oid,
+ eversion_t version);
+
+ /// Update state prior to backfilling soid on targets
+ void prepare_backfill_for_missing(
+ const hobject_t &soid,
+ const eversion_t &version,
+ const std::vector<pg_shard_t> &targets);
+
+ /// Std::set targets with the right version for revert (see recover_primary)
+ void set_revert_with_targets(
+ const hobject_t &soid,
+ const std::set<pg_shard_t> &good_peers);
+
+ /// Update lcod for fromosd
+ void update_peer_last_complete_ondisk(
+ pg_shard_t fromosd,
+ eversion_t lcod) {
+ peer_last_complete_ondisk[fromosd] = lcod;
+ }
+
+ /// Update lcod
+ void update_last_complete_ondisk(
+ eversion_t lcod) {
+ last_complete_ondisk = lcod;
+ }
+
+ /// Update state to reflect recovery up to version
+ void recovery_committed_to(eversion_t version);
+
+ /// Mark recovery complete
+ void local_recovery_complete() {
+ info.last_complete = info.last_update;
+ }
+
+ /// Update last_requested pointer to v
+ void set_last_requested(version_t v) {
+ pg_log.set_last_requested(v);
+ }
+
+ /// Write dirty state to t
+ void write_if_dirty(ObjectStore::Transaction& t);
+
+ /// Mark write completed to v with persisted lc
+ void complete_write(eversion_t v, eversion_t lc);
+
+ /// Update local write applied pointer
+ void local_write_applied(eversion_t v) {
+ last_update_applied = v;
+ }
+
+ /// Updates peering state with new map
+ void advance_map(
+ OSDMapRef osdmap, ///< [in] new osdmap
+ OSDMapRef lastmap, ///< [in] prev osdmap
+ std::vector<int>& newup, ///< [in] new up set
+ int up_primary, ///< [in] new up primary
+ std::vector<int>& newacting, ///< [in] new acting
+ int acting_primary, ///< [in] new acting primary
+ PeeringCtx &rctx ///< [out] recovery context
+ );
+
+ /// Activates most recently updated map
+ void activate_map(
+ PeeringCtx &rctx ///< [out] recovery context
+ );
+
+ /// resets last_persisted_osdmap
+ void reset_last_persisted() {
+ last_persisted_osdmap = 0;
+ dirty_info = true;
+ dirty_big_info = true;
+ }
+
+ /// Signal shutdown beginning
+ void shutdown() {
+ deleting = true;
+ }
+
+ /// Signal shutdown complete
+ void set_delete_complete() {
+ deleted = true;
+ }
+
+ /// Dirty info and write out
+ void force_write_state(ObjectStore::Transaction &t) {
+ dirty_info = true;
+ dirty_big_info = true;
+ write_if_dirty(t);
+ }
+
+ /// Get current interval's readable_until
+ ceph::signedspan get_readable_until() const {
+ return readable_until;
+ }
+
+ /// Get prior intervals' readable_until upper bound
+ ceph::signedspan get_prior_readable_until_ub() const {
+ return prior_readable_until_ub;
+ }
+
+ /// Get prior intervals' readable_until down OSDs of note
+ const std::set<int>& get_prior_readable_down_osds() const {
+ return prior_readable_down_osds;
+ }
+
+ /// Reset prior intervals' readable_until upper bound (e.g., bc it passed)
+ void clear_prior_readable_until_ub() {
+ prior_readable_until_ub = ceph::signedspan::zero();
+ prior_readable_down_osds.clear();
+ info.history.prior_readable_until_ub = ceph::signedspan::zero();
+ }
+
+ void renew_lease(ceph::signedspan now) {
+ bool was_min = (readable_until_ub == readable_until);
+ readable_until_ub_sent = now + readable_interval;
+ if (was_min) {
+ recalc_readable_until();
+ }
+ }
+
+ void send_lease();
+ void schedule_renew_lease();
+
+ pg_lease_t get_lease() {
+ return pg_lease_t(readable_until, readable_until_ub_sent, readable_interval);
+ }
+
+ void proc_lease(const pg_lease_t& l);
+ void proc_lease_ack(int from, const pg_lease_ack_t& la);
+ void proc_renew_lease();
+
+ pg_lease_ack_t get_lease_ack() {
+ return pg_lease_ack_t(readable_until_ub_from_primary);
+ }
+
+ /// [primary] recalc readable_until[_ub] for the current interval
+ void recalc_readable_until();
+
+ //============================ const helpers ================================
+ const char *get_current_state() const {
+ return state_history.get_current_state();
+ }
+ epoch_t get_last_peering_reset() const {
+ return last_peering_reset;
+ }
+ eversion_t get_last_rollback_info_trimmed_to_applied() const {
+ return last_rollback_info_trimmed_to_applied;
+ }
+ /// Returns stable reference to internal pool structure
+ const PGPool &get_pgpool() const {
+ return pool;
+ }
+ /// Returns reference to current osdmap
+ const OSDMapRef &get_osdmap() const {
+ ceph_assert(osdmap_ref);
+ return osdmap_ref;
+ }
+ /// Returns epoch of current osdmap
+ epoch_t get_osdmap_epoch() const {
+ return get_osdmap()->get_epoch();
+ }
+
+ bool is_ec_pg() const override {
+ return pool.info.is_erasure();
+ }
+ int get_pg_size() const override {
+ return pool.info.size;
+ }
+ bool is_deleting() const {
+ return deleting;
+ }
+ bool is_deleted() const {
+ return deleted;
+ }
+ const std::set<pg_shard_t> &get_upset() const override {
+ return upset;
+ }
+ bool is_acting_recovery_backfill(pg_shard_t osd) const {
+ return acting_recovery_backfill.count(osd);
+ }
+ bool is_acting(pg_shard_t osd) const {
+ return has_shard(pool.info.is_erasure(), acting, osd);
+ }
+ bool is_up(pg_shard_t osd) const {
+ return has_shard(pool.info.is_erasure(), up, osd);
+ }
+ static bool has_shard(bool ec, const std::vector<int>& v, pg_shard_t osd) {
+ if (ec) {
+ return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
+ } else {
+ return std::find(v.begin(), v.end(), osd.osd) != v.end();
+ }
+ }
+ const PastIntervals& get_past_intervals() const {
+ return past_intervals;
+ }
+ /// acting osd that is not the primary
+ bool is_nonprimary() const {
+ return role >= 0 && pg_whoami != primary;
+ }
+ /// primary osd
+ bool is_primary() const {
+ return pg_whoami == primary;
+ }
+ bool pg_has_reset_since(epoch_t e) const {
+ return deleted || e < get_last_peering_reset();
+ }
+
+ int get_role() const {
+ return role;
+ }
+ const std::vector<int> &get_acting() const {
+ return acting;
+ }
+ const std::set<pg_shard_t> &get_actingset() const {
+ return actingset;
+ }
+ int get_acting_primary() const {
+ return primary.osd;
+ }
+ pg_shard_t get_primary() const {
+ return primary;
+ }
+ const std::vector<int> &get_up() const {
+ return up;
+ }
+ int get_up_primary() const {
+ return up_primary.osd;
+ }
+
+ bool is_backfill_target(pg_shard_t osd) const {
+ return backfill_targets.count(osd);
+ }
+ const std::set<pg_shard_t> &get_backfill_targets() const {
+ return backfill_targets;
+ }
+ bool is_async_recovery_target(pg_shard_t peer) const {
+ return async_recovery_targets.count(peer);
+ }
+ const std::set<pg_shard_t> &get_async_recovery_targets() const {
+ return async_recovery_targets;
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+ return acting_recovery_backfill;
+ }
+
+ const PGLog &get_pg_log() const {
+ return pg_log;
+ }
+
+ bool state_test(uint64_t m) const { return (state & m) != 0; }
+ void state_set(uint64_t m) { state |= m; }
+ void state_clear(uint64_t m) { state &= ~m; }
+
+ bool is_complete() const { return info.last_complete == info.last_update; }
+ bool should_send_notify() const { return send_notify; }
+
+ uint64_t get_state() const { return state; }
+ bool is_active() const { return state_test(PG_STATE_ACTIVE); }
+ bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
+ bool is_peering() const { return state_test(PG_STATE_PEERING); }
+ bool is_down() const { return state_test(PG_STATE_DOWN); }
+ bool is_recovery_unfound() const {
+ return state_test(PG_STATE_RECOVERY_UNFOUND);
+ }
+ bool is_backfilling() const {
+ return state_test(PG_STATE_BACKFILLING);
+ }
+ bool is_backfill_unfound() const {
+ return state_test(PG_STATE_BACKFILL_UNFOUND);
+ }
+ bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
+ bool is_clean() const { return state_test(PG_STATE_CLEAN); }
+ bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
+ bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
+ bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
+ bool is_peered() const {
+ return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
+ }
+ bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
+ bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
+ bool is_repair() const { return state_test(PG_STATE_REPAIR); }
+ bool is_empty() const { return info.last_update == eversion_t(0,0); }
+
+ bool get_need_up_thru() const {
+ return need_up_thru;
+ }
+
+ bool is_forced_recovery_or_backfill() const {
+ return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+ }
+
+ bool is_backfill_reserved() const {
+ return backfill_reserved;
+ }
+
+ bool is_backfill_reserving() const {
+ return backfill_reserving;
+ }
+
+ ceph_release_t get_last_require_osd_release() const {
+ return last_require_osd_release;
+ }
+
+ const pg_info_t &get_info() const {
+ return info;
+ }
+
+ const decltype(peer_info) &get_peer_info() const {
+ return peer_info;
+ }
+ const decltype(peer_missing) &get_peer_missing() const {
+ return peer_missing;
+ }
+ const pg_missing_const_i &get_peer_missing(const pg_shard_t &peer) const {
+ if (peer == pg_whoami) {
+ return pg_log.get_missing();
+ } else {
+ assert(peer_missing.count(peer));
+ return peer_missing.find(peer)->second;
+ }
+ }
+ const pg_info_t&get_peer_info(pg_shard_t peer) const {
+ assert(peer_info.count(peer));
+ return peer_info.find(peer)->second;
+ }
+ bool has_peer_info(pg_shard_t peer) const {
+ return peer_info.count(peer);
+ }
+
+ bool needs_recovery() const;
+ bool needs_backfill() const;
+
+ /**
+ * Returns whether a particular object can be safely read on this replica
+ */
+ bool can_serve_replica_read(const hobject_t &hoid) {
+ ceph_assert(!is_primary());
+ return !pg_log.get_log().has_write_since(
+ hoid, get_min_last_complete_ondisk());
+ }
+
+ /**
+ * Returns whether the current acting set is able to go active
+ * and serve writes. It needs to satisfy min_size and any
+ * applicable stretch cluster constraints.
+ */
+ bool acting_set_writeable() {
+ return (actingset.size() >= pool.info.min_size) &&
+ (pool.info.stretch_set_can_peer(acting, *get_osdmap(), NULL));
+ }
+
+ /**
+ * Returns whether all peers which might have unfound objects have been
+ * queried or marked lost.
+ */
+ bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
+ bool all_missing_unfound() const {
+ const auto& missing = pg_log.get_missing();
+ if (!missing.have_missing())
+ return false;
+ for (auto& m : missing.get_items()) {
+ if (!missing_loc.is_unfound(m.first))
+ return false;
+ }
+ return true;
+ }
+
+ bool perform_deletes_during_peering() const {
+ return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
+ }
+
+
+ bool have_unfound() const {
+ return missing_loc.have_unfound();
+ }
+ uint64_t get_num_unfound() const {
+ return missing_loc.num_unfound();
+ }
+
+ bool have_missing() const {
+ return pg_log.get_missing().num_missing() > 0;
+ }
+ unsigned int get_num_missing() const {
+ return pg_log.get_missing().num_missing();
+ }
+
+ const MissingLoc &get_missing_loc() const {
+ return missing_loc;
+ }
+
+ const MissingLoc::missing_by_count_t &get_missing_by_count() const {
+ return missing_loc.get_missing_by_count();
+ }
+
+ eversion_t get_min_last_complete_ondisk() const {
+ return min_last_complete_ondisk;
+ }
+
+ eversion_t get_pg_trim_to() const {
+ return pg_trim_to;
+ }
+
+ eversion_t get_last_update_applied() const {
+ return last_update_applied;
+ }
+
+ eversion_t get_last_update_ondisk() const {
+ return last_update_ondisk;
+ }
+
+ bool debug_has_dirty_state() const {
+ return dirty_info || dirty_big_info;
+ }
+
+ std::string get_pg_state_string() const {
+ return pg_state_string(state);
+ }
+
+ /// Dump representation of past_intervals to out
+ void print_past_intervals(std::ostream &out) const {
+ out << "[" << past_intervals.get_bounds()
+ << ")/" << past_intervals.size();
+ }
+
+ void dump_history(ceph::Formatter *f) const {
+ state_history.dump(f);
+ }
+
+ /// Dump formatted peering status
+ void dump_peering_state(ceph::Formatter *f);
+
+private:
+ /// Mask feature vector with feature set from new peer
+ void apply_peer_features(uint64_t f) { peer_features &= f; }
+
+ /// Reset feature vector to default
+ void reset_min_peer_features() {
+ peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ }
+public:
+ /// Get feature vector common to all known peers with this pg
+ uint64_t get_min_peer_features() const { return peer_features; }
+
+ /// Get feature vector common to acting set
+ uint64_t get_min_acting_features() const { return acting_features; }
+
+ /// Get feature vector common to up/acting set
+ uint64_t get_min_upacting_features() const { return upacting_features; }
+
+
+ // Flush control interface
+private:
+ /**
+ * Start additional flush (blocks needs_flush/activation until
+ * complete_flush is called once for each start_flush call as
+ * required by start_flush_on_transaction).
+ */
+ void start_flush(ObjectStore::Transaction &t) {
+ flushes_in_progress++;
+ pl->start_flush_on_transaction(t);
+ }
+public:
+ /// True if there are outstanding flushes
+ bool needs_flush() const {
+ return flushes_in_progress > 0;
+ }
+ /// Must be called once per start_flush
+ void complete_flush();
+
+ friend std::ostream &operator<<(std::ostream &out, const PeeringState &ps);
+};
+
+std::ostream &operator<<(std::ostream &out, const PeeringState &ps);