summaryrefslogtreecommitdiffstats
path: root/src/crimson/osd
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crimson/osd
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson/osd')
-rw-r--r--src/crimson/osd/CMakeLists.txt72
-rw-r--r--src/crimson/osd/acked_peers.h14
-rw-r--r--src/crimson/osd/backfill_facades.h73
-rw-r--r--src/crimson/osd/backfill_state.cc558
-rw-r--r--src/crimson/osd/backfill_state.h382
-rw-r--r--src/crimson/osd/ec_backend.cc37
-rw-r--r--src/crimson/osd/ec_backend.h41
-rw-r--r--src/crimson/osd/exceptions.h46
-rw-r--r--src/crimson/osd/heartbeat.cc819
-rw-r--r--src/crimson/osd/heartbeat.h461
-rw-r--r--src/crimson/osd/lsan_suppressions.cc20
-rw-r--r--src/crimson/osd/main.cc259
-rw-r--r--src/crimson/osd/main_config_bootstrap_helpers.cc265
-rw-r--r--src/crimson/osd/main_config_bootstrap_helpers.h99
-rw-r--r--src/crimson/osd/objclass.cc584
-rw-r--r--src/crimson/osd/object_context.cc85
-rw-r--r--src/crimson/osd/object_context.h276
-rw-r--r--src/crimson/osd/object_context_loader.cc232
-rw-r--r--src/crimson/osd/object_context_loader.h87
-rw-r--r--src/crimson/osd/ops_executer.cc1461
-rw-r--r--src/crimson/osd/ops_executer.h629
-rw-r--r--src/crimson/osd/osd.cc1357
-rw-r--r--src/crimson/osd/osd.h251
-rw-r--r--src/crimson/osd/osd_connection_priv.h27
-rw-r--r--src/crimson/osd/osd_meta.cc98
-rw-r--r--src/crimson/osd/osd_meta.h60
-rw-r--r--src/crimson/osd/osd_operation.cc227
-rw-r--r--src/crimson/osd/osd_operation.h281
-rw-r--r--src/crimson/osd/osd_operation_external_tracking.h307
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.cc207
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.h144
-rw-r--r--src/crimson/osd/osd_operations/client_request.cc388
-rw-r--r--src/crimson/osd/osd_operations/client_request.h281
-rw-r--r--src/crimson/osd/osd_operations/client_request_common.cc64
-rw-r--r--src/crimson/osd/osd_operations/client_request_common.h20
-rw-r--r--src/crimson/osd/osd_operations/common/pg_pipeline.h31
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.cc130
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.h68
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.cc79
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.h81
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.cc68
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.h79
-rw-r--r--src/crimson/osd/osd_operations/osdop_params.h22
-rw-r--r--src/crimson/osd/osd_operations/peering_event.cc190
-rw-r--r--src/crimson/osd/osd_operations/peering_event.h207
-rw-r--r--src/crimson/osd/osd_operations/pg_advance_map.cc130
-rw-r--r--src/crimson/osd/osd_operations/pg_advance_map.h61
-rw-r--r--src/crimson/osd/osd_operations/recovery_subrequest.cc46
-rw-r--r--src/crimson/osd/osd_operations/recovery_subrequest.h81
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.cc80
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.h80
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.cc569
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.h210
-rw-r--r--src/crimson/osd/osdmap_gate.cc86
-rw-r--r--src/crimson/osd/osdmap_gate.h83
-rw-r--r--src/crimson/osd/osdmap_service.h21
-rw-r--r--src/crimson/osd/pg.cc1544
-rw-r--r--src/crimson/osd/pg.h833
-rw-r--r--src/crimson/osd/pg_activation_blocker.cc36
-rw-r--r--src/crimson/osd/pg_activation_blocker.h35
-rw-r--r--src/crimson/osd/pg_backend.cc1811
-rw-r--r--src/crimson/osd/pg_backend.h448
-rw-r--r--src/crimson/osd/pg_interval_interrupt_condition.cc43
-rw-r--r--src/crimson/osd/pg_interval_interrupt_condition.h56
-rw-r--r--src/crimson/osd/pg_map.cc102
-rw-r--r--src/crimson/osd/pg_map.h201
-rw-r--r--src/crimson/osd/pg_meta.cc110
-rw-r--r--src/crimson/osd/pg_meta.h20
-rw-r--r--src/crimson/osd/pg_recovery.cc569
-rw-r--r--src/crimson/osd/pg_recovery.h118
-rw-r--r--src/crimson/osd/pg_recovery_listener.h39
-rw-r--r--src/crimson/osd/pg_shard_manager.cc108
-rw-r--r--src/crimson/osd/pg_shard_manager.h390
-rw-r--r--src/crimson/osd/recovery_backend.cc328
-rw-r--r--src/crimson/osd/recovery_backend.h233
-rw-r--r--src/crimson/osd/replicated_backend.cc174
-rw-r--r--src/crimson/osd/replicated_backend.h61
-rw-r--r--src/crimson/osd/replicated_recovery_backend.cc1182
-rw-r--r--src/crimson/osd/replicated_recovery_backend.h169
-rw-r--r--src/crimson/osd/scheduler/mclock_scheduler.cc165
-rw-r--r--src/crimson/osd/scheduler/mclock_scheduler.h125
-rw-r--r--src/crimson/osd/scheduler/scheduler.cc181
-rw-r--r--src/crimson/osd/scheduler/scheduler.h82
-rw-r--r--src/crimson/osd/shard_services.cc761
-rw-r--r--src/crimson/osd/shard_services.h589
-rw-r--r--src/crimson/osd/state.h130
-rw-r--r--src/crimson/osd/stop_signal.h83
-rw-r--r--src/crimson/osd/watch.cc354
-rw-r--r--src/crimson/osd/watch.h256
89 files changed, 23950 insertions, 0 deletions
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt
new file mode 100644
index 000000000..f521e0244
--- /dev/null
+++ b/src/crimson/osd/CMakeLists.txt
@@ -0,0 +1,72 @@
+add_executable(crimson-osd
+ backfill_state.cc
+ ec_backend.cc
+ heartbeat.cc
+ lsan_suppressions.cc
+ main.cc
+ main_config_bootstrap_helpers.cc
+ osd.cc
+ osd_meta.cc
+ pg.cc
+ pg_backend.cc
+ pg_meta.cc
+ replicated_backend.cc
+ shard_services.cc
+ pg_shard_manager.cc
+ object_context.cc
+ object_context_loader.cc
+ ops_executer.cc
+ osd_operation.cc
+ osd_operations/client_request.cc
+ osd_operations/client_request_common.cc
+ osd_operations/internal_client_request.cc
+ osd_operations/peering_event.cc
+ osd_operations/pg_advance_map.cc
+ osd_operations/replicated_request.cc
+ osd_operations/logmissing_request.cc
+ osd_operations/logmissing_request_reply.cc
+ osd_operations/background_recovery.cc
+ osd_operations/recovery_subrequest.cc
+ osd_operations/snaptrim_event.cc
+ pg_recovery.cc
+ recovery_backend.cc
+ replicated_recovery_backend.cc
+ scheduler/scheduler.cc
+ scheduler/mclock_scheduler.cc
+ osdmap_gate.cc
+ pg_activation_blocker.cc
+ pg_map.cc
+ pg_interval_interrupt_condition.cc
+ objclass.cc
+ ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
+ watch.cc
+ )
+if(HAS_VTA)
+ set_source_files_properties(main.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+target_link_libraries(crimson-osd
+ crimson-admin
+ crimson-common
+ crimson-os
+ crimson
+ fmt::fmt
+ Boost::MPL
+ dmclock::dmclock)
+set_target_properties(crimson-osd PROPERTIES
+ POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE})
+install(TARGETS crimson-osd DESTINATION bin)
+if(WITH_TESTS)
+ add_dependencies(tests crimson-osd)
+endif()
diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h
new file mode 100644
index 000000000..b2f2562c0
--- /dev/null
+++ b/src/crimson/osd/acked_peers.h
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+
+namespace crimson::osd {
+ struct peer_shard_t {
+ pg_shard_t shard;
+ eversion_t last_complete_ondisk;
+ };
+ using acked_peers_t = std::vector<peer_shard_t>;
+}
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
new file mode 100644
index 000000000..683dc6ea6
--- /dev/null
+++ b/src/crimson/osd/backfill_facades.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg.h"
+#include "osd/PeeringState.h"
+
+namespace crimson::osd {
+
+// PeeringFacade -- main implementation of the BackfillState::PeeringFacade
+// interface. We have the abstraction to decuple BackfillState from Peering
+// State, and thus cut depedencies in unit testing. The second implemention
+// is BackfillFixture::PeeringFacade and sits in test_backfill.cc.
+struct PeeringFacade final : BackfillState::PeeringFacade {
+ PeeringState& peering_state;
+
+ hobject_t earliest_backfill() const override {
+ return peering_state.earliest_backfill();
+ }
+
+ const std::set<pg_shard_t>& get_backfill_targets() const override {
+ return peering_state.get_backfill_targets();
+ }
+
+ const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override {
+ return peering_state.get_peer_info(peer).last_backfill;
+ }
+
+ const eversion_t& get_last_update() const override {
+ return peering_state.get_info().last_update;
+ }
+
+ const eversion_t& get_log_tail() const override {
+ return peering_state.get_info().log_tail;
+ }
+
+ void scan_log_after(eversion_t v, scan_log_func_t f) const override {
+ peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f));
+ }
+
+ bool is_backfill_target(pg_shard_t peer) const override {
+ return peering_state.is_backfill_target(peer);
+ }
+ void update_complete_backfill_object_stats(const hobject_t &hoid,
+ const pg_stat_t &stats) override {
+ peering_state.update_complete_backfill_object_stats(hoid, stats);
+ }
+
+ bool is_backfilling() const override {
+ return peering_state.is_backfilling();
+ }
+
+ PeeringFacade(PeeringState& peering_state)
+ : peering_state(peering_state) {
+ }
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct PGFacade final : BackfillState::PGFacade {
+ PG& pg;
+
+ const eversion_t& get_projected_last_update() const override {
+ return pg.projected_last_update;
+ }
+
+ PGFacade(PG& pg) : pg(pg) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
new file mode 100644
index 000000000..46a270ffe
--- /dev/null
+++ b/src/crimson/osd/backfill_state.cc
@@ -0,0 +1,558 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <boost/type_index.hpp>
+#include <fmt/ranges.h>
+#include "common/hobject_fmt.h"
+#include "crimson/osd/backfill_state.h"
+#include "osd/osd_types_fmt.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+BackfillState::BackfillState(
+ BackfillState::BackfillListener& backfill_listener,
+ std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+ std::unique_ptr<BackfillState::PGFacade> pg)
+ : backfill_machine(*this,
+ backfill_listener,
+ std::move(peering_state),
+ std::move(pg)),
+ progress_tracker(
+ std::make_unique<BackfillState::ProgressTracker>(backfill_machine))
+{
+ logger().debug("{}:{}", __func__, __LINE__);
+ backfill_machine.initiate();
+}
+
+template <class S>
+BackfillState::StateHelper<S>::StateHelper()
+{
+ logger().debug("enter {}",
+ boost::typeindex::type_id<S>().pretty_name());
+}
+
+template <class S>
+BackfillState::StateHelper<S>::~StateHelper()
+{
+ logger().debug("exit {}",
+ boost::typeindex::type_id<S>().pretty_name());
+}
+
+BackfillState::~BackfillState() = default;
+
+BackfillState::BackfillMachine::BackfillMachine(
+ BackfillState& backfill_state,
+ BackfillState::BackfillListener& backfill_listener,
+ std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+ std::unique_ptr<BackfillState::PGFacade> pg)
+ : backfill_state(backfill_state),
+ backfill_listener(backfill_listener),
+ peering_state(std::move(peering_state)),
+ pg(std::move(pg))
+{}
+
+BackfillState::BackfillMachine::~BackfillMachine() = default;
+
+BackfillState::Initial::Initial(my_context ctx)
+ : my_base(ctx)
+{
+ backfill_state().last_backfill_started = peering_state().earliest_backfill();
+ logger().debug("{}: bft={} from {}",
+ __func__, peering_state().get_backfill_targets(),
+ backfill_state().last_backfill_started);
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ logger().debug("{}: target shard {} from {}",
+ __func__, bt, peering_state().get_peer_last_backfill(bt));
+ }
+ ceph_assert(peering_state().get_backfill_targets().size());
+ ceph_assert(!backfill_state().last_backfill_started.is_max());
+}
+
+boost::statechart::result
+BackfillState::Initial::react(const BackfillState::Triggered& evt)
+{
+ logger().debug("{}: backfill triggered", __func__);
+ ceph_assert(backfill_state().last_backfill_started == \
+ peering_state().earliest_backfill());
+ ceph_assert(peering_state().is_backfilling());
+ // initialize BackfillIntervals
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ backfill_state().peer_backfill_info[bt].reset(
+ peering_state().get_peer_last_backfill(bt));
+ }
+ backfill_state().backfill_info.reset(backfill_state().last_backfill_started);
+ if (Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info)) {
+ logger().debug("{}: switching to Done state", __func__);
+ return transit<BackfillState::Done>();
+ } else {
+ logger().debug("{}: switching to Enqueuing state", __func__);
+ return transit<BackfillState::Enqueuing>();
+ }
+}
+
+
+// -- Enqueuing
+void BackfillState::Enqueuing::maybe_update_range()
+{
+ if (auto& primary_bi = backfill_state().backfill_info;
+ primary_bi.version >= pg().get_projected_last_update()) {
+ logger().info("{}: bi is current", __func__);
+ ceph_assert(primary_bi.version == pg().get_projected_last_update());
+ } else if (primary_bi.version >= peering_state().get_log_tail()) {
+#if 0
+ if (peering_state().get_pg_log().get_log().empty() &&
+ pg().get_projected_log().empty()) {
+ /* Because we don't move log_tail on split, the log might be
+ * empty even if log_tail != last_update. However, the only
+ * way to get here with an empty log is if log_tail is actually
+ * eversion_t(), because otherwise the entry which changed
+ * last_update since the last scan would have to be present.
+ */
+ ceph_assert(primary_bi.version == eversion_t());
+ return;
+ }
+#endif
+ logger().debug("{}: bi is old, ({}) can be updated with log to {}",
+ __func__,
+ primary_bi.version,
+ pg().get_projected_last_update());
+ logger().debug("{}: scanning pg log first", __func__);
+ peering_state().scan_log_after(primary_bi.version,
+ [&](const pg_log_entry_t& e) {
+ logger().debug("maybe_update_range(lambda): updating from version {}",
+ e.version);
+ if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) {
+ if (e.is_update()) {
+ logger().debug("maybe_update_range(lambda): {} updated to ver {}",
+ e.soid, e.version);
+ primary_bi.objects.erase(e.soid);
+ primary_bi.objects.insert(std::make_pair(e.soid,
+ e.version));
+ } else if (e.is_delete()) {
+ logger().debug("maybe_update_range(lambda): {} removed",
+ e.soid);
+ primary_bi.objects.erase(e.soid);
+ }
+ }
+ });
+ primary_bi.version = pg().get_projected_last_update();
+ } else {
+ ceph_abort_msg(
+ "scan_range should have raised primary_bi.version past log_tail");
+ }
+}
+
+void BackfillState::Enqueuing::trim_backfill_infos()
+{
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ backfill_state().peer_backfill_info[bt].trim_to(
+ std::max(peering_state().get_peer_last_backfill(bt),
+ backfill_state().last_backfill_started));
+ }
+ backfill_state().backfill_info.trim_to(
+ backfill_state().last_backfill_started);
+}
+
+/* static */ bool BackfillState::Enqueuing::all_enqueued(
+ const PeeringFacade& peering_state,
+ const BackfillInterval& backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+ const bool all_local_enqueued = \
+ backfill_info.extends_to_end() && backfill_info.empty();
+ const bool all_peer_enqueued = std::all_of(
+ std::begin(peer_backfill_info),
+ std::end(peer_backfill_info),
+ [] (const auto& kv) {
+ [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv;
+ return peer_backfill_info.extends_to_end() && peer_backfill_info.empty();
+ });
+ return all_local_enqueued && all_peer_enqueued;
+}
+
+hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+ hobject_t e = hobject_t::get_max();
+ for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
+ const auto iter = peer_backfill_info.find(bt);
+ ceph_assert(iter != peer_backfill_info.end());
+ e = std::min(e, iter->second.begin);
+ }
+ return e;
+}
+
+bool BackfillState::Enqueuing::should_rescan_replicas(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const
+{
+ const auto& targets = peering_state().get_backfill_targets();
+ return std::any_of(std::begin(targets), std::end(targets),
+ [&] (const auto& bt) {
+ return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt),
+ backfill_info);
+ });
+}
+
+bool BackfillState::Enqueuing::should_rescan_primary(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const
+{
+ return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
+ !backfill_info.extends_to_end();
+}
+
+void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
+ BackfillState::Enqueuing::result_t&& result,
+ hobject_t& last_backfill_started,
+ std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+ std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
+ [&peer_backfill_info] (const auto& bt) {
+ peer_backfill_info.at(bt).pop_front();
+ });
+ last_backfill_started = std::move(result.new_last_backfill_started);
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
+{
+ // set `new_last_backfill_started` to `check`
+ result_t result { {}, check };
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+ if (pbi.begin == check) {
+ result.pbi_targets.insert(bt);
+ const auto& version = pbi.objects.begin()->second;
+ backfill_state().progress_tracker->enqueue_drop(pbi.begin);
+ backfill_listener().enqueue_drop(bt, pbi.begin, version);
+ }
+ }
+ logger().debug("{}: BACKFILL removing {} from peers {}",
+ __func__, check, result.pbi_targets);
+ ceph_assert(!result.pbi_targets.empty());
+ return result;
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
+{
+ logger().debug("{}: check={}", __func__, check);
+ const auto& primary_bi = backfill_state().backfill_info;
+ result_t result { {}, primary_bi.begin };
+
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
+
+ // Find all check peers that have the wrong version
+ if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
+ check == primary_bi.begin && check == peer_bi.begin) {
+ if(peer_bi.objects.begin()->second != obj_v &&
+ backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+ backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+ } else {
+ // it's fine, keep it! OR already recovering
+ }
+ result.pbi_targets.insert(bt);
+ } else {
+ // Only include peers that we've caught up to their backfill line
+ // otherwise, they only appear to be missing this object
+ // because their peer_bi.begin > backfill_info.begin.
+ if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
+ backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+ backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+ }
+ }
+ }
+ return result;
+}
+
+bool BackfillState::Enqueuing::Enqueuing::all_emptied(
+ const BackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+ const auto& targets = peering_state().get_backfill_targets();
+ const auto replicas_emptied =
+ std::all_of(std::begin(targets), std::end(targets),
+ [&] (const auto& bt) {
+ return peer_backfill_info.at(bt).empty();
+ });
+ return local_backfill_info.empty() && replicas_emptied;
+}
+
+BackfillState::Enqueuing::Enqueuing(my_context ctx)
+ : my_base(ctx)
+{
+ auto& primary_bi = backfill_state().backfill_info;
+
+ // update our local interval to cope with recent changes
+ primary_bi.begin = backfill_state().last_backfill_started;
+ if (primary_bi.version < peering_state().get_log_tail()) {
+ // it might be that the OSD is so flooded with modifying operations
+ // that backfill will be spinning here over and over. For the sake
+ // of performance and complexity we don't synchronize with entire PG.
+ // similar can happen in classical OSD.
+ logger().warn("{}: bi is old, rescanning of local backfill_info",
+ __func__);
+ post_event(RequestPrimaryScanning{});
+ return;
+ } else {
+ maybe_update_range();
+ }
+ trim_backfill_infos();
+
+ while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+ if (!backfill_listener().budget_available()) {
+ post_event(RequestWaiting{});
+ return;
+ } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
+ primary_bi)) {
+ // Count simultaneous scans as a single op and let those complete
+ post_event(RequestReplicasScanning{});
+ return;
+ }
+ // Get object within set of peers to operate on and the set of targets
+ // for which that object applies.
+ if (const hobject_t check = \
+ earliest_peer_backfill(backfill_state().peer_backfill_info);
+ check < primary_bi.begin) {
+ // Don't increment ops here because deletions
+ // are cheap and not replied to unlike real recovery_ops,
+ // and we can't increment ops without requeueing ourself
+ // for recovery.
+ auto result = remove_on_peers(check);
+ trim_backfilled_object_from_intervals(std::move(result),
+ backfill_state().last_backfill_started,
+ backfill_state().peer_backfill_info);
+ } else {
+ auto result = update_on_peers(check);
+ trim_backfilled_object_from_intervals(std::move(result),
+ backfill_state().last_backfill_started,
+ backfill_state().peer_backfill_info);
+ primary_bi.pop_front();
+ }
+ backfill_listener().maybe_flush();
+ }
+
+ if (should_rescan_primary(backfill_state().peer_backfill_info,
+ primary_bi)) {
+ // need to grab one another chunk of the object namespace and restart
+ // the queueing.
+ logger().debug("{}: reached end for current local chunk",
+ __func__);
+ post_event(RequestPrimaryScanning{});
+ } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+ post_event(RequestDone{});
+ } else {
+ logger().debug("{}: reached end for both local and all peers "
+ "but still has in-flight operations", __func__);
+ post_event(RequestWaiting{});
+ }
+}
+
+// -- PrimaryScanning
+BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx)
+ : my_base(ctx)
+{
+ backfill_state().backfill_info.version = peering_state().get_last_update();
+ backfill_listener().request_primary_scan(
+ backfill_state().backfill_info.begin);
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(PrimaryScanned evt)
+{
+ logger().debug("{}", __func__);
+ backfill_state().backfill_info = std::move(evt.result);
+ return transit<Enqueuing>();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(ObjectPushed evt)
+{
+ logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ return discard_event();
+}
+
+// -- ReplicasScanning
+bool BackfillState::ReplicasScanning::replica_needs_scan(
+ const BackfillInterval& replica_backfill_info,
+ const BackfillInterval& local_backfill_info)
+{
+ return replica_backfill_info.empty() && \
+ replica_backfill_info.begin <= local_backfill_info.begin && \
+ !replica_backfill_info.extends_to_end();
+}
+
+BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx)
+ : my_base(ctx)
+{
+ for (const auto& bt : peering_state().get_backfill_targets()) {
+ if (const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+ replica_needs_scan(pbi, backfill_state().backfill_info)) {
+ logger().debug("{}: scanning peer osd.{} from {}",
+ __func__, bt, pbi.end);
+ backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{});
+
+ ceph_assert(waiting_on_backfill.find(bt) == \
+ waiting_on_backfill.end());
+ waiting_on_backfill.insert(bt);
+ }
+ }
+ ceph_assert(!waiting_on_backfill.empty());
+ // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+}
+
+#if 0
+BackfillState::ReplicasScanning::~ReplicasScanning()
+{
+ // TODO: finish_recovery_op(hobject_t::get_max());
+}
+#endif
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ReplicaScanned evt)
+{
+ logger().debug("{}: got scan result from osd={}, result={}",
+ __func__, evt.from, evt.result);
+ // TODO: maybe we'll be able to move waiting_on_backfill from
+ // the machine to the state.
+ ceph_assert(peering_state().is_backfill_target(evt.from));
+ if (waiting_on_backfill.erase(evt.from)) {
+ backfill_state().peer_backfill_info[evt.from] = std::move(evt.result);
+ if (waiting_on_backfill.empty()) {
+ ceph_assert(backfill_state().peer_backfill_info.size() == \
+ peering_state().get_backfill_targets().size());
+ return transit<Enqueuing>();
+ }
+ } else {
+ // we canceled backfill for a while due to a too full, and this
+ // is an extra response from a non-too-full peer
+ logger().debug("{}: canceled backfill (too full?)", __func__);
+ }
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ObjectPushed evt)
+{
+ logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ return discard_event();
+}
+
+
+// -- Waiting
+BackfillState::Waiting::Waiting(my_context ctx)
+ : my_base(ctx)
+{
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(ObjectPushed evt)
+{
+ logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
+ evt.object);
+ backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+ if (!Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info)) {
+ return transit<Enqueuing>();
+ } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+ return transit<Done>();
+ } else {
+ // we still have something to wait on
+ logger().debug("Waiting::react() on ObjectPushed; still waiting");
+ return discard_event();
+ }
+}
+
+// -- Done
+BackfillState::Done::Done(my_context ctx)
+ : my_base(ctx)
+{
+ logger().info("{}: backfill is done", __func__);
+ backfill_listener().backfilled();
+}
+
+// -- Crashed
+BackfillState::Crashed::Crashed()
+{
+ ceph_abort_msg("{}: this should not happen");
+}
+
+// ProgressTracker is an intermediary between the BackfillListener and
+// BackfillMachine + its states. All requests to push or drop an object
+// are directed through it. The same happens with notifications about
+// completing given operations which are generated by BackfillListener
+// and dispatched as i.e. ObjectPushed events.
+// This allows ProgressTacker to track the list of in-flight operations
+// which is essential to make the decision whether the entire machine
+// should switch from Waiting to Done keep in Waiting.
+// ProgressTracker also coordinates .last_backfill_started and stats
+// updates.
+bool BackfillState::ProgressTracker::tracked_objects_completed() const
+{
+ return registry.empty();
+}
+
+bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj)
+{
+ [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace(
+ obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt});
+ return first_seen;
+}
+
+void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
+{
+ registry.try_emplace(
+ obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}});
+}
+
+void BackfillState::ProgressTracker::complete_to(
+ const hobject_t& obj,
+ const pg_stat_t& stats)
+{
+ logger().debug("{}: obj={}",
+ __func__, obj);
+ if (auto completion_iter = registry.find(obj);
+ completion_iter != std::end(registry)) {
+ completion_iter->second = \
+ registry_item_t{ op_stage_t::completed_push, stats };
+ } else {
+ ceph_abort_msg("completing untracked object shall not happen");
+ }
+ for (auto it = std::begin(registry);
+ it != std::end(registry) &&
+ it->second.stage != op_stage_t::enqueued_push;
+ it = registry.erase(it)) {
+ auto& [soid, item] = *it;
+ assert(item.stats);
+ peering_state().update_complete_backfill_object_stats(
+ soid,
+ *item.stats);
+ }
+ if (Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info) &&
+ tracked_objects_completed()) {
+ backfill_state().last_backfill_started = hobject_t::get_max();
+ backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+ } else {
+ backfill_listener().update_peers_last_backfill(obj);
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
new file mode 100644
index 000000000..4bd2991fb
--- /dev/null
+++ b/src/crimson/osd/backfill_state.h
@@ -0,0 +1,382 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <optional>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "osd/recovery_types.h"
+
+namespace crimson::osd {
+
+namespace sc = boost::statechart;
+
+struct BackfillState {
+ struct BackfillListener;
+ struct PeeringFacade;
+ struct PGFacade;
+
+ // events comes first
+ struct PrimaryScanned : sc::event<PrimaryScanned> {
+ BackfillInterval result;
+ PrimaryScanned(BackfillInterval&& result)
+ : result(std::move(result)) {
+ }
+ };
+
+ struct ReplicaScanned : sc::event<ReplicaScanned> {
+ pg_shard_t from;
+ BackfillInterval result;
+ ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+ : from(std::move(from)),
+ result(std::move(result)) {
+ }
+ };
+
+ struct ObjectPushed : sc::event<ObjectPushed> {
+ // TODO: implement replica management; I don't want to follow
+ // current convention where the backend layer is responsible
+ // for tracking replicas.
+ hobject_t object;
+ pg_stat_t stat;
+ ObjectPushed(hobject_t object)
+ : object(std::move(object)) {
+ }
+ };
+
+ struct Triggered : sc::event<Triggered> {
+ };
+
+private:
+ // internal events
+ struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> {
+ };
+
+ struct RequestReplicasScanning : sc::event<RequestReplicasScanning> {
+ };
+
+ struct RequestWaiting : sc::event<RequestWaiting> {
+ };
+
+ struct RequestDone : sc::event<RequestDone> {
+ };
+
+ class ProgressTracker;
+
+public:
+
+ struct Initial;
+ struct Enqueuing;
+ struct PrimaryScanning;
+ struct ReplicasScanning;
+ struct Waiting;
+ struct Done;
+
+ struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> {
+ BackfillMachine(BackfillState& backfill_state,
+ BackfillListener& backfill_listener,
+ std::unique_ptr<PeeringFacade> peering_state,
+ std::unique_ptr<PGFacade> pg);
+ ~BackfillMachine();
+ BackfillState& backfill_state;
+ BackfillListener& backfill_listener;
+ std::unique_ptr<PeeringFacade> peering_state;
+ std::unique_ptr<PGFacade> pg;
+ };
+
+private:
+ template <class S>
+ struct StateHelper {
+ StateHelper();
+ ~StateHelper();
+
+ BackfillState& backfill_state() {
+ return static_cast<S*>(this) \
+ ->template context<BackfillMachine>().backfill_state;
+ }
+ BackfillListener& backfill_listener() {
+ return static_cast<S*>(this) \
+ ->template context<BackfillMachine>().backfill_listener;
+ }
+ PeeringFacade& peering_state() {
+ return *static_cast<S*>(this) \
+ ->template context<BackfillMachine>().peering_state;
+ }
+ PGFacade& pg() {
+ return *static_cast<S*>(this)->template context<BackfillMachine>().pg;
+ }
+
+ const PeeringFacade& peering_state() const {
+ return *static_cast<const S*>(this) \
+ ->template context<BackfillMachine>().peering_state;
+ }
+ const BackfillState& backfill_state() const {
+ return static_cast<const S*>(this) \
+ ->template context<BackfillMachine>().backfill_state;
+ }
+ };
+
+public:
+
+ // states
+ struct Crashed : sc::simple_state<Crashed, BackfillMachine>,
+ StateHelper<Crashed> {
+ explicit Crashed();
+ };
+
+ struct Initial : sc::state<Initial, BackfillMachine>,
+ StateHelper<Initial> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<Triggered>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Initial(my_context);
+ // initialize after triggering backfill by on_activate_complete().
+ // transit to Enqueuing.
+ sc::result react(const Triggered&);
+ };
+
+ struct Enqueuing : sc::state<Enqueuing, BackfillMachine>,
+ StateHelper<Enqueuing> {
+ using reactions = boost::mpl::list<
+ sc::transition<RequestPrimaryScanning, PrimaryScanning>,
+ sc::transition<RequestReplicasScanning, ReplicasScanning>,
+ sc::transition<RequestWaiting, Waiting>,
+ sc::transition<RequestDone, Done>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Enqueuing(my_context);
+
+ // indicate whether there is any remaining work to do when it comes
+ // to comparing the hobject_t namespace between primary and replicas.
+ // true doesn't necessarily mean backfill is done -- there could be
+ // in-flight pushes or drops which had been enqueued but aren't
+ // completed yet.
+ static bool all_enqueued(
+ const PeeringFacade& peering_state,
+ const BackfillInterval& backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+
+ private:
+ void maybe_update_range();
+ void trim_backfill_infos();
+
+ // these methods take BackfillIntervals instead of extracting them from
+ // the state to emphasize the relationships across the main loop.
+ bool all_emptied(
+ const BackfillInterval& local_backfill_info,
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ hobject_t earliest_peer_backfill(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+ bool should_rescan_replicas(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const;
+ // indicate whether a particular acting primary needs to scanned again
+ // to process next piece of the hobject_t's namespace.
+ // the logic is per analogy to replica_needs_scan(). See comments there.
+ bool should_rescan_primary(
+ const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+ const BackfillInterval& backfill_info) const;
+
+ // the result_t is intermediary between {remove,update}_on_peers() and
+ // updating BackfillIntervals in trim_backfilled_object_from_intervals.
+ // This step is important because it affects the main loop's condition,
+ // and thus deserves to be exposed instead of being called deeply from
+ // {remove,update}_on_peers().
+ struct [[nodiscard]] result_t {
+ std::set<pg_shard_t> pbi_targets;
+ hobject_t new_last_backfill_started;
+ };
+ void trim_backfilled_object_from_intervals(
+ result_t&&,
+ hobject_t& last_backfill_started,
+ std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+ result_t remove_on_peers(const hobject_t& check);
+ result_t update_on_peers(const hobject_t& check);
+ };
+
+ struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>,
+ StateHelper<PrimaryScanning> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::custom_reaction<PrimaryScanned>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit PrimaryScanning(my_context);
+ sc::result react(ObjectPushed);
+ // collect scanning result and transit to Enqueuing.
+ sc::result react(PrimaryScanned);
+ };
+
+ struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
+ StateHelper<ReplicasScanning> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::custom_reaction<ReplicaScanned>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit ReplicasScanning(my_context);
+ // collect scanning result; if all results are collected, transition
+ // to Enqueuing will happen.
+ sc::result react(ObjectPushed);
+ sc::result react(ReplicaScanned);
+
+ // indicate whether a particular peer should be scanned to retrieve
+ // BackfillInterval for new range of hobject_t namespace.
+ // true when bi.objects is exhausted, replica bi's end is not MAX,
+ // and primary bi'begin is further than the replica's one.
+ static bool replica_needs_scan(
+ const BackfillInterval& replica_backfill_info,
+ const BackfillInterval& local_backfill_info);
+
+ private:
+ std::set<pg_shard_t> waiting_on_backfill;
+ };
+
+ struct Waiting : sc::state<Waiting, BackfillMachine>,
+ StateHelper<Waiting> {
+ using reactions = boost::mpl::list<
+ sc::custom_reaction<ObjectPushed>,
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Waiting(my_context);
+ sc::result react(ObjectPushed);
+ };
+
+ struct Done : sc::state<Done, BackfillMachine>,
+ StateHelper<Done> {
+ using reactions = boost::mpl::list<
+ sc::transition<sc::event_base, Crashed>>;
+ explicit Done(my_context);
+ };
+
+ BackfillState(BackfillListener& backfill_listener,
+ std::unique_ptr<PeeringFacade> peering_state,
+ std::unique_ptr<PGFacade> pg);
+ ~BackfillState();
+
+ void process_event(
+ boost::intrusive_ptr<const sc::event_base> evt) {
+ backfill_machine.process_event(*std::move(evt));
+ }
+
+ hobject_t get_last_backfill_started() const {
+ return last_backfill_started;
+ }
+private:
+ hobject_t last_backfill_started;
+ BackfillInterval backfill_info;
+ std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+ BackfillMachine backfill_machine;
+ std::unique_ptr<ProgressTracker> progress_tracker;
+};
+
+// BackfillListener -- an interface used by the backfill FSM to request
+// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`.
+// The goals behind the interface are: 1) unittestability; 2) possibility
+// to retrofit classical OSD with BackfillState. For the second reason we
+// never use `seastar::future` -- instead responses to the requests are
+// conveyed as events; see ObjectPushed as an example.
+struct BackfillState::BackfillListener {
+ virtual void request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end) = 0;
+
+ virtual void request_primary_scan(
+ const hobject_t& begin) = 0;
+
+ virtual void enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v) = 0;
+
+ virtual void enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v) = 0;
+
+ virtual void maybe_flush() = 0;
+
+ virtual void update_peers_last_backfill(
+ const hobject_t& new_last_backfill) = 0;
+
+ virtual bool budget_available() const = 0;
+
+ virtual void backfilled() = 0;
+
+ virtual ~BackfillListener() = default;
+};
+
+// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying
+// the interface of PeeringState. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PeeringFacade {
+ virtual hobject_t earliest_backfill() const = 0;
+ virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0;
+ virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0;
+ virtual const eversion_t& get_last_update() const = 0;
+ virtual const eversion_t& get_log_tail() const = 0;
+
+ // the performance impact of `std::function` has not been considered yet.
+ // If there is any proof (from e.g. profiling) about its significance, we
+ // can switch back to the template variant.
+ using scan_log_func_t = std::function<void(const pg_log_entry_t&)>;
+ virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0;
+
+ virtual bool is_backfill_target(pg_shard_t peer) const = 0;
+ virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
+ const pg_stat_t &stats) = 0;
+ virtual bool is_backfilling() const = 0;
+ virtual ~PeeringFacade() {}
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PGFacade {
+ virtual const eversion_t& get_projected_last_update() const = 0;
+ virtual ~PGFacade() {}
+};
+
+class BackfillState::ProgressTracker {
+ // TODO: apply_stat,
+ enum class op_stage_t {
+ enqueued_push,
+ enqueued_drop,
+ completed_push,
+ };
+
+ struct registry_item_t {
+ op_stage_t stage;
+ std::optional<pg_stat_t> stats;
+ };
+
+ BackfillMachine& backfill_machine;
+ std::map<hobject_t, registry_item_t> registry;
+
+ BackfillState& backfill_state() {
+ return backfill_machine.backfill_state;
+ }
+ PeeringFacade& peering_state() {
+ return *backfill_machine.peering_state;
+ }
+ BackfillListener& backfill_listener() {
+ return backfill_machine.backfill_listener;
+ }
+
+public:
+ ProgressTracker(BackfillMachine& backfill_machine)
+ : backfill_machine(backfill_machine) {
+ }
+
+ bool tracked_objects_completed() const;
+
+ bool enqueue_push(const hobject_t&);
+ void enqueue_drop(const hobject_t&);
+ void complete_to(const hobject_t&, const pg_stat_t&);
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
new file mode 100644
index 000000000..d555d6cdc
--- /dev/null
+++ b/src/crimson/osd/ec_backend.cc
@@ -0,0 +1,37 @@
+#include "ec_backend.h"
+
+#include "crimson/osd/shard_services.h"
+
+ECBackend::ECBackend(shard_id_t shard,
+ ECBackend::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t&,
+ uint64_t,
+ DoutPrefixProvider &dpp)
+ : PGBackend{shard, coll, shard_services, dpp}
+{
+ // todo
+}
+
+ECBackend::ll_read_ierrorator::future<ceph::bufferlist>
+ECBackend::_read(const hobject_t& hoid,
+ const uint64_t off,
+ const uint64_t len,
+ const uint32_t flags)
+{
+ // todo
+ return seastar::make_ready_future<bufferlist>();
+}
+
+ECBackend::rep_op_fut_t
+ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ // todo
+ return {seastar::now(),
+ seastar::make_ready_future<crimson::osd::acked_peers_t>()};
+}
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
new file mode 100644
index 000000000..3dbcc4def
--- /dev/null
+++ b/src/crimson/osd/ec_backend.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+#include "pg_backend.h"
+
+class ECBackend : public PGBackend
+{
+public:
+ ECBackend(shard_id_t shard,
+ CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ uint64_t stripe_width,
+ DoutPrefixProvider &dpp);
+ seastar::future<> stop() final {
+ return seastar::now();
+ }
+ void on_actingset_changed(bool same_primary) final {}
+private:
+ ll_read_ierrorator::future<ceph::bufferlist>
+ _read(const hobject_t& hoid, uint64_t off, uint64_t len, uint32_t flags) override;
+ rep_op_fut_t
+ _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& req,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) final;
+ CollectionRef coll;
+ crimson::os::FuturizedStore::Shard* store;
+ seastar::future<> request_committed(const osd_reqid_t& reqid,
+ const eversion_t& version) final {
+ return seastar::now();
+ }
+};
diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h
new file mode 100644
index 000000000..2783ed252
--- /dev/null
+++ b/src/crimson/osd/exceptions.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include "crimson/common/errorator.h"
+
+namespace crimson::osd {
+class error : private std::system_error {
+public:
+ error(const std::errc ec)
+ : system_error(std::make_error_code(ec)) {
+ }
+
+ using system_error::code;
+ using system_error::what;
+
+ friend error make_error(int ret);
+
+private:
+ error(const int ret) noexcept
+ : system_error(ret, std::system_category()) {
+ }
+};
+
+inline error make_error(const int ret) {
+ return error{ret};
+}
+
+struct object_not_found : public error {
+ object_not_found() : error(std::errc::no_such_file_or_directory) {}
+};
+
+struct invalid_argument : public error {
+ invalid_argument() : error(std::errc::invalid_argument) {}
+};
+
+// FIXME: error handling
+struct permission_denied : public error {
+ permission_denied() : error(std::errc::operation_not_permitted) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc
new file mode 100644
index 000000000..266e56533
--- /dev/null
+++ b/src/crimson/osd/heartbeat.cc
@@ -0,0 +1,819 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "heartbeat.h"
+
+#include <boost/range/join.hpp>
+#include <fmt/chrono.h>
+#include <fmt/os.h>
+
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/formatter.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/mon/MonClient.h"
+
+#include "osd/OSDMap.h"
+
+using std::set;
+using std::string;
+using crimson::common::local_conf;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+Heartbeat::Heartbeat(osd_id_t whoami,
+ crimson::osd::ShardServices& service,
+ crimson::mon::Client& monc,
+ crimson::net::Messenger &front_msgr,
+ crimson::net::Messenger &back_msgr)
+ : whoami{whoami},
+ service{service},
+ monc{monc},
+ front_msgr{front_msgr},
+ back_msgr{back_msgr},
+ // do this in background
+ timer{[this] {
+ heartbeat_check();
+ (void)send_heartbeats();
+ }},
+ failing_peers{*this}
+{}
+
+seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs,
+ entity_addrvec_t back_addrs)
+{
+ logger().info("heartbeat: start front_addrs={}, back_addrs={}",
+ front_addrs, back_addrs);
+ // i only care about the address, so any unused port would work
+ for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) {
+ addr.set_port(0);
+ }
+
+ using crimson::net::SocketPolicy;
+ front_msgr.set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossy_client(0));
+ back_msgr.set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossy_client(0));
+ return seastar::when_all_succeed(start_messenger(front_msgr,
+ front_addrs),
+ start_messenger(back_msgr,
+ back_addrs))
+ .then_unpack([this] {
+ timer.arm_periodic(
+ std::chrono::seconds(local_conf()->osd_heartbeat_interval));
+ });
+}
+
+seastar::future<>
+Heartbeat::start_messenger(crimson::net::Messenger& msgr,
+ const entity_addrvec_t& addrs)
+{
+ return msgr.bind(addrs).safe_then([this, &msgr]() mutable {
+ return msgr.start({this});
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [addrs] (const std::error_code& e) {
+ logger().error("heartbeat messenger bind({}): {}", addrs, e);
+ ceph_abort();
+ }));
+}
+
+seastar::future<> Heartbeat::stop()
+{
+ logger().info("{}", __func__);
+ timer.cancel();
+ front_msgr.stop();
+ back_msgr.stop();
+ return gate.close().then([this] {
+ return seastar::when_all_succeed(front_msgr.shutdown(),
+ back_msgr.shutdown());
+ }).then_unpack([] {
+ return seastar::now();
+ });
+}
+
+const entity_addrvec_t& Heartbeat::get_front_addrs() const
+{
+ return front_msgr.get_myaddrs();
+}
+
+const entity_addrvec_t& Heartbeat::get_back_addrs() const
+{
+ return back_msgr.get_myaddrs();
+}
+
+crimson::net::Messenger& Heartbeat::get_front_msgr() const
+{
+ return front_msgr;
+}
+
+crimson::net::Messenger& Heartbeat::get_back_msgr() const
+{
+ return back_msgr;
+}
+
+void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch)
+{
+ assert(whoami != _peer);
+ auto [iter, added] = peers.try_emplace(_peer, *this, _peer);
+ auto& peer = iter->second;
+ peer.set_epoch_added(epoch);
+}
+
+Heartbeat::osds_t Heartbeat::remove_down_peers()
+{
+ osds_t old_osds; // osds not added in this epoch
+ for (auto i = peers.begin(); i != peers.end(); ) {
+ auto osdmap = service.get_map();
+ const auto& [osd, peer] = *i;
+ if (!osdmap->is_up(osd)) {
+ i = peers.erase(i);
+ } else {
+ if (peer.get_epoch_added() < osdmap->get_epoch()) {
+ old_osds.push_back(osd);
+ }
+ ++i;
+ }
+ }
+ return old_osds;
+}
+
+void Heartbeat::add_reporter_peers(int whoami)
+{
+ auto osdmap = service.get_map();
+ // include next and previous up osds to ensure we have a fully-connected set
+ set<int> want;
+ if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) {
+ want.insert(next);
+ }
+ if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) {
+ want.insert(prev);
+ }
+ // make sure we have at least **min_down** osds coming from different
+ // subtree level (e.g., hosts) for fast failure detection.
+ auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters");
+ auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level");
+ osdmap->get_random_up_osds_by_subtree(
+ whoami, subtree, min_down, want, &want);
+ auto epoch = osdmap->get_epoch();
+ for (int osd : want) {
+ add_peer(osd, epoch);
+ };
+}
+
+void Heartbeat::update_peers(int whoami)
+{
+ const auto min_peers = static_cast<size_t>(
+ local_conf().get_val<int64_t>("osd_heartbeat_min_peers"));
+ add_reporter_peers(whoami);
+ auto extra = remove_down_peers();
+ // too many?
+ for (auto& osd : extra) {
+ if (peers.size() <= min_peers) {
+ break;
+ }
+ remove_peer(osd);
+ }
+ // or too few?
+ auto osdmap = service.get_map();
+ auto epoch = osdmap->get_epoch();
+ for (auto next = osdmap->get_next_up_osd_after(whoami);
+ peers.size() < min_peers && next >= 0 && next != whoami;
+ next = osdmap->get_next_up_osd_after(next)) {
+ add_peer(next, epoch);
+ }
+}
+
+Heartbeat::osds_t Heartbeat::get_peers() const
+{
+ osds_t osds;
+ osds.reserve(peers.size());
+ for (auto& peer : peers) {
+ osds.push_back(peer.first);
+ }
+ return osds;
+}
+
+void Heartbeat::remove_peer(osd_id_t peer)
+{
+ assert(peers.count(peer) == 1);
+ peers.erase(peer);
+}
+
+std::optional<seastar::future<>>
+Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ bool dispatched = true;
+ gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+ switch (m->get_type()) {
+ case MSG_OSD_PING:
+ return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m));
+ default:
+ dispatched = false;
+ return seastar::now();
+ }
+ });
+ return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_reset(conn, is_replace);
+ }
+}
+
+void Heartbeat::ms_handle_connect(
+ crimson::net::ConnectionRef conn,
+ seastar::shard_id prv_shard)
+{
+ ceph_assert_always(seastar::this_shard_id() == prv_shard);
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_connect(conn);
+ }
+}
+
+void Heartbeat::ms_handle_accept(
+ crimson::net::ConnectionRef conn,
+ seastar::shard_id prv_shard,
+ bool is_replace)
+{
+ ceph_assert_always(seastar::this_shard_id() == prv_shard);
+ auto peer = conn->get_peer_id();
+ if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+ peer == entity_name_t::NEW) {
+ return;
+ }
+ if (auto found = peers.find(peer);
+ found != peers.end()) {
+ found->second.handle_accept(conn, is_replace);
+ }
+}
+
+seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ switch (m->op) {
+ case MOSDPing::PING:
+ return handle_ping(conn, m);
+ case MOSDPing::PING_REPLY:
+ return handle_reply(conn, m);
+ case MOSDPing::YOU_DIED:
+ return handle_you_died();
+ default:
+ return seastar::now();
+ }
+}
+
+seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ auto min_message = static_cast<uint32_t>(
+ local_conf()->osd_heartbeat_min_size);
+ auto reply =
+ crimson::make_message<MOSDPing>(
+ m->fsid,
+ service.get_map()->get_epoch(),
+ MOSDPing::PING_REPLY,
+ m->ping_stamp,
+ m->mono_ping_stamp,
+ service.get_mnow(),
+ service.get_up_epoch(),
+ min_message);
+ return conn->send(std::move(reply)
+ ).then([this, m, conn] {
+ return maybe_share_osdmap(conn, m);
+ });
+}
+
+seastar::future<> Heartbeat::maybe_share_osdmap(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ const osd_id_t from = m->get_source().num();
+ const epoch_t current_osdmap_epoch = service.get_map()->get_epoch();
+ auto found = peers.find(from);
+ if (found == peers.end()) {
+ return seastar::now();
+ }
+ auto& peer = found->second;
+
+ if (m->map_epoch > peer.get_projected_epoch()) {
+ logger().debug("{} updating peer {} session's projected_epoch"
+ "from {} to ping map epoch of {}",
+ __func__, from, peer.get_projected_epoch(),
+ m->map_epoch);
+ peer.set_projected_epoch(m->map_epoch);
+ }
+
+ if (current_osdmap_epoch <= peer.get_projected_epoch()) {
+ logger().debug("{} peer {} projected_epoch {} is already later "
+ "than our osdmap epoch of {}",
+ __func__ , from, peer.get_projected_epoch(),
+ current_osdmap_epoch);
+ return seastar::now();
+ }
+
+ const epoch_t send_from = peer.get_projected_epoch();
+ logger().debug("{} sending peer {} peer maps from projected epoch {} through "
+ "local osdmap epoch {}",
+ __func__,
+ from,
+ send_from,
+ current_osdmap_epoch);
+ peer.set_projected_epoch(current_osdmap_epoch);
+ return service.send_incremental_map_to_osd(from, send_from);
+}
+
+seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m)
+{
+ const osd_id_t from = m->get_source().num();
+ auto found = peers.find(from);
+ if (found == peers.end()) {
+ // stale reply
+ return seastar::now();
+ }
+ auto& peer = found->second;
+ return peer.handle_reply(conn, m
+ ).then([this, conn, m] {
+ return maybe_share_osdmap(conn, m);
+ });
+}
+
+seastar::future<> Heartbeat::handle_you_died()
+{
+ // TODO: ask for newer osdmap
+ return seastar::now();
+}
+
+void Heartbeat::heartbeat_check()
+{
+ failure_queue_t failure_queue;
+ const auto now = clock::now();
+ for (const auto& [osd, peer] : peers) {
+ auto failed_since = peer.failed_since(now);
+ if (!clock::is_zero(failed_since)) {
+ failure_queue.emplace(osd, failed_since);
+ }
+ }
+ if (!failure_queue.empty()) {
+ // send_failures can run in background, because
+ // 1. After the execution of send_failures, no msg is actually
+ // sent, which means the sending operation is not done,
+ // which further seems to involve problems risks that when
+ // osd shuts down, the left part of the sending operation
+ // may reference OSD and Heartbeat instances that are already
+ // deleted. However, remaining work of that sending operation
+ // involves no reference back to OSD or Heartbeat instances,
+ // which means it wouldn't involve the above risks.
+ // 2. messages are sent in order, if later checks find out
+ // the previous "failed" peers to be healthy, that "still
+ // alive" messages would be sent after the previous "osd
+ // failure" messages which is totally safe.
+ (void)send_failures(std::move(failure_queue));
+ }
+}
+
+seastar::future<> Heartbeat::send_heartbeats()
+{
+ const auto mnow = service.get_mnow();
+ const auto now = clock::now();
+
+ std::vector<seastar::future<>> futures;
+ for (auto& [osd, peer] : peers) {
+ peer.send_heartbeat(now, mnow, futures);
+ }
+ return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue)
+{
+ std::vector<seastar::future<>> futures;
+ const auto now = clock::now();
+ for (auto [osd, failed_since] : failure_queue) {
+ failing_peers.add_pending(osd, failed_since, now, futures);
+ }
+
+ return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+void Heartbeat::print(std::ostream& out) const
+{
+ out << "heartbeat";
+}
+
+Heartbeat::Connection::~Connection()
+{
+ if (conn) {
+ conn->mark_down();
+ }
+}
+
+bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const
+{
+ return (conn && conn == _conn);
+}
+
+bool Heartbeat::Connection::accepted(
+ crimson::net::ConnectionRef accepted_conn,
+ bool is_replace)
+{
+ ceph_assert(accepted_conn);
+ ceph_assert(accepted_conn != conn);
+ if (accepted_conn->get_peer_addr() != listener.get_peer_addr(type)) {
+ return false;
+ }
+
+ if (is_replace) {
+ logger().info("Heartbeat::Connection::accepted(): "
+ "{} racing", *this);
+ racing_detected = true;
+ }
+ if (conn) {
+ // there is no assumption about the ordering of the reset and accept
+ // events for the 2 racing connections.
+ if (is_connected) {
+ logger().warn("Heartbeat::Connection::accepted(): "
+ "{} is accepted while connected, is_replace={}",
+ *this, is_replace);
+ conn->mark_down();
+ set_unconnected();
+ }
+ }
+ conn = accepted_conn;
+ set_connected();
+ return true;
+}
+
+void Heartbeat::Connection::reset(bool is_replace)
+{
+ if (is_replace) {
+ logger().info("Heartbeat::Connection::reset(): "
+ "{} racing, waiting for the replacing accept",
+ *this);
+ racing_detected = true;
+ }
+
+ if (is_connected) {
+ set_unconnected();
+ } else {
+ conn = nullptr;
+ }
+
+ if (is_replace) {
+ // waiting for the replacing accept event
+ } else if (!racing_detected || is_winner_side) {
+ connect();
+ } else { // racing_detected && !is_winner_side
+ logger().info("Heartbeat::Connection::reset(): "
+ "{} racing detected and lose, "
+ "waiting for peer connect me", *this);
+ }
+}
+
+seastar::future<> Heartbeat::Connection::send(MessageURef msg)
+{
+ assert(is_connected);
+ return conn->send(std::move(msg));
+}
+
+void Heartbeat::Connection::validate()
+{
+ assert(is_connected);
+ auto peer_addr = listener.get_peer_addr(type);
+ if (conn->get_peer_addr() != peer_addr) {
+ logger().info("Heartbeat::Connection::validate(): "
+ "{} has new address {} over {}, reset",
+ *this, peer_addr, conn->get_peer_addr());
+ conn->mark_down();
+ racing_detected = false;
+ reset();
+ }
+}
+
+void Heartbeat::Connection::retry()
+{
+ racing_detected = false;
+ if (!is_connected) {
+ if (conn) {
+ conn->mark_down();
+ reset();
+ } else {
+ connect();
+ }
+ }
+}
+
+void Heartbeat::Connection::set_connected()
+{
+ assert(conn);
+ assert(!is_connected);
+ ceph_assert(conn->is_connected());
+ is_connected = true;
+ listener.increase_connected();
+}
+
+void Heartbeat::Connection::set_unconnected()
+{
+ assert(conn);
+ assert(is_connected);
+ conn = nullptr;
+ is_connected = false;
+ listener.decrease_connected();
+}
+
+void Heartbeat::Connection::connect()
+{
+ assert(!conn);
+ auto addr = listener.get_peer_addr(type);
+ conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer));
+ if (conn->is_connected()) {
+ set_connected();
+ }
+}
+
+Heartbeat::clock::time_point
+Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const
+{
+ if (do_health_screen(now) == health_state::UNHEALTHY) {
+ auto oldest_deadline = ping_history.begin()->second.deadline;
+ auto failed_since = std::min(last_rx_back, last_rx_front);
+ if (clock::is_zero(failed_since)) {
+ logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+ "ever on either front or back, first ping sent {} "
+ "(oldest deadline {})",
+ peer, first_tx, oldest_deadline);
+ failed_since = first_tx;
+ } else {
+ logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+ "since back {} front {} (oldest deadline {})",
+ peer, last_rx_back, last_rx_front, oldest_deadline);
+ }
+ return failed_since;
+ } else {
+ return clock::zero();
+ }
+}
+
+void Heartbeat::Session::set_inactive_history(clock::time_point now)
+{
+ assert(!connected);
+ if (ping_history.empty()) {
+ const utime_t sent_stamp{now};
+ const auto deadline =
+ now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+ ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+ } else { // the entry is already added
+ assert(ping_history.size() == 1);
+ }
+}
+
+Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer)
+ : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer},
+ con_front(peer, heartbeat.whoami > peer, Connection::type_t::front,
+ heartbeat.front_msgr, *this),
+ con_back(peer, heartbeat.whoami > peer, Connection::type_t::back,
+ heartbeat.back_msgr, *this)
+{
+ logger().info("Heartbeat::Peer: osd.{} added", peer);
+}
+
+Heartbeat::Peer::~Peer()
+{
+ logger().info("Heartbeat::Peer: osd.{} removed", peer);
+}
+
+void Heartbeat::Peer::send_heartbeat(
+ clock::time_point now, ceph::signedspan mnow,
+ std::vector<seastar::future<>>& futures)
+{
+ session.set_tx(now);
+ if (session.is_started()) {
+ do_send_heartbeat(now, mnow, &futures);
+ for_each_conn([] (auto& conn) {
+ conn.validate();
+ });
+ } else {
+ // we should send MOSDPing but still cannot at this moment
+ if (pending_send) {
+ // we have already pending for a entire heartbeat interval
+ logger().warn("Heartbeat::Peer::send_heartbeat(): "
+ "heartbeat to osd.{} is still pending...", peer);
+ for_each_conn([] (auto& conn) {
+ conn.retry();
+ });
+ } else {
+ logger().info("Heartbeat::Peer::send_heartbeat(): "
+ "heartbeat to osd.{} is pending send...", peer);
+ session.set_inactive_history(now);
+ pending_send = true;
+ }
+ }
+}
+
+void Heartbeat::Peer::handle_reset(
+ crimson::net::ConnectionRef conn, bool is_replace)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.matches(conn)) {
+ ++cnt;
+ _conn.reset(is_replace);
+ }
+ });
+
+ if (cnt == 0) {
+ logger().info("Heartbeat::Peer::handle_reset(): {} ignores conn, is_replace={} -- {}",
+ *this, is_replace, *conn);
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_reset(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+void Heartbeat::Peer::handle_connect(crimson::net::ConnectionRef conn)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.matches(conn)) {
+ ++cnt;
+ _conn.connected();
+ }
+ });
+
+ if (cnt == 0) {
+ logger().error("Heartbeat::Peer::handle_connect(): {} ignores conn -- {}",
+ *this, *conn);
+ conn->mark_down();
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_connect(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+void Heartbeat::Peer::handle_accept(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ int cnt = 0;
+ for_each_conn([&] (auto& _conn) {
+ if (_conn.accepted(conn, is_replace)) {
+ ++cnt;
+ }
+ });
+
+ if (cnt == 0) {
+ logger().warn("Heartbeat::Peer::handle_accept(): {} ignores conn -- {}",
+ *this, *conn);
+ } else if (cnt > 1) {
+ logger().error("Heartbeat::Peer::handle_accept(): {} handles conn {} times -- {}",
+ *this, cnt, *conn);
+ }
+}
+
+seastar::future<> Heartbeat::Peer::handle_reply(
+ crimson::net::ConnectionRef conn, Ref<MOSDPing> m)
+{
+ if (!session.is_started()) {
+ // we haven't sent any ping yet
+ return seastar::now();
+ }
+ type_t type;
+ if (con_front.matches(conn)) {
+ type = type_t::front;
+ } else if (con_back.matches(conn)) {
+ type = type_t::back;
+ } else {
+ return seastar::now();
+ }
+ const auto now = clock::now();
+ if (session.on_pong(m->ping_stamp, type, now)) {
+ if (session.do_health_screen(now) == Session::health_state::HEALTHY) {
+ return heartbeat.failing_peers.cancel_one(peer);
+ }
+ }
+ return seastar::now();
+}
+
+entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type)
+{
+ const auto osdmap = heartbeat.service.get_map();
+ if (type == type_t::front) {
+ return osdmap->get_hb_front_addrs(peer).front();
+ } else {
+ return osdmap->get_hb_back_addrs(peer).front();
+ }
+}
+
+void Heartbeat::Peer::on_connected()
+{
+ logger().info("Heartbeat::Peer: osd.{} connected (send={})",
+ peer, pending_send);
+ session.on_connected();
+ if (pending_send) {
+ pending_send = false;
+ do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr);
+ }
+}
+
+void Heartbeat::Peer::on_disconnected()
+{
+ logger().info("Heartbeat::Peer: osd.{} disconnected", peer);
+ session.on_disconnected();
+}
+
+void Heartbeat::Peer::do_send_heartbeat(
+ Heartbeat::clock::time_point now,
+ ceph::signedspan mnow,
+ std::vector<seastar::future<>>* futures)
+{
+ const utime_t sent_stamp{now};
+ const auto deadline =
+ now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+ session.on_ping(sent_stamp, deadline);
+ for_each_conn([&, this] (auto& conn) {
+ auto min_message = static_cast<uint32_t>(
+ local_conf()->osd_heartbeat_min_size);
+ auto ping = crimson::make_message<MOSDPing>(
+ heartbeat.monc.get_fsid(),
+ heartbeat.service.get_map()->get_epoch(),
+ MOSDPing::PING,
+ sent_stamp,
+ mnow,
+ mnow,
+ heartbeat.service.get_up_epoch(),
+ min_message);
+ if (futures) {
+ futures->push_back(conn.send(std::move(ping)));
+ }
+ });
+}
+
+bool Heartbeat::FailingPeers::add_pending(
+ osd_id_t peer,
+ clock::time_point failed_since,
+ clock::time_point now,
+ std::vector<seastar::future<>>& futures)
+{
+ if (failure_pending.count(peer)) {
+ return false;
+ }
+ auto failed_for = std::chrono::duration_cast<std::chrono::seconds>(
+ now - failed_since).count();
+ auto osdmap = heartbeat.service.get_map();
+ auto failure_report =
+ crimson::make_message<MOSDFailure>(heartbeat.monc.get_fsid(),
+ peer,
+ osdmap->get_addrs(peer),
+ static_cast<int>(failed_for),
+ osdmap->get_epoch());
+ failure_pending.emplace(peer, failure_info_t{failed_since,
+ osdmap->get_addrs(peer)});
+ futures.push_back(heartbeat.monc.send_message(std::move(failure_report)));
+ logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for);
+ return true;
+}
+
+seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer)
+{
+ if (auto pending = failure_pending.find(peer);
+ pending != failure_pending.end()) {
+ auto fut = send_still_alive(peer, pending->second.addrs);
+ failure_pending.erase(peer);
+ return fut;
+ }
+ return seastar::now();
+}
+
+seastar::future<>
+Heartbeat::FailingPeers::send_still_alive(
+ osd_id_t osd, const entity_addrvec_t& addrs)
+{
+ auto still_alive = crimson::make_message<MOSDFailure>(
+ heartbeat.monc.get_fsid(),
+ osd,
+ addrs,
+ 0,
+ heartbeat.service.get_map()->get_epoch(),
+ MOSDFailure::FLAG_ALIVE);
+ logger().info("{}: osd.{}", __func__, osd);
+ return heartbeat.monc.send_message(std::move(still_alive));
+}
diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h
new file mode 100644
index 000000000..f5da45118
--- /dev/null
+++ b/src/crimson/osd/heartbeat.h
@@ -0,0 +1,461 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <seastar/core/future.hh>
+#include "common/ceph_time.h"
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+
+class MOSDPing;
+
+namespace crimson::osd {
+ class ShardServices;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+
+class Heartbeat : public crimson::net::Dispatcher {
+public:
+ using osd_id_t = int;
+
+ Heartbeat(osd_id_t whoami,
+ crimson::osd::ShardServices& service,
+ crimson::mon::Client& monc,
+ crimson::net::Messenger &front_msgr,
+ crimson::net::Messenger &back_msgr);
+
+ seastar::future<> start(entity_addrvec_t front,
+ entity_addrvec_t back);
+ seastar::future<> stop();
+
+ using osds_t = std::vector<osd_id_t>;
+ void add_peer(osd_id_t peer, epoch_t epoch);
+ void update_peers(int whoami);
+ void remove_peer(osd_id_t peer);
+ osds_t get_peers() const;
+
+ const entity_addrvec_t& get_front_addrs() const;
+ const entity_addrvec_t& get_back_addrs() const;
+
+ crimson::net::Messenger &get_front_msgr() const;
+ crimson::net::Messenger &get_back_msgr() const;
+
+ // Dispatcher methods
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef conn, MessageRef m) override;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override;
+ void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) override;
+ void ms_handle_accept(crimson::net::ConnectionRef conn, seastar::shard_id, bool is_replace) override;
+
+ void print(std::ostream&) const;
+private:
+ seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_ping(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDPing> m);
+ seastar::future<> handle_you_died();
+
+ /// remove down OSDs
+ /// @return peers not added in this epoch
+ osds_t remove_down_peers();
+ /// add enough reporters for fast failure detection
+ void add_reporter_peers(int whoami);
+
+ seastar::future<> start_messenger(crimson::net::Messenger& msgr,
+ const entity_addrvec_t& addrs);
+ seastar::future<> maybe_share_osdmap(crimson::net::ConnectionRef,
+ Ref<MOSDPing> m);
+private:
+ const osd_id_t whoami;
+ crimson::osd::ShardServices& service;
+ crimson::mon::Client& monc;
+ crimson::net::Messenger &front_msgr;
+ crimson::net::Messenger &back_msgr;
+
+ seastar::timer<seastar::lowres_clock> timer;
+ // use real_clock so it can be converted to utime_t
+ using clock = ceph::coarse_real_clock;
+
+ class ConnectionListener;
+ class Connection;
+ class Session;
+ class Peer;
+ using peers_map_t = std::map<osd_id_t, Peer>;
+ peers_map_t peers;
+
+ // osds which are considered failed
+ // osd_id => when was the last time that both front and back pings were acked
+ // or sent.
+ // use for calculating how long the OSD has been unresponsive
+ using failure_queue_t = std::map<osd_id_t, clock::time_point>;
+ seastar::future<> send_failures(failure_queue_t&& failure_queue);
+ seastar::future<> send_heartbeats();
+ void heartbeat_check();
+
+ // osds we've reported to monior as failed ones, but they are not marked down
+ // yet
+ crimson::common::Gated gate;
+
+ class FailingPeers {
+ public:
+ FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {}
+ bool add_pending(osd_id_t peer,
+ clock::time_point failed_since,
+ clock::time_point now,
+ std::vector<seastar::future<>>& futures);
+ seastar::future<> cancel_one(osd_id_t peer);
+
+ private:
+ seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&);
+
+ Heartbeat& heartbeat;
+
+ struct failure_info_t {
+ clock::time_point failed_since;
+ entity_addrvec_t addrs;
+ };
+ std::map<osd_id_t, failure_info_t> failure_pending;
+ } failing_peers;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) {
+ hb.print(out);
+ return out;
+}
+
+/*
+ * Event driven interface for Heartbeat::Peer to be notified when both hb_front
+ * and hb_back are connected, or connection is lost.
+ */
+class Heartbeat::ConnectionListener {
+ public:
+ ConnectionListener(size_t connections) : connections{connections} {}
+
+ void increase_connected() {
+ assert(connected < connections);
+ ++connected;
+ if (connected == connections) {
+ on_connected();
+ }
+ }
+ void decrease_connected() {
+ assert(connected > 0);
+ if (connected == connections) {
+ on_disconnected();
+ }
+ --connected;
+ }
+ enum class type_t { front, back };
+ virtual entity_addr_t get_peer_addr(type_t) = 0;
+
+ protected:
+ virtual void on_connected() = 0;
+ virtual void on_disconnected() = 0;
+
+ private:
+ const size_t connections;
+ size_t connected = 0;
+};
+
+class Heartbeat::Connection {
+ public:
+ using type_t = ConnectionListener::type_t;
+ Connection(osd_id_t peer, bool is_winner_side, type_t type,
+ crimson::net::Messenger& msgr,
+ ConnectionListener& listener)
+ : peer{peer}, type{type},
+ msgr{msgr}, listener{listener},
+ is_winner_side{is_winner_side} {
+ connect();
+ }
+ Connection(const Connection&) = delete;
+ Connection(Connection&&) = delete;
+ Connection& operator=(const Connection&) = delete;
+ Connection& operator=(Connection&&) = delete;
+
+ ~Connection();
+
+ bool matches(crimson::net::ConnectionRef _conn) const;
+ void connected() {
+ set_connected();
+ }
+ bool accepted(crimson::net::ConnectionRef, bool is_replace);
+ void reset(bool is_replace=false);
+ seastar::future<> send(MessageURef msg);
+ void validate();
+ // retry connection if still pending
+ void retry();
+
+ private:
+ void set_connected();
+ void set_unconnected();
+ void connect();
+
+ const osd_id_t peer;
+ const type_t type;
+ crimson::net::Messenger& msgr;
+ ConnectionListener& listener;
+
+/*
+ * Resolve the following racing when both me and peer are trying to connect
+ * each other symmetrically, under SocketPolicy::lossy_client:
+ *
+ * OSD.A OSD.B
+ * - -
+ * |-[1]----> <----[2]-|
+ * \ /
+ * \ /
+ * delay.. X delay..
+ * / \
+ * |-[1]x> / \ <x[2]-|
+ * |<-[2]--- ---[1]->|
+ * |(reset#1) (reset#2)|
+ * |(reconnectB) (reconnectA)|
+ * |-[2]---> <---[1]-|
+ * delay.. delay..
+ * (remote close populated)
+ * |-[2]x> <x[1]-|
+ * |(reset#2) (reset#1)|
+ * | ... ... |
+ * (dead loop!)
+ *
+ * Our solution is to remember if such racing was happened recently, and
+ * establish connection asymmetrically only from the winner side whose osd-id
+ * is larger.
+ */
+ const bool is_winner_side;
+ bool racing_detected = false;
+
+ crimson::net::ConnectionRef conn;
+ bool is_connected = false;
+
+ friend std::ostream& operator<<(std::ostream& os, const Connection& c) {
+ if (c.type == type_t::front) {
+ return os << "con_front(osd." << c.peer << ")";
+ } else {
+ return os << "con_back(osd." << c.peer << ")";
+ }
+ }
+};
+
+/*
+ * Track the ping history and ping reply (the pong) from the same session, clean up
+ * history once hb_front or hb_back loses connection and restart the session once
+ * both connections are connected again.
+ *
+ * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back
+ * loses connection, because we would end up with the following deadloop:
+ *
+ * OSD.A OSD.B
+ * - -
+ * hb_front reset <--(network)--- hb_front close
+ * | ^
+ * | |
+ * remove Peer B (dead loop!) remove Peer A
+ * | |
+ * V |
+ * hb_back close ----(network)---> hb_back reset
+ */
+class Heartbeat::Session {
+ public:
+ Session(osd_id_t peer) : peer{peer} {}
+
+ void set_epoch_added(epoch_t epoch_) { epoch = epoch_; }
+ epoch_t get_epoch_added() const { return epoch; }
+
+ void set_projected_epoch(epoch_t epoch_) { projected_epoch = epoch_; }
+ epoch_t get_projected_epoch() const { return projected_epoch; }
+
+ bool is_started() const { return connected; }
+ bool pinged() const {
+ if (clock::is_zero(first_tx)) {
+ // i can never receive a pong without sending any ping message first.
+ assert(clock::is_zero(last_rx_front) &&
+ clock::is_zero(last_rx_back));
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ enum class health_state {
+ UNKNOWN,
+ UNHEALTHY,
+ HEALTHY,
+ };
+ health_state do_health_screen(clock::time_point now) const {
+ if (!pinged()) {
+ // we are not healty nor unhealty because we haven't sent anything yet
+ return health_state::UNKNOWN;
+ } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) {
+ return health_state::UNHEALTHY;
+ } else if (!clock::is_zero(last_rx_front) &&
+ !clock::is_zero(last_rx_back)) {
+ // only declare to be healthy until we have received the first
+ // replies from both front/back connections
+ return health_state::HEALTHY;
+ } else {
+ return health_state::UNKNOWN;
+ }
+ }
+
+ clock::time_point failed_since(clock::time_point now) const;
+
+ void set_tx(clock::time_point now) {
+ if (!pinged()) {
+ first_tx = now;
+ }
+ last_tx = now;
+ }
+
+ void on_connected() {
+ assert(!connected);
+ connected = true;
+ ping_history.clear();
+ }
+
+ void on_ping(const utime_t& sent_stamp,
+ const clock::time_point& deadline) {
+ assert(connected);
+ [[maybe_unused]] auto [reply, added] =
+ ping_history.emplace(sent_stamp, reply_t{deadline, 2});
+ }
+
+ bool on_pong(const utime_t& ping_stamp,
+ Connection::type_t type,
+ clock::time_point now) {
+ assert(connected);
+ auto ping = ping_history.find(ping_stamp);
+ if (ping == ping_history.end()) {
+ // old replies, deprecated by newly sent pings.
+ return false;
+ }
+ auto& unacked = ping->second.unacknowledged;
+ assert(unacked);
+ if (type == Connection::type_t::front) {
+ last_rx_front = now;
+ unacked--;
+ } else {
+ last_rx_back = now;
+ unacked--;
+ }
+ if (unacked == 0) {
+ ping_history.erase(ping_history.begin(), ++ping);
+ }
+ return true;
+ }
+
+ void on_disconnected() {
+ assert(connected);
+ connected = false;
+ if (!ping_history.empty()) {
+ // we lost our ping_history of the last session, but still need to keep
+ // the oldest deadline for unhealthy check.
+ auto oldest = ping_history.begin();
+ auto sent_stamp = oldest->first;
+ auto deadline = oldest->second.deadline;
+ ping_history.clear();
+ ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+ }
+ }
+
+ // maintain an entry in ping_history for unhealthy check
+ void set_inactive_history(clock::time_point);
+
+ private:
+ const osd_id_t peer;
+ bool connected = false;
+ // time we sent our first ping request
+ clock::time_point first_tx;
+ // last time we sent a ping request
+ clock::time_point last_tx;
+ // last time we got a ping reply on the front side
+ clock::time_point last_rx_front;
+ // last time we got a ping reply on the back side
+ clock::time_point last_rx_back;
+ // most recent epoch we wanted this peer
+ epoch_t epoch; // rename me to epoch_added
+ // epoch we expect peer to be at once our sent incrementals are processed
+ epoch_t projected_epoch = 0;
+
+ struct reply_t {
+ clock::time_point deadline;
+ // one sent over front conn, another sent over back conn
+ uint8_t unacknowledged = 0;
+ };
+ // history of inflight pings, arranging by timestamp we sent
+ std::map<utime_t, reply_t> ping_history;
+};
+
+class Heartbeat::Peer final : private Heartbeat::ConnectionListener {
+ public:
+ Peer(Heartbeat&, osd_id_t);
+ ~Peer();
+ Peer(Peer&&) = delete;
+ Peer(const Peer&) = delete;
+ Peer& operator=(Peer&&) = delete;
+ Peer& operator=(const Peer&) = delete;
+
+ // set/get the epoch at which the peer was added
+ void set_epoch_added(epoch_t epoch) { session.set_epoch_added(epoch); }
+ epoch_t get_epoch_added() const { return session.get_epoch_added(); }
+
+ void set_projected_epoch(epoch_t epoch) { session.set_projected_epoch(epoch); }
+ epoch_t get_projected_epoch() const { return session.get_projected_epoch(); }
+
+ // if failure, return time_point since last active
+ // else, return clock::zero()
+ clock::time_point failed_since(clock::time_point now) const {
+ return session.failed_since(now);
+ }
+ void send_heartbeat(
+ clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&);
+ seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>);
+
+ void handle_reset(crimson::net::ConnectionRef conn, bool is_replace);
+
+ void handle_connect(crimson::net::ConnectionRef conn);
+
+ void handle_accept(crimson::net::ConnectionRef conn, bool is_replace);
+
+ private:
+ entity_addr_t get_peer_addr(type_t type) override;
+ void on_connected() override;
+ void on_disconnected() override;
+ void do_send_heartbeat(
+ clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*);
+
+ template <typename Func>
+ void for_each_conn(Func&& f) {
+ f(con_front);
+ f(con_back);
+ }
+
+ Heartbeat& heartbeat;
+ const osd_id_t peer;
+ Session session;
+ // if need to send heartbeat when session connected
+ bool pending_send = false;
+ Connection con_front;
+ Connection con_back;
+
+ friend std::ostream& operator<<(std::ostream& os, const Peer& p) {
+ return os << "peer(osd." << p.peer << ")";
+ }
+};
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<Heartbeat> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<Heartbeat::Connection> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<Heartbeat::Peer> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/lsan_suppressions.cc b/src/crimson/osd/lsan_suppressions.cc
new file mode 100644
index 000000000..53b7eb630
--- /dev/null
+++ b/src/crimson/osd/lsan_suppressions.cc
@@ -0,0 +1,20 @@
+#ifndef _NDEBUG
+// The callbacks we define here will be called from the sanitizer runtime, but
+// aren't referenced from the Chrome executable. We must ensure that those
+// callbacks are not sanitizer-instrumented, and that they aren't stripped by
+// the linker.
+#define SANITIZER_HOOK_ATTRIBUTE \
+ extern "C" \
+ __attribute__((no_sanitize("address", "thread", "undefined"))) \
+ __attribute__((visibility("default"))) \
+ __attribute__((used))
+
+static char kLSanDefaultSuppressions[] =
+ "leak:InitModule\n"
+ "leak:MallocExtension::Initialize\n"
+ "leak:MallocExtension::Register\n";
+
+SANITIZER_HOOK_ATTRIBUTE const char *__lsan_default_suppressions() {
+ return kLSanDefaultSuppressions;
+}
+#endif // ! _NDEBUG
diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc
new file mode 100644
index 000000000..1e817415d
--- /dev/null
+++ b/src/crimson/osd/main.cc
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <random>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/prometheus.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/http/httpd.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/util/closeable.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "auth/KeyRing.h"
+#include "common/ceph_argparse.h"
+#include "common/config_tracker.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/fatal_signal.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/stop_signal.h"
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+#include "global/pidfile.h"
+#include "osd.h"
+
+using namespace std::literals;
+namespace bpo = boost::program_options;
+using crimson::common::local_conf;
+using crimson::common::sharded_conf;
+using crimson::common::sharded_perf_coll;
+
+static seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+
+seastar::future<> make_keyring()
+{
+ const auto path = local_conf().get_val<std::string>("keyring");
+ return seastar::file_exists(path).then([path](bool exists) {
+ KeyRing keyring;
+ EntityName name{local_conf()->name};
+ EntityAuth auth;
+ if (exists &&
+ keyring.load(nullptr, path) == 0 &&
+ keyring.get_auth(name, auth)) {
+ fmt::print(std::cerr, "already have key in keyring: {}\n", path);
+ return seastar::now();
+ } else {
+ CephContext temp_cct{};
+ auth.key.create(&temp_cct, CEPH_CRYPTO_AES);
+ keyring.add(name, auth);
+ bufferlist bl;
+ keyring.encode_plaintext(bl);
+ const auto permissions = (seastar::file_permissions::user_read |
+ seastar::file_permissions::user_write);
+ return crimson::write_file(std::move(bl), path, permissions);
+ }
+ }).handle_exception_type([path](const std::filesystem::filesystem_error& e) {
+ fmt::print(std::cerr, "FATAL: writing new keyring to {}: {}\n", path, e.what());
+ throw e;
+ });
+}
+
+static std::ofstream maybe_set_logger()
+{
+ std::ofstream log_file_stream;
+ if (auto log_file = local_conf()->log_file; !log_file.empty()) {
+ log_file_stream.open(log_file, std::ios::app | std::ios::out);
+ try {
+ seastar::throw_system_error_on(log_file_stream.fail());
+ } catch (const std::system_error& e) {
+ ceph_abort_msg(fmt::format("unable to open log file: {}", e.what()));
+ }
+ logger().set_ostream(log_file_stream);
+ }
+ return log_file_stream;
+}
+
+int main(int argc, const char* argv[])
+{
+ auto early_config_result = crimson::osd::get_early_config(argc, argv);
+ if (!early_config_result.has_value()) {
+ int r = early_config_result.error();
+ std::cerr << "do_early_config returned error: " << r << std::endl;
+ return r;
+ }
+ auto &early_config = early_config_result.value();
+
+ auto seastar_n_early_args = early_config.get_early_args();
+ auto config_proxy_args = early_config.get_ceph_args();
+
+ seastar::app_template::config app_cfg;
+ app_cfg.name = "Crimson";
+ app_cfg.auto_handle_sigint_sigterm = false;
+ seastar::app_template app(std::move(app_cfg));
+ app.add_options()
+ ("mkkey", "generate a new secret key. "
+ "This is normally used in combination with --mkfs")
+ ("mkfs", "create a [new] data directory")
+ ("debug", "enable debug output on all loggers")
+ ("trace", "enable trace output on all loggers")
+ ("osdspec-affinity", bpo::value<std::string>()->default_value(std::string{}),
+ "set affinity to an osdspec")
+ ("prometheus_port", bpo::value<uint16_t>()->default_value(0),
+ "Prometheus port. Set to zero to disable")
+ ("prometheus_address", bpo::value<std::string>()->default_value("0.0.0.0"),
+ "Prometheus listening address")
+ ("prometheus_prefix", bpo::value<std::string>()->default_value("osd"),
+ "Prometheus metrics prefix");
+
+ try {
+ return app.run(
+ seastar_n_early_args.size(),
+ const_cast<char**>(seastar_n_early_args.data()),
+ [&] {
+ auto& config = app.configuration();
+ return seastar::async([&] {
+ try {
+ FatalSignal fatal_signal;
+ seastar_apps_lib::stop_signal should_stop;
+ if (config.count("debug")) {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::debug
+ );
+ }
+ if (config.count("trace")) {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::trace
+ );
+ }
+ sharded_conf().start(
+ early_config.init_params.name, early_config.cluster_name).get();
+ local_conf().start().get();
+ auto stop_conf = seastar::deferred_stop(sharded_conf());
+ sharded_perf_coll().start().get();
+ auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll());
+ local_conf().parse_config_files(early_config.conf_file_list).get();
+ local_conf().parse_env().get();
+ local_conf().parse_argv(config_proxy_args).get();
+ auto log_file_stream = maybe_set_logger();
+ auto reset_logger = seastar::defer([] {
+ logger().set_ostream(std::cerr);
+ });
+ if (const auto ret = pidfile_write(local_conf()->pid_file);
+ ret == -EACCES || ret == -EAGAIN) {
+ ceph_abort_msg(
+ "likely there is another crimson-osd instance with the same id");
+ } else if (ret < 0) {
+ ceph_abort_msg(fmt::format("pidfile_write failed with {} {}",
+ ret, cpp_strerror(-ret)));
+ }
+ // just ignore SIGHUP, we don't reread settings. keep in mind signals
+ // handled by S* must be blocked for alien threads (see AlienStore).
+ seastar::engine().handle_signal(SIGHUP, [] {});
+
+ // start prometheus API server
+ seastar::httpd::http_server_control prom_server;
+ std::any stop_prometheus;
+ if (uint16_t prom_port = config["prometheus_port"].as<uint16_t>();
+ prom_port != 0) {
+ prom_server.start("prometheus").get();
+ stop_prometheus = seastar::make_shared(seastar::deferred_stop(prom_server));
+
+ seastar::prometheus::config prom_config;
+ prom_config.prefix = config["prometheus_prefix"].as<std::string>();
+ seastar::prometheus::start(prom_server, prom_config).get();
+ seastar::net::inet_address prom_addr(config["prometheus_address"].as<std::string>());
+ prom_server.listen(seastar::socket_address{prom_addr, prom_port})
+ .handle_exception([=] (auto ep) {
+ std::cerr << seastar::format("Could not start Prometheus API server on {}:{}: {}\n",
+ prom_addr, prom_port, ep);
+ return seastar::make_exception_future(ep);
+ }).get();
+ }
+
+ const int whoami = std::stoi(local_conf()->name.get_id());
+ const auto nonce = crimson::osd::get_nonce();
+ crimson::net::MessengerRef cluster_msgr, client_msgr;
+ crimson::net::MessengerRef hb_front_msgr, hb_back_msgr;
+ for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s),
+ make_pair(std::ref(client_msgr), "client"s),
+ make_pair(std::ref(hb_front_msgr), "hb_front"s),
+ make_pair(std::ref(hb_back_msgr), "hb_back"s)}) {
+ msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami),
+ name,
+ nonce,
+ true);
+ }
+ auto store = crimson::os::FuturizedStore::create(
+ local_conf().get_val<std::string>("osd_objectstore"),
+ local_conf().get_val<std::string>("osd_data"),
+ local_conf().get_config_values());
+
+ crimson::osd::OSD osd(
+ whoami, nonce, std::ref(should_stop.abort_source()),
+ std::ref(*store), cluster_msgr, client_msgr,
+ hb_front_msgr, hb_back_msgr);
+
+ if (config.count("mkkey")) {
+ make_keyring().get();
+ }
+ if (local_conf()->no_mon_config) {
+ logger().info("bypassing the config fetch due to --no-mon-config");
+ } else {
+ crimson::osd::populate_config_from_mon().get();
+ }
+ if (config.count("mkfs")) {
+ auto osd_uuid = local_conf().get_val<uuid_d>("osd_uuid");
+ if (osd_uuid.is_zero()) {
+ // use a random osd uuid if not specified
+ osd_uuid.generate_random();
+ }
+ osd.mkfs(
+ *store,
+ whoami,
+ osd_uuid,
+ local_conf().get_val<uuid_d>("fsid"),
+ config["osdspec-affinity"].as<std::string>()).get();
+ }
+ if (config.count("mkkey") || config.count("mkfs")) {
+ return EXIT_SUCCESS;
+ } else {
+ osd.start().get();
+ }
+ logger().info("crimson startup completed");
+ should_stop.wait().get();
+ logger().info("crimson shutting down");
+ osd.stop().get();
+ // stop()s registered using defer() are called here
+ } catch (...) {
+ logger().error("startup failed: {}", std::current_exception());
+ return EXIT_FAILURE;
+ }
+ logger().info("crimson shutdown complete");
+ return EXIT_SUCCESS;
+ });
+ });
+ } catch (...) {
+ fmt::print(std::cerr, "FATAL: Exception during startup, aborting: {}\n", std::current_exception());
+ return EXIT_FAILURE;
+ }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "make -j4 \
+ * -C ../../../build \
+ * crimson-osd"
+ * End:
+ */
diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc
new file mode 100644
index 000000000..807fd1591
--- /dev/null
+++ b/src/crimson/osd/main_config_bootstrap_helpers.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+
+#include <seastar/core/print.hh>
+#include <seastar/core/prometheus.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/http/httpd.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/util/closeable.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "common/ceph_argparse.h"
+#include "common/config_tracker.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/fatal_signal.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/main_config_bootstrap_helpers.h"
+
+using namespace std::literals;
+using crimson::common::local_conf;
+using crimson::common::sharded_conf;
+using crimson::common::sharded_perf_coll;
+
+static seastar::logger& logger()
+{
+ return crimson::get_logger(ceph_subsys_osd);
+}
+
+namespace crimson::osd {
+
+void usage(const char* prog)
+{
+ std::cout << "usage: " << prog << std::endl;
+ generic_server_usage();
+}
+
+
+seastar::future<> populate_config_from_mon()
+{
+ logger().info("populating config from monitor");
+ // i don't have any client before joining the cluster, so no need to have
+ // a proper auth handler
+ class DummyAuthHandler : public crimson::common::AuthHandler {
+ public:
+ void handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps)
+ {}
+ };
+ return seastar::async([] {
+ auto auth_handler = std::make_unique<DummyAuthHandler>();
+ auto msgr = crimson::net::Messenger::create(entity_name_t::CLIENT(),
+ "temp_mon_client",
+ get_nonce(),
+ true);
+ crimson::mon::Client monc{*msgr, *auth_handler};
+ msgr->set_auth_client(&monc);
+ msgr->start({&monc}).get();
+ auto stop_msgr = seastar::defer([&] {
+ msgr->stop();
+ msgr->shutdown().get();
+ });
+ monc.start().handle_exception([] (auto ep) {
+ fmt::print(std::cerr, "FATAL: unable to connect to cluster: {}\n", ep);
+ return seastar::make_exception_future<>(ep);
+ }).get();
+ auto stop_monc = seastar::defer([&] {
+ monc.stop().get();
+ });
+ monc.sub_want("config", 0, 0);
+ monc.renew_subs().get();
+ // wait for monmap and config
+ monc.wait_for_config().get();
+ auto fsid = monc.get_fsid().to_string();
+ local_conf().set_val("fsid", fsid).get();
+ logger().debug("{}: got config from monitor, fsid {}", __func__, fsid);
+ });
+}
+
+static tl::expected<early_config_t, int>
+_get_early_config(int argc, const char *argv[])
+{
+ early_config_t ret;
+
+ // pull off ceph configs the stuff from early_args
+ std::vector<const char *> early_args;
+ early_args.insert(
+ std::end(early_args),
+ argv, argv + argc);
+
+ ret.init_params = ceph_argparse_early_args(
+ early_args,
+ CEPH_ENTITY_TYPE_OSD,
+ &ret.cluster_name,
+ &ret.conf_file_list);
+
+ if (ceph_argparse_need_usage(early_args)) {
+ usage(argv[0]);
+ exit(0);
+ }
+
+ seastar::app_template::config app_cfg;
+ app_cfg.name = "Crimson-startup";
+ app_cfg.auto_handle_sigint_sigterm = false;
+ seastar::app_template app(std::move(app_cfg));
+ const char *bootstrap_args[] = { argv[0], "--smp", "1" };
+ int r = app.run(
+ sizeof(bootstrap_args) / sizeof(bootstrap_args[0]),
+ const_cast<char**>(bootstrap_args),
+ [argc, argv, &ret, &early_args] {
+ return seastar::async([argc, argv, &ret, &early_args] {
+ seastar::global_logger_registry().set_all_loggers_level(
+ seastar::log_level::debug);
+ sharded_conf().start(
+ ret.init_params.name, ret.cluster_name).get();
+ local_conf().start().get();
+ auto stop_conf = seastar::deferred_stop(sharded_conf());
+
+ sharded_perf_coll().start().get();
+ auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll());
+
+ local_conf().parse_env().get();
+ local_conf().parse_argv(early_args).get();
+ local_conf().parse_config_files(ret.conf_file_list).get();
+
+ if (local_conf()->no_mon_config) {
+ logger().info("bypassing the config fetch due to --no-mon-config");
+ } else {
+ populate_config_from_mon().get();
+ }
+
+ // get ceph configs
+ std::set_difference(
+ argv, argv + argc,
+ std::begin(early_args),
+ std::end(early_args),
+ std::back_inserter(ret.ceph_args));
+
+ ret.early_args.insert(
+ std::end(ret.early_args),
+ std::begin(early_args),
+ std::end(early_args));
+
+ if (auto found = std::find_if(
+ std::begin(early_args),
+ std::end(early_args),
+ [](auto* arg) { return "--smp"sv == arg; });
+ found == std::end(early_args)) {
+
+ // Set --smp based on crimson_seastar_smp config option
+ ret.early_args.emplace_back("--smp");
+
+ auto smp_config = local_conf().get_val<uint64_t>(
+ "crimson_seastar_smp");
+
+ ret.early_args.emplace_back(fmt::format("{}", smp_config));
+ logger().info("get_early_config: set --smp {}", smp_config);
+ }
+ return 0;
+ });
+ });
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ return ret;
+}
+
+/* get_early_config handles obtaining config parameters required prior
+ * to reactor startup. Most deployment mechanisms (cephadm for one)
+ * rely on pulling configs from the monitor rather than shipping around
+ * config files, so this process needs to support pulling config options
+ * from the monitors.
+ *
+ * Of particular interest are config params related to the seastar
+ * reactor itself which can't be modified after the reactor has been
+ * started -- like the number of cores to use (smp::count). Contacting
+ * the monitors, however, requires a MonClient, which in turn needs a
+ * running reactor.
+ *
+ * Unfortunately, seastar doesn't clean up thread local state
+ * associated with seastar::smp task queues etc, so we can't
+ * start a reactor, stop it, and restart it in the same thread
+ * without an impractical amount of cleanup in seastar.
+ *
+ * More unfortunately, starting a reactor in a seperate thread
+ * and then joining the thread still doesn't avoid all global state,
+ * I observed tasks from the previous reactor incarnation nevertheless
+ * continuing to run in the new one resulting in a crash as they access
+ * freed memory.
+ *
+ * The approach taken here, therefore, is to actually fork, start a
+ * reactor in the child process, encode the resulting early_config_t,
+ * and send it back to the parent process.
+ */
+tl::expected<early_config_t, int>
+get_early_config(int argc, const char *argv[])
+{
+ int pipes[2];
+ int r = pipe2(pipes, 0);
+ if (r < 0) {
+ std::cerr << "get_early_config: failed to create pipes: "
+ << -errno << std::endl;
+ return tl::unexpected(-errno);
+ }
+
+ pid_t worker = fork();
+ if (worker < 0) {
+ close(pipes[0]);
+ close(pipes[1]);
+ std::cerr << "get_early_config: failed to fork: "
+ << -errno << std::endl;
+ return tl::unexpected(-errno);
+ } else if (worker == 0) { // child
+ close(pipes[0]);
+ auto ret = _get_early_config(argc, argv);
+ if (ret.has_value()) {
+ bufferlist bl;
+ ::encode(ret.value(), bl);
+ r = bl.write_fd(pipes[1]);
+ close(pipes[1]);
+ if (r < 0) {
+ std::cerr << "get_early_config: child failed to write_fd: "
+ << r << std::endl;
+ exit(-r);
+ } else {
+ exit(0);
+ }
+ } else {
+ std::cerr << "get_early_config: child failed: "
+ << -ret.error() << std::endl;
+ exit(-ret.error());
+ }
+ return tl::unexpected(-1);
+ } else { // parent
+ close(pipes[1]);
+
+ bufferlist bl;
+ early_config_t ret;
+ while ((r = bl.read_fd(pipes[0], 1024)) > 0);
+ close(pipes[0]);
+
+ // ignore error, we'll propogate error based on read and decode
+ waitpid(worker, nullptr, 0);
+
+ if (r < 0) {
+ std::cerr << "get_early_config: parent failed to read from pipe: "
+ << r << std::endl;
+ return tl::unexpected(r);
+ }
+ try {
+ auto bliter = bl.cbegin();
+ ::decode(ret, bliter);
+ return ret;
+ } catch (...) {
+ std::cerr << "get_early_config: parent failed to decode" << std::endl;
+ return tl::unexpected(-EINVAL);
+ }
+ }
+}
+
+}
diff --git a/src/crimson/osd/main_config_bootstrap_helpers.h b/src/crimson/osd/main_config_bootstrap_helpers.h
new file mode 100644
index 000000000..7c6131d17
--- /dev/null
+++ b/src/crimson/osd/main_config_bootstrap_helpers.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <random>
+
+#include <seastar/core/future.hh>
+
+#include "common/ceph_argparse.h"
+#include "include/expected.hpp"
+
+namespace crimson::osd {
+
+void usage(const char* prog);
+
+inline uint64_t get_nonce()
+{
+ if (auto pid = getpid(); pid == 1 || std::getenv("CEPH_USE_RANDOM_NONCE")) {
+ // we're running in a container; use a random number instead!
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ return std::uniform_int_distribution<uint64_t>{}(rng);
+ } else {
+ return pid;
+ }
+}
+
+seastar::future<> populate_config_from_mon();
+
+struct early_config_t {
+ std::vector<std::string> early_args;
+ std::vector<std::string> ceph_args;
+
+ std::string cluster_name{"ceph"};
+ std::string conf_file_list;
+ CephInitParameters init_params{CEPH_ENTITY_TYPE_OSD};
+
+ /// Returned vector must not outlive in
+ auto to_ptr_vector(const std::vector<std::string> &in) {
+ std::vector<const char *> ret;
+ ret.reserve(in.size());
+ std::transform(
+ std::begin(in), std::end(in),
+ std::back_inserter(ret),
+ [](const auto &str) { return str.c_str(); });
+ return ret;
+ }
+
+ std::vector<const char *> get_early_args() {
+ return to_ptr_vector(early_args);
+ }
+
+ std::vector<const char *> get_ceph_args() {
+ return to_ptr_vector(ceph_args);
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(early_args, bl);
+ encode(ceph_args, bl);
+ encode(cluster_name, bl);
+ encode(conf_file_list, bl);
+ encode(init_params, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(early_args, bl);
+ decode(ceph_args, bl);
+ decode(cluster_name, bl);
+ decode(conf_file_list, bl);
+ decode(init_params, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+/**
+ * get_early_config
+ *
+ * Compile initial configuration information from command line arguments,
+ * config files, and monitors.
+ *
+ * This implementation forks off a worker process to do this work and must
+ * therefore be called very early in main(). (See implementation for an
+ * explanation).
+ */
+tl::expected<early_config_t, int>
+get_early_config(int argc, const char *argv[]);
+
+}
+
+WRITE_CLASS_ENCODER(crimson::osd::early_config_t)
diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc
new file mode 100644
index 000000000..4cc9d7336
--- /dev/null
+++ b/src/crimson/osd/objclass.cc
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdarg>
+#include <cstring>
+#include <boost/container/small_vector.hpp>
+#include "common/ceph_context.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "crimson/common/config_proxy.h"
+#include "common/debug.h"
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/pg_backend.h"
+
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+
+#include "auth/Crypto.h"
+#include "common/armor.h"
+
+using std::map;
+using std::string;
+
+#define dout_context ClassHandler::get_instance().cct
+
+static constexpr int dout_subsys = ceph_subsys_objclass;
+
+static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op)
+{
+ // we can expect the memory under `ret` will be still fine after
+ // executing the osd op as we're running inside `seastar::thread`
+ // created for us by `seastar::async` in `::do_op_call()`.
+ int ret = 0;
+ using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator;
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op)
+ .handle_error_interruptible(
+ osd_op_errorator::all_same_way([&ret] (const std::error_code& err) {
+ assert(err.value() > 0);
+ ret = -err.value();
+ return seastar::now();
+ })).get(); // we're blocking here which requires `seastar::thread`.
+ return ret;
+}
+
+int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
+ char *indata, int datalen,
+ char **outdata, int *outdatalen)
+{
+// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify
+// our depedencies
+ return 0;
+}
+
+int cls_getxattr(cls_method_context_t hctx,
+ const char *name,
+ char **outdata,
+ int *outdatalen)
+{
+ return 0;
+}
+
+int cls_setxattr(cls_method_context_t hctx,
+ const char *name,
+ const char *value,
+ int val_len)
+{
+ return 0;
+}
+
+int cls_read(cls_method_context_t hctx,
+ int ofs, int len,
+ char **outdata,
+ int *outdatalen)
+{
+ return 0;
+}
+
+int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
+{
+ assert(origin);
+
+ try {
+ const auto& message = \
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+ *origin = message.get_orig_source_inst();
+ return 0;
+ } catch (crimson::osd::error& e) {
+ return -e.code().value();
+ }
+}
+
+int cls_cxx_create(cls_method_context_t hctx, const bool exclusive)
+{
+ OSDOp op{CEPH_OSD_OP_CREATE};
+ op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_remove(cls_method_context_t hctx)
+{
+ OSDOp op{CEPH_OSD_OP_DELETE};
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime)
+{
+ OSDOp op{CEPH_OSD_OP_STAT};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ utime_t ut;
+ uint64_t s;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(s, iter);
+ decode(ut, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ if (size) {
+ *size = s;
+ }
+ if (mtime) {
+ *mtime = ut.sec();
+ }
+ return 0;
+}
+
+int cls_cxx_stat2(cls_method_context_t hctx,
+ uint64_t *size,
+ ceph::real_time *mtime)
+{
+ OSDOp op{CEPH_OSD_OP_STAT};
+ if (const int ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ uint64_t dummy_size;
+ real_time dummy_mtime;
+ uint64_t& out_size = size ? *size : dummy_size;
+ real_time& out_mtime = mtime ? *mtime : dummy_mtime;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(out_size, iter);
+ decode(out_mtime, iter);
+ return 0;
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+}
+
+int cls_cxx_read2(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *outbl,
+ uint32_t op_flags)
+{
+ OSDOp op{CEPH_OSD_OP_SYNC_READ};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = len;
+ op.op.flags = op_flags;
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return outbl->length();
+}
+
+int cls_cxx_write2(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *inbl,
+ uint32_t op_flags)
+{
+ OSDOp op{CEPH_OSD_OP_WRITE};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = len;
+ op.op.flags = op_flags;
+ op.indata = *inbl;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl)
+{
+ OSDOp op{CEPH_OSD_OP_WRITEFULL};
+ op.op.extent.offset = 0;
+ op.op.extent.length = inbl->length();
+ op.indata = *inbl;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_replace(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *inbl)
+{
+ {
+ OSDOp top{CEPH_OSD_OP_TRUNCATE};
+ top.op.extent.offset = 0;
+ top.op.extent.length = 0;
+ if (const auto ret = execute_osd_op(hctx, top); ret < 0) {
+ return ret;
+ }
+ }
+
+ {
+ OSDOp wop{CEPH_OSD_OP_WRITE};
+ wop.op.extent.offset = ofs;
+ wop.op.extent.length = len;
+ wop.indata = *inbl;
+ if (const auto ret = execute_osd_op(hctx, wop); ret < 0) {
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int cls_cxx_truncate(cls_method_context_t hctx, int ofs)
+{
+ OSDOp op{CEPH_OSD_OP_TRUNCATE};
+ op.op.extent.offset = ofs;
+ op.op.extent.length = 0;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len)
+{
+ OSDOp op{CEPH_OSD_OP_ZERO};
+ op.op.extent.offset = offset;
+ op.op.extent.length = len;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_getxattr(cls_method_context_t hctx,
+ const char *name,
+ bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_GETXATTR};
+ op.op.xattr.name_len = strlen(name);
+ op.indata.append(name, op.op.xattr.name_len);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return outbl->length();
+}
+
+int cls_cxx_getxattrs(cls_method_context_t hctx,
+ map<string, bufferlist> *attrset)
+{
+ OSDOp op{CEPH_OSD_OP_GETXATTRS};
+ if (const int ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*attrset, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_setxattr(cls_method_context_t hctx,
+ const char *name,
+ bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_SETXATTR};
+ op.op.xattr.name_len = std::strlen(name);
+ op.op.xattr.value_len = inbl->length();
+ op.indata.append(name, op.op.xattr.name_len);
+ op.indata.append(*inbl);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid)
+{
+ OSDOp op{CEPH_OSD_OP_ROLLBACK};
+ op.op.snap.snapid = snapid;
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx,
+ map<string, bufferlist>* vals,
+ bool *more)
+{
+ return 0;
+}
+
+int cls_cxx_map_get_keys(cls_method_context_t hctx,
+ const std::string& start_obj,
+ const uint64_t max_to_get,
+ std::set<std::string>* const keys,
+ bool* const more)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETKEYS};
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*keys, iter);
+ decode(*more, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return keys->size();
+}
+
+int cls_cxx_map_get_vals(cls_method_context_t hctx,
+ const std::string& start_obj,
+ const std::string& filter_prefix,
+ const uint64_t max_to_get,
+ std::map<std::string, ceph::bufferlist> *vals,
+ bool* const more)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALS};
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+ encode(filter_prefix, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*vals, iter);
+ decode(*more, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return vals->size();
+}
+
+int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx,
+ const std::set<std::string> &keys,
+ std::map<std::string, ceph::bufferlist> *vals)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+ encode(keys, op.indata);
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*vals, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETHEADER};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ *outbl = std::move(op.outdata);
+ return 0;
+}
+
+int cls_cxx_map_get_val(cls_method_context_t hctx,
+ const string &key,
+ bufferlist *outbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+ {
+ std::set<std::string> k{key};
+ encode(k, op.indata);
+ }
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+ std::map<std::string, ceph::bufferlist> m;
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(m, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ if (auto iter = std::begin(m); iter != std::end(m)) {
+ *outbl = std::move(iter->second);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+int cls_cxx_map_set_val(cls_method_context_t hctx,
+ const string &key,
+ bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+ {
+ std::map<std::string, ceph::bufferlist> m;
+ m[key] = *inbl;
+ encode(m, op.indata);
+ }
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_set_vals(cls_method_context_t hctx,
+ const std::map<string, ceph::bufferlist> *map)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+ encode(*map, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_clear(cls_method_context_t hctx)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPCLEAR};
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPSETHEADER};
+ op.indata = std::move(*inbl);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_range(cls_method_context_t hctx,
+ const std::string& key_begin,
+ const std::string& key_end)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE};
+ encode(key_begin, op.indata);
+ encode(key_end, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key)
+{
+ OSDOp op{CEPH_OSD_OP_OMAPRMKEYS};
+ std::vector<string> to_rm{key};
+ encode(to_rm, op.indata);
+ return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_list_watchers(cls_method_context_t hctx,
+ obj_list_watch_response_t *watchers)
+{
+ OSDOp op{CEPH_OSD_OP_LIST_WATCHERS};
+ if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+ return ret;
+ }
+
+ try {
+ auto iter = op.outdata.cbegin();
+ decode(*watchers, iter);
+ } catch (buffer::error&) {
+ return -EIO;
+ }
+ return 0;
+}
+
+uint64_t cls_current_version(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ return ox->get_last_user_version();
+}
+
+
+int cls_current_subop_num(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ // in contrast to classical OSD, crimson doesn't count OP_CALL and
+ // OP_STAT which seems fine regarding how the plugins we take care
+ // about use this part of API.
+ return ox->get_processed_rw_ops_num();
+}
+
+uint64_t cls_get_features(cls_method_context_t hctx)
+{
+ return 0;
+}
+
+uint64_t cls_get_client_features(cls_method_context_t hctx)
+{
+ try {
+ const auto& message = \
+ reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+ return message.get_features();
+ } catch (crimson::osd::error& e) {
+ return -e.code().value();
+ }
+}
+
+uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ return ox->get_pool_stripe_width();
+}
+
+ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx)
+{
+ // FIXME
+ return ceph_release_t::nautilus;
+}
+
+ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx)
+{
+ // FIXME
+ return ceph_release_t::nautilus;
+}
+
+const ConfigProxy& cls_get_config(cls_method_context_t hctx)
+{
+ return crimson::common::local_conf();
+}
+
+const object_info_t& cls_get_object_info(cls_method_context_t hctx)
+{
+ return reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_object_info();
+}
+
+int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq)
+{
+ auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+ auto obc = ox->get_obc();
+ if (!obc->obs.exists ||
+ (obc->obs.oi.is_whiteout() &&
+ obc->ssc->snapset.clones.empty())) {
+ return -ENOENT;
+ }
+ *snap_seq = obc->ssc->snapset.seq;
+ return 0;
+}
+
+int cls_cxx_chunk_write_and_set(cls_method_context_t hctx,
+ int ofs,
+ int len,
+ bufferlist *write_inbl,
+ uint32_t op_flags,
+ bufferlist *set_inbl,
+ int set_len)
+{
+ return 0;
+}
+
+int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid)
+{
+ return 0;
+}
+
+uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) {
+ // FIXME
+ return 4096;
+}
+
+int cls_cxx_gather(cls_method_context_t hctx, const std::set<std::string> &src_objs, const std::string& pool,
+ const char *cls, const char *method, bufferlist& inbl)
+{
+ return 0;
+}
+
+int cls_cxx_get_gathered_data(cls_method_context_t hctx, std::map<std::string, bufferlist> *results)
+{
+ return 0;
+}
+
+// although at first glance the implementation looks the same as in
+// the classical OSD, it's different b/c of how the dout macro expands.
+int cls_log(int level, const char *format, ...)
+{
+ size_t size = 256;
+ va_list ap;
+ while (1) {
+ boost::container::small_vector<char, 256> buf(size);
+ va_start(ap, format);
+ int n = vsnprintf(buf.data(), size, format, ap);
+ va_end(ap);
+#define MAX_SIZE 8196UL
+ if ((n > -1 && static_cast<size_t>(n) < size) || size > MAX_SIZE) {
+ dout(ceph::dout::need_dynamic(level)) << buf.data() << dendl;
+ return n;
+ }
+ size *= 2;
+ }
+}
diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc
new file mode 100644
index 000000000..1ea701c22
--- /dev/null
+++ b/src/crimson/osd/object_context.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/object_context.h"
+
+#include <fmt/ranges.h>
+
+#include "common/Formatter.h"
+#include "crimson/common/config_proxy.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf)
+{
+ obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+ conf.add_observer(this);
+}
+
+ObjectContextRegistry::~ObjectContextRegistry()
+{
+ // purge the cache to avoid leaks and complains from LSan
+ obc_lru.set_target_size(0UL);
+}
+
+const char** ObjectContextRegistry::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "crimson_osd_obc_lru_size",
+ nullptr
+ };
+ return KEYS;
+}
+
+void ObjectContextRegistry::handle_conf_change(
+ const crimson::common::ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+}
+
+std::optional<hobject_t> resolve_oid(
+ const SnapSet &ss,
+ const hobject_t &oid)
+{
+ logger().debug("{} oid.snap={},head snapset.seq={}",
+ __func__, oid.snap, ss.seq);
+ if (oid.snap > ss.seq) {
+ // Because oid.snap > ss.seq, we are trying to read from a snapshot
+ // taken after the most recent write to this object. Read from head.
+ return oid.get_head();
+ } else {
+ // which clone would it be?
+ auto clone = std::lower_bound(
+ begin(ss.clones), end(ss.clones),
+ oid.snap);
+ if (clone == end(ss.clones)) {
+ // Doesn't exist, > last clone, < ss.seq
+ return std::nullopt;
+ }
+ auto citer = ss.clone_snaps.find(*clone);
+ // TODO: how do we want to handle this kind of logic error?
+ ceph_assert(citer != ss.clone_snaps.end());
+
+ if (std::find(
+ citer->second.begin(),
+ citer->second.end(),
+ oid.snap) == citer->second.end()) {
+ logger().debug("{} {} does not contain {} -- DNE",
+ __func__, ss.clone_snaps, oid.snap);
+ return std::nullopt;
+ } else {
+ auto soid = oid;
+ soid.snap = *clone;
+ return std::optional<hobject_t>(soid);
+ }
+ }
+}
+
+}
diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h
new file mode 100644
index 000000000..8abf6d3f7
--- /dev/null
+++ b/src/crimson/osd/object_context.h
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <optional>
+#include <utility>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "common/intrusive_lru.h"
+#include "osd/object_state.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/tri_mutex.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::common {
+ class ConfigProxy;
+}
+
+namespace crimson::osd {
+
+class Watch;
+struct SnapSetContext;
+using SnapSetContextRef = boost::intrusive_ptr<SnapSetContext>;
+
+template <typename OBC>
+struct obc_to_hoid {
+ using type = hobject_t;
+ const type &operator()(const OBC &obc) {
+ return obc.obs.oi.soid;
+ }
+};
+
+struct SnapSetContext :
+ public boost::intrusive_ref_counter<SnapSetContext,
+ boost::thread_unsafe_counter>
+{
+ hobject_t oid;
+ SnapSet snapset;
+ bool exists = false;
+ /**
+ * exists
+ *
+ * Because ObjectContext's are cached, we need to be able to express the case
+ * where the object to which a cached ObjectContext refers does not exist.
+ * ObjectContext's for yet-to-be-created objects are initialized with exists=false.
+ * The ObjectContext for a deleted object will have exists set to false until it falls
+ * out of cache (or another write recreates the object).
+ */
+ explicit SnapSetContext(const hobject_t& o) :
+ oid(o), exists(false) {}
+};
+
+class ObjectContext : public ceph::common::intrusive_lru_base<
+ ceph::common::intrusive_lru_config<
+ hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>>
+{
+public:
+ ObjectState obs;
+ SnapSetContextRef ssc;
+ // the watch / notify machinery rather stays away from the hot and
+ // frequented paths. std::map is used mostly because of developer's
+ // convenience.
+ using watch_key_t = std::pair<uint64_t, entity_name_t>;
+ std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers;
+
+ ObjectContext(hobject_t hoid) : obs(std::move(hoid)) {}
+
+ const hobject_t &get_oid() const {
+ return obs.oi.soid;
+ }
+
+ bool is_head() const {
+ return get_oid().is_head();
+ }
+
+ hobject_t get_head_oid() const {
+ return get_oid().get_head();
+ }
+
+ const SnapSet &get_head_ss() const {
+ ceph_assert(is_head());
+ ceph_assert(ssc);
+ return ssc->snapset;
+ }
+
+ void set_head_state(ObjectState &&_obs, SnapSetContextRef &&_ssc) {
+ ceph_assert(is_head());
+ obs = std::move(_obs);
+ ssc = std::move(_ssc);
+ }
+
+ void set_clone_state(ObjectState &&_obs) {
+ ceph_assert(!is_head());
+ obs = std::move(_obs);
+ }
+
+ /// pass the provided exception to any waiting consumers of this ObjectContext
+ template<typename Exception>
+ void interrupt(Exception ex) {
+ lock.abort(std::move(ex));
+ if (recovery_read_marker) {
+ drop_recovery_read();
+ }
+ }
+
+private:
+ tri_mutex lock;
+ bool recovery_read_marker = false;
+
+ template <typename Lock, typename Func>
+ auto _with_lock(Lock&& lock, Func&& func) {
+ Ref obc = this;
+ return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable {
+ return seastar::futurize_invoke(func).finally([&lock, obc] {
+ lock.unlock();
+ });
+ });
+ }
+
+ boost::intrusive::list_member_hook<> list_hook;
+ uint64_t list_link_cnt = 0;
+
+public:
+
+ template <typename ListType>
+ void append_to(ListType& list) {
+ if (list_link_cnt++ == 0) {
+ list.push_back(*this);
+ }
+ }
+
+ template <typename ListType>
+ void remove_from(ListType&& list) {
+ assert(list_link_cnt > 0);
+ if (--list_link_cnt == 0) {
+ list.erase(std::decay_t<ListType>::s_iterator_to(*this));
+ }
+ }
+
+ using obc_accessing_option_t = boost::intrusive::member_hook<
+ ObjectContext,
+ boost::intrusive::list_member_hook<>,
+ &ObjectContext::list_hook>;
+
+ template<RWState::State Type, typename InterruptCond = void, typename Func>
+ auto with_lock(Func&& func) {
+ if constexpr (!std::is_void_v<InterruptCond>) {
+ auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.for_write(), std::move(wrapper));
+ case RWState::RWREAD:
+ return _with_lock(lock.for_read(), std::move(wrapper));
+ case RWState::RWEXCL:
+ return _with_lock(lock.for_excl(), std::move(wrapper));
+ case RWState::RWNONE:
+ return seastar::futurize_invoke(std::move(wrapper));
+ default:
+ assert(0 == "noop");
+ }
+ } else {
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.for_write(), std::forward<Func>(func));
+ case RWState::RWREAD:
+ return _with_lock(lock.for_read(), std::forward<Func>(func));
+ case RWState::RWEXCL:
+ return _with_lock(lock.for_excl(), std::forward<Func>(func));
+ case RWState::RWNONE:
+ return seastar::futurize_invoke(std::forward<Func>(func));
+ default:
+ assert(0 == "noop");
+ }
+ }
+ }
+ template<RWState::State Type, typename InterruptCond = void, typename Func>
+ auto with_promoted_lock(Func&& func) {
+ if constexpr (!std::is_void_v<InterruptCond>) {
+ auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.excl_from_write(), std::move(wrapper));
+ case RWState::RWREAD:
+ return _with_lock(lock.excl_from_read(), std::move(wrapper));
+ case RWState::RWEXCL:
+ return _with_lock(lock.excl_from_excl(), std::move(wrapper));
+ case RWState::RWNONE:
+ return _with_lock(lock.for_excl(), std::move(wrapper));
+ default:
+ assert(0 == "noop");
+ }
+ } else {
+ switch (Type) {
+ case RWState::RWWRITE:
+ return _with_lock(lock.excl_from_write(), std::forward<Func>(func));
+ case RWState::RWREAD:
+ return _with_lock(lock.excl_from_read(), std::forward<Func>(func));
+ case RWState::RWEXCL:
+ return _with_lock(lock.excl_from_excl(), std::forward<Func>(func));
+ case RWState::RWNONE:
+ return _with_lock(lock.for_excl(), std::forward<Func>(func));
+ default:
+ assert(0 == "noop");
+ }
+ }
+ }
+
+ bool empty() const {
+ return !lock.is_acquired();
+ }
+ bool is_request_pending() const {
+ return lock.is_acquired();
+ }
+
+ bool get_recovery_read() {
+ if (lock.try_lock_for_read()) {
+ recovery_read_marker = true;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void wait_recovery_read() {
+ assert(lock.get_readers() > 0);
+ recovery_read_marker = true;
+ }
+ void drop_recovery_read() {
+ assert(recovery_read_marker);
+ recovery_read_marker = false;
+ }
+ bool maybe_get_excl() {
+ return lock.try_lock_for_excl();
+ }
+};
+using ObjectContextRef = ObjectContext::Ref;
+
+class ObjectContextRegistry : public md_config_obs_t {
+ ObjectContext::lru_t obc_lru;
+
+public:
+ ObjectContextRegistry(crimson::common::ConfigProxy &conf);
+ ~ObjectContextRegistry();
+
+ std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) {
+ return obc_lru.get_or_create(hoid);
+ }
+ ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) {
+ return obc_lru.get(hoid);
+ }
+
+ void clear_range(const hobject_t &from,
+ const hobject_t &to) {
+ obc_lru.clear_range(from, to);
+ }
+
+ template <class F>
+ void for_each(F&& f) {
+ obc_lru.for_each(std::forward<F>(f));
+ }
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const crimson::common::ConfigProxy& conf,
+ const std::set <std::string> &changed) final;
+};
+
+std::optional<hobject_t> resolve_oid(const SnapSet &ss,
+ const hobject_t &oid);
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc
new file mode 100644
index 000000000..0a4d74c0d
--- /dev/null
+++ b/src/crimson/osd/object_context_loader.cc
@@ -0,0 +1,232 @@
+#include "crimson/osd/object_context_loader.h"
+#include "osd/osd_types_fmt.h"
+
+SET_SUBSYS(osd);
+
+namespace crimson::osd {
+
+using crimson::common::local_conf;
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_head_obc(ObjectContextRef obc,
+ bool existed,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_head_obc);
+ DEBUGDPP("object {}", dpp, obc->get_oid());
+ assert(obc->is_head());
+ obc->append_to(obc_set_accessing);
+ return obc->with_lock<State, IOInterruptCondition>(
+ [existed=existed, obc=obc, func=std::move(func), this] {
+ return get_or_load_obc<State>(obc, existed)
+ .safe_then_interruptible(
+ [func = std::move(func)](auto obc) {
+ return std::move(func)(std::move(obc));
+ });
+ }).finally([FNAME, this, obc=std::move(obc)] {
+ DEBUGDPP("released object {}", dpp, obc->get_oid());
+ obc->remove_from(obc_set_accessing);
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc(hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc);
+ assert(!oid.is_head());
+ return with_obc<RWState::RWREAD>(
+ oid.get_head(),
+ [FNAME, oid, func=std::move(func), this](auto head) mutable
+ -> load_obc_iertr::future<> {
+ if (!head->obs.exists) {
+ ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+ return this->with_clone_obc_only<State>(std::move(head),
+ oid,
+ std::move(func));
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_only(ObjectContextRef head,
+ hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
+ auto coid = resolve_oid(head->get_head_ss(), oid);
+ if (!coid) {
+ ERRORDPP("clone {} not found", dpp, oid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+ auto [clone, existed] = obc_registry.get_cached_obc(*coid);
+ return clone->template with_lock<State, IOInterruptCondition>(
+ [existed=existed, clone=std::move(clone),
+ func=std::move(func), head=std::move(head), this]()
+ -> load_obc_iertr::future<> {
+ auto loaded = get_or_load_obc<State>(clone, existed);
+ return loaded.safe_then_interruptible(
+ [func = std::move(func)](auto clone) {
+ return std::move(func)(std::move(clone));
+ });
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_direct(
+ hobject_t oid,
+ with_both_obc_func_t&& func)
+ {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc_direct);
+ assert(!oid.is_head());
+ return with_obc<RWState::RWREAD>(
+ oid.get_head(),
+ [FNAME, oid, func=std::move(func), this](auto head) mutable
+ -> load_obc_iertr::future<> {
+ if (!head->obs.exists) {
+ ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
+ return load_obc_iertr::future<>{
+ crimson::ct_error::enoent::make()
+ };
+ }
+#ifndef NDEBUG
+ auto &ss = head->get_head_ss();
+ auto cit = std::find(
+ std::begin(ss.clones), std::end(ss.clones), oid.snap);
+ assert(cit != std::end(ss.clones));
+#endif
+ auto [clone, existed] = obc_registry.get_cached_obc(oid);
+ return clone->template with_lock<State, IOInterruptCondition>(
+ [existed=existed, clone=std::move(clone),
+ func=std::move(func), head=std::move(head), this]()
+ -> load_obc_iertr::future<> {
+ auto loaded = get_or_load_obc<State>(clone, existed);
+ return loaded.safe_then_interruptible(
+ [func = std::move(func), head=std::move(head)](auto clone) {
+ return std::move(func)(std::move(head), std::move(clone));
+ });
+ });
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc(hobject_t oid,
+ with_obc_func_t&& func)
+ {
+ if (oid.is_head()) {
+ auto [obc, existed] =
+ obc_registry.get_cached_obc(std::move(oid));
+ return with_head_obc<State>(std::move(obc),
+ existed,
+ std::move(func));
+ } else {
+ return with_clone_obc<State>(oid, std::move(func));
+ }
+ }
+
+ ObjectContextLoader::load_obc_iertr::future<ObjectContextRef>
+ ObjectContextLoader::load_obc(ObjectContextRef obc)
+ {
+ LOG_PREFIX(ObjectContextLoader::load_obc);
+ return backend.load_metadata(obc->get_oid())
+ .safe_then_interruptible(
+ [FNAME, this, obc=std::move(obc)](auto md)
+ -> load_obc_ertr::future<ObjectContextRef> {
+ const hobject_t& oid = md->os.oi.soid;
+ DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
+ if (oid.is_head()) {
+ if (!md->ssc) {
+ ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+ obc->set_head_state(std::move(md->os),
+ std::move(md->ssc));
+ } else {
+ obc->set_clone_state(std::move(md->os));
+ }
+ DEBUGDPP("returning obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
+ return load_obc_ertr::make_ready_future<ObjectContextRef>(obc);
+ });
+ }
+
+ template<RWState::State State>
+ ObjectContextLoader::load_obc_iertr::future<ObjectContextRef>
+ ObjectContextLoader::get_or_load_obc(ObjectContextRef obc,
+ bool existed)
+ {
+ LOG_PREFIX(ObjectContextLoader::get_or_load_obc);
+ auto loaded =
+ load_obc_iertr::make_ready_future<ObjectContextRef>(obc);
+ if (existed) {
+ DEBUGDPP("cache hit on {}", dpp, obc->get_oid());
+ } else {
+ DEBUGDPP("cache miss on {}", dpp, obc->get_oid());
+ loaded =
+ obc->template with_promoted_lock<State, IOInterruptCondition>(
+ [obc, this] {
+ return load_obc(obc);
+ });
+ }
+ return loaded;
+ }
+
+ ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::reload_obc(ObjectContext& obc) const
+ {
+ LOG_PREFIX(ObjectContextLoader::reload_obc);
+ assert(obc.is_head());
+ return backend.load_metadata(obc.get_oid())
+ .safe_then_interruptible<false>(
+ [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> {
+ DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid());
+ if (!md->ssc) {
+ ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid());
+ return crimson::ct_error::object_corrupted::make();
+ }
+ obc.set_head_state(std::move(md->os), std::move(md->ssc));
+ return load_obc_ertr::now();
+ });
+ }
+
+ void ObjectContextLoader::notify_on_change(bool is_primary)
+ {
+ LOG_PREFIX(ObjectContextLoader::notify_on_change);
+ DEBUGDPP("is_primary: {}", dpp, is_primary);
+ for (auto& obc : obc_set_accessing) {
+ DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
+ obc.interrupt(::crimson::common::actingset_changed(is_primary));
+ }
+ }
+
+ // explicitly instantiate the used instantiations
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t,
+ with_obc_func_t&&);
+
+ template ObjectContextLoader::load_obc_iertr::future<>
+ ObjectContextLoader::with_clone_obc_direct<RWState::RWWRITE>(
+ hobject_t,
+ with_both_obc_func_t&&);
+}
diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h
new file mode 100644
index 000000000..3ab7f6ad8
--- /dev/null
+++ b/src/crimson/osd/object_context_loader.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <seastar/core/future.hh>
+#include "crimson/common/errorator.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg_backend.h"
+
+namespace crimson::osd {
+class ObjectContextLoader {
+public:
+ using obc_accessing_list_t = boost::intrusive::list<
+ ObjectContext,
+ ObjectContext::obc_accessing_option_t>;
+
+ ObjectContextLoader(
+ ObjectContextRegistry& _obc_services,
+ PGBackend& _backend,
+ DoutPrefixProvider& dpp)
+ : obc_registry{_obc_services},
+ backend{_backend},
+ dpp{dpp}
+ {}
+
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+
+ using with_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef)>;
+
+ using with_both_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>;
+
+ // Use this variant by default
+ template<RWState::State State>
+ load_obc_iertr::future<> with_obc(hobject_t oid,
+ with_obc_func_t&& func);
+
+ // Use this variant in the case where the head object
+ // obc is already locked and only the clone obc is needed.
+ // Avoid nesting with_head_obc() calls by using with_clone_obc()
+ // with an already locked head.
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head,
+ hobject_t oid,
+ with_obc_func_t&& func);
+
+ // Use this variant in the case where both the head
+ // object *and* the matching clone object are being used
+ // in func.
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc_direct(
+ hobject_t oid,
+ with_both_obc_func_t&& func);
+
+ load_obc_iertr::future<> reload_obc(ObjectContext& obc) const;
+
+ void notify_on_change(bool is_primary);
+
+private:
+ ObjectContextRegistry& obc_registry;
+ PGBackend& backend;
+ DoutPrefixProvider& dpp;
+ obc_accessing_list_t obc_set_accessing;
+
+ template<RWState::State State>
+ load_obc_iertr::future<> with_clone_obc(hobject_t oid,
+ with_obc_func_t&& func);
+
+ template<RWState::State State>
+ load_obc_iertr::future<> with_head_obc(ObjectContextRef obc,
+ bool existed,
+ with_obc_func_t&& func);
+
+ template<RWState::State State>
+ load_obc_iertr::future<ObjectContextRef>
+ get_or_load_obc(ObjectContextRef obc,
+ bool existed);
+
+ load_obc_iertr::future<ObjectContextRef>
+ load_obc(ObjectContextRef obc);
+};
+}
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
new file mode 100644
index 000000000..040870203
--- /dev/null
+++ b/src/crimson/osd/ops_executer.cc
@@ -0,0 +1,1461 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ops_executer.h"
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm_ext/push_back.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/thread.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/watch.h"
+#include "osd/ClassHandler.h"
+#include "osd/SnapMapper.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op)
+{
+ std::string cname, mname;
+ ceph::bufferlist indata;
+ try {
+ auto bp = std::begin(osd_op.indata);
+ bp.copy(osd_op.op.cls.class_len, cname);
+ bp.copy(osd_op.op.cls.method_len, mname);
+ bp.copy(osd_op.op.cls.indata_len, indata);
+ } catch (buffer::error&) {
+ logger().warn("call unable to decode class + method + indata");
+ return crimson::ct_error::invarg::make();
+ }
+
+ // NOTE: opening a class can actually result in dlopen(), and thus
+ // blocking the entire reactor. Thankfully to ClassHandler's cache
+ // this is supposed to be extremely infrequent.
+ ClassHandler::ClassData* cls;
+ int r = ClassHandler::get_instance().open_class(cname, &cls);
+ if (r) {
+ logger().warn("class {} open got {}", cname, cpp_strerror(r));
+ if (r == -ENOENT) {
+ return crimson::ct_error::operation_not_supported::make();
+ } else if (r == -EPERM) {
+ // propagate permission errors
+ return crimson::ct_error::permission_denied::make();
+ }
+ return crimson::ct_error::input_output_error::make();
+ }
+
+ ClassHandler::ClassMethod* method = cls->get_method(mname);
+ if (!method) {
+ logger().warn("call method {}.{} does not exist", cname, mname);
+ return crimson::ct_error::operation_not_supported::make();
+ }
+
+ const auto flags = method->get_flags();
+ if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) {
+ return crimson::ct_error::enoent::make();
+ }
+
+#if 0
+ if (flags & CLS_METHOD_WR) {
+ ctx->user_modify = true;
+ }
+#endif
+
+ logger().debug("calling method {}.{}, num_read={}, num_write={}",
+ cname, mname, num_read, num_write);
+ const auto prev_rd = num_read;
+ const auto prev_wr = num_write;
+ return interruptor::async(
+ [this, method, indata=std::move(indata)]() mutable {
+ ceph::bufferlist outdata;
+ auto cls_context = reinterpret_cast<cls_method_context_t>(this);
+ const auto ret = method->exec(cls_context, indata, outdata);
+ return std::make_pair(ret, std::move(outdata));
+ }
+ ).then_interruptible(
+ [this, prev_rd, prev_wr, &osd_op, flags]
+ (auto outcome) -> call_errorator::future<> {
+ auto& [ret, outdata] = outcome;
+ osd_op.rval = ret;
+
+ logger().debug("do_op_call: method returned ret={}, outdata.length()={}"
+ " while num_read={}, num_write={}",
+ ret, outdata.length(), num_read, num_write);
+ if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
+ logger().error("method tried to read object but is not marked RD");
+ osd_op.rval = -EIO;
+ return crimson::ct_error::input_output_error::make();
+ }
+ if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
+ logger().error("method tried to update object but is not marked WR");
+ osd_op.rval = -EIO;
+ return crimson::ct_error::input_output_error::make();
+ }
+ // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`,
+ // grep for `ignore_out_data`.
+ using crimson::common::local_conf;
+ if (op_info.allows_returnvec() &&
+ op_info.may_write() &&
+ ret >= 0 &&
+ outdata.length() > local_conf()->osd_max_write_op_reply_len) {
+ // the justification of this limit it to not inflate the pg log.
+ // that's the reason why we don't worry about pure reads.
+ logger().error("outdata overflow due to .length()={}, limit={}",
+ outdata.length(),
+ local_conf()->osd_max_write_op_reply_len);
+ osd_op.rval = -EOVERFLOW;
+ return crimson::ct_error::value_too_large::make();
+ }
+ // for write calls we never return data expect errors or RETURNVEC.
+ // please refer cls/cls_hello.cc to details.
+ if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) {
+ osd_op.op.extent.length = outdata.length();
+ osd_op.outdata.claim_append(outdata);
+ }
+ if (ret < 0) {
+ return crimson::stateful_ec{
+ std::error_code(-ret, std::generic_category()) };
+ } else {
+ return seastar::now();
+ }
+ }
+ );
+}
+
+static watch_info_t create_watch_info(const OSDOp& osd_op,
+ const OpsExecuter::ExecutableMessage& msg,
+ entity_addr_t peer_addr)
+{
+ using crimson::common::local_conf;
+ const uint32_t timeout =
+ osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout
+ : osd_op.op.watch.timeout;
+ return {
+ osd_op.op.watch.cookie,
+ timeout,
+ peer_addr
+ };
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}", __func__);
+ struct connect_ctx_t {
+ ObjectContext::watch_key_t key;
+ crimson::net::ConnectionRef conn;
+ watch_info_t info;
+
+ connect_ctx_t(
+ const OSDOp& osd_op,
+ const ExecutableMessage& msg,
+ crimson::net::ConnectionRef conn)
+ : key(osd_op.op.watch.cookie, msg.get_reqid().name),
+ conn(conn),
+ info(create_watch_info(osd_op, msg, conn->get_peer_addr())) {
+ }
+ };
+
+ return with_effect_on_obc(
+ connect_ctx_t{ osd_op, get_message(), conn },
+ [&](auto& ctx) {
+ const auto& entity = ctx.key.second;
+ auto [it, emplaced] =
+ os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info));
+ if (emplaced) {
+ logger().info("registered new watch {} by {}", it->second, entity);
+ txn.nop();
+ } else {
+ logger().info("found existing watch {} by {}", it->second, entity);
+ }
+ return seastar::now();
+ },
+ [](auto&& ctx, ObjectContextRef obc, Ref<PG> pg) {
+ assert(pg);
+ auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr);
+ if (emplaced) {
+ const auto& [cookie, entity] = ctx.key;
+ it->second = crimson::osd::Watch::create(
+ obc, ctx.info, entity, std::move(pg));
+ logger().info("op_effect: added new watcher: {}", ctx.key);
+ } else {
+ logger().info("op_effect: found existing watcher: {}", ctx.key);
+ }
+ return it->second->connect(std::move(ctx.conn), true /* will_ping */);
+ }
+ );
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_reconnect(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ const entity_name_t& entity = get_message().get_reqid().name;
+ const auto& cookie = osd_op.op.watch.cookie;
+ if (!os.oi.watchers.count(std::make_pair(cookie, entity))) {
+ return crimson::ct_error::not_connected::make();
+ } else {
+ logger().info("found existing watch by {}", entity);
+ return do_op_watch_subop_watch(osd_op, os, txn);
+ }
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_unwatch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().info("{}", __func__);
+
+ struct disconnect_ctx_t {
+ ObjectContext::watch_key_t key;
+ disconnect_ctx_t(const OSDOp& osd_op, const ExecutableMessage& msg)
+ : key(osd_op.op.watch.cookie, msg.get_reqid().name) {
+ }
+ };
+ return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() },
+ [&] (auto& ctx) {
+ const auto& entity = ctx.key.second;
+ if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) {
+ logger().info("removed watch {} by {}", nh.mapped(), entity);
+ txn.nop();
+ } else {
+ logger().info("can't remove: no watch by {}", entity);
+ }
+ return seastar::now();
+ },
+ [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) {
+ return seastar::do_with(std::move(nh.mapped()),
+ [ctx](auto&& watcher) {
+ logger().info("op_effect: disconnect watcher {}", ctx.key);
+ return watcher->remove();
+ });
+ } else {
+ logger().info("op_effect: disconnect failed to find watcher {}", ctx.key);
+ return seastar::now();
+ }
+ });
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_ping(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ const entity_name_t& entity = get_message().get_reqid().name;
+ const auto& cookie = osd_op.op.watch.cookie;
+ const auto key = std::make_pair(cookie, entity);
+
+ // Note: WATCH with PING doesn't cause may_write() to return true,
+ // so if there is nothing else in the transaction, this is going
+ // to run do_osd_op_effects, but not write out a log entry */
+ if (!os.oi.watchers.count(key)) {
+ return crimson::ct_error::not_connected::make();
+ }
+ auto it = obc->watchers.find(key);
+ if (it == std::end(obc->watchers) || !it->second->is_connected()) {
+ return crimson::ct_error::timed_out::make();
+ }
+ logger().info("found existing watch by {}", entity);
+ it->second->got_ping(ceph_clock_now());
+ return seastar::now();
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}", __func__);
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ switch (osd_op.op.watch.op) {
+ case CEPH_OSD_WATCH_OP_WATCH:
+ return do_op_watch_subop_watch(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_RECONNECT:
+ return do_op_watch_subop_reconnect(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_PING:
+ return do_op_watch_subop_ping(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_UNWATCH:
+ return do_op_watch_subop_unwatch(osd_op, os, txn);
+ case CEPH_OSD_WATCH_OP_LEGACY_WATCH:
+ logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH");
+ return crimson::ct_error::invarg::make();
+ }
+ logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op);
+ return crimson::ct_error::invarg::make();
+}
+
+static uint64_t get_next_notify_id(epoch_t e)
+{
+ // FIXME
+ static std::uint64_t next_notify_id = 0;
+ return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++));
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch());
+
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ struct notify_ctx_t {
+ crimson::net::ConnectionRef conn;
+ notify_info_t ninfo;
+ const uint64_t client_gid;
+ const epoch_t epoch;
+
+ notify_ctx_t(const ExecutableMessage& msg,
+ crimson::net::ConnectionRef conn)
+ : conn(conn),
+ client_gid(msg.get_reqid().name.num()),
+ epoch(msg.get_map_epoch()) {
+ }
+ };
+ return with_effect_on_obc(
+ notify_ctx_t{ get_message(), conn },
+ [&](auto& ctx) {
+ try {
+ auto bp = osd_op.indata.cbegin();
+ uint32_t ver; // obsolete
+ ceph::decode(ver, bp);
+ ceph::decode(ctx.ninfo.timeout, bp);
+ ceph::decode(ctx.ninfo.bl, bp);
+ } catch (const buffer::error&) {
+ ctx.ninfo.timeout = 0;
+ }
+ if (!ctx.ninfo.timeout) {
+ using crimson::common::local_conf;
+ ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout;
+ }
+ ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch);
+ ctx.ninfo.cookie = osd_op.op.notify.cookie;
+ // return our unique notify id to the client
+ ceph::encode(ctx.ninfo.notify_id, osd_op.outdata);
+ return seastar::now();
+ },
+ [](auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ auto alive_watchers = obc->watchers | boost::adaptors::map_values
+ | boost::adaptors::filtered(
+ [] (const auto& w) {
+ // FIXME: filter as for the `is_ping` in `Watch::start_notify`
+ return w->is_alive();
+ });
+ return crimson::osd::Notify::create_n_propagate(
+ std::begin(alive_watchers),
+ std::end(alive_watchers),
+ std::move(ctx.conn),
+ ctx.ninfo,
+ ctx.client_gid,
+ obc->obs.oi.user_version);
+ }
+ );
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_list_watchers(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}", __func__);
+
+ obj_list_watch_response_t response;
+ for (const auto& [key, info] : os.oi.watchers) {
+ logger().debug("{}: key cookie={}, entity={}",
+ __func__, key.first, key.second);
+ assert(key.first == info.cookie);
+ assert(key.second.is_client());
+ response.entries.emplace_back(watch_item_t{
+ key.second, info.cookie, info.timeout_seconds, info.addr});
+ }
+ response.encode(osd_op.outdata, get_message().get_features());
+ return watch_ierrorator::now();
+}
+
+OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify_ack(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ logger().debug("{}", __func__);
+
+ struct notifyack_ctx_t {
+ const entity_name_t entity;
+ uint64_t watch_cookie;
+ uint64_t notify_id;
+ ceph::bufferlist reply_bl;
+
+ notifyack_ctx_t(const ExecutableMessage& msg)
+ : entity(msg.get_reqid().name) {
+ }
+ };
+ return with_effect_on_obc(notifyack_ctx_t{ get_message() },
+ [&] (auto& ctx) -> watch_errorator::future<> {
+ try {
+ auto bp = osd_op.indata.cbegin();
+ ceph::decode(ctx.notify_id, bp);
+ ceph::decode(ctx.watch_cookie, bp);
+ if (!bp.end()) {
+ ceph::decode(ctx.reply_bl, bp);
+ }
+ } catch (const buffer::error&) {
+ // here we behave differently than ceph-osd. For historical reasons,
+ // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`.
+ // crimson just returns EINVAL if the data cannot be decoded.
+ return crimson::ct_error::invarg::make();
+ }
+ return watch_errorator::now();
+ },
+ [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) {
+ logger().info("notify_ack watch_cookie={}, notify_id={}",
+ ctx.watch_cookie, ctx.notify_id);
+ return seastar::do_for_each(obc->watchers,
+ [ctx=std::move(ctx)] (auto& kv) {
+ const auto& [key, watchp] = kv;
+ static_assert(
+ std::is_same_v<std::decay_t<decltype(watchp)>,
+ seastar::shared_ptr<crimson::osd::Watch>>);
+ auto& [cookie, entity] = key;
+ if (ctx.entity != entity) {
+ logger().debug("skipping watch {}; entity name {} != {}",
+ key, entity, ctx.entity);
+ return seastar::now();
+ }
+ if (ctx.watch_cookie != cookie) {
+ logger().debug("skipping watch {}; cookie {} != {}",
+ key, ctx.watch_cookie, cookie);
+ return seastar::now();
+ }
+ logger().info("acking notify on watch {}", key);
+ return watchp->notify_ack(ctx.notify_id, ctx.reply_bl);
+ });
+ });
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+template <class Func>
+auto OpsExecuter::do_const_op(Func&& f) {
+ // TODO: pass backend as read-only
+ return std::forward<Func>(f)(pg->get_backend(), std::as_const(obc->obs));
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+template <class Func>
+auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) {
+ ++num_write;
+ if (!osd_op_params) {
+ osd_op_params.emplace();
+ fill_op_params_bump_pg_version();
+ }
+ user_modify = (m == modified_by::user);
+ return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn);
+}
+OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver(
+ OSDOp& osd_op,
+ const ObjectState& os)
+{
+ if (!osd_op.op.assert_ver.ver) {
+ return crimson::ct_error::invarg::make();
+ } else if (osd_op.op.assert_ver.ver < os.oi.user_version) {
+ return crimson::ct_error::erange::make();
+ } else if (osd_op.op.assert_ver.ver > os.oi.user_version) {
+ return crimson::ct_error::value_too_large::make();
+ }
+ return seastar::now();
+}
+
+OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
+ OSDOp& osd_op,
+ const ObjectState& os,
+ const SnapSet& ss)
+{
+ obj_list_snap_response_t resp;
+ resp.clones.reserve(ss.clones.size() + 1);
+ for (auto &clone: ss.clones) {
+ clone_info ci;
+ ci.cloneid = clone;
+
+ {
+ auto p = ss.clone_snaps.find(clone);
+ if (p == ss.clone_snaps.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_snaps, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.snaps.reserve(p->second.size());
+ ci.snaps.insert(ci.snaps.end(), p->second.rbegin(), p->second.rend());
+ }
+
+ {
+ auto p = ss.clone_overlap.find(clone);
+ if (p == ss.clone_overlap.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_overlap, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.overlap.reserve(p->second.num_intervals());
+ ci.overlap.insert(ci.overlap.end(), p->second.begin(), p->second.end());
+ }
+
+ {
+ auto p = ss.clone_size.find(clone);
+ if (p == ss.clone_size.end()) {
+ logger().error(
+ "OpsExecutor::do_list_snaps: {} has inconsistent "
+ "clone_size, missing clone {}",
+ os.oi.soid,
+ clone);
+ return crimson::ct_error::invarg::make();
+ }
+ ci.size = p->second;
+ }
+ resp.clones.push_back(std::move(ci));
+ }
+
+ if (!os.oi.is_whiteout()) {
+ clone_info ci;
+ ci.cloneid = CEPH_NOSNAP;
+ ci.size = os.oi.size;
+ resp.clones.push_back(std::move(ci));
+ }
+ resp.seq = ss.seq;
+ logger().error(
+ "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}",
+ os.oi.soid,
+ resp.clones.size());
+ resp.encode(osd_op.outdata);
+ return read_ierrorator::now();
+}
+
+OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator>
+OpsExecuter::execute_op(OSDOp& osd_op)
+{
+ return do_execute_op(osd_op).handle_error_interruptible(
+ osd_op_errorator::all_same_way([&osd_op](auto e, auto&& e_raw)
+ -> OpsExecuter::osd_op_errorator::future<> {
+ // All ops except for CMPEXT should have rval set to -e.value(),
+ // CMPEXT sets rval itself and shouldn't be overridden.
+ if (e.value() != ct_error::cmp_fail_error_value) {
+ osd_op.rval = -e.value();
+ }
+ if ((osd_op.op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
+ e.value() != EAGAIN && e.value() != EINPROGRESS) {
+ return osd_op_errorator::now();
+ } else {
+ return std::move(e_raw);
+ }
+ }));
+}
+
+OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator>
+OpsExecuter::do_execute_op(OSDOp& osd_op)
+{
+ // TODO: dispatch via call table?
+ // TODO: we might want to find a way to unify both input and output
+ // of each op.
+ logger().debug(
+ "handling op {} on object {}",
+ ceph_osd_op_name(osd_op.op.op),
+ get_target());
+ switch (const ceph_osd_op& op = osd_op.op; op.op) {
+ case CEPH_OSD_OP_SYNC_READ:
+ [[fallthrough]];
+ case CEPH_OSD_OP_READ:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.read(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_SPARSE_READ:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.sparse_read(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_CHECKSUM:
+ return do_read_op([&osd_op](auto& backend, const auto& os) {
+ return backend.checksum(os, osd_op);
+ });
+ case CEPH_OSD_OP_CMPEXT:
+ return do_read_op([&osd_op](auto& backend, const auto& os) {
+ return backend.cmp_ext(os, osd_op);
+ });
+ case CEPH_OSD_OP_GETXATTR:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.getxattr(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_GETXATTRS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.get_xattrs(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_CMPXATTR:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.cmp_xattr(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_RMXATTR:
+ return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.rm_xattr(os, osd_op, txn);
+ });
+ case CEPH_OSD_OP_CREATE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.create(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.write(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITESAME:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.write_same(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_WRITEFULL:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.writefull(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_ROLLBACK:
+ return do_write_op([this, &head=obc,
+ &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.rollback(os, osd_op, txn, *osd_op_params, delta_stats,
+ head, pg->obc_loader);
+ });
+ case CEPH_OSD_OP_APPEND:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.append(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_TRUNCATE:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ // FIXME: rework needed. Move this out to do_write_op(), introduce
+ // do_write_op_no_user_modify()...
+ return backend.truncate(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_ZERO:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.zero(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_SETALLOCHINT:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.set_allochint(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_SETXATTR:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.setxattr(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_DELETE:
+ {
+ bool whiteout = false;
+ if (!obc->ssc->snapset.clones.empty() ||
+ (snapc.snaps.size() && // there are snaps
+ snapc.snaps[0] > obc->ssc->snapset.seq)) { // existing obj is old
+ logger().debug("{} has or will have clones, will whiteout {}",
+ __func__, obc->obs.oi.soid);
+ whiteout = true;
+ }
+ return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) {
+ return backend.remove(os, txn, delta_stats, whiteout);
+ });
+ }
+ case CEPH_OSD_OP_CALL:
+ return this->do_op_call(osd_op);
+ case CEPH_OSD_OP_STAT:
+ // note: stat does not require RD
+ return do_const_op([this, &osd_op] (/* const */auto& backend, const auto& os) {
+ return backend.stat(os, osd_op, delta_stats);
+ });
+
+ case CEPH_OSD_OP_TMAPPUT:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.tmapput(os, osd_op, txn, delta_stats, *osd_op_params);
+ });
+ case CEPH_OSD_OP_TMAPUP:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto &txn) {
+ return backend.tmapup(os, osd_op, txn, delta_stats, *osd_op_params);
+ });
+ case CEPH_OSD_OP_TMAPGET:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.tmapget(os, osd_op, delta_stats);
+ });
+
+ // OMAP
+ case CEPH_OSD_OP_OMAPGETKEYS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_keys(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETVALS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_vals(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAP_CMP:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_cmp(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETHEADER:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_header(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+ return do_read_op([this, &osd_op](auto& backend, const auto& os) {
+ return backend.omap_get_vals_by_keys(os, osd_op, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPSETVALS:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_set_vals(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPSETHEADER:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_set_header(os, osd_op, txn, *osd_op_params,
+ delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPRMKEYRANGE:
+#if 0
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+#endif
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_remove_range(os, osd_op, txn, delta_stats);
+ });
+ case CEPH_OSD_OP_OMAPRMKEYS:
+ /** TODO: Implement supports_omap()
+ if (!pg.get_pgpool().info.supports_omap()) {
+ return crimson::ct_error::operation_not_supported::make();
+ }*/
+ return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_remove_key(os, osd_op, txn);
+ });
+ case CEPH_OSD_OP_OMAPCLEAR:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return backend.omap_clear(os, osd_op, txn, *osd_op_params, delta_stats);
+ });
+
+ // watch/notify
+ case CEPH_OSD_OP_WATCH:
+ return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) {
+ return do_op_watch(osd_op, os, txn);
+ }, modified_by::sys);
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_list_watchers(osd_op, os);
+ });
+ case CEPH_OSD_OP_NOTIFY:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_notify(osd_op, os);
+ });
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_op_notify_ack(osd_op, os);
+ });
+ case CEPH_OSD_OP_ASSERT_VER:
+ return do_read_op([this, &osd_op](auto&, const auto& os) {
+ return do_assert_ver(osd_op, os);
+ });
+ case CEPH_OSD_OP_LIST_SNAPS:
+ return do_snapset_op([this, &osd_op](const auto &os, const auto &ss) {
+ return do_list_snaps(osd_op, os, ss);
+ });
+
+ default:
+ logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+ throw std::runtime_error(
+ fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+ }
+}
+
+void OpsExecuter::fill_op_params_bump_pg_version()
+{
+ osd_op_params->req_id = msg->get_reqid();
+ osd_op_params->mtime = msg->get_mtime();
+ osd_op_params->at_version = pg->next_version();
+ osd_op_params->pg_trim_to = pg->get_pg_trim_to();
+ osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk();
+ osd_op_params->last_complete = pg->get_info().last_complete;
+}
+
+std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
+ const std::vector<OSDOp>& ops)
+{
+ // let's ensure we don't need to inform SnapMapper about this particular
+ // entry.
+ assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP);
+ std::vector<pg_log_entry_t> log_entries;
+ log_entries.emplace_back(
+ obc->obs.exists ?
+ pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE,
+ obc->obs.oi.soid,
+ osd_op_params->at_version,
+ obc->obs.oi.version,
+ osd_op_params->user_modify ? osd_op_params->at_version.version : 0,
+ osd_op_params->req_id,
+ osd_op_params->mtime,
+ op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0);
+ if (op_info.allows_returnvec()) {
+ // also the per-op values are recorded in the pg log
+ log_entries.back().set_op_returns(ops);
+ logger().debug("{} op_returns: {}",
+ __func__, log_entries.back().op_returns);
+ }
+ log_entries.back().clean_regions = std::move(osd_op_params->clean_regions);
+ return log_entries;
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove(
+ const hobject_t& soid,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}", __func__, soid);
+ return interruptor::async([soid, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ const auto r = snap_mapper.remove_oid(soid, &_t);
+ if (r) {
+ logger().error("{}: remove_oid {} failed with {}",
+ __func__, soid, r);
+ }
+ // On removal tolerate missing key corruption
+ assert(r == 0 || r == -ENOENT);
+ });
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
+ return interruptor::async([soid, snaps, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ assert(std::size(snaps) > 0);
+ [[maybe_unused]] const auto r = snap_mapper.update_snaps(
+ soid, snaps, 0, &_t);
+ assert(r == 0);
+ });
+}
+
+OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
+ return interruptor::async([soid, snaps, &snap_mapper,
+ _t=osdriver.get_transaction(&txn)]() mutable {
+ assert(std::size(snaps) > 0);
+ snap_mapper.add_oid(soid, snaps, &_t);
+ });
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+uint32_t OpsExecuter::get_pool_stripe_width() const {
+ return pg->get_pgpool().info.get_stripe_width();
+}
+
+// Defined here because there is a circular dependency between OpsExecuter and PG
+version_t OpsExecuter::get_last_user_version() const
+{
+ return pg->get_last_user_version();
+}
+
+std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
+ const SnapContext& snapc,
+ const ObjectState& initial_obs,
+ const SnapSet& initial_snapset,
+ PGBackend& backend,
+ ceph::os::Transaction& txn)
+{
+ const hobject_t& soid = initial_obs.oi.soid;
+ logger().debug("{} {} snapset={} snapc={}",
+ __func__, soid,
+ initial_snapset, snapc);
+
+ auto cloning_ctx = std::make_unique<CloningContext>();
+ cloning_ctx->new_snapset = initial_snapset;
+
+ // clone object, the snap field is set to the seq of the SnapContext
+ // at its creation.
+ hobject_t coid = soid;
+ coid.snap = snapc.seq;
+
+ // existing snaps are stored in descending order in snapc,
+ // cloned_snaps vector will hold all the snaps stored until snapset.seq
+ const std::vector<snapid_t> cloned_snaps = [&] {
+ auto last = std::find_if(
+ std::begin(snapc.snaps), std::end(snapc.snaps),
+ [&](snapid_t snap_id) { return snap_id <= initial_snapset.seq; });
+ return std::vector<snapid_t>{std::begin(snapc.snaps), last};
+ }();
+
+ auto [snap_oi, clone_obc] = prepare_clone(coid);
+ // make clone
+ backend.clone(snap_oi, initial_obs, clone_obc->obs, txn);
+
+ delta_stats.num_objects++;
+ if (snap_oi.is_omap()) {
+ delta_stats.num_objects_omap++;
+ }
+ delta_stats.num_object_clones++;
+ // newsnapset is obc's ssc
+ cloning_ctx->new_snapset.clones.push_back(coid.snap);
+ cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size;
+ cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps;
+
+ // clone_overlap should contain an entry for each clone
+ // (an empty interval_set if there is no overlap)
+ auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap];
+ if (initial_obs.oi.size) {
+ overlap.insert(0, initial_obs.oi.size);
+ }
+
+ // log clone
+ logger().debug("cloning v {} to {} v {} snaps={} snapset={}",
+ initial_obs.oi.version, coid,
+ osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset);
+
+ cloning_ctx->log_entry = {
+ pg_log_entry_t::CLONE,
+ coid,
+ snap_oi.version,
+ initial_obs.oi.version,
+ initial_obs.oi.user_version,
+ osd_reqid_t(),
+ initial_obs.oi.mtime, // will be replaced in `apply_to()`
+ 0
+ };
+ encode(cloned_snaps, cloning_ctx->log_entry.snaps);
+
+ // TODO: update most recent clone_overlap and usage stats
+ return cloning_ctx;
+}
+
+void OpsExecuter::CloningContext::apply_to(
+ std::vector<pg_log_entry_t>& log_entries,
+ ObjectContext& processed_obc) &&
+{
+ log_entry.mtime = processed_obc.obs.oi.mtime;
+ log_entries.emplace_back(std::move(log_entry));
+ processed_obc.ssc->snapset = std::move(new_snapset);
+}
+
+OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>>
+OpsExecuter::flush_clone_metadata(
+ std::vector<pg_log_entry_t>&& log_entries,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn)
+{
+ assert(!txn.empty());
+ auto maybe_snap_mapped = interruptor::now();
+ if (cloning_ctx) {
+ std::move(*cloning_ctx).apply_to(log_entries, *obc);
+ const auto& coid = log_entries.back().soid;
+ const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap];
+ maybe_snap_mapped = snap_map_clone(
+ coid,
+ std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)},
+ snap_mapper,
+ osdriver,
+ txn);
+ }
+ if (snapc.seq > obc->ssc->snapset.seq) {
+ // update snapset with latest snap context
+ obc->ssc->snapset.seq = snapc.seq;
+ obc->ssc->snapset.snaps.clear();
+ }
+ logger().debug("{} done, initial snapset={}, new snapset={}",
+ __func__, obc->obs.oi.soid, obc->ssc->snapset);
+ return std::move(
+ maybe_snap_mapped
+ ).then_interruptible([log_entries=std::move(log_entries)]() mutable {
+ return interruptor::make_ready_future<std::vector<pg_log_entry_t>>(
+ std::move(log_entries));
+ });
+}
+
+// TODO: make this static
+std::pair<object_info_t, ObjectContextRef> OpsExecuter::prepare_clone(
+ const hobject_t& coid)
+{
+ object_info_t static_snap_oi(coid);
+ static_snap_oi.version = pg->next_version();
+ static_snap_oi.prior_version = obc->obs.oi.version;
+ static_snap_oi.copy_user_bits(obc->obs.oi);
+ if (static_snap_oi.is_whiteout()) {
+ // clone shouldn't be marked as whiteout
+ static_snap_oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ }
+
+ ObjectContextRef clone_obc;
+ if (pg->is_primary()) {
+ // lookup_or_create
+ auto [c_obc, existed] =
+ pg->obc_registry.get_cached_obc(std::move(coid));
+ assert(!existed);
+ c_obc->obs.oi = static_snap_oi;
+ c_obc->obs.exists = true;
+ c_obc->ssc = obc->ssc;
+ logger().debug("clone_obc: {}", c_obc->obs.oi);
+ clone_obc = std::move(c_obc);
+ }
+ return std::make_pair(std::move(static_snap_oi), std::move(clone_obc));
+}
+
+void OpsExecuter::apply_stats()
+{
+ pg->get_peering_state().apply_op_stats(get_target(), delta_stats);
+ pg->publish_stats_to_osd();
+}
+
+OpsExecuter::OpsExecuter(Ref<PG> pg,
+ ObjectContextRef _obc,
+ const OpInfo& op_info,
+ abstracted_msg_t&& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& _snapc)
+ : pg(std::move(pg)),
+ obc(std::move(_obc)),
+ op_info(op_info),
+ msg(std::move(msg)),
+ conn(conn),
+ snapc(_snapc)
+{
+ if (op_info.may_write() && should_clone(*obc, snapc)) {
+ do_write_op([this](auto& backend, auto& os, auto& txn) {
+ cloning_ctx = execute_clone(std::as_const(snapc),
+ std::as_const(obc->obs),
+ std::as_const(obc->ssc->snapset),
+ backend,
+ txn);
+ });
+ }
+}
+
+static inline std::unique_ptr<const PGLSFilter> get_pgls_filter(
+ const std::string& type,
+ bufferlist::const_iterator& iter)
+{
+ // storing non-const PGLSFilter for the sake of ::init()
+ std::unique_ptr<PGLSFilter> filter;
+ if (type.compare("plain") == 0) {
+ filter = std::make_unique<PGLSPlainFilter>();
+ } else {
+ std::size_t dot = type.find(".");
+ if (dot == type.npos || dot == 0 || dot == type.size() - 1) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ const std::string class_name = type.substr(0, dot);
+ const std::string filter_name = type.substr(dot + 1);
+ ClassHandler::ClassData *cls = nullptr;
+ int r = ClassHandler::get_instance().open_class(class_name, &cls);
+ if (r != 0) {
+ logger().warn("can't open class {}: {}", class_name, cpp_strerror(r));
+ if (r == -EPERM) {
+ // propogate permission error
+ throw crimson::osd::permission_denied{};
+ } else {
+ throw crimson::osd::invalid_argument{};
+ }
+ } else {
+ ceph_assert(cls);
+ }
+
+ ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name);
+ if (class_filter == nullptr) {
+ logger().warn("can't find filter {} in class {}", filter_name, class_name);
+ throw crimson::osd::invalid_argument{};
+ }
+
+ filter.reset(class_filter->fn());
+ if (!filter) {
+ // Object classes are obliged to return us something, but let's
+ // give an error rather than asserting out.
+ logger().warn("buggy class {} failed to construct filter {}",
+ class_name, filter_name);
+ throw crimson::osd::invalid_argument{};
+ }
+ }
+
+ ceph_assert(filter);
+ int r = filter->init(iter);
+ if (r < 0) {
+ logger().warn("error initializing filter {}: {}", type, cpp_strerror(r));
+ throw crimson::osd::invalid_argument{};
+ }
+
+ // successfully constructed and initialized, return it.
+ return filter;
+}
+
+static PG::interruptible_future<hobject_t> pgls_filter(
+ const PGLSFilter& filter,
+ const PGBackend& backend,
+ const hobject_t& sobj)
+{
+ if (const auto xattr = filter.get_xattr(); !xattr.empty()) {
+ logger().debug("pgls_filter: filter is interested in xattr={} for obj={}",
+ xattr, sobj);
+ return backend.getxattr(sobj, std::move(xattr)).safe_then_interruptible(
+ [&filter, sobj] (ceph::bufferlist val) {
+ logger().debug("pgls_filter: got xvalue for obj={}", sobj);
+
+ const bool filtered = filter.filter(sobj, val);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] {
+ logger().debug("pgls_filter: got error for obj={}", sobj);
+
+ if (filter.reject_empty_xattr()) {
+ return seastar::make_ready_future<hobject_t>();
+ }
+ ceph::bufferlist val;
+ const bool filtered = filter.filter(sobj, val);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }));
+ } else {
+ ceph::bufferlist empty_lvalue_bl;
+ const bool filtered = filter.filter(sobj, empty_lvalue_bl);
+ return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+ }
+}
+
+static PG::interruptible_future<ceph::bufferlist> do_pgnls_common(
+ const hobject_t& pg_start,
+ const hobject_t& pg_end,
+ const PGBackend& backend,
+ const hobject_t& lower_bound,
+ const std::string& nspace,
+ const uint64_t limit,
+ const PGLSFilter* const filter)
+{
+ if (!(lower_bound.is_min() ||
+ lower_bound.is_max() ||
+ (lower_bound >= pg_start && lower_bound < pg_end))) {
+ // this should only happen with a buggy client.
+ throw std::invalid_argument("outside of PG bounds");
+ }
+
+ return backend.list_objects(lower_bound, limit).then_interruptible(
+ [&backend, filter, nspace](auto&& ret)
+ -> PG::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> {
+ auto& [objects, next] = ret;
+ auto in_my_namespace = [&nspace](const hobject_t& obj) {
+ using crimson::common::local_conf;
+ if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) {
+ return false;
+ } else if (nspace == librados::all_nspaces) {
+ return true;
+ } else {
+ return obj.get_namespace() == nspace;
+ }
+ };
+ auto to_pglsed = [&backend, filter] (const hobject_t& obj)
+ -> PG::interruptible_future<hobject_t> {
+ // this transformation looks costly. However, I don't have any
+ // reason to think PGLS* operations are critical for, let's say,
+ // general performance.
+ //
+ // from tchaikov: "another way is to use seastar::map_reduce(),
+ // to 1) save the effort to filter the already filtered objects
+ // 2) avoid the space to keep the tuple<bool, object> even if
+ // the object is filtered out".
+ if (filter) {
+ return pgls_filter(*filter, backend, obj);
+ } else {
+ return seastar::make_ready_future<hobject_t>(obj);
+ }
+ };
+
+ auto range = objects | boost::adaptors::filtered(in_my_namespace)
+ | boost::adaptors::transformed(to_pglsed);
+ logger().debug("do_pgnls_common: finishing the 1st stage of pgls");
+ return seastar::when_all_succeed(std::begin(range),
+ std::end(range)).then(
+ [next=std::move(next)] (auto items) mutable {
+ // the sole purpose of this chaining is to pass `next` to 2nd
+ // stage altogether with items
+ logger().debug("do_pgnls_common: 1st done");
+ return seastar::make_ready_future<
+ std::tuple<std::vector<hobject_t>, hobject_t>>(
+ std::move(items), std::move(next));
+ });
+ }).then_interruptible(
+ [pg_end] (auto&& ret) {
+ auto& [items, next] = ret;
+ auto is_matched = [] (const auto& obj) {
+ return !obj.is_min();
+ };
+ auto to_entry = [] (const auto& obj) {
+ return librados::ListObjectImpl{
+ obj.get_namespace(), obj.oid.name, obj.get_key()
+ };
+ };
+
+ pg_nls_response_t response;
+ boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched)
+ | boost::adaptors::transformed(to_entry));
+ response.handle = next.is_max() ? pg_end : next;
+ ceph::bufferlist out;
+ encode(response, out);
+ logger().debug("do_pgnls_common: response.entries.size()= {}",
+ response.entries.size());
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+ });
+}
+
+static PG::interruptible_future<> do_pgnls(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ hobject_t lower_bound;
+ try {
+ ceph::decode(lower_bound, osd_op.indata);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGNLS handle");
+ }
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = \
+ pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgnls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ nullptr /* no filter */)
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+}
+
+static PG::interruptible_future<> do_pgnls_filtered(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ std::string cname, mname, type;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ ceph::decode(cname, bp);
+ ceph::decode(mname, bp);
+ ceph::decode(type, bp);
+ } catch (const buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ auto filter = get_pgls_filter(type, bp);
+
+ hobject_t lower_bound;
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGNLS_FILTER description");
+ }
+
+ logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+ __func__, cname, mname, type, lower_bound,
+ static_cast<const void*>(filter.get()));
+ return seastar::do_with(std::move(filter),
+ [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgnls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ filter.get())
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+ });
+}
+
+static PG::interruptible_future<ceph::bufferlist> do_pgls_common(
+ const hobject_t& pg_start,
+ const hobject_t& pg_end,
+ const PGBackend& backend,
+ const hobject_t& lower_bound,
+ const std::string& nspace,
+ const uint64_t limit,
+ const PGLSFilter* const filter)
+{
+ if (!(lower_bound.is_min() ||
+ lower_bound.is_max() ||
+ (lower_bound >= pg_start && lower_bound < pg_end))) {
+ // this should only happen with a buggy client.
+ throw std::invalid_argument("outside of PG bounds");
+ }
+
+ using entries_t = decltype(pg_ls_response_t::entries);
+ return backend.list_objects(lower_bound, limit).then_interruptible(
+ [&backend, filter, nspace](auto&& ret) {
+ auto& [objects, next] = ret;
+ return PG::interruptor::when_all(
+ PG::interruptor::map_reduce(std::move(objects),
+ [&backend, filter, nspace](const hobject_t& obj)
+ -> PG::interruptible_future<hobject_t>{
+ if (obj.get_namespace() == nspace) {
+ if (filter) {
+ return pgls_filter(*filter, backend, obj);
+ } else {
+ return seastar::make_ready_future<hobject_t>(obj);
+ }
+ } else {
+ return seastar::make_ready_future<hobject_t>();
+ }
+ },
+ entries_t{},
+ [](entries_t entries, hobject_t obj) {
+ if (!obj.is_min()) {
+ entries.emplace_back(obj.oid, obj.get_key());
+ }
+ return entries;
+ }),
+ seastar::make_ready_future<hobject_t>(next));
+ }).then_interruptible([pg_end](auto&& ret) {
+ auto entries = std::move(std::get<0>(ret).get0());
+ auto next = std::move(std::get<1>(ret).get0());
+ pg_ls_response_t response;
+ response.handle = next.is_max() ? pg_end : next;
+ response.entries = std::move(entries);
+ ceph::bufferlist out;
+ encode(response, out);
+ logger().debug("{}: response.entries.size()=",
+ __func__, response.entries.size());
+ return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+ });
+}
+
+static PG::interruptible_future<> do_pgls(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ hobject_t lower_bound;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument{"unable to decode PGLS handle"};
+ }
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end =
+ pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ nullptr /* no filter */)
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+}
+
+static PG::interruptible_future<> do_pgls_filtered(
+ const PG& pg,
+ const std::string& nspace,
+ OSDOp& osd_op)
+{
+ std::string cname, mname, type;
+ auto bp = osd_op.indata.cbegin();
+ try {
+ ceph::decode(cname, bp);
+ ceph::decode(mname, bp);
+ ceph::decode(type, bp);
+ } catch (const buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ auto filter = get_pgls_filter(type, bp);
+
+ hobject_t lower_bound;
+ try {
+ lower_bound.decode(bp);
+ } catch (const buffer::error&) {
+ throw std::invalid_argument("unable to decode PGLS_FILTER description");
+ }
+
+ logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+ __func__, cname, mname, type, lower_bound,
+ static_cast<const void*>(filter.get()));
+ return seastar::do_with(std::move(filter),
+ [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+ const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+ const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num());
+ return do_pgls_common(pg_start,
+ pg_end,
+ pg.get_backend(),
+ lower_bound,
+ nspace,
+ osd_op.op.pgls.count,
+ filter.get())
+ .then_interruptible([&osd_op](bufferlist bl) {
+ osd_op.outdata = std::move(bl);
+ return seastar::now();
+ });
+ });
+}
+
+PgOpsExecuter::interruptible_future<>
+PgOpsExecuter::execute_op(OSDOp& osd_op)
+{
+ logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op));
+ switch (const ceph_osd_op& op = osd_op.op; op.op) {
+ case CEPH_OSD_OP_PGLS:
+ return do_pgls(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGLS_FILTER:
+ return do_pgls_filtered(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGNLS:
+ return do_pgnls(pg, nspace, osd_op);
+ case CEPH_OSD_OP_PGNLS_FILTER:
+ return do_pgnls_filtered(pg, nspace, osd_op);
+ default:
+ logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+ throw std::runtime_error(
+ fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
new file mode 100644
index 000000000..1230b1c5a
--- /dev/null
+++ b/src/crimson/osd/ops_executer.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <fmt/os.h>
+#include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "common/dout.h"
+#include "common/map_cacher.hpp"
+#include "common/static_ptr.h"
+#include "messages/MOSDOp.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/interruptible_future.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/shard_services.h"
+
+struct ObjectState;
+struct OSDOp;
+class OSDriver;
+class SnapMapper;
+
+namespace crimson::osd {
+class PG;
+
+// OpsExecuter -- a class for executing ops targeting a certain object.
+class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
+ friend class SnapTrimObjSubEvent;
+
+ using call_errorator = crimson::errorator<
+ crimson::stateful_ec,
+ crimson::ct_error::enoent,
+ crimson::ct_error::eexist,
+ crimson::ct_error::enospc,
+ crimson::ct_error::edquot,
+ crimson::ct_error::cmp_fail,
+ crimson::ct_error::eagain,
+ crimson::ct_error::invarg,
+ crimson::ct_error::erange,
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::enametoolong,
+ crimson::ct_error::permission_denied,
+ crimson::ct_error::operation_not_supported,
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::value_too_large,
+ crimson::ct_error::file_too_large>;
+ using read_errorator = PGBackend::read_errorator;
+ using write_ertr = PGBackend::write_ertr;
+ using get_attr_errorator = PGBackend::get_attr_errorator;
+ using watch_errorator = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::invarg,
+ crimson::ct_error::not_connected,
+ crimson::ct_error::timed_out>;
+
+ using call_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, call_errorator>;
+ using read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, read_errorator>;
+ using write_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, write_ertr>;
+ using get_attr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, get_attr_errorator>;
+ using watch_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, watch_errorator>;
+
+ template <typename Errorator, typename T = void>
+ using interruptible_errorated_future =
+ ::crimson::interruptible::interruptible_errorated_future<
+ IOInterruptCondition, Errorator, T>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<IOInterruptCondition>;
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ IOInterruptCondition, T>;
+
+public:
+ // ExecutableMessage -- an interface class to allow using OpsExecuter
+ // with other message types than just the `MOSDOp`. The type erasure
+ // happens in the ctor of `OpsExecuter`.
+ struct ExecutableMessage {
+ virtual osd_reqid_t get_reqid() const = 0;
+ virtual utime_t get_mtime() const = 0;
+ virtual epoch_t get_map_epoch() const = 0;
+ virtual entity_inst_t get_orig_source_inst() const = 0;
+ virtual uint64_t get_features() const = 0;
+ virtual bool has_flag(uint32_t flag) const = 0;
+ virtual entity_name_t get_source() const = 0;
+ };
+
+ template <class ImplT>
+ class ExecutableMessagePimpl final : ExecutableMessage {
+ const ImplT* pimpl;
+ // In crimson, conn is independently maintained outside Message.
+ const crimson::net::ConnectionRef conn;
+ public:
+ ExecutableMessagePimpl(const ImplT* pimpl,
+ const crimson::net::ConnectionRef conn)
+ : pimpl(pimpl), conn(conn) {
+ }
+
+ osd_reqid_t get_reqid() const final {
+ return pimpl->get_reqid();
+ }
+ bool has_flag(uint32_t flag) const final {
+ return pimpl->has_flag(flag);
+ }
+ utime_t get_mtime() const final {
+ return pimpl->get_mtime();
+ };
+ epoch_t get_map_epoch() const final {
+ return pimpl->get_map_epoch();
+ }
+ entity_inst_t get_orig_source_inst() const final {
+ // We can't get the origin source address from the message
+ // since (In Crimson) the connection is maintained
+ // outside of the Message.
+ return entity_inst_t(get_source(), conn->get_peer_addr());
+ }
+ entity_name_t get_source() const final {
+ return pimpl->get_source();
+ }
+ uint64_t get_features() const final {
+ return pimpl->get_features();
+ }
+ };
+
+ // because OpsExecuter is pretty heavy-weight object we want to ensure
+ // it's not copied nor even moved by accident. Performance is the sole
+ // reason for prohibiting that.
+ OpsExecuter(OpsExecuter&&) = delete;
+ OpsExecuter(const OpsExecuter&) = delete;
+
+ using osd_op_errorator = crimson::compound_errorator_t<
+ call_errorator,
+ read_errorator,
+ write_ertr,
+ get_attr_errorator,
+ watch_errorator,
+ PGBackend::stat_errorator>;
+ using osd_op_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, osd_op_errorator>;
+
+ object_stat_sum_t delta_stats;
+private:
+ // an operation can be divided into two stages: main and effect-exposing
+ // one. The former is performed immediately on call to `do_osd_op()` while
+ // the later on `submit_changes()` – after successfully processing main
+ // stages of all involved operations. When any stage fails, none of all
+ // scheduled effect-exposing stages will be executed.
+ // when operation requires this division, some variant of `with_effect()`
+ // should be used.
+ struct effect_t {
+ // an effect can affect PG, i.e. create a watch timeout
+ virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0;
+ virtual ~effect_t() = default;
+ };
+
+ Ref<PG> pg; // for the sake of object class
+ ObjectContextRef obc;
+ const OpInfo& op_info;
+ using abstracted_msg_t =
+ ceph::static_ptr<ExecutableMessage,
+ sizeof(ExecutableMessagePimpl<void>)>;
+ abstracted_msg_t msg;
+ crimson::net::ConnectionRef conn;
+ std::optional<osd_op_params_t> osd_op_params;
+ bool user_modify = false;
+ ceph::os::Transaction txn;
+
+ size_t num_read = 0; ///< count read ops
+ size_t num_write = 0; ///< count update ops
+
+ SnapContext snapc; // writer snap context
+ struct CloningContext {
+ SnapSet new_snapset;
+ pg_log_entry_t log_entry;
+
+ void apply_to(
+ std::vector<pg_log_entry_t>& log_entries,
+ ObjectContext& processed_obc) &&;
+ };
+ std::unique_ptr<CloningContext> cloning_ctx;
+
+
+ /**
+ * execute_clone
+ *
+ * If snapc contains a snap which occurred logically after the last write
+ * seen by this object (see OpsExecutor::should_clone()), we first need
+ * make a clone of the object at its current state. execute_clone primes
+ * txn with that clone operation and returns an
+ * OpsExecutor::CloningContext which will allow us to fill in the corresponding
+ * metadata and log_entries once the operations have been processed.
+ *
+ * Note that this strategy differs from classic, which instead performs this
+ * work at the end and reorders the transaction. See
+ * PrimaryLogPG::make_writeable
+ *
+ * @param snapc [in] snapc for this operation (from the client if from the
+ * client, from the pool otherwise)
+ * @param initial_obs [in] objectstate for the object at operation start
+ * @param initial_snapset [in] snapset for the object at operation start
+ * @param backend [in,out] interface for generating mutations
+ * @param txn [out] transaction for the operation
+ */
+ std::unique_ptr<CloningContext> execute_clone(
+ const SnapContext& snapc,
+ const ObjectState& initial_obs,
+ const SnapSet& initial_snapset,
+ PGBackend& backend,
+ ceph::os::Transaction& txn);
+
+
+ /**
+ * should_clone
+ *
+ * Predicate returning whether a user write with snap context snapc
+ * contains a snap which occurred prior to the most recent write
+ * on the object reflected in initial_obc.
+ *
+ * @param initial_obc [in] obc for object to be mutated
+ * @param snapc [in] snapc for this operation (from the client if from the
+ * client, from the pool otherwise)
+ */
+ static bool should_clone(
+ const ObjectContext& initial_obc,
+ const SnapContext& snapc) {
+ // clone?
+ return initial_obc.obs.exists // both nominally and...
+ && !initial_obc.obs.oi.is_whiteout() // ... logically exists
+ && snapc.snaps.size() // there are snaps
+ && snapc.snaps[0] > initial_obc.ssc->snapset.seq; // existing obj is old
+ }
+
+ interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata(
+ std::vector<pg_log_entry_t>&& log_entries,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+
+ static interruptible_future<> snap_map_remove(
+ const hobject_t& soid,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+ static interruptible_future<> snap_map_modify(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+ static interruptible_future<> snap_map_clone(
+ const hobject_t& soid,
+ const std::set<snapid_t>& snaps,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ ceph::os::Transaction& txn);
+
+ // this gizmo could be wrapped in std::optional for the sake of lazy
+ // initialization. we don't need it for ops that doesn't have effect
+ // TODO: verify the init overhead of chunked_fifo
+ seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects;
+
+ template <class Context, class MainFunc, class EffectFunc>
+ auto with_effect_on_obc(
+ Context&& ctx,
+ MainFunc&& main_func,
+ EffectFunc&& effect_func);
+
+ call_ierrorator::future<> do_op_call(OSDOp& osd_op);
+ watch_ierrorator::future<> do_op_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_watch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_reconnect(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_unwatch(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_watch_subop_ping(
+ OSDOp& osd_op,
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ watch_ierrorator::future<> do_op_list_watchers(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ watch_ierrorator::future<> do_op_notify(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ watch_ierrorator::future<> do_op_notify_ack(
+ OSDOp& osd_op,
+ const ObjectState& os);
+ call_errorator::future<> do_assert_ver(
+ OSDOp& osd_op,
+ const ObjectState& os);
+
+ using list_snaps_ertr = read_errorator::extend<
+ crimson::ct_error::invarg>;
+ using list_snaps_iertr = ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ list_snaps_ertr>;
+ list_snaps_iertr::future<> do_list_snaps(
+ OSDOp& osd_op,
+ const ObjectState& os,
+ const SnapSet& ss);
+
+ template <class Func>
+ auto do_const_op(Func&& f);
+
+ template <class Func>
+ auto do_read_op(Func&& f) {
+ ++num_read;
+ // TODO: pass backend as read-only
+ return do_const_op(std::forward<Func>(f));
+ }
+
+ template <class Func>
+ auto do_snapset_op(Func&& f) {
+ ++num_read;
+ return std::invoke(
+ std::forward<Func>(f),
+ std::as_const(obc->obs),
+ std::as_const(obc->ssc->snapset));
+ }
+
+ enum class modified_by {
+ user,
+ sys,
+ };
+
+ template <class Func>
+ auto do_write_op(Func&& f, modified_by m = modified_by::user);
+
+ decltype(auto) dont_do_legacy_op() {
+ return crimson::ct_error::operation_not_supported::make();
+ }
+
+ interruptible_errorated_future<osd_op_errorator>
+ do_execute_op(OSDOp& osd_op);
+
+ OpsExecuter(Ref<PG> pg,
+ ObjectContextRef obc,
+ const OpInfo& op_info,
+ abstracted_msg_t&& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& snapc);
+
+public:
+ template <class MsgT>
+ OpsExecuter(Ref<PG> pg,
+ ObjectContextRef obc,
+ const OpInfo& op_info,
+ const MsgT& msg,
+ crimson::net::ConnectionRef conn,
+ const SnapContext& snapc)
+ : OpsExecuter(
+ std::move(pg),
+ std::move(obc),
+ op_info,
+ abstracted_msg_t{
+ std::in_place_type_t<ExecutableMessagePimpl<MsgT>>{},
+ &msg,
+ conn},
+ conn,
+ snapc) {
+ }
+
+ template <class Func>
+ struct RollbackHelper;
+
+ template <class Func>
+ RollbackHelper<Func> create_rollbacker(Func&& func);
+
+ interruptible_errorated_future<osd_op_errorator>
+ execute_op(OSDOp& osd_op);
+
+ using rep_op_fut_tuple =
+ std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>;
+ using rep_op_fut_t =
+ interruptible_future<rep_op_fut_tuple>;
+ template <typename MutFunc>
+ rep_op_fut_t flush_changes_n_do_ops_effects(
+ const std::vector<OSDOp>& ops,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ MutFunc&& mut_func) &&;
+ std::vector<pg_log_entry_t> prepare_transaction(
+ const std::vector<OSDOp>& ops);
+ void fill_op_params_bump_pg_version();
+
+ ObjectContextRef get_obc() const {
+ return obc;
+ }
+
+ const object_info_t &get_object_info() const {
+ return obc->obs.oi;
+ }
+ const hobject_t &get_target() const {
+ return get_object_info().soid;
+ }
+
+ const auto& get_message() const {
+ return *msg;
+ }
+
+ size_t get_processed_rw_ops_num() const {
+ return num_read + num_write;
+ }
+
+ uint32_t get_pool_stripe_width() const;
+
+ bool has_seen_write() const {
+ return num_write > 0;
+ }
+
+ object_stat_sum_t& get_stats(){
+ return delta_stats;
+ }
+
+ version_t get_last_user_version() const;
+
+ std::pair<object_info_t, ObjectContextRef> prepare_clone(
+ const hobject_t& coid);
+
+ void apply_stats();
+};
+
+template <class Context, class MainFunc, class EffectFunc>
+auto OpsExecuter::with_effect_on_obc(
+ Context&& ctx,
+ MainFunc&& main_func,
+ EffectFunc&& effect_func)
+{
+ using context_t = std::decay_t<Context>;
+ // the language offers implicit conversion to pointer-to-function for
+ // lambda only when it's closureless. We enforce this restriction due
+ // the fact that `flush_changes()` std::moves many executer's parts.
+ using allowed_effect_func_t =
+ seastar::future<> (*)(context_t&&, ObjectContextRef, Ref<PG>);
+ static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>,
+ "with_effect function is not allowed to capture");
+ struct task_t final : effect_t {
+ context_t ctx;
+ EffectFunc effect_func;
+ ObjectContextRef obc;
+
+ task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc)
+ : ctx(std::move(ctx)),
+ effect_func(std::move(effect_func)),
+ obc(std::move(obc)) {
+ }
+ osd_op_errorator::future<> execute(Ref<PG> pg) final {
+ return std::move(effect_func)(std::move(ctx),
+ std::move(obc),
+ std::move(pg));
+ }
+ };
+ auto task =
+ std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc);
+ auto& ctx_ref = task->ctx;
+ op_effects.emplace_back(std::move(task));
+ return std::forward<MainFunc>(main_func)(ctx_ref);
+}
+
+template <typename MutFunc>
+OpsExecuter::rep_op_fut_t
+OpsExecuter::flush_changes_n_do_ops_effects(
+ const std::vector<OSDOp>& ops,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver,
+ MutFunc&& mut_func) &&
+{
+ const bool want_mutate = !txn.empty();
+ // osd_op_params are instantiated by every wr-like operation.
+ assert(osd_op_params || !want_mutate);
+ assert(obc);
+ rep_op_fut_t maybe_mutated =
+ interruptor::make_ready_future<rep_op_fut_tuple>(
+ seastar::now(),
+ interruptor::make_interruptible(osd_op_errorator::now()));
+ if (cloning_ctx) {
+ ceph_assert(want_mutate);
+ }
+ if (want_mutate) {
+ if (user_modify) {
+ osd_op_params->user_at_version = osd_op_params->at_version.version;
+ }
+ maybe_mutated = flush_clone_metadata(
+ prepare_transaction(ops),
+ snap_mapper,
+ osdriver,
+ txn
+ ).then_interruptible([mut_func=std::move(mut_func),
+ this](auto&& log_entries) mutable {
+ auto [submitted, all_completed] =
+ std::forward<MutFunc>(mut_func)(std::move(txn),
+ std::move(obc),
+ std::move(*osd_op_params),
+ std::move(log_entries));
+ return interruptor::make_ready_future<rep_op_fut_tuple>(
+ std::move(submitted),
+ osd_op_ierrorator::future<>(std::move(all_completed)));
+ });
+ }
+ apply_stats();
+
+ if (__builtin_expect(op_effects.empty(), true)) {
+ return maybe_mutated;
+ } else {
+ return maybe_mutated.then_unpack_interruptible(
+ // need extra ref pg due to apply_stats() which can be executed after
+ // informing snap mapper
+ [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable {
+ return interruptor::make_ready_future<rep_op_fut_tuple>(
+ std::move(submitted),
+ all_completed.safe_then_interruptible([this, pg=std::move(pg)] {
+ // let's do the cleaning of `op_effects` in destructor
+ return interruptor::do_for_each(op_effects,
+ [pg=std::move(pg)](auto& op_effect) {
+ return op_effect->execute(pg);
+ });
+ }));
+ });
+ }
+}
+
+template <class Func>
+struct OpsExecuter::RollbackHelper {
+ interruptible_future<> rollback_obc_if_modified(const std::error_code& e);
+ ObjectContextRef get_obc() const {
+ assert(ox);
+ return ox->obc;
+ }
+ seastar::lw_shared_ptr<OpsExecuter> ox;
+ Func func;
+};
+
+template <class Func>
+inline OpsExecuter::RollbackHelper<Func>
+OpsExecuter::create_rollbacker(Func&& func) {
+ return {shared_from_this(), std::forward<Func>(func)};
+}
+
+
+template <class Func>
+OpsExecuter::interruptible_future<>
+OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
+ const std::error_code& e)
+{
+ // Oops, an operation had failed. do_osd_ops() altogether with
+ // OpsExecuter already dropped the ObjectStore::Transaction if
+ // there was any. However, this is not enough to completely
+ // rollback as we gave OpsExecuter the very single copy of `obc`
+ // we maintain and we did it for both reading and writing.
+ // Now all modifications must be reverted.
+ //
+ // Let's just reload from the store. Evicting from the shared
+ // LRU would be tricky as next MOSDOp (the one at `get_obc`
+ // phase) could actually already finished the lookup. Fortunately,
+ // this is supposed to live on cold paths, so performance is not
+ // a concern -- simplicity wins.
+ //
+ // The conditional's purpose is to efficiently handle hot errors
+ // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or
+ // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients
+ // typically append them before any write. If OpsExecuter hasn't
+ // seen any modifying operation, `obc` is supposed to be kept
+ // unchanged.
+ assert(ox);
+ const auto need_rollback = ox->has_seen_write();
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{}: object {} got error {}, need_rollback={}",
+ __func__,
+ ox->obc->get_oid(),
+ e,
+ need_rollback);
+ return need_rollback ? func(*ox->obc) : interruptor::now();
+}
+
+// PgOpsExecuter -- a class for executing ops targeting a certain PG.
+class PgOpsExecuter {
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ IOInterruptCondition, T>;
+
+public:
+ PgOpsExecuter(const PG& pg, const MOSDOp& msg)
+ : pg(pg), nspace(msg.get_hobj().nspace) {
+ }
+
+ interruptible_future<> execute_op(OSDOp& osd_op);
+
+private:
+ const PG& pg;
+ const std::string& nspace;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
new file mode 100644
index 000000000..cfe4f54ab
--- /dev/null
+++ b/src/crimson/osd/osd.cc
@@ -0,0 +1,1357 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd.h"
+
+#include <sys/utsname.h>
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/join.hpp>
+#include <fmt/format.h>
+#include <fmt/os.h>
+#include <fmt/ostream.h>
+#include <seastar/core/timer.hh>
+
+#include "common/pick_address.h"
+#include "include/util.h"
+
+#include "messages/MCommand.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDPeeringOp.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MPGStats.h"
+
+#include "os/Transaction.h"
+#include "osd/ClassHandler.h"
+#include "osd/OSDCap.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+#include "crimson/admin/osd_admin.h"
+#include "crimson/admin/pg_commands.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/exception.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/heartbeat.h"
+#include "crimson/osd/osd_meta.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/crush/CrushLocation.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+ static constexpr int TICK_INTERVAL = 1;
+}
+
+using std::make_unique;
+using std::map;
+using std::pair;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using crimson::common::local_conf;
+using crimson::os::FuturizedStore;
+
+namespace crimson::osd {
+
+OSD::OSD(int id, uint32_t nonce,
+ seastar::abort_source& abort_source,
+ crimson::os::FuturizedStore& store,
+ crimson::net::MessengerRef cluster_msgr,
+ crimson::net::MessengerRef public_msgr,
+ crimson::net::MessengerRef hb_front_msgr,
+ crimson::net::MessengerRef hb_back_msgr)
+ : whoami{id},
+ nonce{nonce},
+ abort_source{abort_source},
+ // do this in background
+ beacon_timer{[this] { (void)send_beacon(); }},
+ cluster_msgr{cluster_msgr},
+ public_msgr{public_msgr},
+ hb_front_msgr{hb_front_msgr},
+ hb_back_msgr{hb_back_msgr},
+ monc{new crimson::mon::Client{*public_msgr, *this}},
+ mgrc{new crimson::mgr::Client{*public_msgr, *this}},
+ store{store},
+ pg_shard_manager{osd_singleton_state,
+ shard_services,
+ pg_to_shard_mappings},
+ // do this in background -- continuation rearms timer when complete
+ tick_timer{[this] {
+ std::ignore = update_heartbeat_peers(
+ ).then([this] {
+ update_stats();
+ tick_timer.arm(
+ std::chrono::seconds(TICK_INTERVAL));
+ });
+ }},
+ asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()},
+ log_client(cluster_msgr.get(), LogClient::NO_FLAGS),
+ clog(log_client.create_channel())
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr),
+ std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) {
+ msgr.get()->set_auth_server(monc.get());
+ msgr.get()->set_auth_client(monc.get());
+ }
+
+ if (local_conf()->osd_open_classes_on_start) {
+ const int r = ClassHandler::get_instance().open_all_classes();
+ if (r) {
+ logger().warn("{} warning: got an error loading one or more classes: {}",
+ __func__, cpp_strerror(r));
+ }
+ }
+ logger().info("{}: nonce is {}", __func__, nonce);
+ monc->set_log_client(&log_client);
+ clog->set_log_to_monitors(true);
+}
+
+OSD::~OSD() = default;
+
+namespace {
+// Initial features in new superblock.
+// Features here are also automatically upgraded
+CompatSet get_osd_initial_compat_set()
+{
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
+ return CompatSet(ceph_osd_feature_compat,
+ ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+}
+
+seastar::future<> OSD::open_meta_coll()
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.get_sharded_store().open_collection(
+ coll_t::meta()
+ ).then([this](auto ch) {
+ pg_shard_manager.init_meta_coll(ch, store.get_sharded_store());
+ return seastar::now();
+ });
+}
+
+seastar::future<OSDMeta> OSD::open_or_create_meta_coll(FuturizedStore &store)
+{
+ return store.get_sharded_store().open_collection(coll_t::meta()).then([&store](auto ch) {
+ if (!ch) {
+ return store.get_sharded_store().create_new_collection(
+ coll_t::meta()
+ ).then([&store](auto ch) {
+ return OSDMeta(ch, store.get_sharded_store());
+ });
+ } else {
+ return seastar::make_ready_future<OSDMeta>(ch, store.get_sharded_store());
+ }
+ });
+}
+
+seastar::future<> OSD::mkfs(
+ FuturizedStore &store,
+ unsigned whoami,
+ uuid_d osd_uuid,
+ uuid_d cluster_fsid,
+ std::string osdspec_affinity)
+{
+ return store.start().then([&store, osd_uuid] {
+ return store.mkfs(osd_uuid).handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ logger().error("error creating empty object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([&store] {
+ return store.mount().handle_error(
+ crimson::stateful_ec::handle([](const auto& ec) {
+ logger().error("error mounting object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([&store] {
+ return open_or_create_meta_coll(store);
+ }).then([&store, whoami, cluster_fsid](auto meta_coll) {
+ OSDSuperblock superblock;
+ superblock.cluster_fsid = cluster_fsid;
+ superblock.osd_fsid = store.get_fsid();
+ superblock.whoami = whoami;
+ superblock.compat_features = get_osd_initial_compat_set();
+ return _write_superblock(
+ store, std::move(meta_coll), std::move(superblock));
+ }).then([&store, cluster_fsid] {
+ return store.write_meta("ceph_fsid", cluster_fsid.to_string());
+ }).then([&store] {
+ return store.write_meta("magic", CEPH_OSD_ONDISK_MAGIC);
+ }).then([&store, whoami] {
+ return store.write_meta("whoami", std::to_string(whoami));
+ }).then([&store] {
+ return _write_key_meta(store);
+ }).then([&store, osdspec_affinity=std::move(osdspec_affinity)] {
+ return store.write_meta("osdspec_affinity", osdspec_affinity);
+ }).then([&store] {
+ return store.write_meta("ready", "ready");
+ }).then([&store, whoami, cluster_fsid] {
+ fmt::print("created object store {} for osd.{} fsid {}\n",
+ local_conf().get_val<std::string>("osd_data"),
+ whoami, cluster_fsid);
+ return store.umount();
+ }).then([&store] {
+ return store.stop();
+ });
+}
+
+seastar::future<> OSD::_write_superblock(
+ FuturizedStore &store,
+ OSDMeta meta_coll,
+ OSDSuperblock superblock)
+{
+ return seastar::do_with(
+ std::move(meta_coll),
+ std::move(superblock),
+ [&store](auto &meta_coll, auto &superblock) {
+ return meta_coll.load_superblock(
+ ).safe_then([&superblock](OSDSuperblock&& sb) {
+ if (sb.cluster_fsid != superblock.cluster_fsid) {
+ logger().error("provided cluster fsid {} != superblock's {}",
+ sb.cluster_fsid, superblock.cluster_fsid);
+ throw std::invalid_argument("mismatched fsid");
+ }
+ if (sb.whoami != superblock.whoami) {
+ logger().error("provided osd id {} != superblock's {}",
+ sb.whoami, superblock.whoami);
+ throw std::invalid_argument("mismatched osd id");
+ }
+ }).handle_error(
+ crimson::ct_error::enoent::handle([&store, &meta_coll, &superblock] {
+ // meta collection does not yet, create superblock
+ logger().info(
+ "{} writing superblock cluster_fsid {} osd_fsid {}",
+ "_write_superblock",
+ superblock.cluster_fsid,
+ superblock.osd_fsid);
+ ceph::os::Transaction t;
+ meta_coll.create(t);
+ meta_coll.store_superblock(t, superblock);
+ logger().debug("OSD::_write_superblock: do_transaction...");
+ return store.get_sharded_store().do_transaction(
+ meta_coll.collection(),
+ std::move(t));
+ }),
+ crimson::ct_error::assert_all("_write_superbock error")
+ );
+ });
+}
+
+// this `to_string` sits in the `crimson::osd` namespace, so we don't brake
+// the language rule on not overloading in `std::`.
+static std::string to_string(const seastar::temporary_buffer<char>& temp_buf)
+{
+ return {temp_buf.get(), temp_buf.size()};
+}
+
+seastar::future<> OSD::_write_key_meta(FuturizedStore &store)
+{
+
+ if (auto key = local_conf().get_val<std::string>("key"); !std::empty(key)) {
+ return store.write_meta("osd_key", key);
+ } else if (auto keyfile = local_conf().get_val<std::string>("keyfile");
+ !std::empty(keyfile)) {
+ return read_file(keyfile).then([&store](const auto& temp_buf) {
+ // it's on a truly cold path, so don't worry about memcpy.
+ return store.write_meta("osd_key", to_string(temp_buf));
+ }).handle_exception([keyfile] (auto ep) {
+ logger().error("_write_key_meta: failed to handle keyfile {}: {}",
+ keyfile, ep);
+ ceph_abort();
+ });
+ } else {
+ return seastar::now();
+ }
+}
+
+namespace {
+ entity_addrvec_t pick_addresses(int what) {
+ entity_addrvec_t addrs;
+ crimson::common::CephContext cct;
+ // we're interested solely in v2; crimson doesn't do v1
+ const auto flags = what | CEPH_PICK_ADDRESS_MSGR2;
+ if (int r = ::pick_addresses(&cct, flags, &addrs, -1); r < 0) {
+ throw std::runtime_error("failed to pick address");
+ }
+ for (auto addr : addrs.v) {
+ logger().info("picked address {}", addr);
+ }
+ return addrs;
+ }
+ std::pair<entity_addrvec_t, bool>
+ replace_unknown_addrs(entity_addrvec_t maybe_unknowns,
+ const entity_addrvec_t& knowns) {
+ bool changed = false;
+ auto maybe_replace = [&](entity_addr_t addr) {
+ if (!addr.is_blank_ip()) {
+ return addr;
+ }
+ for (auto& b : knowns.v) {
+ if (addr.get_family() == b.get_family()) {
+ auto a = b;
+ a.set_nonce(addr.get_nonce());
+ a.set_type(addr.get_type());
+ a.set_port(addr.get_port());
+ changed = true;
+ return a;
+ }
+ }
+ throw std::runtime_error("failed to replace unknown address");
+ };
+ entity_addrvec_t replaced;
+ std::transform(maybe_unknowns.v.begin(),
+ maybe_unknowns.v.end(),
+ std::back_inserter(replaced.v),
+ maybe_replace);
+ return {replaced, changed};
+ }
+}
+
+seastar::future<> OSD::start()
+{
+ logger().info("start");
+
+ startup_time = ceph::mono_clock::now();
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.start().then([this] {
+ return pg_to_shard_mappings.start(0, seastar::smp::count
+ ).then([this] {
+ return osd_singleton_state.start_single(
+ whoami, std::ref(*cluster_msgr), std::ref(*public_msgr),
+ std::ref(*monc), std::ref(*mgrc));
+ }).then([this] {
+ return osd_states.start();
+ }).then([this] {
+ ceph::mono_time startup_time = ceph::mono_clock::now();
+ return shard_services.start(
+ std::ref(osd_singleton_state),
+ std::ref(pg_to_shard_mappings),
+ whoami,
+ startup_time,
+ osd_singleton_state.local().perf,
+ osd_singleton_state.local().recoverystate_perf,
+ std::ref(store),
+ std::ref(osd_states));
+ });
+ }).then([this] {
+ heartbeat.reset(new Heartbeat{
+ whoami, get_shard_services(),
+ *monc, *hb_front_msgr, *hb_back_msgr});
+ return store.mount().handle_error(
+ crimson::stateful_ec::handle([] (const auto& ec) {
+ logger().error("error mounting object store in {}: ({}) {}",
+ local_conf().get_val<std::string>("osd_data"),
+ ec.value(), ec.message());
+ std::exit(EXIT_FAILURE);
+ }));
+ }).then([this] {
+ return open_meta_coll();
+ }).then([this] {
+ return pg_shard_manager.get_meta_coll().load_superblock(
+ ).handle_error(
+ crimson::ct_error::assert_all("open_meta_coll error")
+ );
+ }).then([this](OSDSuperblock&& sb) {
+ superblock = std::move(sb);
+ pg_shard_manager.set_superblock(superblock);
+ return pg_shard_manager.get_local_map(superblock.current_epoch);
+ }).then([this](OSDMapService::local_cached_map_t&& map) {
+ osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(map));
+ return pg_shard_manager.update_map(std::move(map));
+ }).then([this] {
+ return shard_services.invoke_on_all([this](auto &local_service) {
+ local_service.local_state.osdmap_gate.got_map(osdmap->get_epoch());
+ });
+ }).then([this] {
+ bind_epoch = osdmap->get_epoch();
+ return pg_shard_manager.load_pgs(store);
+ }).then([this] {
+ uint64_t osd_required =
+ CEPH_FEATURE_UID |
+ CEPH_FEATURE_PGID64 |
+ CEPH_FEATURE_OSDENC;
+ using crimson::net::SocketPolicy;
+
+ public_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+ public_msgr->set_policy(entity_name_t::TYPE_MON,
+ SocketPolicy::lossy_client(osd_required));
+ public_msgr->set_policy(entity_name_t::TYPE_MGR,
+ SocketPolicy::lossy_client(osd_required));
+ public_msgr->set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::stateless_server(0));
+
+ cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+ cluster_msgr->set_policy(entity_name_t::TYPE_MON,
+ SocketPolicy::lossy_client(0));
+ cluster_msgr->set_policy(entity_name_t::TYPE_OSD,
+ SocketPolicy::lossless_peer(osd_required));
+ cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT,
+ SocketPolicy::stateless_server(0));
+
+ crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()};
+ return seastar::when_all_succeed(
+ cluster_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER))
+ .safe_then([this, dispatchers]() mutable {
+ return cluster_msgr->start(dispatchers);
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [] (const std::error_code& e) {
+ logger().error("cluster messenger bind(): {}", e);
+ ceph_abort();
+ })),
+ public_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC))
+ .safe_then([this, dispatchers]() mutable {
+ return public_msgr->start(dispatchers);
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [] (const std::error_code& e) {
+ logger().error("public messenger bind(): {}", e);
+ ceph_abort();
+ })));
+ }).then_unpack([this] {
+ return seastar::when_all_succeed(monc->start(),
+ mgrc->start());
+ }).then_unpack([this] {
+ return _add_me_to_crush();
+ }).then([this] {
+ monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
+ monc->sub_want("mgrmap", 0, 0);
+ monc->sub_want("osdmap", 0, 0);
+ return monc->renew_subs();
+ }).then([this] {
+ if (auto [addrs, changed] =
+ replace_unknown_addrs(cluster_msgr->get_myaddrs(),
+ public_msgr->get_myaddrs()); changed) {
+ logger().debug("replacing unkwnown addrs of cluster messenger");
+ cluster_msgr->set_myaddrs(addrs);
+ }
+ return heartbeat->start(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC),
+ pick_addresses(CEPH_PICK_ADDRESS_CLUSTER));
+ }).then([this] {
+ // create the admin-socket server, and the objects that register
+ // to handle incoming commands
+ return start_asok_admin();
+ }).then([this] {
+ return log_client.set_fsid(monc->get_fsid());
+ }).then([this] {
+ return start_boot();
+ });
+}
+
+seastar::future<> OSD::start_boot()
+{
+ pg_shard_manager.set_preboot();
+ return monc->get_version("osdmap").then([this](auto&& ret) {
+ auto [newest, oldest] = ret;
+ return _preboot(oldest, newest);
+ });
+}
+
+seastar::future<> OSD::_preboot(version_t oldest, version_t newest)
+{
+ logger().info("osd.{}: _preboot", whoami);
+ if (osdmap->get_epoch() == 0) {
+ logger().info("waiting for initial osdmap");
+ } else if (osdmap->is_destroyed(whoami)) {
+ logger().warn("osdmap says I am destroyed");
+ // provide a small margin so we don't livelock seeing if we
+ // un-destroyed ourselves.
+ if (osdmap->get_epoch() > newest - 1) {
+ throw std::runtime_error("i am destroyed");
+ }
+ } else if (osdmap->is_noup(whoami)) {
+ logger().warn("osdmap NOUP flag is set, waiting for it to clear");
+ } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+ logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it");
+ } else if (osdmap->require_osd_release < ceph_release_t::octopus) {
+ logger().error("osdmap require_osd_release < octopus; please upgrade to octopus");
+ } else if (false) {
+ // TODO: update mon if current fullness state is different from osdmap
+ } else if (version_t n = local_conf()->osd_map_message_max;
+ osdmap->get_epoch() >= oldest - 1 &&
+ osdmap->get_epoch() + n > newest) {
+ return _send_boot();
+ }
+ // get all the latest maps
+ if (osdmap->get_epoch() + 1 >= oldest) {
+ return get_shard_services().osdmap_subscribe(osdmap->get_epoch() + 1, false);
+ } else {
+ return get_shard_services().osdmap_subscribe(oldest - 1, true);
+ }
+}
+
+seastar::future<> OSD::_send_boot()
+{
+ pg_shard_manager.set_booting();
+
+ entity_addrvec_t public_addrs = public_msgr->get_myaddrs();
+ entity_addrvec_t cluster_addrs = cluster_msgr->get_myaddrs();
+ entity_addrvec_t hb_back_addrs = heartbeat->get_back_addrs();
+ entity_addrvec_t hb_front_addrs = heartbeat->get_front_addrs();
+ if (cluster_msgr->set_addr_unknowns(public_addrs)) {
+ cluster_addrs = cluster_msgr->get_myaddrs();
+ }
+ if (heartbeat->get_back_msgr().set_addr_unknowns(cluster_addrs)) {
+ hb_back_addrs = heartbeat->get_back_addrs();
+ }
+ if (heartbeat->get_front_msgr().set_addr_unknowns(public_addrs)) {
+ hb_front_addrs = heartbeat->get_front_addrs();
+ }
+ logger().info("hb_back_msgr: {}", hb_back_addrs);
+ logger().info("hb_front_msgr: {}", hb_front_addrs);
+ logger().info("cluster_msgr: {}", cluster_addrs);
+
+ auto m = crimson::make_message<MOSDBoot>(superblock,
+ osdmap->get_epoch(),
+ boot_epoch,
+ hb_back_addrs,
+ hb_front_addrs,
+ cluster_addrs,
+ CEPH_FEATURES_ALL);
+ collect_sys_info(&m->metadata, NULL);
+
+ // See OSDMonitor::preprocess_boot, prevents boot without allow_crimson
+ // OSDMap flag
+ m->metadata["osd_type"] = "crimson";
+ return monc->send_message(std::move(m));
+}
+
+seastar::future<> OSD::_add_me_to_crush()
+{
+ if (!local_conf().get_val<bool>("osd_crush_update_on_start")) {
+ return seastar::now();
+ }
+ auto get_weight = [this] {
+ if (auto w = local_conf().get_val<double>("osd_crush_initial_weight");
+ w >= 0) {
+ return seastar::make_ready_future<double>(w);
+ } else {
+ return store.stat().then([](auto st) {
+ auto total = st.total;
+ return seastar::make_ready_future<double>(
+ std::max(.00001,
+ double(total) / double(1ull << 40))); // TB
+ });
+ }
+ };
+ return get_weight().then([this](auto weight) {
+ const crimson::crush::CrushLocation loc;
+ return seastar::do_with(
+ std::move(loc),
+ [this, weight] (crimson::crush::CrushLocation& loc) {
+ return loc.init_on_startup().then([this, weight, &loc]() {
+ logger().info("crush location is {}", loc);
+ string cmd = fmt::format(R"({{
+ "prefix": "osd crush create-or-move",
+ "id": {},
+ "weight": {:.4f},
+ "args": [{}]
+ }})", whoami, weight, loc);
+ return monc->run_command(std::move(cmd), {});
+ });
+ });
+ }).then([](auto&& command_result) {
+ [[maybe_unused]] auto [code, message, out] = std::move(command_result);
+ if (code) {
+ logger().warn("fail to add to crush: {} ({})", message, code);
+ throw std::runtime_error("fail to add to crush");
+ } else {
+ logger().info("added to crush: {}", message);
+ }
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSD::handle_command(
+ crimson::net::ConnectionRef conn,
+ Ref<MCommand> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return asok->handle_command(conn, std::move(m));
+}
+
+/*
+ The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands
+ to handle) registered to it:
+ - OSD's specific commands are handled by the OSD object;
+ - there are some common commands registered to be directly handled by the AdminSocket object
+ itself.
+*/
+seastar::future<> OSD::start_asok_admin()
+{
+ auto asok_path = local_conf().get_val<std::string>("admin_socket");
+ using namespace crimson::admin;
+ return asok->start(asok_path).then([this] {
+ asok->register_admin_commands();
+ asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this)));
+ asok->register_command(make_asok_hook<SendBeaconHook>(*this));
+ asok->register_command(make_asok_hook<FlushPgStatsHook>(*this));
+ asok->register_command(
+ make_asok_hook<DumpPGStateHistory>(std::as_const(pg_shard_manager)));
+ asok->register_command(make_asok_hook<DumpMetricsHook>());
+ asok->register_command(make_asok_hook<DumpPerfCountersHook>());
+ asok->register_command(make_asok_hook<InjectDataErrorHook>(get_shard_services()));
+ asok->register_command(make_asok_hook<InjectMDataErrorHook>(get_shard_services()));
+ // PG commands
+ asok->register_command(make_asok_hook<pg::QueryCommand>(*this));
+ asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this));
+ // ops commands
+ asok->register_command(
+ make_asok_hook<DumpInFlightOpsHook>(
+ std::as_const(pg_shard_manager)));
+ asok->register_command(
+ make_asok_hook<DumpHistoricOpsHook>(
+ std::as_const(get_shard_services().get_registry())));
+ asok->register_command(
+ make_asok_hook<DumpSlowestHistoricOpsHook>(
+ std::as_const(get_shard_services().get_registry())));
+ asok->register_command(
+ make_asok_hook<DumpRecoveryReservationsHook>(get_shard_services()));
+ });
+}
+
+seastar::future<> OSD::stop()
+{
+ logger().info("stop");
+ beacon_timer.cancel();
+ tick_timer.cancel();
+ // see also OSD::shutdown()
+ return prepare_to_stop().then([this] {
+ return pg_shard_manager.set_stopping();
+ }).then([this] {
+ logger().debug("prepared to stop");
+ public_msgr->stop();
+ cluster_msgr->stop();
+ auto gate_close_fut = gate.close();
+ return asok->stop().then([this] {
+ return heartbeat->stop();
+ }).then([this] {
+ return pg_shard_manager.stop_registries();
+ }).then([this] {
+ return store.umount();
+ }).then([this] {
+ return store.stop();
+ }).then([this] {
+ return pg_shard_manager.stop_pgs();
+ }).then([this] {
+ return monc->stop();
+ }).then([this] {
+ return mgrc->stop();
+ }).then([this] {
+ return shard_services.stop();
+ }).then([this] {
+ return osd_states.stop();
+ }).then([this] {
+ return osd_singleton_state.stop();
+ }).then([this] {
+ return pg_to_shard_mappings.stop();
+ }).then([fut=std::move(gate_close_fut)]() mutable {
+ return std::move(fut);
+ }).then([this] {
+ return when_all_succeed(
+ public_msgr->shutdown(),
+ cluster_msgr->shutdown()).discard_result();
+ }).handle_exception([](auto ep) {
+ logger().error("error while stopping osd: {}", ep);
+ });
+ });
+}
+
+void OSD::dump_status(Formatter* f) const
+{
+ f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
+ f->dump_stream("osd_fsid") << superblock.osd_fsid;
+ f->dump_unsigned("whoami", superblock.whoami);
+ f->dump_string("state", pg_shard_manager.get_osd_state_string());
+ f->dump_unsigned("oldest_map", superblock.oldest_map);
+ f->dump_unsigned("cluster_osdmap_trim_lower_bound",
+ superblock.cluster_osdmap_trim_lower_bound);
+ f->dump_unsigned("newest_map", superblock.newest_map);
+ f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs());
+}
+
+void OSD::print(std::ostream& out) const
+{
+ out << "{osd." << superblock.whoami << " "
+ << superblock.osd_fsid << " [" << superblock.oldest_map
+ << "," << superblock.newest_map << "] "
+ << "tlb:" << superblock.cluster_osdmap_trim_lower_bound
+ << " pgs:" << pg_shard_manager.get_num_pgs()
+ << "}";
+}
+
+std::optional<seastar::future<>>
+OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+ if (pg_shard_manager.is_stopping()) {
+ return seastar::now();
+ }
+ auto maybe_ret = do_ms_dispatch(conn, std::move(m));
+ if (!maybe_ret.has_value()) {
+ return std::nullopt;
+ }
+
+ gate.dispatch_in_background(
+ __func__, *this, [ret=std::move(maybe_ret.value())]() mutable {
+ return std::move(ret);
+ });
+ return seastar::now();
+}
+
+std::optional<seastar::future<>>
+OSD::do_ms_dispatch(
+ crimson::net::ConnectionRef conn,
+ MessageRef m)
+{
+ if (seastar::this_shard_id() != PRIMARY_CORE) {
+ switch (m->get_type()) {
+ case CEPH_MSG_OSD_MAP:
+ case MSG_COMMAND:
+ case MSG_OSD_MARK_ME_DOWN:
+ // FIXME: order is not guaranteed in this path
+ return conn.get_foreign(
+ ).then([this, m=std::move(m)](auto f_conn) {
+ return seastar::smp::submit_to(PRIMARY_CORE,
+ [f_conn=std::move(f_conn), m=std::move(m), this]() mutable {
+ auto conn = make_local_shared_foreign(std::move(f_conn));
+ auto ret = do_ms_dispatch(conn, std::move(m));
+ assert(ret.has_value());
+ return std::move(ret.value());
+ });
+ });
+ }
+ }
+
+ switch (m->get_type()) {
+ case CEPH_MSG_OSD_MAP:
+ return handle_osd_map(boost::static_pointer_cast<MOSDMap>(m));
+ case CEPH_MSG_OSD_OP:
+ return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m));
+ case MSG_OSD_PG_CREATE2:
+ return handle_pg_create(
+ conn, boost::static_pointer_cast<MOSDPGCreate2>(m));
+ return seastar::now();
+ case MSG_COMMAND:
+ return handle_command(conn, boost::static_pointer_cast<MCommand>(m));
+ case MSG_OSD_MARK_ME_DOWN:
+ return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m));
+ case MSG_OSD_PG_PULL:
+ [[fallthrough]];
+ case MSG_OSD_PG_PUSH:
+ [[fallthrough]];
+ case MSG_OSD_PG_PUSH_REPLY:
+ [[fallthrough]];
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ [[fallthrough]];
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ [[fallthrough]];
+ case MSG_OSD_PG_SCAN:
+ [[fallthrough]];
+ case MSG_OSD_PG_BACKFILL:
+ [[fallthrough]];
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m));
+ case MSG_OSD_PG_LEASE:
+ [[fallthrough]];
+ case MSG_OSD_PG_LEASE_ACK:
+ [[fallthrough]];
+ case MSG_OSD_PG_NOTIFY2:
+ [[fallthrough]];
+ case MSG_OSD_PG_INFO2:
+ [[fallthrough]];
+ case MSG_OSD_PG_QUERY2:
+ [[fallthrough]];
+ case MSG_OSD_BACKFILL_RESERVE:
+ [[fallthrough]];
+ case MSG_OSD_RECOVERY_RESERVE:
+ [[fallthrough]];
+ case MSG_OSD_PG_LOG:
+ return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+ case MSG_OSD_REPOP:
+ return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
+ case MSG_OSD_REPOPREPLY:
+ return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m));
+ case MSG_OSD_SCRUB2:
+ return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m));
+ case MSG_OSD_PG_UPDATE_LOG_MISSING:
+ return handle_update_log_missing(conn, boost::static_pointer_cast<
+ MOSDPGUpdateLogMissing>(m));
+ case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ return handle_update_log_missing_reply(conn, boost::static_pointer_cast<
+ MOSDPGUpdateLogMissingReply>(m));
+ default:
+ return std::nullopt;
+ }
+}
+
+void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+ // TODO: cleanup the session attached to this connection
+ logger().warn("ms_handle_reset");
+}
+
+void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn)
+{
+ logger().warn("ms_handle_remote_reset");
+}
+
+void OSD::handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps_info)
+{
+ // TODO: store the parsed cap and associate it with the connection
+ if (caps_info.allow_all) {
+ logger().debug("{} {} has all caps", __func__, name);
+ return;
+ }
+ if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.cbegin();
+ string str;
+ try {
+ decode(str, p);
+ } catch (ceph::buffer::error& e) {
+ logger().warn("{} {} failed to decode caps string", __func__, name);
+ return;
+ }
+ OSDCap caps;
+ if (caps.parse(str)) {
+ logger().debug("{} {} has caps {}", __func__, name, str);
+ } else {
+ logger().warn("{} {} failed to parse caps {}", __func__, name, str);
+ }
+ }
+}
+
+void OSD::update_stats()
+{
+ osd_stat_seq++;
+ osd_stat.up_from = get_shard_services().get_up_epoch();
+ osd_stat.hb_peers = heartbeat->get_peers();
+ osd_stat.seq = (
+ static_cast<uint64_t>(get_shard_services().get_up_epoch()) << 32
+ ) | osd_stat_seq;
+ gate.dispatch_in_background("statfs", *this, [this] {
+ (void) store.stat().then([this](store_statfs_t&& st) {
+ osd_stat.statfs = st;
+ });
+ });
+}
+
+seastar::future<MessageURef> OSD::get_stats() const
+{
+ // MPGStats::had_map_for is not used since PGMonitor was removed
+ auto m = crimson::make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch());
+ m->osd_stat = osd_stat;
+ return pg_shard_manager.get_pg_stats(
+ ).then([m=std::move(m)](auto &&stats) mutable {
+ m->pg_stat = std::move(stats);
+ return seastar::make_ready_future<MessageURef>(std::move(m));
+ });
+}
+
+uint64_t OSD::send_pg_stats()
+{
+ // mgr client sends the report message in background
+ mgrc->report();
+ return osd_stat.seq;
+}
+
+seastar::future<> OSD::handle_osd_map(Ref<MOSDMap> m)
+{
+ /* Ensure that only one MOSDMap is processed at a time. Allowing concurrent
+ * processing may eventually be worthwhile, but such an implementation would
+ * need to ensure (among other things)
+ * 1. any particular map is only processed once
+ * 2. PGAdvanceMap operations are processed in order for each PG
+ * As map handling is not presently a bottleneck, we stick to this
+ * simpler invariant for now.
+ * See https://tracker.ceph.com/issues/59165
+ */
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return handle_osd_map_lock.lock().then([this, m] {
+ return _handle_osd_map(m);
+ }).finally([this] {
+ return handle_osd_map_lock.unlock();
+ });
+}
+
+seastar::future<> OSD::_handle_osd_map(Ref<MOSDMap> m)
+{
+ logger().info("handle_osd_map {}", *m);
+ if (m->fsid != superblock.cluster_fsid) {
+ logger().warn("fsid mismatched");
+ return seastar::now();
+ }
+ if (pg_shard_manager.is_initializing()) {
+ logger().warn("i am still initializing");
+ return seastar::now();
+ }
+
+ const auto first = m->get_first();
+ const auto last = m->get_last();
+ logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]",
+ first, last, superblock.newest_map,
+ m->cluster_osdmap_trim_lower_bound, m->newest_map);
+ // make sure there is something new, here, before we bother flushing
+ // the queues and such
+ if (last <= superblock.newest_map) {
+ return seastar::now();
+ }
+ // missing some?
+ bool skip_maps = false;
+ epoch_t start = superblock.newest_map + 1;
+ if (first > start) {
+ logger().info("handle_osd_map message skips epochs {}..{}",
+ start, first - 1);
+ if (m->cluster_osdmap_trim_lower_bound <= start) {
+ return get_shard_services().osdmap_subscribe(start, false);
+ }
+ // always try to get the full range of maps--as many as we can. this
+ // 1- is good to have
+ // 2- is at present the only way to ensure that we get a *full* map as
+ // the first map!
+ if (m->cluster_osdmap_trim_lower_bound < first) {
+ return get_shard_services().osdmap_subscribe(
+ m->cluster_osdmap_trim_lower_bound - 1, true);
+ }
+ skip_maps = true;
+ start = first;
+ }
+
+ return seastar::do_with(ceph::os::Transaction{},
+ [=, this](auto& t) {
+ return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] {
+ // even if this map isn't from a mon, we may have satisfied our subscription
+ monc->sub_got("osdmap", last);
+ if (!superblock.oldest_map || skip_maps) {
+ superblock.oldest_map = first;
+ }
+ superblock.newest_map = last;
+ superblock.current_epoch = last;
+
+ // note in the superblock that we were clean thru the prior epoch
+ if (boot_epoch && boot_epoch >= superblock.mounted) {
+ superblock.mounted = boot_epoch;
+ superblock.clean_thru = last;
+ }
+ pg_shard_manager.get_meta_coll().store_superblock(t, superblock);
+ pg_shard_manager.set_superblock(superblock);
+ logger().debug("OSD::handle_osd_map: do_transaction...");
+ return store.get_sharded_store().do_transaction(
+ pg_shard_manager.get_meta_coll().collection(),
+ std::move(t));
+ });
+ }).then([=, this] {
+ // TODO: write to superblock and commit the transaction
+ return committed_osd_maps(start, last, m);
+ });
+}
+
+seastar::future<> OSD::committed_osd_maps(
+ version_t first,
+ version_t last,
+ Ref<MOSDMap> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last);
+ // advance through the new maps
+ return seastar::do_for_each(boost::make_counting_iterator(first),
+ boost::make_counting_iterator(last + 1),
+ [this](epoch_t cur) {
+ return pg_shard_manager.get_local_map(
+ cur
+ ).then([this](OSDMapService::local_cached_map_t&& o) {
+ osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(o));
+ return pg_shard_manager.update_map(std::move(o));
+ }).then([this] {
+ if (get_shard_services().get_up_epoch() == 0 &&
+ osdmap->is_up(whoami) &&
+ osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) {
+ return pg_shard_manager.set_up_epoch(
+ osdmap->get_epoch()
+ ).then([this] {
+ if (!boot_epoch) {
+ boot_epoch = osdmap->get_epoch();
+ }
+ });
+ } else {
+ return seastar::now();
+ }
+ });
+ }).then([m, this] {
+ auto fut = seastar::now();
+ if (osdmap->is_up(whoami)) {
+ const auto up_from = osdmap->get_up_from(whoami);
+ logger().info("osd.{}: map e {} marked me up: up_from {}, bind_epoch {}, state {}",
+ whoami, osdmap->get_epoch(), up_from, bind_epoch,
+ pg_shard_manager.get_osd_state_string());
+ if (bind_epoch < up_from &&
+ osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() &&
+ pg_shard_manager.is_booting()) {
+ logger().info("osd.{}: activating...", whoami);
+ fut = pg_shard_manager.set_active().then([this] {
+ beacon_timer.arm_periodic(
+ std::chrono::seconds(local_conf()->osd_beacon_report_interval));
+ // timer continuation rearms when complete
+ tick_timer.arm(
+ std::chrono::seconds(TICK_INTERVAL));
+ });
+ }
+ } else {
+ if (pg_shard_manager.is_prestop()) {
+ got_stop_ack();
+ return seastar::now();
+ }
+ }
+ return fut.then([this] {
+ return check_osdmap_features().then([this] {
+ // yay!
+ logger().info("osd.{}: committed_osd_maps: broadcasting osdmaps up"
+ " to {} epoch to pgs", whoami, osdmap->get_epoch());
+ return pg_shard_manager.broadcast_map_to_pgs(osdmap->get_epoch());
+ });
+ });
+ }).then([m, this] {
+ if (pg_shard_manager.is_active()) {
+ logger().info("osd.{}: now active", whoami);
+ if (!osdmap->exists(whoami) ||
+ osdmap->is_stop(whoami)) {
+ return shutdown();
+ }
+ if (should_restart()) {
+ return restart();
+ } else {
+ return seastar::now();
+ }
+ } else if (pg_shard_manager.is_preboot()) {
+ logger().info("osd.{}: now preboot", whoami);
+
+ if (m->get_source().is_mon()) {
+ return _preboot(
+ m->cluster_osdmap_trim_lower_bound, m->newest_map);
+ } else {
+ logger().info("osd.{}: start_boot", whoami);
+ return start_boot();
+ }
+ } else {
+ logger().info("osd.{}: now {}", whoami,
+ pg_shard_manager.get_osd_state_string());
+ // XXX
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<> OSD::handle_osd_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> m)
+{
+ return pg_shard_manager.start_pg_operation<ClientRequest>(
+ get_shard_services(),
+ conn,
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_pg_create(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGCreate2> m)
+{
+ return seastar::do_for_each(m->pgs, [this, conn, m](auto& pg) {
+ auto& [pgid, when] = pg;
+ const auto &[created, created_stamp] = when;
+ auto q = m->pg_extra.find(pgid);
+ ceph_assert(q != m->pg_extra.end());
+ auto& [history, pi] = q->second;
+ logger().debug(
+ "{}: {} e{} @{} "
+ "history {} pi {}",
+ __func__, pgid, created, created_stamp,
+ history, pi);
+ if (!pi.empty() &&
+ m->epoch < pi.get_bounds().second) {
+ logger().error(
+ "got pg_create on {} epoch {} "
+ "unmatched past_intervals {} (history {})",
+ pgid, m->epoch,
+ pi, history);
+ return seastar::now();
+ } else {
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ pg_shard_t(),
+ pgid,
+ m->epoch,
+ m->epoch,
+ NullEvt(),
+ true,
+ new PGCreateInfo(pgid, m->epoch, history, pi, true)).second;
+ }
+ });
+}
+
+seastar::future<> OSD::handle_update_log_missing(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissing> m)
+{
+ m->decode_payload();
+ return pg_shard_manager.start_pg_operation<LogMissingRequest>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_update_log_missing_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissingReply> m)
+{
+ m->decode_payload();
+ return pg_shard_manager.start_pg_operation<LogMissingRequestReply>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_rep_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOp> m)
+{
+ m->finish_decode();
+ return pg_shard_manager.start_pg_operation<RepRequest>(
+ std::move(conn),
+ std::move(m)).second;
+}
+
+seastar::future<> OSD::handle_rep_op_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOpReply> m)
+{
+ spg_t pgid = m->get_spg();
+ return pg_shard_manager.with_pg(
+ pgid,
+ [m=std::move(m)](auto &&pg) {
+ if (pg) {
+ m->finish_decode();
+ pg->handle_rep_op_reply(*m);
+ } else {
+ logger().warn("stale reply: {}", *m);
+ }
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSD::handle_scrub(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDScrub2> m)
+{
+ if (m->fsid != superblock.cluster_fsid) {
+ logger().warn("fsid mismatched");
+ return seastar::now();
+ }
+ return seastar::parallel_for_each(std::move(m->scrub_pgs),
+ [m, conn, this](spg_t pgid) {
+ pg_shard_t from_shard{static_cast<int>(m->get_source().num()),
+ pgid.shard};
+ PeeringState::RequestScrub scrub_request{m->deep, m->repair};
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ from_shard,
+ pgid,
+ PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second;
+ });
+}
+
+seastar::future<> OSD::handle_mark_me_down(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDMarkMeDown> m)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ if (pg_shard_manager.is_prestop()) {
+ got_stop_ack();
+ }
+ return seastar::now();
+}
+
+seastar::future<> OSD::handle_recovery_subreq(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp> m)
+{
+ return pg_shard_manager.start_pg_operation<RecoverySubRequest>(
+ conn, std::move(m)).second;
+}
+
+bool OSD::should_restart() const
+{
+ if (!osdmap->is_up(whoami)) {
+ logger().info("map e {} marked osd.{} down",
+ osdmap->get_epoch(), whoami);
+ return true;
+ } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) {
+ logger().error("map e {} had wrong client addr ({} != my {})",
+ osdmap->get_epoch(),
+ osdmap->get_addrs(whoami),
+ public_msgr->get_myaddrs());
+ return true;
+ } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) {
+ logger().error("map e {} had wrong cluster addr ({} != my {})",
+ osdmap->get_epoch(),
+ osdmap->get_cluster_addrs(whoami),
+ cluster_msgr->get_myaddrs());
+ return true;
+ } else {
+ return false;
+ }
+}
+
+seastar::future<> OSD::restart()
+{
+ beacon_timer.cancel();
+ tick_timer.cancel();
+ return pg_shard_manager.set_up_epoch(
+ 0
+ ).then([this] {
+ bind_epoch = osdmap->get_epoch();
+ // TODO: promote to shutdown if being marked down for multiple times
+ // rebind messengers
+ return start_boot();
+ });
+}
+
+seastar::future<> OSD::shutdown()
+{
+ logger().info("shutting down per osdmap");
+ abort_source.request_abort();
+ return seastar::now();
+}
+
+seastar::future<> OSD::send_beacon()
+{
+ if (!pg_shard_manager.is_active()) {
+ return seastar::now();
+ }
+ // FIXME: min lec should be calculated from pg_stat
+ // and should set m->pgs
+ epoch_t min_last_epoch_clean = osdmap->get_epoch();
+ auto m = crimson::make_message<MOSDBeacon>(osdmap->get_epoch(),
+ min_last_epoch_clean,
+ superblock.last_purged_snaps_scrub,
+ local_conf()->osd_beacon_report_interval);
+ return monc->send_message(std::move(m));
+}
+
+seastar::future<> OSD::update_heartbeat_peers()
+{
+ if (!pg_shard_manager.is_active()) {
+ return seastar::now();;
+ }
+
+ pg_shard_manager.for_each_pgid([this](auto &pgid) {
+ vector<int> up, acting;
+ osdmap->pg_to_up_acting_osds(pgid.pgid,
+ &up, nullptr,
+ &acting, nullptr);
+ for (int osd : boost::join(up, acting)) {
+ if (osd == CRUSH_ITEM_NONE || osd == whoami) {
+ continue;
+ } else {
+ heartbeat->add_peer(osd, osdmap->get_epoch());
+ }
+ }
+ });
+ heartbeat->update_peers(whoami);
+ return seastar::now();
+}
+
+seastar::future<> OSD::handle_peering_op(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPeeringOp> m)
+{
+ const int from = m->get_source().num();
+ logger().debug("handle_peering_op on {} from {}", m->get_spg(), from);
+ m->set_features(conn->get_features());
+ std::unique_ptr<PGPeeringEvent> evt(m->get_event());
+ return pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+ conn,
+ pg_shard_t{from, m->get_spg().shard},
+ m->get_spg(),
+ std::move(*evt)).second;
+}
+
+seastar::future<> OSD::check_osdmap_features()
+{
+ assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.write_meta(
+ "require_osd_release",
+ stringify((int)osdmap->require_osd_release));
+}
+
+seastar::future<> OSD::prepare_to_stop()
+{
+ if (osdmap && osdmap->is_up(whoami)) {
+ pg_shard_manager.set_prestop();
+ const auto timeout =
+ std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::duration<double>(
+ local_conf().get_val<double>("osd_mon_shutdown_timeout")));
+
+ return seastar::with_timeout(
+ seastar::timer<>::clock::now() + timeout,
+ monc->send_message(
+ crimson::make_message<MOSDMarkMeDown>(
+ monc->get_fsid(),
+ whoami,
+ osdmap->get_addrs(whoami),
+ osdmap->get_epoch(),
+ true)).then([this] {
+ return stop_acked.get_future();
+ })
+ ).handle_exception_type(
+ [](seastar::timed_out_error&) {
+ return seastar::now();
+ });
+ }
+ return seastar::now();
+}
+
+}
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
new file mode 100644
index 000000000..10ff60d47
--- /dev/null
+++ b/src/crimson/osd/osd.h
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/timer.hh>
+
+#include "crimson/common/logclient.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/auth_handler.h"
+#include "crimson/common/gated.h"
+#include "crimson/admin/admin_socket.h"
+#include "crimson/common/simple_lru.h"
+#include "crimson/mgr/client.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/pg_shard_manager.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/state.h"
+
+#include "messages/MOSDOp.h"
+#include "osd/PeeringState.h"
+#include "osd/osd_types.h"
+#include "osd/osd_perf_counters.h"
+#include "osd/PGPeeringEvent.h"
+
+class MCommand;
+class MOSDMap;
+class MOSDRepOpReply;
+class MOSDRepOp;
+class MOSDScrub2;
+class OSDMeta;
+class Heartbeat;
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+class PG;
+
+class OSD final : public crimson::net::Dispatcher,
+ private crimson::common::AuthHandler,
+ private crimson::mgr::WithStats {
+ const int whoami;
+ const uint32_t nonce;
+ seastar::abort_source& abort_source;
+ seastar::timer<seastar::lowres_clock> beacon_timer;
+ // talk with osd
+ crimson::net::MessengerRef cluster_msgr;
+ // talk with client/mon/mgr
+ crimson::net::MessengerRef public_msgr;
+
+ // HB Messengers
+ crimson::net::MessengerRef hb_front_msgr;
+ crimson::net::MessengerRef hb_back_msgr;
+
+ std::unique_ptr<crimson::mon::Client> monc;
+ std::unique_ptr<crimson::mgr::Client> mgrc;
+
+ // TODO: use a wrapper for ObjectStore
+ OSDMapService::cached_map_t osdmap;
+ crimson::os::FuturizedStore& store;
+
+ /// _first_ epoch we were marked up (after this process started)
+ epoch_t boot_epoch = 0;
+ //< epoch we last did a bind to new ip:ports
+ epoch_t bind_epoch = 0;
+ //< since when there is no more pending pg creates from mon
+ epoch_t last_pg_create_epoch = 0;
+
+ ceph::mono_time startup_time;
+
+ seastar::shared_mutex handle_osd_map_lock;
+
+ OSDSuperblock superblock;
+
+ // Dispatcher methods
+ std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final;
+ void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final;
+ void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final;
+
+ std::optional<seastar::future<>> do_ms_dispatch(crimson::net::ConnectionRef, MessageRef);
+
+ // mgr::WithStats methods
+ // pg statistics including osd ones
+ osd_stat_t osd_stat;
+ uint32_t osd_stat_seq = 0;
+ void update_stats();
+ seastar::future<MessageURef> get_stats() const final;
+
+ // AuthHandler methods
+ void handle_authentication(const EntityName& name,
+ const AuthCapsInfo& caps) final;
+
+ seastar::sharded<PGShardMapping> pg_to_shard_mappings;
+ seastar::sharded<OSDSingletonState> osd_singleton_state;
+ seastar::sharded<OSDState> osd_states;
+ seastar::sharded<ShardServices> shard_services;
+
+ crimson::osd::PGShardManager pg_shard_manager;
+
+ std::unique_ptr<Heartbeat> heartbeat;
+ seastar::timer<seastar::lowres_clock> tick_timer;
+
+ // admin-socket
+ seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok;
+
+public:
+ OSD(int id, uint32_t nonce,
+ seastar::abort_source& abort_source,
+ crimson::os::FuturizedStore& store,
+ crimson::net::MessengerRef cluster_msgr,
+ crimson::net::MessengerRef client_msgr,
+ crimson::net::MessengerRef hb_front_msgr,
+ crimson::net::MessengerRef hb_back_msgr);
+ ~OSD() final;
+
+ auto &get_pg_shard_manager() {
+ return pg_shard_manager;
+ }
+
+ seastar::future<> open_meta_coll();
+ static seastar::future<OSDMeta> open_or_create_meta_coll(
+ crimson::os::FuturizedStore &store
+ );
+ static seastar::future<> mkfs(
+ crimson::os::FuturizedStore &store,
+ unsigned whoami,
+ uuid_d osd_uuid,
+ uuid_d cluster_fsid,
+ std::string osdspec_affinity);
+
+ seastar::future<> start();
+ seastar::future<> stop();
+
+ void dump_status(Formatter*) const;
+ void print(std::ostream&) const;
+
+ /// @return the seq id of the pg stats being sent
+ uint64_t send_pg_stats();
+
+ auto &get_shard_services() {
+ return shard_services.local();
+ }
+
+private:
+ static seastar::future<> _write_superblock(
+ crimson::os::FuturizedStore &store,
+ OSDMeta meta,
+ OSDSuperblock superblock);
+ static seastar::future<> _write_key_meta(
+ crimson::os::FuturizedStore &store
+ );
+ seastar::future<> start_boot();
+ seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap);
+ seastar::future<> _send_boot();
+ seastar::future<> _add_me_to_crush();
+
+ seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+ seastar::future<> start_asok_admin();
+
+ void write_superblock(ceph::os::Transaction& t);
+ seastar::future<> read_superblock();
+
+ seastar::future<> handle_osd_map(Ref<MOSDMap> m);
+ seastar::future<> _handle_osd_map(Ref<MOSDMap> m);
+ seastar::future<> handle_pg_create(crimson::net::ConnectionRef conn,
+ Ref<MOSDPGCreate2> m);
+ seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> m);
+ seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOp> m);
+ seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn,
+ Ref<MOSDRepOpReply> m);
+ seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
+ Ref<MOSDPeeringOp> m);
+ seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp> m);
+ seastar::future<> handle_scrub(crimson::net::ConnectionRef conn,
+ Ref<MOSDScrub2> m);
+ seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn,
+ Ref<MOSDMarkMeDown> m);
+
+ seastar::future<> committed_osd_maps(version_t first,
+ version_t last,
+ Ref<MOSDMap> m);
+
+ seastar::future<> check_osdmap_features();
+
+ seastar::future<> handle_command(crimson::net::ConnectionRef conn,
+ Ref<MCommand> m);
+ seastar::future<> handle_update_log_missing(crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissing> m);
+ seastar::future<> handle_update_log_missing_reply(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDPGUpdateLogMissingReply> m);
+
+private:
+ crimson::common::Gated gate;
+
+ seastar::promise<> stop_acked;
+ void got_stop_ack() {
+ stop_acked.set_value();
+ }
+ seastar::future<> prepare_to_stop();
+ bool should_restart() const;
+ seastar::future<> restart();
+ seastar::future<> shutdown();
+ seastar::future<> update_heartbeat_peers();
+ friend class PGAdvanceMap;
+
+public:
+ seastar::future<> send_beacon();
+
+private:
+ LogClient log_client;
+ LogChannelRef clog;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const OSD& osd) {
+ osd.print(out);
+ return out;
+}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::OSD> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h
new file mode 100644
index 000000000..69edf94b8
--- /dev/null
+++ b/src/crimson/osd/osd_connection_priv.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+
+namespace crimson::osd {
+
+struct OSDConnectionPriv : public crimson::net::Connection::user_private_t {
+ ConnectionPipeline client_request_conn_pipeline;
+ ConnectionPipeline peering_request_conn_pipeline;
+ ConnectionPipeline replicated_request_conn_pipeline;
+};
+
+static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) {
+ if (!conn->has_user_private()) {
+ conn->set_user_private(std::make_unique<OSDConnectionPriv>());
+ }
+ return static_cast<OSDConnectionPriv&>(conn->get_user_private());
+}
+
+}
diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc
new file mode 100644
index 000000000..e40b2b246
--- /dev/null
+++ b/src/crimson/osd/osd_meta.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_meta.h"
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "os/Transaction.h"
+
+using std::string;
+using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
+
+void OSDMeta::create(ceph::os::Transaction& t)
+{
+ t.create_collection(coll->get_cid(), 0);
+}
+
+void OSDMeta::store_map(ceph::os::Transaction& t,
+ epoch_t e, const bufferlist& m)
+{
+ t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m);
+}
+
+seastar::future<bufferlist> OSDMeta::load_map(epoch_t e)
+{
+ return store.read(coll,
+ osdmap_oid(e), 0, 0,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error(
+ read_errorator::all_same_way([e] {
+ ceph_abort_msg(fmt::format("{} read gave enoent on {}",
+ __func__, osdmap_oid(e)));
+ }));
+}
+
+void OSDMeta::store_superblock(ceph::os::Transaction& t,
+ const OSDSuperblock& superblock)
+{
+ bufferlist bl;
+ encode(superblock, bl);
+ t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl);
+}
+
+OSDMeta::load_superblock_ret OSDMeta::load_superblock()
+{
+ return store.read(
+ coll, superblock_oid(), 0, 0
+ ).safe_then([] (bufferlist&& bl) {
+ auto p = bl.cbegin();
+ OSDSuperblock superblock;
+ decode(superblock, p);
+ return seastar::make_ready_future<OSDSuperblock>(std::move(superblock));
+ });
+}
+
+seastar::future<std::tuple<pg_pool_t,
+ std::string,
+ OSDMeta::ec_profile_t>>
+OSDMeta::load_final_pool_info(int64_t pool) {
+ return store.read(coll, final_pool_info_oid(pool),
+ 0, 0).safe_then([] (bufferlist&& bl) {
+ auto p = bl.cbegin();
+ pg_pool_t pi;
+ string name;
+ ec_profile_t ec_profile;
+ decode(pi, p);
+ decode(name, p);
+ decode(ec_profile, p);
+ return seastar::make_ready_future<std::tuple<pg_pool_t,
+ string,
+ ec_profile_t>>(
+ std::make_tuple(std::move(pi),
+ std::move(name),
+ std::move(ec_profile)));
+ },read_errorator::all_same_way([pool] {
+ throw std::runtime_error(fmt::format("read gave enoent on {}",
+ final_pool_info_oid(pool)));
+ }));
+}
+
+ghobject_t OSDMeta::osdmap_oid(epoch_t epoch)
+{
+ string name = fmt::format("osdmap.{}", epoch);
+ return ghobject_t(hobject_t(sobject_t(object_t(name), 0)));
+}
+
+ghobject_t OSDMeta::final_pool_info_oid(int64_t pool)
+{
+ string name = fmt::format("final_pool_{}", pool);
+ return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP)));
+}
+
+ghobject_t OSDMeta::superblock_oid()
+{
+ return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)));
+}
diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h
new file mode 100644
index 000000000..652266d9e
--- /dev/null
+++ b/src/crimson/osd/osd_meta.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::os {
+ class FuturizedCollection;
+ class FuturizedStore;
+}
+
+/// metadata shared across PGs, or put in another way,
+/// metadata not specific to certain PGs.
+class OSDMeta {
+ template<typename T> using Ref = boost::intrusive_ptr<T>;
+
+ crimson::os::FuturizedStore::Shard& store;
+ Ref<crimson::os::FuturizedCollection> coll;
+
+public:
+ OSDMeta(Ref<crimson::os::FuturizedCollection> coll,
+ crimson::os::FuturizedStore::Shard& store)
+ : store{store}, coll{coll}
+ {}
+
+ auto collection() {
+ return coll;
+ }
+ void create(ceph::os::Transaction& t);
+
+ void store_map(ceph::os::Transaction& t,
+ epoch_t e, const bufferlist& m);
+ seastar::future<bufferlist> load_map(epoch_t e);
+
+ void store_superblock(ceph::os::Transaction& t,
+ const OSDSuperblock& sb);
+
+ using load_superblock_ertr = crimson::os::FuturizedStore::Shard::read_errorator;
+ using load_superblock_ret = load_superblock_ertr::future<OSDSuperblock>;
+ load_superblock_ret load_superblock();
+
+ using ec_profile_t = std::map<std::string, std::string>;
+ seastar::future<std::tuple<pg_pool_t,
+ std::string,
+ ec_profile_t>> load_final_pool_info(int64_t pool);
+private:
+ static ghobject_t osdmap_oid(epoch_t epoch);
+ static ghobject_t final_pool_info_oid(int64_t pool);
+ static ghobject_t superblock_oid();
+};
diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc
new file mode 100644
index 000000000..920fdc114
--- /dev/null
+++ b/src/crimson/osd/osd_operation.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_operation.h"
+#include "common/Formatter.h"
+#include "crimson/common/log.h"
+#include "crimson/osd/osd_operations/client_request.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+void OSDOperationRegistry::do_stop()
+{
+ logger().info("OSDOperationRegistry::{}", __func__);
+ // we need to decouple visiting the registry from destructing
+ // ops because of the auto-unlink feature of boost::intrusive.
+ // the list shouldn't change while iterating due to constrains
+ // on iterator's validity.
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& historic_registry = get_registry<historic_reg_index>();
+ std::vector<ClientRequest::ICRef> to_ref_down;
+ std::transform(std::begin(historic_registry), std::end(historic_registry),
+ std::back_inserter(to_ref_down),
+ [] (const Operation& op) {
+ return ClientRequest::ICRef{
+ static_cast<const ClientRequest*>(&op),
+ /* add_ref= */ false
+ };
+ });
+ last_of_recents = std::end(historic_registry);
+ // to_ref_down is going off
+}
+
+OSDOperationRegistry::OSDOperationRegistry()
+ : OperationRegistryT(seastar::this_shard_id())
+{
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& historic_registry = get_registry<historic_reg_index>();
+ last_of_recents = std::begin(historic_registry);
+}
+
+static auto get_duration(const ClientRequest& client_request)
+{
+ // TODO: consider enhancing `CompletionEvent` with computing duration
+ // once -- when it's enetered.
+ return client_request.get_completed() - client_request.get_started();
+}
+
+void OSDOperationRegistry::put_historic(const ClientRequest& op)
+{
+ // unlink the op from the client request registry. this is a part of
+ // the re-link procedure. finally it will be in historic registry.
+ constexpr auto client_reg_index =
+ static_cast<size_t>(OperationTypeCode::client_request);
+ constexpr auto historic_reg_index =
+ static_cast<size_t>(OperationTypeCode::historic_client_request);
+ auto& client_registry = get_registry<client_reg_index>();
+ auto& historic_registry = get_registry<historic_reg_index>();
+ historic_registry.splice(std::end(historic_registry),
+ client_registry,
+ client_registry.iterator_to(op));
+ ClientRequest::ICRef(
+ &op, /* add_ref= */true
+ ).detach(); // yes, "leak" it for now!
+
+ // check whether the history size limit is not exceeded; if so, then
+ // purge the oldest op.
+ // NOTE: Operation uses the auto-unlink feature of boost::intrusive.
+ // NOTE: the cleaning happens in OSDOperationRegistry::do_stop()
+ using crimson::common::local_conf;
+ if (num_recent_ops >= local_conf()->osd_op_history_size) {
+ ++last_of_recents;
+ ++num_slow_ops;
+ } else {
+ ++num_recent_ops;
+ }
+ if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) {
+ // we're interested in keeping slowest ops. if the slow op history
+ // is disabled, the list will have only one element, so the full-blown
+ // search will boil down into `.front()`.
+ const auto fastest_historic_iter = std::min_element(
+ std::cbegin(historic_registry), last_of_recents,
+ [] (const auto& lop, const auto& rop) {
+ const auto& lclient_request = static_cast<const ClientRequest&>(lop);
+ const auto& rclient_request = static_cast<const ClientRequest&>(rop);
+ return get_duration(lclient_request) < get_duration(rclient_request);
+ });
+ assert(fastest_historic_iter != std::end(historic_registry));
+ const auto& fastest_historic_op =
+ static_cast<const ClientRequest&>(*fastest_historic_iter);
+ historic_registry.erase(fastest_historic_iter);
+ // clear a previously "leaked" op
+ ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false);
+ --num_slow_ops;
+ }
+}
+
+size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) const
+{
+ const auto& historic_client_registry =
+ get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+ f->open_object_section("op_history");
+ f->dump_int("size", historic_client_registry.size());
+ // TODO: f->dump_int("duration", history_duration.load());
+ // the intrusive list is configured to not store the size
+ size_t ops_count = 0;
+ {
+ f->open_array_section("ops");
+ for (const auto& op : historic_client_registry) {
+ op.dump(f);
+ ++ops_count;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ return ops_count;
+}
+
+size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const
+{
+ const auto& historic_client_registry =
+ get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>();
+ f->open_object_section("op_history");
+ f->dump_int("size", historic_client_registry.size());
+ // TODO: f->dump_int("duration", history_duration.load());
+ // the intrusive list is configured to not store the size
+ std::multimap<utime_t,
+ const ClientRequest*,
+ std::greater<utime_t>> sorted_slowest_ops;
+ // iterating over the entire registry as a slow op could be also
+ // in the "recently added" part.
+ std::transform(std::begin(historic_client_registry),
+ std::end(historic_client_registry),
+ std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)),
+ [] (const Operation& op) {
+ const auto& cop = static_cast<const ClientRequest&>(op);
+ return std::make_pair(get_duration(cop), &cop);
+ });
+ f->open_array_section("ops");
+ using crimson::common::local_conf;
+ size_t ops_count = 0;
+ for (auto it = std::begin(sorted_slowest_ops);
+ ops_count < local_conf()->osd_op_history_slow_op_size
+ && it != std::end(sorted_slowest_ops);
+ ++it, ++ops_count)
+ {
+ it->second->dump(f);
+ }
+ f->close_section();
+ return ops_count;
+}
+
+OperationThrottler::OperationThrottler(ConfigProxy &conf)
+ : scheduler(crimson::osd::scheduler::make_scheduler(conf))
+{
+ conf.add_observer(this);
+ update_from_config(conf);
+}
+
+void OperationThrottler::wake()
+{
+ while ((!max_in_progress || in_progress < max_in_progress) &&
+ !scheduler->empty()) {
+ auto item = scheduler->dequeue();
+ item.wake.set_value();
+ ++in_progress;
+ --pending;
+ }
+}
+
+void OperationThrottler::release_throttle()
+{
+ ceph_assert(in_progress > 0);
+ --in_progress;
+ wake();
+}
+
+seastar::future<> OperationThrottler::acquire_throttle(
+ crimson::osd::scheduler::params_t params)
+{
+ crimson::osd::scheduler::item_t item{params, seastar::promise<>()};
+ auto fut = item.wake.get_future();
+ scheduler->enqueue(std::move(item));
+ return fut;
+}
+
+void OperationThrottler::dump_detail(Formatter *f) const
+{
+ f->dump_unsigned("max_in_progress", max_in_progress);
+ f->dump_unsigned("in_progress", in_progress);
+ f->open_object_section("scheduler");
+ {
+ scheduler->dump(*f);
+ }
+ f->close_section();
+}
+
+void OperationThrottler::update_from_config(const ConfigProxy &conf)
+{
+ max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency");
+ wake();
+}
+
+const char** OperationThrottler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "crimson_osd_scheduler_concurrency",
+ NULL
+ };
+ return KEYS;
+}
+
+void OperationThrottler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ update_from_config(conf);
+}
+
+}
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
new file mode 100644
index 000000000..8ef44ee9e
--- /dev/null
+++ b/src/crimson/osd/osd_operation.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/operation.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/scheduler/scheduler.h"
+#include "osd/osd_types.h"
+
+namespace crimson::os::seastore {
+ template<class OpT>
+ class OperationProxyT;
+}
+
+namespace crimson::osd {
+
+/// Ordering stages for a class of operations ordered by PG.
+struct ConnectionPipeline {
+ struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::await_active";
+ } await_active;
+
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::await_map";
+ } await_map;
+
+ struct GetPG : OrderedExclusivePhaseT<GetPG> {
+ static constexpr auto type_name =
+ "ConnectionPipeline::get_pg";
+ } get_pg;
+};
+
+enum class OperationTypeCode {
+ client_request = 0,
+ peering_event,
+ pg_advance_map,
+ pg_creation,
+ replicated_request,
+ background_recovery,
+ background_recovery_sub,
+ internal_client_request,
+ historic_client_request,
+ logmissing_request,
+ logmissing_request_reply,
+ snaptrim_event,
+ snaptrimobj_subevent,
+ last_op
+};
+
+static constexpr const char* const OP_NAMES[] = {
+ "client_request",
+ "peering_event",
+ "pg_advance_map",
+ "pg_creation",
+ "replicated_request",
+ "background_recovery",
+ "background_recovery_sub",
+ "internal_client_request",
+ "historic_client_request",
+ "logmissing_request",
+ "logmissing_request_reply",
+ "snaptrim_event",
+ "snaptrimobj_subevent",
+};
+
+// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry:
+static_assert(
+ (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) ==
+ static_cast<int>(OperationTypeCode::last_op));
+
+struct InterruptibleOperation : Operation {
+ template <typename ValuesT = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, ValuesT>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+};
+
+template <typename T>
+struct OperationT : InterruptibleOperation {
+ static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)];
+ using IRef = boost::intrusive_ptr<T>;
+ using ICRef = boost::intrusive_ptr<const T>;
+
+ unsigned get_type() const final {
+ return static_cast<unsigned>(T::type);
+ }
+
+ const char *get_type_name() const final {
+ return T::type_name;
+ }
+
+ virtual ~OperationT() = default;
+
+private:
+ virtual void dump_detail(ceph::Formatter *f) const = 0;
+};
+
+template <class T>
+class TrackableOperationT : public OperationT<T> {
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+protected:
+ template<class EventT>
+ decltype(auto) get_event() {
+ // all out derivates are supposed to define the list of tracking
+ // events accessible via `std::get`. This will usually boil down
+ // into an instance of `std::tuple`.
+ return std::get<EventT>(that()->tracking_events);
+ }
+
+ template<class EventT>
+ decltype(auto) get_event() const {
+ return std::get<EventT>(that()->tracking_events);
+ }
+
+ using OperationT<T>::OperationT;
+
+ struct StartEvent : TimeEvent<StartEvent> {};
+ struct CompletionEvent : TimeEvent<CompletionEvent> {};
+
+ template <class EventT, class... Args>
+ void track_event(Args&&... args) {
+ // the idea is to have a visitor-like interface that allows to double
+ // dispatch (backend, blocker type)
+ get_event<EventT>().trigger(*that(), std::forward<Args>(args)...);
+ }
+
+ template <class BlockingEventT, class InterruptorT=void, class F>
+ auto with_blocking_event(F&& f) {
+ auto ret = std::forward<F>(f)(typename BlockingEventT::template Trigger<T>{
+ get_event<BlockingEventT>(), *that()
+ });
+ if constexpr (std::is_same_v<InterruptorT, void>) {
+ return ret;
+ } else {
+ using ret_t = decltype(ret);
+ return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)};
+ }
+ }
+
+public:
+ static constexpr bool is_trackable = true;
+};
+
+template <class T>
+class PhasedOperationT : public TrackableOperationT<T> {
+ using base_t = TrackableOperationT<T>;
+
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+protected:
+ using TrackableOperationT<T>::TrackableOperationT;
+
+ template <class InterruptorT=void, class StageT>
+ auto enter_stage(StageT& stage) {
+ return this->template with_blocking_event<typename StageT::BlockingEvent,
+ InterruptorT>(
+ [&stage, this] (auto&& trigger) {
+ // delegated storing the pipeline handle to let childs to match
+ // the lifetime of pipeline with e.g. ConnectedSocket (important
+ // for ConnectionPipeline).
+ return that()->get_handle().template enter<T>(stage, std::move(trigger));
+ });
+ }
+
+ template <class OpT>
+ friend class crimson::os::seastore::OperationProxyT;
+
+ // PGShardManager::start_pg_operation needs access to enter_stage, we can make this
+ // more sophisticated later on
+ friend class PGShardManager;
+};
+
+/**
+ * Maintains a set of lists of all active ops.
+ */
+struct OSDOperationRegistry : OperationRegistryT<
+ static_cast<size_t>(OperationTypeCode::last_op)
+> {
+ OSDOperationRegistry();
+
+ void do_stop() override;
+
+ void put_historic(const class ClientRequest& op);
+
+ size_t dump_historic_client_requests(ceph::Formatter* f) const;
+ size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const;
+
+private:
+ op_list::const_iterator last_of_recents;
+ size_t num_recent_ops = 0;
+ size_t num_slow_ops = 0;
+};
+/**
+ * Throttles set of currently running operations
+ *
+ * Very primitive currently, assumes all ops are equally
+ * expensive and simply limits the number that can be
+ * concurrently active.
+ */
+class OperationThrottler : public BlockerT<OperationThrottler>,
+ private md_config_obs_t {
+ friend BlockerT<OperationThrottler>;
+ static constexpr const char* type_name = "OperationThrottler";
+
+ template <typename OperationT, typename F>
+ auto with_throttle(
+ OperationT* op,
+ crimson::osd::scheduler::params_t params,
+ F &&f) {
+ if (!max_in_progress) return f();
+ return acquire_throttle(params)
+ .then(std::forward<F>(f))
+ .then([this](auto x) {
+ release_throttle();
+ return x;
+ });
+ }
+
+ template <typename OperationT, typename F>
+ seastar::future<> with_throttle_while(
+ OperationT* op,
+ crimson::osd::scheduler::params_t params,
+ F &&f) {
+ return with_throttle(op, params, f).then([this, params, op, f](bool cont) {
+ return cont ? with_throttle_while(op, params, f) : seastar::now();
+ });
+ }
+
+
+public:
+ OperationThrottler(ConfigProxy &conf);
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+ void update_from_config(const ConfigProxy &conf);
+
+ template <class OpT, class... Args>
+ seastar::future<> with_throttle_while(
+ BlockingEvent::Trigger<OpT>&& trigger,
+ Args&&... args) {
+ return trigger.maybe_record_blocking(
+ with_throttle_while(std::forward<Args>(args)...), *this);
+ }
+
+private:
+ void dump_detail(Formatter *f) const final;
+
+ crimson::osd::scheduler::SchedulerRef scheduler;
+
+ uint64_t max_in_progress = 0;
+ uint64_t in_progress = 0;
+
+ uint64_t pending = 0;
+
+ void wake();
+
+ seastar::future<> acquire_throttle(
+ crimson::osd::scheduler::params_t params);
+
+ void release_throttle();
+};
+
+}
diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
new file mode 100644
index 000000000..4b6dbf4b7
--- /dev/null
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -0,0 +1,307 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_map.h"
+
+namespace crimson::osd {
+
+// Just the boilerplate currently. Implementing
+struct LttngBackend
+ : ClientRequest::StartEvent::Backend,
+ ConnectionPipeline::AwaitActive::BlockingEvent::Backend,
+ ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
+ ConnectionPipeline::GetPG::BlockingEvent::Backend,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ PGMap::PGCreationBlockingEvent::Backend,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
+ PGActivationBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
+ ClientRequest::CompletionEvent::Backend
+{
+ void handle(ClientRequest::StartEvent&,
+ const Operation&) override {}
+
+ void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitActive& blocker) override {
+ }
+
+ void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const OSD_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ConnectionPipeline::GetPG::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::GetPG& blocker) override {
+ }
+
+ void handle(PGMap::PGCreationBlockingEvent&,
+ const Operation&,
+ const PGMap::PGCreationBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ }
+
+ void handle(PGActivationBlocker::BlockingEvent& ev,
+ const Operation& op,
+ const PGActivationBlocker& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::GetOBC& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::Process& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::SendReply& blocker) override {
+ }
+
+ void handle(ClientRequest::CompletionEvent&,
+ const Operation&) override {}
+};
+
+struct HistoricBackend
+ : ClientRequest::StartEvent::Backend,
+ ConnectionPipeline::AwaitActive::BlockingEvent::Backend,
+ ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
+ ConnectionPipeline::GetPG::BlockingEvent::Backend,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ PGMap::PGCreationBlockingEvent::Backend,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
+ PGActivationBlocker::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
+ ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
+ ClientRequest::CompletionEvent::Backend
+{
+ void handle(ClientRequest::StartEvent&,
+ const Operation&) override {}
+
+ void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitActive& blocker) override {
+ }
+
+ void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const OSD_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ConnectionPipeline::GetPG::BlockingEvent& ev,
+ const Operation& op,
+ const ConnectionPipeline::GetPG& blocker) override {
+ }
+
+ void handle(PGMap::PGCreationBlockingEvent&,
+ const Operation&,
+ const PGMap::PGCreationBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ }
+
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ }
+
+ void handle(PGActivationBlocker::BlockingEvent& ev,
+ const Operation& op,
+ const PGActivationBlocker& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::GetOBC& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::Process& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
+ }
+
+ void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const ClientRequest::PGPipeline::SendReply& blocker) override {
+ }
+
+ static const ClientRequest& to_client_request(const Operation& op) {
+#ifdef NDEBUG
+ return static_cast<const ClientRequest&>(op);
+#else
+ return dynamic_cast<const ClientRequest&>(op);
+#endif
+ }
+
+ void handle(ClientRequest::CompletionEvent&, const Operation& op) override {
+ if (crimson::common::local_conf()->osd_op_history_size) {
+ to_client_request(op).put_historic();
+ }
+ }
+};
+
+} // namespace crimson::osd
+
+namespace crimson {
+
+template <>
+struct EventBackendRegistry<osd::ClientRequest> {
+ static std::tuple<osd::LttngBackend, osd::HistoricBackend> get_backends() {
+ return { {}, {} };
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RemotePeeringEvent> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::LocalPeeringEvent> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RepRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+
+template <>
+struct EventBackendRegistry<osd::LogMissingRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::LogMissingRequestReply> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::RecoverySubRequest> {
+ static std::tuple<> get_backends() {
+ return {/* no extenral backends */};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::BackfillRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::PGAdvanceMap> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+template <>
+struct EventBackendRegistry<osd::SnapTrimObjSubEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+};
+
+} // namespace crimson
diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc
new file mode 100644
index 000000000..953ec9595
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.cc
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "messages/MOSDOp.h"
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::UrgentRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+
+ template <>
+ struct EventBackendRegistry<osd::PglogBasedRecovery> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+template <class T>
+BackgroundRecoveryT<T>::BackgroundRecoveryT(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ crimson::osd::scheduler::scheduler_class_t scheduler_class,
+ float delay)
+ : pg(pg),
+ epoch_started(epoch_started),
+ delay(delay),
+ ss(ss),
+ scheduler_class(scheduler_class)
+{}
+
+template <class T>
+void BackgroundRecoveryT<T>::print(std::ostream &lhs) const
+{
+ lhs << "BackgroundRecovery(" << pg->get_pgid() << ")";
+}
+
+template <class T>
+void BackgroundRecoveryT<T>::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->open_object_section("recovery_detail");
+ {
+ // TODO pg->dump_recovery_state(f);
+ }
+ f->close_section();
+}
+
+template <class T>
+seastar::future<> BackgroundRecoveryT<T>::start()
+{
+ logger().debug("{}: start", *this);
+
+ typename T::IRef ref = static_cast<T*>(this);
+ auto maybe_delay = seastar::now();
+ if (delay) {
+ maybe_delay = seastar::sleep(
+ std::chrono::milliseconds(std::lround(delay * 1000)));
+ }
+ return maybe_delay.then([ref, this] {
+ return this->template with_blocking_event<OperationThrottler::BlockingEvent>(
+ [ref, this] (auto&& trigger) {
+ return ss.with_throttle_while(
+ std::move(trigger),
+ this, get_scheduler_params(), [this] {
+ return T::interruptor::with_interruption([this] {
+ return do_recovery();
+ }, [](std::exception_ptr) {
+ return seastar::make_ready_future<bool>(false);
+ }, pg);
+ }).handle_exception_type([ref, this](const std::system_error& err) {
+ if (err.code() == std::make_error_code(std::errc::interrupted)) {
+ logger().debug("{} recovery interruped: {}", *pg, err.what());
+ return seastar::now();
+ }
+ return seastar::make_exception_future<>(err);
+ });
+ });
+ });
+}
+
+UrgentRecovery::UrgentRecovery(
+ const hobject_t& soid,
+ const eversion_t& need,
+ Ref<PG> pg,
+ ShardServices& ss,
+ epoch_t epoch_started)
+ : BackgroundRecoveryT{pg, ss, epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::immediate},
+ soid{soid}, need(need)
+{
+}
+
+UrgentRecovery::interruptible_future<bool>
+UrgentRecovery::do_recovery()
+{
+ logger().debug("{}: {}", __func__, *this);
+ if (!pg->has_reset_since(epoch_started)) {
+ return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->get_recovery_handler()->recover_missing(trigger, soid, need);
+ }).then_interruptible([] {
+ return seastar::make_ready_future<bool>(false);
+ });
+ }
+ return seastar::make_ready_future<bool>(false);
+}
+
+void UrgentRecovery::print(std::ostream &lhs) const
+{
+ lhs << "UrgentRecovery(" << pg->get_pgid() << ", "
+ << soid << ", v" << need << ", epoch_started: "
+ << epoch_started << ")";
+}
+
+void UrgentRecovery::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->open_object_section("recovery_detail");
+ {
+ f->dump_stream("oid") << soid;
+ f->dump_stream("version") << need;
+ }
+ f->close_section();
+}
+
+PglogBasedRecovery::PglogBasedRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ const epoch_t epoch_started,
+ float delay)
+ : BackgroundRecoveryT(
+ std::move(pg),
+ ss,
+ epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::background_recovery,
+ delay)
+{}
+
+PglogBasedRecovery::interruptible_future<bool>
+PglogBasedRecovery::do_recovery()
+{
+ if (pg->has_reset_since(epoch_started)) {
+ return seastar::make_ready_future<bool>(false);
+ }
+ return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->get_recovery_handler()->start_recovery_ops(
+ trigger,
+ crimson::common::local_conf()->osd_recovery_max_single_start);
+ });
+}
+
+PGPeeringPipeline &BackfillRecovery::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+BackfillRecovery::interruptible_future<bool>
+BackfillRecovery::do_recovery()
+{
+ logger().debug("{}", __func__);
+
+ if (pg->has_reset_since(epoch_started)) {
+ logger().debug("{}: pg got reset since epoch_started={}",
+ __func__, epoch_started);
+ return seastar::make_ready_future<bool>(false);
+ }
+ // TODO: limits
+ return enter_stage<interruptor>(
+ // process_event() of our boost::statechart machine is non-reentrant.
+ // with the backfill_pipeline we protect it from a second entry from
+ // the implementation of BackfillListener.
+ // additionally, this stage serves to synchronize with PeeringEvent.
+ peering_pp(*pg).process
+ ).then_interruptible([this] {
+ pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt));
+ return seastar::make_ready_future<bool>(false);
+ });
+}
+
+template class BackgroundRecoveryT<UrgentRecovery>;
+template class BackgroundRecoveryT<PglogBasedRecovery>;
+template class BackgroundRecoveryT<BackfillRecovery>;
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h
new file mode 100644
index 000000000..17f2cd57a
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/event_base.hpp>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+
+namespace crimson::osd {
+class PG;
+class ShardServices;
+
+template <class T>
+class BackgroundRecoveryT : public PhasedOperationT<T> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::background_recovery;
+
+ BackgroundRecoveryT(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ crimson::osd::scheduler::scheduler_class_t scheduler_class, float delay = 0);
+
+ virtual void print(std::ostream &) const;
+ seastar::future<> start();
+
+protected:
+ Ref<PG> pg;
+ const epoch_t epoch_started;
+ float delay = 0;
+
+private:
+ virtual void dump_detail(Formatter *f) const;
+ crimson::osd::scheduler::params_t get_scheduler_params() const {
+ return {
+ 1, // cost
+ 0, // owner
+ scheduler_class
+ };
+ }
+ using do_recovery_ret_t = typename PhasedOperationT<T>::template interruptible_future<bool>;
+ virtual do_recovery_ret_t do_recovery() = 0;
+ ShardServices &ss;
+ const crimson::osd::scheduler::scheduler_class_t scheduler_class;
+};
+
+/// represent a recovery initiated for serving a client request
+///
+/// unlike @c PglogBasedRecovery and @c BackfillRecovery,
+/// @c UrgentRecovery is not throttled by the scheduler. and it
+/// utilizes @c RecoveryBackend directly to recover the unreadable
+/// object.
+class UrgentRecovery final : public BackgroundRecoveryT<UrgentRecovery> {
+public:
+ UrgentRecovery(
+ const hobject_t& soid,
+ const eversion_t& need,
+ Ref<PG> pg,
+ ShardServices& ss,
+ epoch_t epoch_started);
+ void print(std::ostream&) const final;
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ RecoveryBackend::RecoveryBlockingEvent
+ > tracking_events;
+
+private:
+ void dump_detail(Formatter* f) const final;
+ interruptible_future<bool> do_recovery() override;
+ const hobject_t soid;
+ const eversion_t need;
+};
+
+class PglogBasedRecovery final : public BackgroundRecoveryT<PglogBasedRecovery> {
+public:
+ PglogBasedRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ float delay = 0);
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ RecoveryBackend::RecoveryBlockingEvent
+ > tracking_events;
+
+private:
+ interruptible_future<bool> do_recovery() override;
+};
+
+class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> {
+public:
+
+ template <class EventT>
+ BackfillRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ epoch_t epoch_started,
+ const EventT& evt);
+
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ OperationThrottler::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent
+ > tracking_events;
+
+private:
+ boost::intrusive_ptr<const boost::statechart::event_base> evt;
+ PipelineHandle handle;
+
+ static PGPeeringPipeline &peering_pp(PG &pg);
+ interruptible_future<bool> do_recovery() override;
+};
+
+template <class EventT>
+BackfillRecovery::BackfillRecovery(
+ Ref<PG> pg,
+ ShardServices &ss,
+ const epoch_t epoch_started,
+ const EventT& evt)
+ : BackgroundRecoveryT(
+ std::move(pg),
+ ss,
+ epoch_started,
+ crimson::osd::scheduler::scheduler_class_t::background_best_effort),
+ evt(evt.intrusive_from_this())
+{}
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::BackfillRecovery> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::PglogBasedRecovery> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::UrgentRecovery> : fmt::ostream_formatter {};
+template <class T> struct fmt::formatter<crimson::osd::BackgroundRecoveryT<T>> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
new file mode 100644
index 000000000..9374fbde2
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -0,0 +1,388 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "osd/object_state_fmt.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+
+void ClientRequest::Orderer::requeue(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ for (auto &req: list) {
+ logger().debug("{}: {} requeueing {}", __func__, *pg, req);
+ req.reset_instance_handle();
+ std::ignore = req.with_pg_int(shard_services, pg);
+ }
+}
+
+void ClientRequest::Orderer::clear_and_cancel()
+{
+ for (auto i = list.begin(); i != list.end(); ) {
+ logger().debug(
+ "ClientRequest::Orderer::clear_and_cancel: {}",
+ *i);
+ i->complete_request();
+ remove_request(*(i++));
+ }
+}
+
+void ClientRequest::complete_request()
+{
+ track_event<CompletionEvent>();
+ on_complete.set_value();
+}
+
+ClientRequest::ClientRequest(
+ ShardServices &shard_services, crimson::net::ConnectionRef conn,
+ Ref<MOSDOp> &&m)
+ : put_historic_shard_services(&shard_services),
+ conn(std::move(conn)),
+ m(std::move(m)),
+ instance_handle(new instance_handle_t)
+{}
+
+ClientRequest::~ClientRequest()
+{
+ logger().debug("{}: destroying", *this);
+}
+
+void ClientRequest::print(std::ostream &lhs) const
+{
+ lhs << "m=[" << *m << "]";
+}
+
+void ClientRequest::dump_detail(Formatter *f) const
+{
+ logger().debug("{}: dumping", *this);
+ std::apply([f] (auto... event) {
+ (..., event.dump(f));
+ }, tracking_events);
+}
+
+ConnectionPipeline &ClientRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).client_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+bool ClientRequest::is_pg_op() const
+{
+ return std::any_of(
+ begin(m->ops), end(m->ops),
+ [](auto& op) { return ceph_osd_op_type_pg(op.op.op); });
+}
+
+seastar::future<> ClientRequest::with_pg_int(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ epoch_t same_interval_since = pgref->get_interval_start_epoch();
+ logger().debug("{} same_interval_since: {}", *this, same_interval_since);
+ if (m->finish_decode()) {
+ m->clear_payload();
+ }
+ const auto this_instance_id = instance_id++;
+ OperationRef opref{this};
+ auto instance_handle = get_instance_handle();
+ auto &ihref = *instance_handle;
+ return interruptor::with_interruption(
+ [this, pgref, this_instance_id, &ihref, &shard_services]() mutable {
+ PG &pg = *pgref;
+ if (pg.can_discard_op(*m)) {
+ return shard_services.send_incremental_map(
+ std::ref(*conn), m->get_map_epoch()
+ ).then([this, this_instance_id, pgref] {
+ logger().debug("{}.{}: discarding", *this, this_instance_id);
+ pgref->client_request_orderer.remove_request(*this);
+ complete_request();
+ return interruptor::now();
+ });
+ }
+ return ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this
+ ).then_interruptible([this, this_instance_id, &pg, &ihref] {
+ logger().debug("{}.{}: after await_map stage", *this, this_instance_id);
+ return ihref.enter_blocker(
+ *this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map,
+ m->get_min_epoch(), nullptr);
+ }).then_interruptible([this, this_instance_id, &pg, &ihref](auto map) {
+ logger().debug("{}.{}: after wait_for_map", *this, this_instance_id);
+ return ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this);
+ }).then_interruptible([this, this_instance_id, &pg, &ihref]() {
+ logger().debug(
+ "{}.{}: after wait_for_active stage", *this, this_instance_id);
+ return ihref.enter_blocker(
+ *this,
+ pg.wait_for_active_blocker,
+ &decltype(pg.wait_for_active_blocker)::wait);
+ }).then_interruptible([this, pgref, this_instance_id, &ihref]() mutable
+ -> interruptible_future<> {
+ logger().debug(
+ "{}.{}: after wait_for_active", *this, this_instance_id);
+ if (is_pg_op()) {
+ return process_pg_op(pgref);
+ } else {
+ return process_op(ihref, pgref);
+ }
+ }).then_interruptible([this, this_instance_id, pgref] {
+ logger().debug("{}.{}: after process*", *this, this_instance_id);
+ pgref->client_request_orderer.remove_request(*this);
+ complete_request();
+ });
+ }, [this, this_instance_id, pgref](std::exception_ptr eptr) {
+ // TODO: better debug output
+ logger().debug("{}.{}: interrupted {}", *this, this_instance_id, eptr);
+ }, pgref).finally(
+ [opref=std::move(opref), pgref=std::move(pgref),
+ instance_handle=std::move(instance_handle), &ihref] {
+ ihref.handle.exit();
+ });
+}
+
+seastar::future<> ClientRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ put_historic_shard_services = &shard_services;
+ pgref->client_request_orderer.add_request(*this);
+ auto ret = on_complete.get_future();
+ std::ignore = with_pg_int(
+ shard_services, std::move(pgref)
+ );
+ return ret;
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::process_pg_op(
+ Ref<PG> &pg)
+{
+ return pg->do_pg_ops(
+ m
+ ).then_interruptible([this, pg=std::move(pg)](MURef<MOSDOpReply> reply) {
+ return conn->send(std::move(reply));
+ });
+}
+
+auto ClientRequest::reply_op_error(const Ref<PG>& pg, int err)
+{
+ logger().debug("{}: replying with error {}", *this, err);
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), err, pg->get_osdmap_epoch(),
+ m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK),
+ !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
+ reply->set_reply_versions(eversion_t(), 0);
+ reply->set_op_returns(std::vector<pg_log_op_return_item_t>{});
+ return conn->send(std::move(reply));
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::process_op(instance_handle_t &ihref, Ref<PG> &pg)
+{
+ return ihref.enter_stage<interruptor>(
+ client_pp(*pg).recover_missing,
+ *this
+ ).then_interruptible(
+ [this, pg]() mutable {
+ if (pg->is_primary()) {
+ return do_recover_missing(pg, m->get_hobj());
+ } else {
+ logger().debug("process_op: Skipping do_recover_missing"
+ "on non primary pg");
+ return interruptor::now();
+ }
+ }).then_interruptible([this, pg, &ihref]() mutable {
+ return pg->already_complete(m->get_reqid()).then_interruptible(
+ [this, pg, &ihref](auto completed) mutable
+ -> PG::load_obc_iertr::future<> {
+ if (completed) {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), completed->err, pg->get_osdmap_epoch(),
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, false);
+ reply->set_reply_versions(completed->version, completed->user_version);
+ return conn->send(std::move(reply));
+ } else {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).get_obc, *this
+ ).then_interruptible(
+ [this, pg, &ihref]() mutable -> PG::load_obc_iertr::future<> {
+ logger().debug("{}: in get_obc stage", *this);
+ op_info.set_from_op(&*m, *pg->get_osdmap());
+ return pg->with_locked_obc(
+ m->get_hobj(), op_info,
+ [this, pg, &ihref](auto obc) mutable {
+ logger().debug("{}: got obc {}", *this, obc->obs);
+ return ihref.enter_stage<interruptor>(
+ client_pp(*pg).process, *this
+ ).then_interruptible([this, pg, obc, &ihref]() mutable {
+ return do_process(ihref, pg, obc);
+ });
+ });
+ });
+ }
+ });
+ }).handle_error_interruptible(
+ PG::load_obc_ertr::all_same_way([this, pg=std::move(pg)](const auto &code) {
+ logger().error("ClientRequest saw error code {}", code);
+ assert(code.value() > 0);
+ return reply_op_error(pg, -code.value());
+ }));
+}
+
+ClientRequest::interruptible_future<>
+ClientRequest::do_process(
+ instance_handle_t &ihref,
+ Ref<PG>& pg, crimson::osd::ObjectContextRef obc)
+{
+ if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
+ return reply_op_error(pg, -EINVAL);
+ }
+ const pg_pool_t pool = pg->get_pgpool().info;
+ if (pool.has_flag(pg_pool_t::FLAG_EIO)) {
+ // drop op on the floor; the client will handle returning EIO
+ if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
+ logger().debug("discarding op due to pool EIO flag");
+ return seastar::now();
+ } else {
+ logger().debug("replying EIO due to pool EIO flag");
+ return reply_op_error(pg, -EIO);
+ }
+ }
+ if (m->get_oid().name.size()
+ > crimson::common::local_conf()->osd_max_object_name_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().get_key().size()
+ > crimson::common::local_conf()->osd_max_object_name_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().nspace.size()
+ > crimson::common::local_conf()->osd_max_object_namespace_len) {
+ return reply_op_error(pg, -ENAMETOOLONG);
+ } else if (m->get_hobj().oid.name.empty()) {
+ return reply_op_error(pg, -EINVAL);
+ } else if (pg->get_osdmap()->is_blocklisted(conn->get_peer_addr())) {
+ logger().info("{} is blocklisted", conn->get_peer_addr());
+ return reply_op_error(pg, -EBLOCKLISTED);
+ }
+
+ if (!obc->obs.exists && !op_info.may_write()) {
+ return reply_op_error(pg, -ENOENT);
+ }
+
+ SnapContext snapc = get_snapc(pg,obc);
+
+ if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
+ snapc.seq < obc->ssc->snapset.seq) {
+ logger().debug("{} ORDERSNAP flag set and snapc seq {}",
+ " < snapset seq {} on {}",
+ __func__, snapc.seq, obc->ssc->snapset.seq,
+ obc->obs.oi.soid);
+ return reply_op_error(pg, -EOLDSNAPC);
+ }
+
+ if (!pg->is_primary()) {
+ // primary can handle both normal ops and balanced reads
+ if (is_misdirected(*pg)) {
+ logger().trace("do_process: dropping misdirected op");
+ return seastar::now();
+ } else if (const hobject_t& hoid = m->get_hobj();
+ !pg->get_peering_state().can_serve_replica_read(hoid)) {
+ logger().debug("{}: unstable write on replica, "
+ "bouncing to primary",
+ __func__);
+ return reply_op_error(pg, -EAGAIN);
+ } else {
+ logger().debug("{}: serving replica read on oid {}",
+ __func__, m->get_hobj());
+ }
+ }
+ return pg->do_osd_ops(m, conn, obc, op_info, snapc).safe_then_unpack_interruptible(
+ [this, pg, &ihref](auto submitted, auto all_completed) mutable {
+ return submitted.then_interruptible([this, pg, &ihref] {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+ }).then_interruptible(
+ [this, pg, all_completed=std::move(all_completed), &ihref]() mutable {
+ return all_completed.safe_then_interruptible(
+ [this, pg, &ihref](MURef<MOSDOpReply> reply) {
+ return ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this
+ ).then_interruptible(
+ [this, reply=std::move(reply)]() mutable {
+ logger().debug("{}: sending response", *this);
+ return conn->send(std::move(reply));
+ });
+ }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable {
+ return process_op(ihref, pg);
+ }));
+ });
+ }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable {
+ return process_op(ihref, pg);
+ }));
+}
+
+bool ClientRequest::is_misdirected(const PG& pg) const
+{
+ // otherwise take a closer look
+ if (const int flags = m->get_flags();
+ flags & CEPH_OSD_FLAG_BALANCE_READS ||
+ flags & CEPH_OSD_FLAG_LOCALIZE_READS) {
+ if (!op_info.may_read()) {
+ // no read found, so it can't be balanced read
+ return true;
+ }
+ if (op_info.may_write() || op_info.may_cache()) {
+ // write op, but i am not primary
+ return true;
+ }
+ // balanced reads; any replica will do
+ return false;
+ }
+ // neither balanced nor localize reads
+ return true;
+}
+
+void ClientRequest::put_historic() const
+{
+ ceph_assert_always(put_historic_shard_services);
+ put_historic_shard_services->get_registry().put_historic(*this);
+}
+
+const SnapContext ClientRequest::get_snapc(
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc) const
+{
+ SnapContext snapc;
+ if (op_info.may_write() || op_info.may_cache()) {
+ // snap
+ if (pg->get_pgpool().info.is_pool_snaps_mode()) {
+ // use pool's snapc
+ snapc = pg->get_pgpool().snapc;
+ logger().debug("{} using pool's snapc snaps={}",
+ __func__, snapc.snaps);
+
+ } else {
+ // client specified snapc
+ snapc.seq = m->get_snap_seq();
+ snapc.snaps = m->get_snaps();
+ logger().debug("{} client specified snapc seq={} snaps={}",
+ __func__, snapc.seq, snapc.snaps);
+ }
+ }
+ return snapc;
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
new file mode 100644
index 000000000..b2dce1e87
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/utility.h"
+#include "messages/MOSDOp.h"
+
+namespace crimson::osd {
+class PG;
+class OSD;
+class ShardServices;
+
+class ClientRequest final : public PhasedOperationT<ClientRequest>,
+ private CommonClientRequest {
+ // Initially set to primary core, updated to pg core after move,
+ // used by put_historic
+ ShardServices *put_historic_shard_services = nullptr;
+
+ crimson::net::ConnectionRef conn;
+ // must be after conn due to ConnectionPipeline's life-time
+ Ref<MOSDOp> m;
+ OpInfo op_info;
+ seastar::promise<> on_complete;
+ unsigned instance_id = 0;
+
+public:
+ class PGPipeline : public CommonPGPipeline {
+ public:
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::await_map";
+ } await_map;
+ struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+ } wait_repop;
+ struct SendReply : OrderedExclusivePhaseT<SendReply> {
+ static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply";
+ } send_reply;
+ friend class ClientRequest;
+ friend class LttngBackend;
+ friend class HistoricBackend;
+ friend class ReqRequest;
+ friend class LogMissingRequest;
+ friend class LogMissingRequestReply;
+ };
+
+ /**
+ * instance_handle_t
+ *
+ * Client request is, at present, the only Operation which can be requeued.
+ * This is, mostly, fine. However, reusing the PipelineHandle or
+ * BlockingEvent structures before proving that the prior instance has stopped
+ * can create hangs or crashes due to violations of the BlockerT and
+ * PipelineHandle invariants.
+ *
+ * To solve this, we create an instance_handle_t which contains the events
+ * for the portion of execution that can be rerun as well as the
+ * PipelineHandle. ClientRequest::with_pg_int grabs a reference to the current
+ * instance_handle_t and releases its PipelineHandle in the finally block.
+ * On requeue, we create a new instance_handle_t with a fresh PipelineHandle
+ * and events tuple and use it and use it for the next invocation of
+ * with_pg_int.
+ */
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ class instance_handle_t : public boost::intrusive_ref_counter<
+ instance_handle_t, boost::thread_unsafe_counter> {
+ public:
+ // intrusive_ptr because seastar::lw_shared_ptr includes a cpu debug check
+ // that we will fail since the core on which we allocate the request may not
+ // be the core on which we perform with_pg_int. This is harmless, since we
+ // don't leave any references on the source core, so we just bypass it by using
+ // intrusive_ptr instead.
+ using ref_t = boost::intrusive_ptr<instance_handle_t>;
+ PipelineHandle handle;
+
+ std::tuple<
+ PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ PGPipeline::RecoverMissing::BlockingEvent,
+ PGPipeline::GetOBC::BlockingEvent,
+ PGPipeline::Process::BlockingEvent,
+ PGPipeline::WaitRepop::BlockingEvent,
+ PGPipeline::SendReply::BlockingEvent,
+ CompletionEvent
+ > pg_tracking_events;
+
+ template <typename BlockingEventT, typename InterruptorT=void, typename F>
+ auto with_blocking_event(F &&f, ClientRequest &op) {
+ auto ret = std::forward<F>(f)(
+ typename BlockingEventT::template Trigger<ClientRequest>{
+ std::get<BlockingEventT>(pg_tracking_events), op
+ });
+ if constexpr (std::is_same_v<InterruptorT, void>) {
+ return ret;
+ } else {
+ using ret_t = decltype(ret);
+ return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)};
+ }
+ }
+
+ template <typename InterruptorT=void, typename StageT>
+ auto enter_stage(StageT &stage, ClientRequest &op) {
+ return this->template with_blocking_event<
+ typename StageT::BlockingEvent,
+ InterruptorT>(
+ [&stage, this](auto &&trigger) {
+ return handle.template enter<ClientRequest>(
+ stage, std::move(trigger));
+ }, op);
+ }
+
+ template <
+ typename InterruptorT=void, typename BlockingObj, typename Method,
+ typename... Args>
+ auto enter_blocker(
+ ClientRequest &op, BlockingObj &obj, Method method, Args&&... args) {
+ return this->template with_blocking_event<
+ typename BlockingObj::Blocker::BlockingEvent,
+ InterruptorT>(
+ [&obj, method,
+ args=std::forward_as_tuple(std::move(args)...)](auto &&trigger) mutable {
+ return apply_method_to_tuple(
+ obj, method,
+ std::tuple_cat(
+ std::forward_as_tuple(std::move(trigger)),
+ std::move(args))
+ );
+ }, op);
+ }
+ };
+ instance_handle_t::ref_t instance_handle;
+ void reset_instance_handle() {
+ instance_handle = new instance_handle_t;
+ }
+ auto get_instance_handle() { return instance_handle; }
+
+ using ordering_hook_t = boost::intrusive::list_member_hook<>;
+ ordering_hook_t ordering_hook;
+ class Orderer {
+ using list_t = boost::intrusive::list<
+ ClientRequest,
+ boost::intrusive::member_hook<
+ ClientRequest,
+ typename ClientRequest::ordering_hook_t,
+ &ClientRequest::ordering_hook>
+ >;
+ list_t list;
+
+ public:
+ void add_request(ClientRequest &request) {
+ assert(!request.ordering_hook.is_linked());
+ intrusive_ptr_add_ref(&request);
+ list.push_back(request);
+ }
+ void remove_request(ClientRequest &request) {
+ assert(request.ordering_hook.is_linked());
+ list.erase(list_t::s_iterator_to(request));
+ intrusive_ptr_release(&request);
+ }
+ void requeue(ShardServices &shard_services, Ref<PG> pg);
+ void clear_and_cancel();
+ };
+ void complete_request();
+
+ static constexpr OperationTypeCode type = OperationTypeCode::client_request;
+
+ ClientRequest(
+ ShardServices &shard_services,
+ crimson::net::ConnectionRef, Ref<MOSDOp> &&m);
+ ~ClientRequest();
+
+ void print(std::ostream &) const final;
+ void dump_detail(Formatter *f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return m->get_spg();
+ }
+ PipelineHandle &get_handle() { return instance_handle->handle; }
+ epoch_t get_epoch() const { return m->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg_int(
+ ShardServices &shard_services, Ref<PG> pg);
+
+public:
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pgref);
+
+private:
+ template <typename FuncT>
+ interruptible_future<> with_sequencer(FuncT&& func);
+ auto reply_op_error(const Ref<PG>& pg, int err);
+
+ interruptible_future<> do_process(
+ instance_handle_t &ihref,
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc);
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition> process_pg_op(
+ Ref<PG> &pg);
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition> process_op(
+ instance_handle_t &ihref,
+ Ref<PG> &pg);
+ bool is_pg_op() const;
+
+ PGPipeline &client_pp(PG &pg);
+
+ template <typename Errorator>
+ using interruptible_errorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ Errorator>;
+
+ bool is_misdirected(const PG& pg) const;
+
+ const SnapContext get_snapc(
+ Ref<PG>& pg,
+ crimson::osd::ObjectContextRef obc) const;
+
+public:
+
+ friend class LttngBackend;
+ friend class HistoricBackend;
+
+ auto get_started() const {
+ return get_event<StartEvent>().get_timestamp();
+ };
+
+ auto get_completed() const {
+ return get_event<CompletionEvent>().get_timestamp();
+ };
+
+ void put_historic() const;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::ClientRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc
new file mode 100644
index 000000000..cfd22c774
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request_common.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+typename InterruptibleOperation::template interruptible_future<>
+CommonClientRequest::do_recover_missing(
+ Ref<PG>& pg, const hobject_t& soid)
+{
+ eversion_t ver;
+ assert(pg->is_primary());
+ logger().debug("{} check for recovery, {}", __func__, soid);
+ if (!pg->is_unreadable_object(soid, &ver) &&
+ !pg->is_degraded_or_backfilling_object(soid)) {
+ return seastar::now();
+ }
+ logger().debug("{} need to wait for recovery, {}", __func__, soid);
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered();
+ } else {
+ auto [op, fut] =
+ pg->get_shard_services().start_operation<UrgentRecovery>(
+ soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch());
+ return std::move(fut);
+ }
+}
+
+bool CommonClientRequest::should_abort_request(
+ const Operation& op,
+ std::exception_ptr eptr)
+{
+ if (*eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::actingset_changed)) {
+ try {
+ std::rethrow_exception(eptr);
+ } catch(::crimson::common::actingset_changed& e) {
+ if (e.is_primary()) {
+ logger().debug("{} {} operation restart, acting set changed", __func__, op);
+ return false;
+ } else {
+ logger().debug("{} {} operation abort, up primary changed", __func__, op);
+ return true;
+ }
+ }
+ } else {
+ assert(*eptr.__cxa_exception_type() ==
+ typeid(crimson::common::system_shutdown_exception));
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{} {} operation skipped, system shutdown", __func__, op);
+ return true;
+ }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h
new file mode 100644
index 000000000..6a8a78966
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request_common.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/operation.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+struct CommonClientRequest {
+ static InterruptibleOperation::template interruptible_future<>
+ do_recover_missing(Ref<PG>& pg, const hobject_t& soid);
+
+ static bool should_abort_request(
+ const crimson::Operation& op, std::exception_ptr eptr);
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
new file mode 100644
index 000000000..58fa07b8b
--- /dev/null
+++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+class CommonPGPipeline {
+protected:
+ friend class InternalClientRequest;
+ friend class SnapTrimEvent;
+ friend class SnapTrimObjSubEvent;
+
+ struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
+ static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
+ } wait_for_active;
+ struct RecoverMissing : OrderedExclusivePhaseT<RecoverMissing> {
+ static constexpr auto type_name = "CommonPGPipeline::recover_missing";
+ } recover_missing;
+ struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
+ static constexpr auto type_name = "CommonPGPipeline::get_obc";
+ } get_obc;
+ struct Process : OrderedExclusivePhaseT<Process> {
+ static constexpr auto type_name = "CommonPGPipeline::process";
+ } process;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
new file mode 100644
index 000000000..1e9b842b2
--- /dev/null
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operations/internal_client_request.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::InternalClientRequest> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+
+namespace crimson::osd {
+
+InternalClientRequest::InternalClientRequest(Ref<PG> pg)
+ : pg(std::move(pg))
+{
+ assert(bool(this->pg));
+ assert(this->pg->is_primary());
+}
+
+InternalClientRequest::~InternalClientRequest()
+{
+ logger().debug("{}: destroying", *this);
+}
+
+void InternalClientRequest::print(std::ostream &) const
+{
+}
+
+void InternalClientRequest::dump_detail(Formatter *f) const
+{
+}
+
+CommonPGPipeline& InternalClientRequest::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+seastar::future<> InternalClientRequest::start()
+{
+ track_event<StartEvent>();
+ return crimson::common::handle_system_shutdown([this] {
+ return seastar::repeat([this] {
+ logger().debug("{}: in repeat", *this);
+ return interruptor::with_interruption([this]() mutable {
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([this] {
+ return do_recover_missing(pg, get_target_oid());
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
+ logger().debug("{}: getting obc lock", *this);
+ return seastar::do_with(create_osd_ops(),
+ [this](auto& osd_ops) mutable {
+ logger().debug("InternalClientRequest: got {} OSDOps to execute",
+ std::size(osd_ops));
+ [[maybe_unused]] const int ret = op_info.set_from_op(
+ std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+ assert(ret == 0);
+ return pg->with_locked_obc(get_target_oid(), op_info,
+ [&osd_ops, this](auto obc) {
+ return enter_stage<interruptor>(client_pp().process
+ ).then_interruptible(
+ [obc=std::move(obc), &osd_ops, this] {
+ return pg->do_osd_ops(
+ std::move(obc),
+ osd_ops,
+ std::as_const(op_info),
+ get_do_osd_ops_params(),
+ [] {
+ return PG::do_osd_ops_iertr::now();
+ },
+ [] (const std::error_code& e) {
+ return PG::do_osd_ops_iertr::now();
+ }
+ ).safe_then_unpack_interruptible(
+ [](auto submitted, auto all_completed) {
+ return all_completed.handle_error_interruptible(
+ crimson::ct_error::eagain::handle([] {
+ return seastar::now();
+ }));
+ }, crimson::ct_error::eagain::handle([] {
+ return interruptor::now();
+ })
+ );
+ });
+ });
+ });
+ }).handle_error_interruptible(PG::load_obc_ertr::all_same_way([] {
+ return seastar::now();
+ })).then_interruptible([] {
+ return seastar::stop_iteration::yes;
+ });
+ }, [this](std::exception_ptr eptr) {
+ if (should_abort_request(*this, std::move(eptr))) {
+ return seastar::stop_iteration::yes;
+ } else {
+ return seastar::stop_iteration::no;
+ }
+ }, pg);
+ }).then([this] {
+ track_event<CompletionEvent>();
+ });
+ });
+}
+
+} // namespace crimson::osd
+
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
new file mode 100644
index 000000000..8eed12e05
--- /dev/null
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request_common.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+
+namespace crimson::osd {
+
+class InternalClientRequest : public PhasedOperationT<InternalClientRequest>,
+ private CommonClientRequest {
+public:
+ explicit InternalClientRequest(Ref<PG> pg);
+ ~InternalClientRequest();
+
+ // imposed by `ShardService::start_operation<T>(...)`.
+ seastar::future<> start();
+
+protected:
+ virtual const hobject_t& get_target_oid() const = 0;
+ virtual PG::do_osd_ops_params_t get_do_osd_ops_params() const = 0;
+ virtual std::vector<OSDOp> create_osd_ops() = 0;
+
+ const PG& get_pg() const {
+ return *pg;
+ }
+
+private:
+ friend OperationT<InternalClientRequest>;
+
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::internal_client_request;
+
+ void print(std::ostream &) const final;
+ void dump_detail(Formatter *f) const final;
+
+ CommonPGPipeline& client_pp();
+
+ seastar::future<> do_process();
+
+ Ref<PG> pg;
+ OpInfo op_info;
+ PipelineHandle handle;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::InternalClientRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc
new file mode 100644
index 000000000..739b46406
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "logmissing_request.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+LogMissingRequest::LogMissingRequest(crimson::net::ConnectionRef&& conn,
+ Ref<MOSDPGUpdateLogMissing> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void LogMissingRequest::print(std::ostream& os) const
+{
+ os << "LogMissingRequest("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void LogMissingRequest::dump_detail(Formatter *f) const
+{
+ f->open_object_section("LogMissingRequest");
+ f->dump_stream("req_tid") << req->get_tid();
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("entries") << req->entries;
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &LogMissingRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> LogMissingRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: LogMissingRequest::with_pg", *this);
+
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(client_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), req->min_epoch);
+ });
+ }).then_interruptible([this, pg](auto) {
+ return pg->do_update_log_missing(req, conn);
+ });
+ }, [ref](std::exception_ptr) { return seastar::now(); }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h
new file mode 100644
index 000000000..71d0816fd
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class LogMissingRequest final : public PhasedOperationT<LogMissingRequest> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request;
+ LogMissingRequest(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissing>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDPGUpdateLogMissing> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LogMissingRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
new file mode 100644
index 000000000..b4bf2938e
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "logmissing_request_reply.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+LogMissingRequestReply::LogMissingRequestReply(
+ crimson::net::ConnectionRef&& conn,
+ Ref<MOSDPGUpdateLogMissingReply> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void LogMissingRequestReply::print(std::ostream& os) const
+{
+ os << "LogMissingRequestReply("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void LogMissingRequestReply::dump_detail(Formatter *f) const
+{
+ f->open_object_section("LogMissingRequestReply");
+ f->dump_stream("rep_tid") << req->get_tid();
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &LogMissingRequestReply::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> LogMissingRequestReply::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: LogMissingRequestReply::with_pg", *this);
+
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ return pg->do_update_log_missing_reply(std::move(req));
+ }, [ref](std::exception_ptr) { return seastar::now(); }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h
new file mode 100644
index 000000000..c89131fec
--- /dev/null
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class LogMissingRequestReply final : public PhasedOperationT<LogMissingRequestReply> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request_reply;
+ LogMissingRequestReply(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissingReply>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDPGUpdateLogMissingReply> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LogMissingRequestReply> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h
new file mode 100644
index 000000000..c7b81e765
--- /dev/null
+++ b/src/crimson/osd/osd_operations/osdop_params.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "messages/MOSDOp.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+
+// The fields in this struct are parameters that may be needed in multiple
+// level of processing. I inclosed all those parameters in this struct to
+// avoid passing each of them as a method parameter.
+struct osd_op_params_t {
+ osd_reqid_t req_id;
+ utime_t mtime;
+ eversion_t at_version;
+ eversion_t pg_trim_to;
+ eversion_t min_last_complete_ondisk;
+ eversion_t last_complete;
+ version_t user_at_version = 0;
+ bool user_modify = false;
+ ObjectCleanRegions clean_regions;
+
+ osd_op_params_t() = default;
+};
diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc
new file mode 100644
index 000000000..ea4662bd0
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.cc
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "messages/MOSDPGLog.h"
+
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+template <class T>
+void PeeringEvent<T>::print(std::ostream &lhs) const
+{
+ lhs << "PeeringEvent("
+ << "from=" << from
+ << " pgid=" << pgid
+ << " sent=" << evt.get_epoch_sent()
+ << " requested=" << evt.get_epoch_requested()
+ << " evt=" << evt.get_desc()
+ << ")";
+}
+
+template <class T>
+void PeeringEvent<T>::dump_detail(Formatter *f) const
+{
+ f->open_object_section("PeeringEvent");
+ f->dump_stream("from") << from;
+ f->dump_stream("pgid") << pgid;
+ f->dump_int("sent", evt.get_epoch_sent());
+ f->dump_int("requested", evt.get_epoch_requested());
+ f->dump_string("evt", evt.get_desc());
+ f->open_array_section("events");
+ {
+ std::apply([f](auto&... events) {
+ (..., events.dump(f));
+ }, static_cast<const T*>(this)->tracking_events);
+ }
+ f->close_section();
+ f->close_section();
+}
+
+
+template <class T>
+PGPeeringPipeline &PeeringEvent<T>::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+template <class T>
+seastar::future<> PeeringEvent<T>::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ if (!pg) {
+ logger().warn("{}: pg absent, did not create", *this);
+ on_pg_absent(shard_services);
+ that()->get_handle().exit();
+ return complete_rctx_no_pg(shard_services);
+ }
+
+ using interruptor = typename T::interruptor;
+ return interruptor::with_interruption([this, pg, &shard_services] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(peering_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), evt.get_epoch_sent());
+ });
+ }).then_interruptible([this, pg](auto) {
+ return this->template enter_stage<interruptor>(peering_pp(*pg).process);
+ }).then_interruptible([this, pg, &shard_services] {
+ return pg->do_peering_event(evt, ctx
+ ).then_interruptible([this, pg, &shard_services] {
+ that()->get_handle().exit();
+ return complete_rctx(shard_services, pg);
+ });
+ }).then_interruptible([pg, &shard_services]()
+ -> typename T::template interruptible_future<> {
+ if (!pg->get_need_up_thru()) {
+ return seastar::now();
+ }
+ return shard_services.send_alive(pg->get_same_interval_since());
+ }).then_interruptible([&shard_services] {
+ return shard_services.send_pg_temp();
+ });
+ }, [this](std::exception_ptr ep) {
+ logger().debug("{}: interrupted with {}", *this, ep);
+ }, pg);
+}
+
+template <class T>
+void PeeringEvent<T>::on_pg_absent(ShardServices &)
+{
+ logger().debug("{}: pg absent, dropping", *this);
+}
+
+template <class T>
+typename PeeringEvent<T>::template interruptible_future<>
+PeeringEvent<T>::complete_rctx(ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: submitting ctx", *this);
+ return shard_services.dispatch_context(
+ pg->get_collection_ref(),
+ std::move(ctx));
+}
+
+ConnectionPipeline &RemotePeeringEvent::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).peering_request_conn_pipeline;
+}
+
+void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services)
+{
+ if (auto& e = get_event().get_event();
+ e.dynamic_type() == MQuery::static_type()) {
+ const auto map_epoch =
+ shard_services.get_map()->get_epoch();
+ const auto& q = static_cast<const MQuery&>(e);
+ const pg_info_t empty{spg_t{pgid.pgid, q.query.to}};
+ if (q.query.type == q.query.LOG ||
+ q.query.type == q.query.FULLLOG) {
+ auto m = crimson::make_message<MOSDPGLog>(q.query.from, q.query.to,
+ map_epoch, empty,
+ q.query.epoch_sent);
+ ctx.send_osd_message(q.from.osd, std::move(m));
+ } else {
+ ctx.send_notify(q.from.osd, {q.query.from, q.query.to,
+ q.query.epoch_sent,
+ map_epoch, empty,
+ PastIntervals{}});
+ }
+ }
+}
+
+RemotePeeringEvent::interruptible_future<> RemotePeeringEvent::complete_rctx(
+ ShardServices &shard_services,
+ Ref<PG> pg)
+{
+ if (pg) {
+ return PeeringEvent::complete_rctx(shard_services, pg);
+ } else {
+ return shard_services.dispatch_context_messages(std::move(ctx));
+ }
+}
+
+seastar::future<> RemotePeeringEvent::complete_rctx_no_pg(
+ ShardServices &shard_services)
+{
+ return shard_services.dispatch_context_messages(std::move(ctx));
+}
+
+seastar::future<> LocalPeeringEvent::start()
+{
+ logger().debug("{}: start", *this);
+
+ IRef ref = this;
+ auto maybe_delay = seastar::now();
+ if (delay) {
+ maybe_delay = seastar::sleep(
+ std::chrono::milliseconds(std::lround(delay * 1000)));
+ }
+ return maybe_delay.then([this] {
+ return with_pg(pg->get_shard_services(), pg);
+ }).finally([ref=std::move(ref)] {
+ logger().debug("{}: complete", *ref);
+ });
+}
+
+
+LocalPeeringEvent::~LocalPeeringEvent() {}
+
+template class PeeringEvent<RemotePeeringEvent>;
+template class PeeringEvent<LocalPeeringEvent>;
+
+}
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
new file mode 100644
index 000000000..e94caead1
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class OSD;
+class ShardServices;
+class PG;
+class BackfillRecovery;
+
+ class PGPeeringPipeline {
+ struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+ static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
+ } await_map;
+ struct Process : OrderedExclusivePhaseT<Process> {
+ static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
+ } process;
+ template <class T>
+ friend class PeeringEvent;
+ friend class LocalPeeringEvent;
+ friend class RemotePeeringEvent;
+ friend class PGAdvanceMap;
+ friend class BackfillRecovery;
+ };
+
+template <class T>
+class PeeringEvent : public PhasedOperationT<T> {
+ T* that() {
+ return static_cast<T*>(this);
+ }
+ const T* that() const {
+ return static_cast<const T*>(this);
+ }
+
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::peering_event;
+
+protected:
+ PGPeeringPipeline &peering_pp(PG &pg);
+
+ PeeringCtx ctx;
+ pg_shard_t from;
+ spg_t pgid;
+ float delay = 0;
+ PGPeeringEvent evt;
+
+ const pg_shard_t get_from() const {
+ return from;
+ }
+
+ const spg_t get_pgid() const {
+ return pgid;
+ }
+
+ const PGPeeringEvent &get_event() const {
+ return evt;
+ }
+
+ virtual void on_pg_absent(ShardServices &);
+
+ virtual typename PeeringEvent::template interruptible_future<>
+ complete_rctx(ShardServices &, Ref<PG>);
+
+ virtual seastar::future<> complete_rctx_no_pg(
+ ShardServices &shard_services
+ ) { return seastar::now();}
+
+public:
+ template <typename... Args>
+ PeeringEvent(
+ const pg_shard_t &from, const spg_t &pgid,
+ Args&&... args) :
+ from(from),
+ pgid(pgid),
+ evt(std::forward<Args>(args)...)
+ {}
+ template <typename... Args>
+ PeeringEvent(
+ const pg_shard_t &from, const spg_t &pgid,
+ float delay, Args&&... args) :
+ from(from),
+ pgid(pgid),
+ delay(delay),
+ evt(std::forward<Args>(args)...)
+ {}
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+};
+
+class RemotePeeringEvent : public PeeringEvent<RemotePeeringEvent> {
+protected:
+ crimson::net::ConnectionRef conn;
+ // must be after conn due to ConnectionPipeline's life-time
+ PipelineHandle handle;
+
+ void on_pg_absent(ShardServices &) final;
+ PeeringEvent::interruptible_future<> complete_rctx(
+ ShardServices &shard_services,
+ Ref<PG> pg) override;
+ seastar::future<> complete_rctx_no_pg(
+ ShardServices &shard_services
+ ) override;
+
+public:
+ class OSDPipeline {
+ struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> {
+ static constexpr auto type_name =
+ "PeeringRequest::OSDPipeline::await_active";
+ } await_active;
+ friend class RemotePeeringEvent;
+ };
+
+ template <typename... Args>
+ RemotePeeringEvent(crimson::net::ConnectionRef conn, Args&&... args) :
+ PeeringEvent(std::forward<Args>(args)...),
+ conn(conn)
+ {}
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ PGPeeringPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent,
+ OSDPipeline::AwaitActive::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ static constexpr bool can_create() { return true; }
+ auto get_create_info() { return std::move(evt.create_info); }
+ spg_t get_pgid() const {
+ return pgid;
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return evt.get_epoch_sent(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+};
+
+class LocalPeeringEvent final : public PeeringEvent<LocalPeeringEvent> {
+protected:
+ Ref<PG> pg;
+ PipelineHandle handle;
+
+public:
+ template <typename... Args>
+ LocalPeeringEvent(Ref<PG> pg, Args&&... args) :
+ PeeringEvent(std::forward<Args>(args)...),
+ pg(pg)
+ {}
+
+ seastar::future<> start();
+ virtual ~LocalPeeringEvent();
+
+ PipelineHandle &get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ PGPeeringPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGPeeringPipeline::Process::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::LocalPeeringEvent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::RemotePeeringEvent> : fmt::ostream_formatter {};
+template <class T> struct fmt::formatter<crimson::osd::PeeringEvent<T>> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc
new file mode 100644
index 000000000..3706af810
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "osd/PeeringState.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+PGAdvanceMap::PGAdvanceMap(
+ ShardServices &shard_services, Ref<PG> pg, epoch_t to,
+ PeeringCtx &&rctx, bool do_init)
+ : shard_services(shard_services), pg(pg), to(to),
+ rctx(std::move(rctx)), do_init(do_init)
+{
+ logger().debug("{}: created", *this);
+}
+
+PGAdvanceMap::~PGAdvanceMap() {}
+
+void PGAdvanceMap::print(std::ostream &lhs) const
+{
+ lhs << "PGAdvanceMap("
+ << "pg=" << pg->get_pgid()
+ << " from=" << (from ? *from : -1)
+ << " to=" << to;
+ if (do_init) {
+ lhs << " do_init";
+ }
+ lhs << ")";
+}
+
+void PGAdvanceMap::dump_detail(Formatter *f) const
+{
+ f->open_object_section("PGAdvanceMap");
+ f->dump_stream("pgid") << pg->get_pgid();
+ if (from) {
+ f->dump_int("from", *from);
+ }
+ f->dump_int("to", to);
+ f->dump_bool("do_init", do_init);
+ f->close_section();
+}
+
+PGPeeringPipeline &PGAdvanceMap::peering_pp(PG &pg)
+{
+ return pg.peering_request_pg_pipeline;
+}
+
+seastar::future<> PGAdvanceMap::start()
+{
+ using cached_map_t = OSDMapService::cached_map_t;
+
+ logger().debug("{}: start", *this);
+
+ IRef ref = this;
+ return enter_stage<>(
+ peering_pp(*pg).process
+ ).then([this] {
+ /*
+ * PGAdvanceMap is scheduled at pg creation and when
+ * broadcasting new osdmaps to pgs. We are not able to serialize
+ * between the two different PGAdvanceMap callers since a new pg
+ * will get advanced to the latest osdmap at it's creation.
+ * As a result, we may need to adjust the PGAdvance operation
+ * 'from' epoch.
+ * See: https://tracker.ceph.com/issues/61744
+ */
+ from = pg->get_osdmap_epoch();
+ auto fut = seastar::now();
+ if (do_init) {
+ fut = pg->handle_initialize(rctx
+ ).then([this] {
+ return pg->handle_activate_map(rctx);
+ });
+ }
+ return fut.then([this] {
+ ceph_assert(std::cmp_less_equal(*from, to));
+ return seastar::do_for_each(
+ boost::make_counting_iterator(*from + 1),
+ boost::make_counting_iterator(to + 1),
+ [this](epoch_t next_epoch) {
+ logger().debug("{}: start: getting map {}",
+ *this, next_epoch);
+ return shard_services.get_map(next_epoch).then(
+ [this] (cached_map_t&& next_map) {
+ logger().debug("{}: advancing map to {}",
+ *this, next_map->get_epoch());
+ return pg->handle_advance_map(next_map, rctx);
+ });
+ }).then([this] {
+ return pg->handle_activate_map(rctx).then([this] {
+ logger().debug("{}: map activated", *this);
+ if (do_init) {
+ shard_services.pg_created(pg->get_pgid(), pg);
+ logger().info("PGAdvanceMap::start new pg {}", *pg);
+ }
+ return seastar::when_all_succeed(
+ pg->get_need_up_thru()
+ ? shard_services.send_alive(
+ pg->get_same_interval_since())
+ : seastar::now(),
+ shard_services.dispatch_context(
+ pg->get_collection_ref(),
+ std::move(rctx)));
+ });
+ }).then_unpack([this] {
+ logger().debug("{}: sending pg temp", *this);
+ return shard_services.send_pg_temp();
+ });
+ });
+ }).then([this, ref=std::move(ref)] {
+ logger().debug("{}: complete", *this);
+ });
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h
new file mode 100644
index 000000000..b712cc12e
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+class PG;
+
+class PGAdvanceMap : public PhasedOperationT<PGAdvanceMap> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map;
+
+protected:
+ ShardServices &shard_services;
+ Ref<PG> pg;
+ PipelineHandle handle;
+
+ std::optional<epoch_t> from;
+ epoch_t to;
+
+ PeeringCtx rctx;
+ const bool do_init;
+
+public:
+ PGAdvanceMap(
+ ShardServices &shard_services, Ref<PG> pg, epoch_t to,
+ PeeringCtx &&rctx, bool do_init);
+ ~PGAdvanceMap();
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter *f) const final;
+ seastar::future<> start();
+ PipelineHandle &get_handle() { return handle; }
+
+ std::tuple<
+ PGPeeringPipeline::Process::BlockingEvent
+ > tracking_events;
+
+private:
+ PGPeeringPipeline &peering_pp(PG &pg);
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::PGAdvanceMap> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc
new file mode 100644
index 000000000..68655b8da
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc
@@ -0,0 +1,46 @@
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::RecoverySubRequest> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+seastar::future<> RecoverySubRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pgref)
+{
+ logger().debug("{}: {}", "RecoverySubRequest::with_pg", *this);
+
+ track_event<StartEvent>();
+ IRef opref = this;
+ return interruptor::with_interruption([this, pgref] {
+ return pgref->get_recovery_backend()->handle_recovery_op(m, conn);
+ }, [](std::exception_ptr) {
+ return seastar::now();
+ }, pgref).finally([this, opref, pgref] {
+ track_event<CompletionEvent>();
+ });
+}
+
+ConnectionPipeline &RecoverySubRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).peering_request_conn_pipeline;
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h
new file mode 100644
index 000000000..07c7c95b5
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDFastDispatchOp.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class RecoverySubRequest final : public PhasedOperationT<RecoverySubRequest> {
+public:
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::background_recovery_sub;
+
+ RecoverySubRequest(
+ crimson::net::ConnectionRef conn,
+ Ref<MOSDFastDispatchOp>&& m)
+ : conn(conn), m(m) {}
+
+ void print(std::ostream& out) const final
+ {
+ out << *m;
+ }
+
+ void dump_detail(Formatter *f) const final
+ {
+ }
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return m->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return m->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+private:
+ crimson::net::ConnectionRef conn;
+ // must be after `conn` to ensure the ConnectionPipeline's is alive
+ PipelineHandle handle;
+ Ref<MOSDFastDispatchOp> m;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::RecoverySubRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc
new file mode 100644
index 000000000..09217575c
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_request.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/osd_operation_external_tracking.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+RepRequest::RepRequest(crimson::net::ConnectionRef&& conn,
+ Ref<MOSDRepOp> &&req)
+ : conn{std::move(conn)},
+ req{std::move(req)}
+{}
+
+void RepRequest::print(std::ostream& os) const
+{
+ os << "RepRequest("
+ << "from=" << req->from
+ << " req=" << *req
+ << ")";
+}
+
+void RepRequest::dump_detail(Formatter *f) const
+{
+ f->open_object_section("RepRequest");
+ f->dump_stream("reqid") << req->reqid;
+ f->dump_stream("pgid") << req->get_spg();
+ f->dump_unsigned("map_epoch", req->get_map_epoch());
+ f->dump_unsigned("min_epoch", req->get_min_epoch());
+ f->dump_stream("oid") << req->poid;
+ f->dump_stream("from") << req->from;
+ f->close_section();
+}
+
+ConnectionPipeline &RepRequest::get_connection_pipeline()
+{
+ return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg)
+{
+ return pg.request_pg_pipeline;
+}
+
+seastar::future<> RepRequest::with_pg(
+ ShardServices &shard_services, Ref<PG> pg)
+{
+ logger().debug("{}: RepRequest::with_pg", *this);
+ IRef ref = this;
+ return interruptor::with_interruption([this, pg] {
+ logger().debug("{}: pg present", *this);
+ return this->template enter_stage<interruptor>(client_pp(*pg).await_map
+ ).then_interruptible([this, pg] {
+ return this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), req->min_epoch);
+ });
+ }).then_interruptible([this, pg] (auto) {
+ return pg->handle_rep_op(req);
+ });
+ }, [ref](std::exception_ptr) {
+ return seastar::now();
+ }, pg);
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h
new file mode 100644
index 000000000..c742888d9
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDRepOp.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSD;
+class PG;
+
+class RepRequest final : public PhasedOperationT<RepRequest> {
+public:
+ static constexpr OperationTypeCode type = OperationTypeCode::replicated_request;
+ RepRequest(crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&);
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+
+ static constexpr bool can_create() { return false; }
+ spg_t get_pgid() const {
+ return req->get_spg();
+ }
+ PipelineHandle &get_handle() { return handle; }
+ epoch_t get_epoch() const { return req->get_min_epoch(); }
+
+ ConnectionPipeline &get_connection_pipeline();
+ seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() {
+ assert(conn);
+ return conn.get_foreign(
+ ).then([this](auto f_conn) {
+ conn.reset();
+ return f_conn;
+ });
+ }
+ void finish_remote_submission(crimson::net::ConnectionFRef _conn) {
+ assert(!conn);
+ conn = make_local_shared_foreign(std::move(_conn));
+ }
+
+ seastar::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ std::tuple<
+ StartEvent,
+ ConnectionPipeline::AwaitActive::BlockingEvent,
+ ConnectionPipeline::AwaitMap::BlockingEvent,
+ ConnectionPipeline::GetPG::BlockingEvent,
+ ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
+ PGMap::PGCreationBlockingEvent,
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
+ > tracking_events;
+
+private:
+ ClientRequest::PGPipeline &client_pp(PG &pg);
+
+ crimson::net::ConnectionRef conn;
+ PipelineHandle handle;
+ Ref<MOSDRepOp> req;
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::RepRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
new file mode 100644
index 000000000..e4a1b04df
--- /dev/null
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -0,0 +1,569 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/pg.h"
+#include <seastar/core/sleep.hh>
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson {
+ template <>
+ struct EventBackendRegistry<osd::SnapTrimEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+
+ template <>
+ struct EventBackendRegistry<osd::SnapTrimObjSubEvent> {
+ static std::tuple<> get_backends() {
+ return {};
+ }
+ };
+}
+
+namespace crimson::osd {
+
+PG::interruptible_future<>
+PG::SnapTrimMutex::lock(SnapTrimEvent &st_event) noexcept
+{
+ return st_event.enter_stage<interruptor>(wait_pg
+ ).then_interruptible([this] {
+ return mutex.lock();
+ });
+}
+
+void SnapTrimEvent::SubOpBlocker::dump_detail(Formatter *f) const
+{
+ f->open_array_section("dependent_operations");
+ {
+ for (const auto &kv : subops) {
+ f->dump_unsigned("op_id", kv.first);
+ }
+ }
+ f->close_section();
+}
+
+template <class... Args>
+void SnapTrimEvent::SubOpBlocker::emplace_back(Args&&... args)
+{
+ subops.emplace_back(std::forward<Args>(args)...);
+};
+
+SnapTrimEvent::remove_or_update_iertr::future<>
+SnapTrimEvent::SubOpBlocker::wait_completion()
+{
+ return interruptor::do_for_each(subops, [](auto&& kv) {
+ return std::move(kv.second);
+ });
+}
+
+void SnapTrimEvent::print(std::ostream &lhs) const
+{
+ lhs << "SnapTrimEvent("
+ << "pgid=" << pg->get_pgid()
+ << " snapid=" << snapid
+ << " needs_pause=" << needs_pause
+ << ")";
+}
+
+void SnapTrimEvent::dump_detail(Formatter *f) const
+{
+ f->open_object_section("SnapTrimEvent");
+ f->dump_stream("pgid") << pg->get_pgid();
+ f->close_section();
+}
+
+SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration>
+SnapTrimEvent::start()
+{
+ logger().debug("{}: {}", *this, __func__);
+ return with_pg(
+ pg->get_shard_services(), pg
+ ).finally([ref=IRef{this}, this] {
+ logger().debug("{}: complete", *ref);
+ return handle.complete();
+ });
+}
+
+CommonPGPipeline& SnapTrimEvent::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration>
+SnapTrimEvent::with_pg(
+ ShardServices &shard_services, Ref<PG> _pg)
+{
+ return interruptor::with_interruption([&shard_services, this] {
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([] {
+ //return do_recover_missing(pg, get_target_oid());
+ return seastar::now();
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] {
+ return pg->snaptrim_mutex.lock(*this);
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().process);
+ }).then_interruptible([&shard_services, this] {
+ return interruptor::async([this] {
+ std::vector<hobject_t> to_trim;
+ using crimson::common::local_conf;
+ const auto max =
+ local_conf().get_val<uint64_t>("osd_pg_max_concurrent_snap_trims");
+ // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
+ // the ENOENT below and erase snapid.
+ int r = snap_mapper.get_next_objects_to_trim(
+ snapid,
+ max,
+ &to_trim);
+ if (r == -ENOENT) {
+ to_trim.clear(); // paranoia
+ return to_trim;
+ } else if (r != 0) {
+ logger().error("{}: get_next_objects_to_trim returned {}",
+ *this, cpp_strerror(r));
+ ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
+ } else {
+ assert(!to_trim.empty());
+ }
+ logger().debug("{}: async almost done line {}", *this, __LINE__);
+ return to_trim;
+ }).then_interruptible([&shard_services, this] (const auto& to_trim) {
+ if (to_trim.empty()) {
+ // the legit ENOENT -> done
+ logger().debug("{}: to_trim is empty! Stopping iteration", *this);
+ pg->snaptrim_mutex.unlock();
+ return snap_trim_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ return [&shard_services, this](const auto &to_trim) {
+ for (const auto& object : to_trim) {
+ logger().debug("{}: trimming {}", *this, object);
+ auto [op, fut] = shard_services.start_operation_may_interrupt<
+ interruptor, SnapTrimObjSubEvent>(
+ pg,
+ object,
+ snapid);
+ subop_blocker.emplace_back(
+ op->get_id(),
+ std::move(fut)
+ );
+ }
+ return interruptor::now();
+ }(to_trim).then_interruptible([this] {
+ return enter_stage<interruptor>(wait_subop);
+ }).then_interruptible([this] {
+ logger().debug("{}: awaiting completion", *this);
+ return subop_blocker.wait_completion();
+ }).finally([this] {
+ pg->snaptrim_mutex.unlock();
+ }).safe_then_interruptible([this] {
+ if (!needs_pause) {
+ return interruptor::now();
+ }
+ // let's know operators we're waiting
+ return enter_stage<interruptor>(
+ wait_trim_timer
+ ).then_interruptible([this] {
+ using crimson::common::local_conf;
+ const auto time_to_sleep =
+ local_conf().template get_val<double>("osd_snap_trim_sleep");
+ logger().debug("{}: time_to_sleep {}", *this, time_to_sleep);
+ // TODO: this logic should be more sophisticated and distinguish
+ // between SSDs, HDDs and the hybrid case
+ return seastar::sleep(
+ std::chrono::milliseconds(std::lround(time_to_sleep * 1000)));
+ });
+ }).safe_then_interruptible([this] {
+ logger().debug("{}: all completed", *this);
+ return snap_trim_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ });
+ }, [this](std::exception_ptr eptr) -> snap_trim_ertr::future<seastar::stop_iteration> {
+ logger().debug("{}: interrupted {}", *this, eptr);
+ return crimson::ct_error::eagain::make();
+ }, pg);
+}
+
+
+CommonPGPipeline& SnapTrimObjSubEvent::client_pp()
+{
+ return pg->request_pg_pipeline;
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::start()
+{
+ logger().debug("{}: start", *this);
+ return with_pg(
+ pg->get_shard_services(), pg
+ ).finally([ref=IRef{this}, this] {
+ logger().debug("{}: complete", *ref);
+ return handle.complete();
+ });
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::remove_clone(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ const auto p = std::find(
+ head_obc->ssc->snapset.clones.begin(),
+ head_obc->ssc->snapset.clones.end(),
+ coid.snap);
+ if (p == head_obc->ssc->snapset.clones.end()) {
+ logger().error("{}: Snap {} not in clones",
+ *this, coid.snap);
+ return crimson::ct_error::enoent::make();
+ }
+ assert(p != head_obc->ssc->snapset.clones.end());
+ snapid_t last = coid.snap;
+ delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(last);
+
+ if (p != head_obc->ssc->snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ std::vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = coid;
+ prev_coid.snap = *n;
+
+ // does the classical OSD really need is_present_clone(prev_coid)?
+ delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(*n);
+ head_obc->ssc->snapset.clone_overlap[*n].intersection_of(
+ head_obc->ssc->snapset.clone_overlap[*p]);
+ delta_stats.num_bytes += head_obc->ssc->snapset.get_clone_bytes(*n);
+ }
+ delta_stats.num_objects--;
+ if (obc->obs.oi.is_dirty()) {
+ delta_stats.num_objects_dirty--;
+ }
+ if (obc->obs.oi.is_omap()) {
+ delta_stats.num_objects_omap--;
+ }
+ if (obc->obs.oi.is_whiteout()) {
+ logger().debug("{}: trimming whiteout on {}",
+ *this, coid);
+ delta_stats.num_whiteouts--;
+ }
+ delta_stats.num_object_clones--;
+
+ obc->obs.exists = false;
+ head_obc->ssc->snapset.clones.erase(p);
+ head_obc->ssc->snapset.clone_overlap.erase(last);
+ head_obc->ssc->snapset.clone_size.erase(last);
+ head_obc->ssc->snapset.clone_snaps.erase(last);
+
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::DELETE,
+ coid,
+ osd_op_p.at_version,
+ obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime, // will be replaced in `apply_to()`
+ 0}
+ );
+ txn.remove(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
+ obc->obs.oi = object_info_t(coid);
+ return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn);
+}
+
+void SnapTrimObjSubEvent::remove_head_whiteout(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ // NOTE: this arguably constitutes minor interference with the
+ // tiering agent if this is a cache tier since a snap trim event
+ // is effectively evicting a whiteout we might otherwise want to
+ // keep around.
+ const auto head_oid = coid.get_head();
+ logger().info("{}: {} removing {}",
+ *this, coid, head_oid);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::DELETE,
+ head_oid,
+ osd_op_p.at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime, // will be replaced in `apply_to()`
+ 0}
+ );
+ logger().info("{}: remove snap head", *this);
+ object_info_t& oi = head_obc->obs.oi;
+ delta_stats.num_objects--;
+ if (oi.is_dirty()) {
+ delta_stats.num_objects_dirty--;
+ }
+ if (oi.is_omap()) {
+ delta_stats.num_objects_omap--;
+ }
+ if (oi.is_whiteout()) {
+ logger().debug("{}: trimming whiteout on {}",
+ *this, oi.soid);
+ delta_stats.num_whiteouts--;
+ }
+ head_obc->obs.exists = false;
+ head_obc->obs.oi = object_info_t(head_oid);
+ txn.remove(pg->get_collection_ref()->get_cid(),
+ ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
+}
+
+SnapTrimObjSubEvent::interruptible_future<>
+SnapTrimObjSubEvent::adjust_snaps(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ const std::set<snapid_t>& new_snaps,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ head_obc->ssc->snapset.clone_snaps[coid.snap] =
+ std::vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+
+ // we still do a 'modify' event on this object just to trigger a
+ // snapmapper.update ... :(
+ obc->obs.oi.prior_version = obc->obs.oi.version;
+ obc->obs.oi.version = osd_op_p.at_version;
+ ceph::bufferlist bl;
+ encode(obc->obs.oi,
+ bl,
+ pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ txn.setattr(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
+ OI_ATTR,
+ bl);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::MODIFY,
+ coid,
+ obc->obs.oi.version,
+ obc->obs.oi.prior_version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime,
+ 0}
+ );
+ return OpsExecuter::snap_map_modify(
+ coid, new_snaps, pg->snap_mapper, pg->osdriver, txn);
+}
+
+void SnapTrimObjSubEvent::update_head(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries
+) {
+ const auto head_oid = coid.get_head();
+ logger().info("{}: writing updated snapset on {}, snapset is {}",
+ *this, head_oid, head_obc->ssc->snapset);
+ log_entries.emplace_back(
+ pg_log_entry_t{
+ pg_log_entry_t::MODIFY,
+ head_oid,
+ osd_op_p.at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ obc->obs.oi.mtime,
+ 0}
+ );
+
+ head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
+ head_obc->obs.oi.version = osd_op_p.at_version;
+
+ std::map<std::string, ceph::bufferlist, std::less<>> attrs;
+ ceph::bufferlist bl;
+ encode(head_obc->ssc->snapset, bl);
+ attrs[SS_ATTR] = std::move(bl);
+
+ bl.clear();
+ head_obc->obs.oi.encode_no_oid(bl,
+ pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ attrs[OI_ATTR] = std::move(bl);
+ txn.setattrs(
+ pg->get_collection_ref()->get_cid(),
+ ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
+ attrs);
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<
+ SnapTrimObjSubEvent::remove_or_update_ret_t>
+SnapTrimObjSubEvent::remove_or_update(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc)
+{
+ auto citer = head_obc->ssc->snapset.clone_snaps.find(coid.snap);
+ if (citer == head_obc->ssc->snapset.clone_snaps.end()) {
+ logger().error("{}: No clone_snaps in snapset {} for object {}",
+ *this, head_obc->ssc->snapset, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ const auto& old_snaps = citer->second;
+ if (old_snaps.empty()) {
+ logger().error("{}: no object info snaps for object {}",
+ *this, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ if (head_obc->ssc->snapset.seq == 0) {
+ logger().error("{}: no snapset.seq for object {}",
+ *this, coid);
+ return crimson::ct_error::enoent::make();
+ }
+ const OSDMapRef& osdmap = pg->get_osdmap();
+ std::set<snapid_t> new_snaps;
+ for (const auto& old_snap : old_snaps) {
+ if (!osdmap->in_removed_snaps_queue(pg->get_info().pgid.pgid.pool(),
+ old_snap)
+ && old_snap != snap_to_trim) {
+ new_snaps.insert(old_snap);
+ }
+ }
+
+ return seastar::do_with(ceph::os::Transaction{}, [=, this](auto &txn) {
+ std::vector<pg_log_entry_t> log_entries{};
+
+ int64_t num_objects_before_trim = delta_stats.num_objects;
+ osd_op_p.at_version = pg->next_version();
+ auto ret = remove_or_update_iertr::now();
+ if (new_snaps.empty()) {
+ // remove clone from snapset
+ logger().info("{}: {} snaps {} -> {} ... deleting",
+ *this, coid, old_snaps, new_snaps);
+ ret = remove_clone(obc, head_obc, txn, log_entries);
+ } else {
+ // save adjusted snaps for this object
+ logger().info("{}: {} snaps {} -> {}",
+ *this, coid, old_snaps, new_snaps);
+ ret = adjust_snaps(obc, head_obc, new_snaps, txn, log_entries);
+ }
+ return std::move(ret).safe_then_interruptible(
+ [&txn, obc, num_objects_before_trim, log_entries=std::move(log_entries), head_obc=std::move(head_obc), this]() mutable {
+ osd_op_p.at_version = pg->next_version();
+
+ // save head snapset
+ logger().debug("{}: {} new snapset {} on {}",
+ *this, coid, head_obc->ssc->snapset, head_obc->obs.oi);
+ if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) {
+ remove_head_whiteout(obc, head_obc, txn, log_entries);
+ } else {
+ update_head(obc, head_obc, txn, log_entries);
+ }
+ // Stats reporting - Set number of objects trimmed
+ if (num_objects_before_trim > delta_stats.num_objects) {
+ //int64_t num_objects_trimmed =
+ // num_objects_before_trim - delta_stats.num_objects;
+ //add_objects_trimmed_count(num_objects_trimmed);
+ }
+ }).safe_then_interruptible(
+ [&txn, log_entries=std::move(log_entries)] () mutable {
+ return remove_or_update_iertr::make_ready_future<remove_or_update_ret_t>(
+ std::make_pair(std::move(txn), std::move(log_entries)));
+ });
+ });
+}
+
+SnapTrimObjSubEvent::remove_or_update_iertr::future<>
+SnapTrimObjSubEvent::with_pg(
+ ShardServices &shard_services, Ref<PG> _pg)
+{
+ return enter_stage<interruptor>(
+ client_pp().wait_for_active
+ ).then_interruptible([this] {
+ return with_blocking_event<PGActivationBlocker::BlockingEvent,
+ interruptor>([this] (auto&& trigger) {
+ return pg->wait_for_active_blocker.wait(std::move(trigger));
+ });
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().recover_missing);
+ }).then_interruptible([] {
+ //return do_recover_missing(pg, get_target_oid());
+ return seastar::now();
+ }).then_interruptible([this] {
+ return enter_stage<interruptor>(
+ client_pp().get_obc);
+ }).then_interruptible([this] {
+ logger().debug("{}: getting obc for {}", *this, coid);
+ // end of commonality
+ // with_clone_obc_direct lock both clone's and head's obcs
+ return pg->obc_loader.with_clone_obc_direct<RWState::RWWRITE>(
+ coid,
+ [this](auto head_obc, auto clone_obc) {
+ logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
+ return enter_stage<interruptor>(
+ client_pp().process
+ ).then_interruptible(
+ [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable {
+ logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
+ return remove_or_update(
+ clone_obc, head_obc
+ ).safe_then_unpack_interruptible([clone_obc, this]
+ (auto&& txn, auto&& log_entries) mutable {
+ auto [submitted, all_completed] = pg->submit_transaction(
+ std::move(clone_obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries));
+ return submitted.then_interruptible(
+ [all_completed=std::move(all_completed), this] () mutable {
+ return enter_stage<interruptor>(
+ wait_repop
+ ).then_interruptible([all_completed=std::move(all_completed)] () mutable {
+ return std::move(all_completed);
+ });
+ });
+ });
+ });
+ }).handle_error_interruptible(
+ remove_or_update_iertr::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"}
+ );
+ });
+}
+
+void SnapTrimObjSubEvent::print(std::ostream &lhs) const
+{
+ lhs << "SnapTrimObjSubEvent("
+ << "coid=" << coid
+ << " snapid=" << snap_to_trim
+ << ")";
+}
+
+void SnapTrimObjSubEvent::dump_detail(Formatter *f) const
+{
+ f->open_object_section("SnapTrimObjSubEvent");
+ f->dump_stream("coid") << coid;
+ f->close_section();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
new file mode 100644
index 000000000..a3a970a04
--- /dev/null
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/common/pg_pipeline.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "osd/osd_types.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+class SnapMapper;
+
+namespace crimson::osd {
+
+class OSD;
+class ShardServices;
+
+// trim up to `max` objects for snapshot `snapid
+class SnapTrimEvent final : public PhasedOperationT<SnapTrimEvent> {
+public:
+ using remove_or_update_ertr =
+ crimson::errorator<crimson::ct_error::enoent>;
+ using remove_or_update_iertr =
+ crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, remove_or_update_ertr>;
+ using snap_trim_ertr = remove_or_update_ertr::extend<
+ crimson::ct_error::eagain>;
+ using snap_trim_iertr = remove_or_update_iertr::extend<
+ crimson::ct_error::eagain>;
+
+ static constexpr OperationTypeCode type = OperationTypeCode::snaptrim_event;
+
+ SnapTrimEvent(Ref<PG> pg,
+ SnapMapper& snap_mapper,
+ const snapid_t snapid,
+ const bool needs_pause)
+ : pg(std::move(pg)),
+ snap_mapper(snap_mapper),
+ snapid(snapid),
+ needs_pause(needs_pause) {}
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ snap_trim_ertr::future<seastar::stop_iteration> start();
+ snap_trim_ertr::future<seastar::stop_iteration> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+private:
+ CommonPGPipeline& client_pp();
+
+ // bases on 998cb8c141bb89aafae298a9d5e130fbd78fe5f2
+ struct SubOpBlocker : crimson::BlockerT<SubOpBlocker> {
+ static constexpr const char* type_name = "CompoundOpBlocker";
+
+ using id_done_t = std::pair<crimson::Operation::id_t,
+ remove_or_update_iertr::future<>>;
+
+ void dump_detail(Formatter *f) const final;
+
+ template <class... Args>
+ void emplace_back(Args&&... args);
+
+ remove_or_update_iertr::future<> wait_completion();
+ private:
+ std::vector<id_done_t> subops;
+ } subop_blocker;
+
+ // we don't need to synchronize with other instances of SnapTrimEvent;
+ // it's here for the sake of op tracking.
+ struct WaitSubop : OrderedConcurrentPhaseT<WaitSubop> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_subop";
+ } wait_subop;
+
+ // an instantiator can instruct us to go over this stage and then
+ // wait for the future to implement throttling. It is implemented
+ // that way to for the sake of tracking ops.
+ struct WaitTrimTimer : OrderedExclusivePhaseT<WaitTrimTimer> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_trim_timer";
+ } wait_trim_timer;
+
+ PipelineHandle handle;
+ Ref<PG> pg;
+ SnapMapper& snap_mapper;
+ const snapid_t snapid;
+ const bool needs_pause;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ WaitSubop::BlockingEvent,
+ PG::SnapTrimMutex::WaitPG::BlockingEvent,
+ WaitTrimTimer::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+
+ friend class PG::SnapTrimMutex;
+};
+
+// remove single object. a SnapTrimEvent can create multiple subrequests.
+// the division of labour is needed because of the restriction that an Op
+// cannot revisite a pipeline's stage it already saw.
+class SnapTrimObjSubEvent : public PhasedOperationT<SnapTrimObjSubEvent> {
+public:
+ using remove_or_update_ertr =
+ crimson::errorator<crimson::ct_error::enoent>;
+ using remove_or_update_iertr =
+ crimson::interruptible::interruptible_errorator<
+ IOInterruptCondition, remove_or_update_ertr>;
+
+ static constexpr OperationTypeCode type =
+ OperationTypeCode::snaptrimobj_subevent;
+
+ SnapTrimObjSubEvent(
+ Ref<PG> pg,
+ const hobject_t& coid,
+ snapid_t snap_to_trim)
+ : pg(std::move(pg)),
+ coid(coid),
+ snap_to_trim(snap_to_trim) {
+ }
+
+ void print(std::ostream &) const final;
+ void dump_detail(ceph::Formatter* f) const final;
+ remove_or_update_iertr::future<> start();
+ remove_or_update_iertr::future<> with_pg(
+ ShardServices &shard_services, Ref<PG> pg);
+
+ CommonPGPipeline& client_pp();
+
+private:
+ object_stat_sum_t delta_stats;
+
+ remove_or_update_iertr::future<> remove_clone(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ void remove_head_whiteout(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ interruptible_future<> adjust_snaps(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ const std::set<snapid_t>& new_snaps,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+ void update_head(
+ ObjectContextRef obc,
+ ObjectContextRef head_obc,
+ ceph::os::Transaction& txn,
+ std::vector<pg_log_entry_t>& log_entries);
+
+ using remove_or_update_ret_t =
+ std::pair<ceph::os::Transaction, std::vector<pg_log_entry_t>>;
+ remove_or_update_iertr::future<remove_or_update_ret_t>
+ remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc);
+
+ // we don't need to synchronize with other instances started by
+ // SnapTrimEvent; it's here for the sake of op tracking.
+ struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+ static constexpr auto type_name = "SnapTrimObjSubEvent::wait_repop";
+ } wait_repop;
+
+ Ref<PG> pg;
+ PipelineHandle handle;
+ osd_op_params_t osd_op_p;
+ const hobject_t coid;
+ const snapid_t snap_to_trim;
+
+public:
+ PipelineHandle& get_handle() { return handle; }
+
+ std::tuple<
+ StartEvent,
+ CommonPGPipeline::WaitForActive::BlockingEvent,
+ PGActivationBlocker::BlockingEvent,
+ CommonPGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonPGPipeline::Process::BlockingEvent,
+ WaitRepop::BlockingEvent,
+ CompletionEvent
+ > tracking_events;
+};
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::SnapTrimEvent> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::osd::SnapTrimObjSubEvent> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc
new file mode 100644
index 000000000..171ec436d
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+template <OSDMapGateType OSDMapGateTypeV>
+void OSDMapGate<OSDMapGateTypeV>::OSDMapBlocker::dump_detail(Formatter *f) const
+{
+ f->open_object_section("OSDMapGate");
+ f->dump_int("epoch", epoch);
+ f->close_section();
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+seastar::future<epoch_t> OSDMapGate<OSDMapGateTypeV>::wait_for_map(
+ typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger,
+ epoch_t epoch,
+ ShardServices *shard_services)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<epoch_t>(
+ crimson::common::system_shutdown_exception());
+ }
+ if (current >= epoch) {
+ return seastar::make_ready_future<epoch_t>(current);
+ } else {
+ logger().info("evt epoch is {}, i have {}, will wait", epoch, current);
+ auto &blocker = waiting_peering.emplace(
+ epoch, std::make_pair(blocker_type, epoch)).first->second;
+ auto fut = blocker.promise.get_shared_future();
+ if (shard_services) {
+ return trigger.maybe_record_blocking(
+ shard_services->osdmap_subscribe(current, true).then(
+ [fut=std::move(fut)]() mutable {
+ return std::move(fut);
+ }),
+ blocker);
+ } else {
+ return trigger.maybe_record_blocking(std::move(fut), blocker);
+ }
+ }
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+void OSDMapGate<OSDMapGateTypeV>::got_map(epoch_t epoch) {
+ if (epoch == 0) {
+ return;
+ }
+ ceph_assert(epoch > current);
+ current = epoch;
+ auto first = waiting_peering.begin();
+ auto last = waiting_peering.upper_bound(epoch);
+ std::for_each(first, last, [epoch](auto& blocked_requests) {
+ blocked_requests.second.promise.set_value(epoch);
+ });
+ waiting_peering.erase(first, last);
+}
+
+template <OSDMapGateType OSDMapGateTypeV>
+seastar::future<> OSDMapGate<OSDMapGateTypeV>::stop() {
+ logger().info("osdmap::stop");
+ stopping = true;
+ auto first = waiting_peering.begin();
+ auto last = waiting_peering.end();
+ std::for_each(first, last, [](auto& blocked_requests) {
+ blocked_requests.second.promise.set_exception(
+ crimson::common::system_shutdown_exception());
+ });
+ return seastar::now();
+}
+
+template class OSDMapGate<OSDMapGateType::PG>;
+template class OSDMapGate<OSDMapGateType::OSD>;
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h
new file mode 100644
index 000000000..d76c4b82f
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <optional>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+enum class OSDMapGateType {
+ OSD,
+ PG,
+};
+
+template <OSDMapGateType OSDMapGateTypeV>
+class OSDMapGate {
+public:
+ struct OSDMapBlocker : BlockerT<OSDMapBlocker> {
+ const char * type_name;
+ epoch_t epoch;
+
+ OSDMapBlocker(std::pair<const char *, epoch_t> args)
+ : type_name(args.first), epoch(args.second) {}
+
+ OSDMapBlocker(const OSDMapBlocker &) = delete;
+ OSDMapBlocker(OSDMapBlocker &&) = delete;
+ OSDMapBlocker &operator=(const OSDMapBlocker &) = delete;
+ OSDMapBlocker &operator=(OSDMapBlocker &&) = delete;
+
+ seastar::shared_promise<epoch_t> promise;
+
+ void dump_detail(Formatter *f) const final;
+ };
+ using Blocker = OSDMapBlocker;
+
+private:
+ // order the promises in ascending order of the waited osdmap epoch,
+ // so we can access all the waiters expecting a map whose epoch is less
+ // than or equal to a given epoch
+ using waiting_peering_t = std::map<epoch_t,
+ OSDMapBlocker>;
+ const char *blocker_type;
+ waiting_peering_t waiting_peering;
+ epoch_t current = 0;
+ bool stopping = false;
+public:
+ OSDMapGate(const char *blocker_type)
+ : blocker_type(blocker_type) {}
+
+ /**
+ * wait_for_map
+ *
+ * Wait for an osdmap whose epoch is greater or equal to given epoch.
+ * If shard_services is non-null, request map if not present.
+ */
+ seastar::future<epoch_t>
+ wait_for_map(
+ typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger,
+ epoch_t epoch,
+ ShardServices *shard_services=nullptr
+ );
+ void got_map(epoch_t epoch);
+ seastar::future<> stop();
+};
+
+using OSD_OSDMapGate = OSDMapGate<OSDMapGateType::OSD>;
+using PG_OSDMapGate = OSDMapGate<OSDMapGateType::PG>;
+
+}
diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h
new file mode 100644
index 000000000..017303536
--- /dev/null
+++ b/src/crimson/osd/osdmap_service.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/types.h"
+#include "osd/OSDMap.h"
+
+class OSDMap;
+
+class OSDMapService {
+public:
+ using cached_map_t = OSDMapRef;
+ using local_cached_map_t = LocalOSDMapRef;
+
+ virtual ~OSDMapService() = default;
+ virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0;
+ /// get the latest map
+ virtual cached_map_t get_map() const = 0;
+ virtual epoch_t get_up_epoch() const = 0;
+};
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
new file mode 100644
index 000000000..7cf3b158c
--- /dev/null
+++ b/src/crimson/osd/pg.cc
@@ -0,0 +1,1544 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "pg.h"
+
+#include <functional>
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/hobject_fmt.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+
+#include "osd/OSDMap.h"
+#include "osd/osd_types_fmt.h"
+
+#include "os/Transaction.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/snaptrim_event.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/replicated_recovery_backend.h"
+#include "crimson/osd/watch.h"
+
+using std::ostream;
+using std::set;
+using std::string;
+using std::vector;
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace std::chrono {
+std::ostream& operator<<(std::ostream& out, const signedspan& d)
+{
+ auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count();
+ auto ns = std::abs((d % 1s).count());
+ fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : "");
+ return out;
+}
+}
+
+template <typename T>
+struct fmt::formatter<std::optional<T>> : fmt::formatter<T> {
+ template <typename FormatContext>
+ auto format(const std::optional<T>& v, FormatContext& ctx) const {
+ if (v.has_value()) {
+ return fmt::formatter<T>::format(*v, ctx);
+ }
+ return fmt::format_to(ctx.out(), "<null>");
+ }
+};
+
+namespace crimson::osd {
+
+using crimson::common::local_conf;
+
+class RecoverablePredicate : public IsPGRecoverablePredicate {
+public:
+ bool operator()(const set<pg_shard_t> &have) const override {
+ return !have.empty();
+ }
+};
+
+class ReadablePredicate: public IsPGReadablePredicate {
+ pg_shard_t whoami;
+public:
+ explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {}
+ bool operator()(const set<pg_shard_t> &have) const override {
+ return have.count(whoami);
+ }
+};
+
+PG::PG(
+ spg_t pgid,
+ pg_shard_t pg_shard,
+ crimson::os::CollectionRef coll_ref,
+ pg_pool_t&& pool,
+ std::string&& name,
+ cached_map_t osdmap,
+ ShardServices &shard_services,
+ ec_profile_t profile)
+ : pgid{pgid},
+ pg_whoami{pg_shard},
+ coll_ref{coll_ref},
+ pgmeta_oid{pgid.make_pgmeta_oid()},
+ osdmap_gate("PG::osdmap_gate"),
+ shard_services{shard_services},
+ backend(
+ PGBackend::create(
+ pgid.pgid,
+ pg_shard,
+ pool,
+ coll_ref,
+ shard_services,
+ profile,
+ *this)),
+ recovery_backend(
+ std::make_unique<ReplicatedRecoveryBackend>(
+ *this, shard_services, coll_ref, backend.get())),
+ recovery_handler(
+ std::make_unique<PGRecovery>(this)),
+ peering_state(
+ shard_services.get_cct(),
+ pg_shard,
+ pgid,
+ PGPool(
+ osdmap,
+ pgid.pool(),
+ pool,
+ name),
+ osdmap,
+ this,
+ this),
+ obc_registry{
+ local_conf()},
+ obc_loader{
+ obc_registry,
+ *backend.get(),
+ *this},
+ osdriver(
+ &shard_services.get_store(),
+ coll_ref,
+ pgid.make_pgmeta_oid()),
+ snap_mapper(
+ this->shard_services.get_cct(),
+ &osdriver,
+ pgid.ps(),
+ pgid.get_split_bits(pool.get_pg_num()),
+ pgid.pool(),
+ pgid.shard),
+ wait_for_active_blocker(this)
+{
+ peering_state.set_backend_predicates(
+ new ReadablePredicate(pg_whoami),
+ new RecoverablePredicate());
+ osdmap_gate.got_map(osdmap->get_epoch());
+}
+
+PG::~PG() {}
+
+void PG::check_blocklisted_watchers()
+{
+ logger().debug("{}", __func__);
+ obc_registry.for_each([this](ObjectContextRef obc) {
+ assert(obc);
+ for (const auto& [key, watch] : obc->watchers) {
+ assert(watch->get_pg() == this);
+ const auto& ea = watch->get_peer_addr();
+ logger().debug("watch: Found {} cookie {}. Checking entity_add_t {}",
+ watch->get_entity(), watch->get_cookie(), ea);
+ if (get_osdmap()->is_blocklisted(ea)) {
+ logger().info("watch: Found blocklisted watcher for {}", ea);
+ watch->do_watch_timeout();
+ }
+ }
+ });
+}
+
+bool PG::try_flush_or_schedule_async() {
+ logger().debug("PG::try_flush_or_schedule_async: flush ...");
+ (void)shard_services.get_store().flush(
+ coll_ref
+ ).then(
+ [this, epoch=get_osdmap_epoch()]() {
+ return shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ epoch,
+ epoch,
+ PeeringState::IntervalFlush());
+ });
+ return false;
+}
+
+void PG::publish_stats_to_osd()
+{
+ if (!is_primary())
+ return;
+ if (auto new_pg_stats = peering_state.prepare_stats_for_publish(
+ pg_stats,
+ object_stat_collection_t());
+ new_pg_stats.has_value()) {
+ pg_stats = std::move(new_pg_stats);
+ }
+}
+
+void PG::clear_publish_stats()
+{
+ pg_stats.reset();
+}
+
+pg_stat_t PG::get_stats() const
+{
+ return pg_stats.value_or(pg_stat_t{});
+}
+
+void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay)
+{
+ // handle the peering event in the background
+ logger().debug(
+ "{}: PG::queue_check_readable lpr: {}, delay: {}",
+ *this, last_peering_reset, delay);
+ check_readable_timer.cancel();
+ check_readable_timer.set_callback([last_peering_reset, this] {
+ logger().debug(
+ "{}: PG::queue_check_readable callback lpr: {}",
+ *this, last_peering_reset);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ last_peering_reset,
+ last_peering_reset,
+ PeeringState::CheckReadable{});
+ });
+ check_readable_timer.arm(
+ std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+void PG::recheck_readable()
+{
+ bool changed = false;
+ const auto mnow = shard_services.get_mnow();
+ if (peering_state.state_test(PG_STATE_WAIT)) {
+ auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub();
+ if (mnow < prior_readable_until_ub) {
+ logger().info(
+ "{}: {} will wait (mnow {} < prior_readable_until_ub {})",
+ *this, __func__, mnow, prior_readable_until_ub);
+ queue_check_readable(
+ peering_state.get_last_peering_reset(),
+ prior_readable_until_ub - mnow);
+ } else {
+ logger().info(
+ "{}:{} no longer wait (mnow {} >= prior_readable_until_ub {})",
+ *this, __func__, mnow, prior_readable_until_ub);
+ peering_state.state_clear(PG_STATE_WAIT);
+ peering_state.clear_prior_readable_until_ub();
+ changed = true;
+ }
+ }
+ if (peering_state.state_test(PG_STATE_LAGGY)) {
+ auto readable_until = peering_state.get_readable_until();
+ if (readable_until == readable_until.zero()) {
+ logger().info(
+ "{}:{} still laggy (mnow {}, readable_until zero)",
+ *this, __func__, mnow);
+ } else if (mnow >= readable_until) {
+ logger().info(
+ "{}:{} still laggy (mnow {} >= readable_until {})",
+ *this, __func__, mnow, readable_until);
+ } else {
+ logger().info(
+ "{}:{} no longer laggy (mnow {} < readable_until {})",
+ *this, __func__, mnow, readable_until);
+ peering_state.state_clear(PG_STATE_LAGGY);
+ changed = true;
+ }
+ }
+ if (changed) {
+ publish_stats_to_osd();
+ if (!peering_state.state_test(PG_STATE_WAIT) &&
+ !peering_state.state_test(PG_STATE_LAGGY)) {
+ // TODO: requeue ops waiting for readable
+ }
+ }
+}
+
+unsigned PG::get_target_pg_log_entries() const
+{
+ const unsigned local_num_pgs = shard_services.get_num_local_pgs();
+ const unsigned local_target =
+ local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd") /
+ seastar::smp::count;
+ const unsigned min_pg_log_entries =
+ local_conf().get_val<uint64_t>("osd_min_pg_log_entries");
+ if (local_num_pgs > 0 && local_target > 0) {
+ // target an even spread of our budgeted log entries across all
+ // PGs. note that while we only get to control the entry count
+ // for primary PGs, we'll normally be responsible for a mix of
+ // primary and replica PGs (for the same pool(s) even), so this
+ // will work out.
+ const unsigned max_pg_log_entries =
+ local_conf().get_val<uint64_t>("osd_max_pg_log_entries");
+ return std::clamp(local_target / local_num_pgs,
+ min_pg_log_entries,
+ max_pg_log_entries);
+ } else {
+ // fall back to a per-pg value.
+ return min_pg_log_entries;
+ }
+}
+
+void PG::on_removal(ceph::os::Transaction &t) {
+ t.register_on_commit(
+ new LambdaContext(
+ [this](int r) {
+ ceph_assert(r == 0);
+ (void)shard_services.start_operation<LocalPeeringEvent>(
+ this, pg_whoami, pgid, float(0.001), get_osdmap_epoch(),
+ get_osdmap_epoch(), PeeringState::DeleteSome());
+ }));
+}
+
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+ logger().debug("{}: {} snaps={}", *this, __func__, snaps);
+ snap_trimq = std::move(snaps);
+ projected_last_update = peering_state.get_info().last_update;
+}
+
+void PG::on_activate_complete()
+{
+ wait_for_active_blocker.unblock();
+
+ if (peering_state.needs_recovery()) {
+ logger().info("{}: requesting recovery",
+ __func__);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DoRecovery{});
+ } else if (peering_state.needs_backfill()) {
+ logger().info("{}: requesting backfill",
+ __func__);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill{});
+ } else {
+ logger().debug("{}: no need to recover or backfill, AllReplicasRecovered",
+ " for pg: {}", __func__, pgid);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered{});
+ }
+ publish_stats_to_osd();
+}
+
+void PG::prepare_write(pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ceph::os::Transaction &t)
+{
+ std::map<string,bufferlist> km;
+ std::string key_to_remove;
+ if (dirty_big_info || dirty_info) {
+ int ret = prepare_info_keymap(
+ shard_services.get_cct(),
+ &km,
+ &key_to_remove,
+ get_osdmap_epoch(),
+ info,
+ last_written_info,
+ past_intervals,
+ dirty_big_info,
+ need_write_epoch,
+ true,
+ nullptr,
+ this);
+ ceph_assert(ret == 0);
+ }
+ pglog.write_log_and_missing(
+ t, &km, coll_ref->get_cid(), pgmeta_oid,
+ peering_state.get_pgpool().info.require_rollback());
+ if (!km.empty()) {
+ t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km);
+ }
+ if (!key_to_remove.empty()) {
+ t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove);
+ }
+}
+
+std::pair<ghobject_t, bool>
+PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
+{
+ logger().info("removing pg {}", pgid);
+ auto fut = interruptor::make_interruptible(
+ shard_services.get_store().list_objects(
+ coll_ref,
+ _next,
+ ghobject_t::get_max(),
+ local_conf()->osd_target_transaction_size));
+
+ auto [objs_to_rm, next] = fut.get();
+ if (objs_to_rm.empty()) {
+ logger().info("all objs removed, removing coll for {}", pgid);
+ t.remove(coll_ref->get_cid(), pgmeta_oid);
+ t.remove_collection(coll_ref->get_cid());
+ (void) shard_services.get_store().do_transaction(
+ coll_ref, std::move(t)).then([this] {
+ return shard_services.remove_pg(pgid);
+ });
+ return {next, false};
+ } else {
+ for (auto &obj : objs_to_rm) {
+ if (obj == pgmeta_oid) {
+ continue;
+ }
+ logger().trace("pg {}, removing obj {}", pgid, obj);
+ t.remove(coll_ref->get_cid(), obj);
+ }
+ t.register_on_commit(
+ new LambdaContext([this](int r) {
+ ceph_assert(r == 0);
+ logger().trace("triggering more pg delete {}", pgid);
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ float(0.001),
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DeleteSome{});
+ }));
+ return {next, true};
+ }
+}
+
+Context *PG::on_clean()
+{
+ // Not needed yet (will be needed for IO unblocking)
+ return nullptr;
+}
+
+void PG::on_active_actmap()
+{
+ logger().debug("{}: {} snap_trimq={}", *this, __func__, snap_trimq);
+ peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
+ // loops until snap_trimq is empty or SNAPTRIM_ERROR.
+ std::ignore = seastar::do_until(
+ [this] { return snap_trimq.empty()
+ || peering_state.state_test(PG_STATE_SNAPTRIM_ERROR);
+ },
+ [this] {
+ peering_state.state_set(PG_STATE_SNAPTRIM);
+ publish_stats_to_osd();
+ const auto to_trim = snap_trimq.range_start();
+ snap_trimq.erase(to_trim);
+ const auto needs_pause = !snap_trimq.empty();
+ return seastar::repeat([to_trim, needs_pause, this] {
+ logger().debug("{}: going to start SnapTrimEvent, to_trim={}",
+ *this, to_trim);
+ return shard_services.start_operation<SnapTrimEvent>(
+ this,
+ snap_mapper,
+ to_trim,
+ needs_pause
+ ).second.handle_error(
+ crimson::ct_error::enoent::handle([this] {
+ logger().error("{}: ENOENT saw, trimming stopped", *this);
+ peering_state.state_set(PG_STATE_SNAPTRIM_ERROR);
+ publish_stats_to_osd();
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }), crimson::ct_error::eagain::handle([this] {
+ logger().info("{}: EAGAIN saw, trimming restarted", *this);
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ })
+ );
+ }).then([this, trimmed=to_trim] {
+ logger().debug("{}: trimmed snap={}", *this, trimmed);
+ });
+ }).finally([this] {
+ logger().debug("{}: PG::on_active_actmap() finished trimming",
+ *this);
+ peering_state.state_clear(PG_STATE_SNAPTRIM);
+ peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
+ publish_stats_to_osd();
+ });
+}
+
+void PG::on_active_advmap(const OSDMapRef &osdmap)
+{
+ const auto new_removed_snaps = osdmap->get_new_removed_snaps();
+ if (auto it = new_removed_snaps.find(get_pgid().pool());
+ it != new_removed_snaps.end()) {
+ bool bad = false;
+ for (auto j : it->second) {
+ if (snap_trimq.intersects(j.first, j.second)) {
+ decltype(snap_trimq) added, overlap;
+ added.insert(j.first, j.second);
+ overlap.intersection_of(snap_trimq, added);
+ logger().error("{}: {} removed_snaps already contains {}",
+ *this, __func__, overlap);
+ bad = true;
+ snap_trimq.union_of(added);
+ } else {
+ snap_trimq.insert(j.first, j.second);
+ }
+ }
+ logger().info("{}: {} new removed snaps {}, snap_trimq now{}",
+ *this, __func__, it->second, snap_trimq);
+ assert(!bad || !local_conf().get_val<bool>("osd_debug_verify_cached_snaps"));
+ }
+}
+
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
+{
+ // TODO: should update the stats upon finishing the scrub
+ peering_state.update_stats([scrub_level, this](auto& history, auto& stats) {
+ const utime_t now = ceph_clock_now();
+ history.last_scrub = peering_state.get_info().last_update;
+ history.last_scrub_stamp = now;
+ history.last_clean_scrub_stamp = now;
+ if (scrub_level == scrub_level_t::deep) {
+ history.last_deep_scrub = history.last_scrub;
+ history.last_deep_scrub_stamp = now;
+ }
+ // yes, please publish the stats
+ return true;
+ });
+}
+
+void PG::log_state_enter(const char *state) {
+ logger().info("Entering state: {}", state);
+}
+
+void PG::log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) {
+ logger().info(
+ "Exiting state: {}, entered at {}, {} spent on {} events",
+ state_name,
+ enter_time,
+ event_dur,
+ events);
+}
+
+ceph::signedspan PG::get_mnow() const
+{
+ return shard_services.get_mnow();
+}
+
+HeartbeatStampsRef PG::get_hb_stamps(int peer)
+{
+ return shard_services.get_hb_stamps(peer);
+}
+
+void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay)
+{
+ // handle the peering event in the background
+ renew_lease_timer.cancel();
+ renew_lease_timer.set_callback([last_peering_reset, this] {
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ last_peering_reset,
+ last_peering_reset,
+ RenewLease{});
+ });
+ renew_lease_timer.arm(
+ std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+
+void PG::init(
+ int role,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ ObjectStore::Transaction &t)
+{
+ peering_state.init(
+ role, newup, new_up_primary, newacting,
+ new_acting_primary, history, pi, t);
+}
+
+seastar::future<> PG::read_state(crimson::os::FuturizedStore::Shard* store)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ return seastar::do_with(PGMeta(*store, pgid), [] (auto& pg_meta) {
+ return pg_meta.load();
+ }).then([this, store](auto&& ret) {
+ auto [pg_info, past_intervals] = std::move(ret);
+ return peering_state.init_from_disk_state(
+ std::move(pg_info),
+ std::move(past_intervals),
+ [this, store] (PGLog &pglog) {
+ return pglog.read_log_and_missing_crimson(
+ *store,
+ coll_ref,
+ peering_state.get_info(),
+ pgmeta_oid);
+ });
+ }).then([this]() {
+ int primary, up_primary;
+ vector<int> acting, up;
+ peering_state.get_osdmap()->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &primary);
+ peering_state.init_primary_up_acting(
+ up,
+ acting,
+ up_primary,
+ primary);
+ int rr = OSDMap::calc_pg_role(pg_whoami, acting);
+ peering_state.set_role(rr);
+
+ epoch_t epoch = get_osdmap_epoch();
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ epoch,
+ epoch,
+ PeeringState::Initialize());
+
+ return seastar::now();
+ });
+}
+
+PG::interruptible_future<> PG::do_peering_event(
+ PGPeeringEvent& evt, PeeringCtx &rctx)
+{
+ if (peering_state.pg_has_reset_since(evt.get_epoch_requested()) ||
+ peering_state.pg_has_reset_since(evt.get_epoch_sent())) {
+ logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc());
+ return interruptor::now();
+ } else {
+ logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid);
+ // all peering event handling needs to be run in a dedicated seastar::thread,
+ // so that event processing can involve I/O reqs freely, for example: PG::on_removal,
+ // PG::on_new_interval
+ return interruptor::async([this, &evt, &rctx] {
+ peering_state.handle_event(
+ evt.get_event(),
+ &rctx);
+ peering_state.write_if_dirty(rctx.transaction);
+ });
+ }
+}
+
+seastar::future<> PG::handle_advance_map(
+ cached_map_t next_map, PeeringCtx &rctx)
+{
+ return seastar::async([this, next_map=std::move(next_map), &rctx] {
+ vector<int> newup, newacting;
+ int up_primary, acting_primary;
+ next_map->pg_to_up_acting_osds(
+ pgid.pgid,
+ &newup, &up_primary,
+ &newacting, &acting_primary);
+ peering_state.advance_map(
+ next_map,
+ peering_state.get_osdmap(),
+ newup,
+ up_primary,
+ newacting,
+ acting_primary,
+ rctx);
+ osdmap_gate.got_map(next_map->get_epoch());
+ });
+}
+
+seastar::future<> PG::handle_activate_map(PeeringCtx &rctx)
+{
+ return seastar::async([this, &rctx] {
+ peering_state.activate_map(rctx);
+ });
+}
+
+seastar::future<> PG::handle_initialize(PeeringCtx &rctx)
+{
+ return seastar::async([this, &rctx] {
+ peering_state.handle_event(PeeringState::Initialize{}, &rctx);
+ });
+}
+
+
+void PG::print(ostream& out) const
+{
+ out << peering_state << " ";
+}
+
+void PG::dump_primary(Formatter* f)
+{
+ peering_state.dump_peering_state(f);
+
+ f->open_array_section("recovery_state");
+ PeeringState::QueryState q(f);
+ peering_state.handle_event(q, 0);
+ f->close_section();
+
+ // TODO: snap_trimq
+ // TODO: scrubber state
+ // TODO: agent state
+}
+
+std::ostream& operator<<(std::ostream& os, const PG& pg)
+{
+ os << " pg_epoch " << pg.get_osdmap_epoch() << " ";
+ pg.print(os);
+ return os;
+}
+
+std::tuple<PG::interruptible_future<>,
+ PG::interruptible_future<>>
+PG::submit_transaction(
+ ObjectContextRef&& obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ if (__builtin_expect(stopping, false)) {
+ return {seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception()),
+ seastar::now()};
+ }
+
+ epoch_t map_epoch = get_osdmap_epoch();
+ ceph_assert(!has_reset_since(osd_op_p.at_version.epoch));
+
+ peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+ peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version,
+ txn, true, false);
+
+ auto [submitted, all_completed] = backend->mutate_object(
+ peering_state.get_acting_recovery_backfill(),
+ std::move(obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ peering_state.get_last_peering_reset(),
+ map_epoch,
+ std::move(log_entries));
+ return std::make_tuple(std::move(submitted), all_completed.then_interruptible(
+ [this, last_complete=peering_state.get_info().last_complete,
+ at_version=osd_op_p.at_version](auto acked) {
+ for (const auto& peer : acked) {
+ peering_state.update_peer_last_complete_ondisk(
+ peer.shard, peer.last_complete_ondisk);
+ }
+ peering_state.complete_write(at_version, last_complete);
+ return seastar::now();
+ }));
+}
+
+PG::interruptible_future<> PG::repair_object(
+ const hobject_t& oid,
+ eversion_t& v)
+{
+ // see also PrimaryLogPG::rep_repair_primary_object()
+ assert(is_primary());
+ logger().debug("{}: {} peers osd.{}", __func__, oid, get_acting_recovery_backfill());
+ // Add object to PG's missing set if it isn't there already
+ assert(!get_local_missing().is_missing(oid));
+ peering_state.force_object_missing(pg_whoami, oid, v);
+ auto [op, fut] = get_shard_services().start_operation<UrgentRecovery>(
+ oid, v, this, get_shard_services(), get_osdmap_epoch());
+ return std::move(fut);
+}
+
+template <class Ret, class SuccessFunc, class FailureFunc>
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>>
+PG::do_osd_ops_execute(
+ seastar::lw_shared_ptr<OpsExecuter> ox,
+ std::vector<OSDOp>& ops,
+ SuccessFunc&& success_func,
+ FailureFunc&& failure_func)
+{
+ assert(ox);
+ auto rollbacker = ox->create_rollbacker([this] (auto& obc) {
+ return obc_loader.reload_obc(obc).handle_error_interruptible(
+ load_obc_ertr::assert_all{"can't live with object state messed up"});
+ });
+ auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func));
+ return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) {
+ logger().debug(
+ "do_osd_ops_execute: object {} - handling op {}",
+ ox->get_target(),
+ ceph_osd_op_name(osd_op.op.op));
+ return ox->execute_op(osd_op);
+ }).safe_then_interruptible([this, ox, &ops] {
+ logger().debug(
+ "do_osd_ops_execute: object {} all operations successful",
+ ox->get_target());
+ // check for full
+ if ((ox->delta_stats.num_bytes > 0 ||
+ ox->delta_stats.num_objects > 0) &&
+ get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
+ const auto& m = ox->get_message();
+ if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now
+ m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+ logger().info(" full, but proceeding due to FULL_FORCE or MDS");
+ } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+ // they tried, they failed.
+ logger().info(" full, replying to FULL_TRY op");
+ if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::edquot::make()));
+ else
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::enospc::make()));
+ } else {
+ // drop request
+ logger().info(" full, dropping request (bad client)");
+ return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
+ seastar::now(),
+ OpsExecuter::osd_op_ierrorator::future<>(
+ crimson::ct_error::eagain::make()));
+ }
+ }
+ return std::move(*ox).flush_changes_n_do_ops_effects(
+ ops,
+ snap_mapper,
+ osdriver,
+ [this] (auto&& txn,
+ auto&& obc,
+ auto&& osd_op_p,
+ auto&& log_entries) {
+ logger().debug(
+ "do_osd_ops_execute: object {} submitting txn",
+ obc->get_oid());
+ return submit_transaction(
+ std::move(obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries));
+ });
+ }).safe_then_unpack_interruptible(
+ [success_func=std::move(success_func), rollbacker, this, failure_func_ptr]
+ (auto submitted_fut, auto all_completed_fut) mutable {
+ return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
+ std::move(submitted_fut),
+ all_completed_fut.safe_then_interruptible_tuple(
+ std::move(success_func),
+ crimson::ct_error::object_corrupted::handle(
+ [rollbacker, this] (const std::error_code& e) mutable {
+ // this is a path for EIO. it's special because we want to fix the obejct
+ // and try again. that is, the layer above `PG::do_osd_ops` is supposed to
+ // restart the execution.
+ return rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [obc=rollbacker.get_obc(), this] {
+ return repair_object(obc->obs.oi.soid,
+ obc->obs.oi.version).then_interruptible([] {
+ return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()};
+ });
+ });
+ }), OpsExecuter::osd_op_errorator::all_same_way(
+ [rollbacker, failure_func_ptr]
+ (const std::error_code& e) mutable {
+ return rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [e, failure_func_ptr] {
+ return (*failure_func_ptr)(e);
+ });
+ })
+ )
+ );
+ }, OpsExecuter::osd_op_errorator::all_same_way(
+ [rollbacker, failure_func_ptr]
+ (const std::error_code& e) mutable {
+ return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
+ seastar::now(),
+ e.value() == ENOENT ? (*failure_func_ptr)(e) :
+ rollbacker.rollback_obc_if_modified(e).then_interruptible(
+ [e, failure_func_ptr] {
+ return (*failure_func_ptr)(e);
+ }));
+ }));
+}
+seastar::future<> PG::submit_error_log(
+ Ref<MOSDOp> m,
+ const OpInfo &op_info,
+ ObjectContextRef obc,
+ const std::error_code e,
+ ceph_tid_t rep_tid,
+ eversion_t &version)
+{
+ const osd_reqid_t &reqid = m->get_reqid();
+ mempool::osd_pglog::list<pg_log_entry_t> log_entries;
+ log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR,
+ obc->obs.oi.soid,
+ next_version(),
+ eversion_t(), 0,
+ reqid, utime_t(),
+ -e.value()));
+ if (op_info.allows_returnvec()) {
+ log_entries.back().set_op_returns(m->ops);
+ }
+ ceph_assert(is_primary());
+ if (!log_entries.empty()) {
+ ceph_assert(log_entries.rbegin()->version >= projected_last_update);
+ version = projected_last_update = log_entries.rbegin()->version;
+ }
+ ceph::os::Transaction t;
+ peering_state.merge_new_log_entries(
+ log_entries, t, peering_state.get_pg_trim_to(),
+ peering_state.get_min_last_complete_ondisk());
+
+ set<pg_shard_t> waiting_on;
+ for (auto &i : get_acting_recovery_backfill()) {
+ pg_shard_t peer(i);
+ if (peer == pg_whoami) continue;
+ ceph_assert(peering_state.get_peer_missing().count(peer));
+ ceph_assert(peering_state.has_peer_info(peer));
+ auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>(
+ log_entries,
+ spg_t(peering_state.get_info().pgid.pgid, i.shard),
+ pg_whoami.shard,
+ get_osdmap_epoch(),
+ get_last_peering_reset(),
+ rep_tid,
+ peering_state.get_pg_trim_to(),
+ peering_state.get_min_last_complete_ondisk());
+ send_cluster_message(peer.osd, std::move(log_m), get_osdmap_epoch());
+ waiting_on.insert(peer);
+ }
+ waiting_on.insert(pg_whoami);
+ log_entry_update_waiting_on.insert(
+ std::make_pair(rep_tid, log_update_t{std::move(waiting_on)}));
+ return shard_services.get_store().do_transaction(
+ get_collection_ref(), std::move(t))
+ .then([this] {
+ peering_state.update_trim_to();
+ return seastar::now();
+ });
+}
+
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
+PG::do_osd_ops(
+ Ref<MOSDOp> m,
+ crimson::net::ConnectionRef conn,
+ ObjectContextRef obc,
+ const OpInfo &op_info,
+ const SnapContext& snapc)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ return do_osd_ops_execute<MURef<MOSDOpReply>>(
+ seastar::make_lw_shared<OpsExecuter>(
+ Ref<PG>{this}, obc, op_info, *m, conn, snapc),
+ m->ops,
+ [this, m, obc, may_write = op_info.may_write(),
+ may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] {
+ // TODO: should stop at the first op which returns a negative retval,
+ // cmpext uses it for returning the index of first unmatched byte
+ int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+ if (may_read && result >= 0) {
+ for (auto &osdop : m->ops) {
+ if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ result = osdop.rval.code;
+ break;
+ }
+ }
+ } else if (result > 0 && may_write && !rvec) {
+ result = 0;
+ } else if (result < 0 && (m->ops.empty() ?
+ 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ result = 0;
+ }
+ auto reply = crimson::make_message<MOSDOpReply>(m.get(),
+ result,
+ get_osdmap_epoch(),
+ 0,
+ false);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ logger().debug(
+ "do_osd_ops: {} - object {} sending reply",
+ *m,
+ m->get_hobj());
+ if (obc->obs.exists) {
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ obc->obs.oi.user_version);
+ } else {
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ }
+ return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
+ std::move(reply));
+ },
+ [m, &op_info, obc, this] (const std::error_code& e) {
+ return seastar::do_with(eversion_t(), [m, &op_info, obc, e, this](auto &version) {
+ auto fut = seastar::now();
+ epoch_t epoch = get_osdmap_epoch();
+ ceph_tid_t rep_tid = shard_services.get_tid();
+ auto last_complete = peering_state.get_info().last_complete;
+ if (op_info.may_write()) {
+ fut = submit_error_log(m, op_info, obc, e, rep_tid, version);
+ }
+ return fut.then([m, e, epoch, &op_info, rep_tid, &version, last_complete, this] {
+ auto log_reply = [m, e, this] {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), -e.value(), get_osdmap_epoch(), 0, false);
+ if (m->ops.empty() ? 0 :
+ m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
+ reply->set_result(0);
+ }
+ // For all ops except for CMPEXT, the correct error value is encoded
+ // in e.value(). For CMPEXT, osdop.rval has the actual error value.
+ if (e.value() == ct_error::cmp_fail_error_value) {
+ assert(!m->ops.empty());
+ for (auto &osdop : m->ops) {
+ if (osdop.rval < 0) {
+ reply->set_result(osdop.rval);
+ break;
+ }
+ }
+ }
+ reply->set_enoent_reply_versions(
+ peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
+ std::move(reply));
+ };
+
+ if (!peering_state.pg_has_reset_since(epoch) && op_info.may_write()) {
+ auto it = log_entry_update_waiting_on.find(rep_tid);
+ ceph_assert(it != log_entry_update_waiting_on.end());
+ auto it2 = it->second.waiting_on.find(pg_whoami);
+ ceph_assert(it2 != it->second.waiting_on.end());
+ it->second.waiting_on.erase(it2);
+
+ if (it->second.waiting_on.empty()) {
+ log_entry_update_waiting_on.erase(it);
+ if (version != eversion_t()) {
+ peering_state.complete_write(version, last_complete);
+ }
+ return log_reply();
+ } else {
+ return it->second.all_committed.get_shared_future()
+ .then([this, &version, last_complete, log_reply = std::move(log_reply)] {
+ if (version != eversion_t()) {
+ peering_state.complete_write(version, last_complete);
+ }
+ return log_reply();
+ });
+ }
+ } else {
+ return log_reply();
+ }
+ });
+ });
+ });
+}
+
+PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>>
+PG::do_osd_ops(
+ ObjectContextRef obc,
+ std::vector<OSDOp>& ops,
+ const OpInfo &op_info,
+ const do_osd_ops_params_t &&msg_params,
+ do_osd_ops_success_func_t success_func,
+ do_osd_ops_failure_func_t failure_func)
+{
+ // This overload is generally used for internal client requests,
+ // use an empty SnapContext.
+ return seastar::do_with(
+ std::move(msg_params),
+ [=, this, &ops, &op_info](auto &msg_params) {
+ return do_osd_ops_execute<void>(
+ seastar::make_lw_shared<OpsExecuter>(
+ Ref<PG>{this},
+ std::move(obc),
+ op_info,
+ msg_params,
+ msg_params.get_connection(),
+ SnapContext{}
+ ),
+ ops,
+ std::move(success_func),
+ std::move(failure_func));
+ });
+}
+
+PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+
+ auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this),
+ std::as_const(*m));
+ return interruptor::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) {
+ logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op));
+ return ox->execute_op(osd_op);
+ }).then_interruptible([m, this, ox = std::move(ox)] {
+ auto reply = crimson::make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(),
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ false);
+ reply->claim_op_out_data(m->ops);
+ reply->set_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply));
+ }).handle_exception_type_interruptible([=, this](const crimson::osd::error& e) {
+ auto reply = crimson::make_message<MOSDOpReply>(
+ m.get(), -e.code().value(), get_osdmap_epoch(), 0, false);
+ reply->set_enoent_reply_versions(peering_state.get_info().last_update,
+ peering_state.get_info().last_user_version);
+ return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply));
+ });
+}
+
+hobject_t PG::get_oid(const hobject_t& hobj)
+{
+ return hobj.snap == CEPH_SNAPDIR ? hobj.get_head() : hobj;
+}
+
+RWState::State PG::get_lock_type(const OpInfo &op_info)
+{
+
+ if (op_info.rwordered() && op_info.may_read()) {
+ return RWState::RWEXCL;
+ } else if (op_info.rwordered()) {
+ return RWState::RWWRITE;
+ } else {
+ ceph_assert(op_info.may_read());
+ return RWState::RWREAD;
+ }
+}
+
+void PG::check_blocklisted_obc_watchers(
+ ObjectContextRef &obc)
+{
+ if (obc->watchers.empty()) {
+ for (auto &[src, winfo] : obc->obs.oi.watchers) {
+ auto watch = crimson::osd::Watch::create(
+ obc, winfo, src.second, this);
+ watch->disconnect();
+ auto [it, emplaced] = obc->watchers.emplace(src, std::move(watch));
+ assert(emplaced);
+ logger().debug("added watch for obj {}, client {}",
+ obc->get_oid(), src.second);
+ }
+ }
+}
+
+PG::load_obc_iertr::future<>
+PG::with_locked_obc(const hobject_t &hobj,
+ const OpInfo &op_info,
+ with_obc_func_t &&f)
+{
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ const hobject_t oid = get_oid(hobj);
+ auto wrapper = [f=std::move(f), this](auto obc) {
+ check_blocklisted_obc_watchers(obc);
+ return f(obc);
+ };
+ switch (get_lock_type(op_info)) {
+ case RWState::RWREAD:
+ return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper));
+ case RWState::RWWRITE:
+ return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper));
+ case RWState::RWEXCL:
+ return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper));
+ default:
+ ceph_abort();
+ };
+}
+
+PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ logger().debug("{}: {}", __func__, *req);
+ if (can_discard_replica_op(*req)) {
+ return seastar::now();
+ }
+
+ ceph::os::Transaction txn;
+ auto encoded_txn = req->get_data().cbegin();
+ decode(txn, encoded_txn);
+ auto p = req->logbl.cbegin();
+ std::vector<pg_log_entry_t> log_entries;
+ decode(log_entries, p);
+ log_operation(std::move(log_entries),
+ req->pg_trim_to,
+ req->version,
+ req->min_last_complete_ondisk,
+ !txn.empty(),
+ txn,
+ false);
+ logger().debug("PG::handle_rep_op: do_transaction...");
+ return interruptor::make_interruptible(shard_services.get_store().do_transaction(
+ coll_ref, std::move(txn))).then_interruptible(
+ [req, lcod=peering_state.get_info().last_complete, this] {
+ peering_state.update_last_complete_ondisk(lcod);
+ const auto map_epoch = get_osdmap_epoch();
+ auto reply = crimson::make_message<MOSDRepOpReply>(
+ req.get(), pg_whoami, 0,
+ map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+ reply->set_last_complete_ondisk(lcod);
+ return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch);
+ });
+}
+
+void PG::log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &txn,
+ bool async) {
+ logger().debug("{}", __func__);
+ if (is_primary()) {
+ ceph_assert(trim_to <= peering_state.get_last_update_ondisk());
+ }
+ /* TODO: when we add snap mapper and projected log support,
+ * we'll likely want to update them here.
+ *
+ * See src/osd/PrimaryLogPG.h:log_operation for how classic
+ * handles these cases.
+ */
+#if 0
+ if (transaction_applied) {
+ //TODO:
+ //update_snap_map(logv, t);
+ }
+ auto last = logv.rbegin();
+ if (is_primary() && last != logv.rend()) {
+ projected_log.skip_can_rollback_to_to_head();
+ projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+ }
+#endif
+ if (!is_primary()) { // && !is_ec_pg()
+ replica_clear_repop_obc(logv);
+ }
+ peering_state.append_log(std::move(logv),
+ trim_to,
+ roll_forward_to,
+ min_last_complete_ondisk,
+ txn,
+ !txn.empty(),
+ false);
+}
+
+void PG::replica_clear_repop_obc(
+ const std::vector<pg_log_entry_t> &logv) {
+ logger().debug("{} clearing {} entries", __func__, logv.size());
+ for (auto &&e: logv) {
+ logger().debug(" {} get_object_boundary(from): {} "
+ " head version(to): {}",
+ e.soid,
+ e.soid.get_object_boundary(),
+ e.soid.get_head());
+ /* Have to blast all clones, they share a snapset */
+ obc_registry.clear_range(
+ e.soid.get_object_boundary(), e.soid.get_head());
+ }
+}
+
+void PG::handle_rep_op_reply(const MOSDRepOpReply& m)
+{
+ if (!can_discard_replica_op(m)) {
+ backend->got_rep_op_reply(m);
+ }
+}
+
+PG::interruptible_future<> PG::do_update_log_missing(
+ Ref<MOSDPGUpdateLogMissing> m,
+ crimson::net::ConnectionRef conn)
+{
+ if (__builtin_expect(stopping, false)) {
+ return seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception());
+ }
+
+ ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
+ ObjectStore::Transaction t;
+ std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+ if (m->pg_trim_to != eversion_t())
+ op_trim_to = m->pg_trim_to;
+ if (m->pg_roll_forward_to != eversion_t())
+ op_roll_forward_to = m->pg_roll_forward_to;
+ logger().debug("op_trim_to = {}, op_roll_forward_to = {}",
+ op_trim_to, op_roll_forward_to);
+
+ peering_state.append_log_entries_update_missing(
+ m->entries, t, op_trim_to, op_roll_forward_to);
+
+ return interruptor::make_interruptible(shard_services.get_store().do_transaction(
+ coll_ref, std::move(t))).then_interruptible(
+ [m, conn, lcod=peering_state.get_info().last_complete, this] {
+ if (!peering_state.pg_has_reset_since(m->get_epoch())) {
+ peering_state.update_last_complete_ondisk(lcod);
+ auto reply =
+ crimson::make_message<MOSDPGUpdateLogMissingReply>(
+ spg_t(peering_state.get_info().pgid.pgid, get_primary().shard),
+ pg_whoami.shard,
+ m->get_epoch(),
+ m->min_epoch,
+ m->get_tid(),
+ lcod);
+ reply->set_priority(CEPH_MSG_PRIO_HIGH);
+ return conn->send(std::move(reply));
+ }
+ return seastar::now();
+ });
+}
+
+
+PG::interruptible_future<> PG::do_update_log_missing_reply(
+ Ref<MOSDPGUpdateLogMissingReply> m)
+{
+ logger().debug("{}: got reply from {}", __func__, m->get_from());
+
+ auto it = log_entry_update_waiting_on.find(m->get_tid());
+ if (it != log_entry_update_waiting_on.end()) {
+ if (it->second.waiting_on.count(m->get_from())) {
+ it->second.waiting_on.erase(m->get_from());
+ if (m->last_complete_ondisk != eversion_t()) {
+ peering_state.update_peer_last_complete_ondisk(
+ m->get_from(), m->last_complete_ondisk);
+ }
+ } else {
+ logger().error("{} : {} got reply {} from shard we are not waiting for ",
+ __func__, peering_state.get_info().pgid, *m, m->get_from());
+ }
+
+ if (it->second.waiting_on.empty()) {
+ it->second.all_committed.set_value();
+ it->second.all_committed = {};
+ log_entry_update_waiting_on.erase(it);
+ }
+ } else {
+ logger().error("{} : {} got reply {} on unknown tid {}",
+ __func__, peering_state.get_info().pgid, *m, m->get_tid());
+ }
+ return seastar::now();
+}
+
+bool PG::old_peering_msg(
+ const epoch_t reply_epoch,
+ const epoch_t query_epoch) const
+{
+ if (const epoch_t lpr = peering_state.get_last_peering_reset();
+ lpr > reply_epoch || lpr > query_epoch) {
+ logger().debug("{}: pg changed {} lpr {}, reply_epoch {}, query_epoch {}",
+ __func__, get_info().history, lpr, reply_epoch, query_epoch);
+ return true;
+ }
+ return false;
+}
+
+bool PG::can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const
+{
+ // if a repop is replied after a replica goes down in a new osdmap, and
+ // before the pg advances to this new osdmap, the repop replies before this
+ // repop can be discarded by that replica OSD, because the primary resets the
+ // connection to it when handling the new osdmap marking it down, and also
+ // resets the messenger sesssion when the replica reconnects. to avoid the
+ // out-of-order replies, the messages from that replica should be discarded.
+ const auto osdmap = peering_state.get_osdmap();
+ const int from_osd = m.get_source().num();
+ if (osdmap->is_down(from_osd)) {
+ return true;
+ }
+ // Mostly, this overlaps with the old_peering_msg
+ // condition. An important exception is pushes
+ // sent by replicas not in the acting set, since
+ // if such a replica goes down it does not cause
+ // a new interval.
+ if (osdmap->get_down_at(from_osd) >= m_map_epoch) {
+ return true;
+ }
+ // same pg?
+ // if pg changes *at all*, we reset and repeer!
+ return old_peering_msg(m_map_epoch, m_map_epoch);
+}
+
+seastar::future<> PG::stop()
+{
+ logger().info("PG {} {}", pgid, __func__);
+ stopping = true;
+ cancel_local_background_io_reservation();
+ cancel_remote_recovery_reservation();
+ check_readable_timer.cancel();
+ renew_lease_timer.cancel();
+ return osdmap_gate.stop().then([this] {
+ return wait_for_active_blocker.stop();
+ }).then([this] {
+ return recovery_handler->stop();
+ }).then([this] {
+ return recovery_backend->stop();
+ }).then([this] {
+ return backend->stop();
+ });
+}
+
+void PG::on_change(ceph::os::Transaction &t) {
+ logger().debug("{} {}:", *this, __func__);
+ context_registry_on_change();
+ obc_loader.notify_on_change(is_primary());
+ recovery_backend->on_peering_interval_change(t);
+ backend->on_actingset_changed(is_primary());
+ wait_for_active_blocker.unblock();
+ if (is_primary()) {
+ logger().debug("{} {}: requeueing", *this, __func__);
+ client_request_orderer.requeue(shard_services, this);
+ } else {
+ logger().debug("{} {}: dropping requests", *this, __func__);
+ client_request_orderer.clear_and_cancel();
+ }
+}
+
+void PG::context_registry_on_change() {
+ obc_registry.for_each([](ObjectContextRef obc) {
+ assert(obc);
+ for (auto j = obc->watchers.begin();
+ j != obc->watchers.end();
+ j = obc->watchers.erase(j)) {
+ j->second->discard_state();
+ }
+ });
+}
+
+bool PG::can_discard_op(const MOSDOp& m) const {
+ if (m.get_map_epoch() <
+ peering_state.get_info().history.same_primary_since) {
+ logger().debug("{} changed after {} dropping {} ",
+ __func__ , m.get_map_epoch(), m);
+ return true;
+ }
+
+ if ((m.get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS))
+ && !is_primary()
+ && (m.get_map_epoch() <
+ peering_state.get_info().history.same_interval_since))
+ {
+ // Note: the Objecter will resend on interval change without the primary
+ // changing if it actually sent to a replica. If the primary hasn't
+ // changed since the send epoch, we got it, and we're primary, it won't
+ // have resent even if the interval did change as it sent it to the primary
+ // (us).
+ return true;
+ }
+ return __builtin_expect(m.get_map_epoch()
+ < peering_state.get_info().history.same_primary_since, false);
+}
+
+bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const {
+ /* The conditions below may clear (on_local_recover, before we queue
+ * the transaction) before we actually requeue the degraded waiters
+ * in on_global_recover after the transaction completes.
+ */
+ if (peering_state.get_pg_log().get_missing().get_items().count(soid))
+ return true;
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ for (auto& peer : get_acting_recovery_backfill()) {
+ if (peer == get_primary()) continue;
+ auto peer_missing_entry = peering_state.get_peer_missing().find(peer);
+ // If an object is missing on an async_recovery_target, return false.
+ // This will not block the op and the object is async recovered later.
+ if (peer_missing_entry != peering_state.get_peer_missing().end() &&
+ peer_missing_entry->second.get_items().count(soid)) {
+ return true;
+ }
+ // Object is degraded if after last_backfill AND
+ // we are backfilling it
+ if (is_backfill_target(peer) &&
+ peering_state.get_peer_info(peer).last_backfill <= soid &&
+ recovery_handler->backfill_state &&
+ recovery_handler->backfill_state->get_last_backfill_started() >= soid &&
+ recovery_backend->is_recovering(soid)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+PG::interruptible_future<std::optional<PG::complete_op_t>>
+PG::already_complete(const osd_reqid_t& reqid)
+{
+ eversion_t version;
+ version_t user_version;
+ int ret;
+ std::vector<pg_log_op_return_item_t> op_returns;
+
+ if (peering_state.get_pg_log().get_log().get_request(
+ reqid, &version, &user_version, &ret, &op_returns)) {
+ complete_op_t dupinfo{
+ user_version,
+ version,
+ ret};
+ return backend->request_committed(reqid, version).then([dupinfo] {
+ return seastar::make_ready_future<std::optional<complete_op_t>>(dupinfo);
+ });
+ } else {
+ return seastar::make_ready_future<std::optional<complete_op_t>>(std::nullopt);
+ }
+}
+
+}
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
new file mode 100644
index 000000000..d96db2e20
--- /dev/null
+++ b/src/crimson/osd/pg.h
@@ -0,0 +1,833 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "common/dout.h"
+#include "include/interval_set.h"
+#include "crimson/net/Fwd.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "osd/osd_types_fmt.h"
+#include "crimson/osd/object_context.h"
+#include "osd/PeeringState.h"
+#include "osd/SnapMapper.h"
+
+#include "crimson/common/interruptible_future.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/logmissing_request.h"
+#include "crimson/osd/osd_operations/logmissing_request_reply.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_activation_blocker.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/osd/object_context_loader.h"
+
+class MQuery;
+class OSDMap;
+class PGBackend;
+class PGPeeringEvent;
+class osd_op_params_t;
+
+namespace recovery {
+ class Context;
+}
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+class OpsExecuter;
+class BackfillRecovery;
+class SnapTrimEvent;
+
+class PG : public boost::intrusive_ref_counter<
+ PG,
+ boost::thread_unsafe_counter>,
+ public PGRecoveryListener,
+ PeeringState::PeeringListener,
+ DoutPrefixProvider
+{
+ using ec_profile_t = std::map<std::string,std::string>;
+ using cached_map_t = OSDMapService::cached_map_t;
+
+ ClientRequest::PGPipeline request_pg_pipeline;
+ PGPeeringPipeline peering_request_pg_pipeline;
+
+ ClientRequest::Orderer client_request_orderer;
+
+ spg_t pgid;
+ pg_shard_t pg_whoami;
+ crimson::os::CollectionRef coll_ref;
+ ghobject_t pgmeta_oid;
+
+ seastar::timer<seastar::lowres_clock> check_readable_timer;
+ seastar::timer<seastar::lowres_clock> renew_lease_timer;
+
+public:
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+
+ PG(spg_t pgid,
+ pg_shard_t pg_shard,
+ crimson::os::CollectionRef coll_ref,
+ pg_pool_t&& pool,
+ std::string&& name,
+ cached_map_t osdmap,
+ ShardServices &shard_services,
+ ec_profile_t profile);
+
+ ~PG();
+
+ const pg_shard_t& get_pg_whoami() const final {
+ return pg_whoami;
+ }
+
+ const spg_t& get_pgid() const final {
+ return pgid;
+ }
+
+ PGBackend& get_backend() {
+ return *backend;
+ }
+ const PGBackend& get_backend() const {
+ return *backend;
+ }
+ // EpochSource
+ epoch_t get_osdmap_epoch() const final {
+ return peering_state.get_osdmap_epoch();
+ }
+
+ eversion_t get_pg_trim_to() const {
+ return peering_state.get_pg_trim_to();
+ }
+
+ eversion_t get_min_last_complete_ondisk() const {
+ return peering_state.get_min_last_complete_ondisk();
+ }
+
+ const pg_info_t& get_info() const final {
+ return peering_state.get_info();
+ }
+
+ // DoutPrefixProvider
+ std::ostream& gen_prefix(std::ostream& out) const final {
+ return out << *this;
+ }
+ crimson::common::CephContext *get_cct() const final {
+ return shard_services.get_cct();
+ }
+ unsigned get_subsys() const final {
+ return ceph_subsys_osd;
+ }
+
+ crimson::os::CollectionRef get_collection_ref() {
+ return coll_ref;
+ }
+
+ // PeeringListener
+ void prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ceph::os::Transaction &t) final;
+
+ void on_info_history_change() final {
+ // Not needed yet -- mainly for scrub scheduling
+ }
+
+ /// Notify PG that Primary/Replica status has changed (to update scrub registration)
+ void on_primary_status_change(bool was_primary, bool now_primary) final {
+ }
+
+ /// Need to reschedule next scrub. Assuming no change in role
+ void reschedule_scrub() final {
+ }
+
+ void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final;
+
+ uint64_t get_snap_trimq_size() const final {
+ return std::size(snap_trimq);
+ }
+
+ void send_cluster_message(
+ int osd, MessageURef m,
+ epoch_t epoch, bool share_map_update=false) final {
+ (void)shard_services.send_to_osd(osd, std::move(m), epoch);
+ }
+
+ void send_pg_created(pg_t pgid) final {
+ (void)shard_services.send_pg_created(pgid);
+ }
+
+ bool try_flush_or_schedule_async() final;
+
+ void start_flush_on_transaction(
+ ceph::os::Transaction &t) final {
+ t.register_on_commit(
+ new LambdaContext([this](int r){
+ peering_state.complete_flush();
+ }));
+ }
+
+ void on_flushed() final {
+ // will be needed for unblocking IO operations/peering
+ }
+
+ template <typename T>
+ void start_peering_event_operation(T &&evt, float delay = 0) {
+ (void) shard_services.start_operation<LocalPeeringEvent>(
+ this,
+ pg_whoami,
+ pgid,
+ delay,
+ std::forward<T>(evt));
+ }
+
+ void schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) final {
+ start_peering_event_operation(std::move(*event), delay);
+ }
+ std::vector<pg_shard_t> get_replica_recovery_order() const final {
+ return peering_state.get_replica_recovery_order();
+ }
+ void request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_request_reservation(
+ pgid,
+ on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+ start_peering_event_operation(std::move(*on_grant));
+ }) : nullptr,
+ priority,
+ on_preempt ? make_lambda_context(
+ [this, on_preempt=std::move(on_preempt)] (int) {
+ start_peering_event_operation(std::move(*on_preempt));
+ }) : nullptr);
+ }
+
+ void update_local_background_io_priority(
+ unsigned priority) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_update_priority(
+ pgid,
+ priority);
+ }
+
+ void cancel_local_background_io_reservation() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.local_cancel_reservation(
+ pgid);
+ }
+
+ void request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remote_request_reservation(
+ pgid,
+ on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+ start_peering_event_operation(std::move(*on_grant));
+ }) : nullptr,
+ priority,
+ on_preempt ? make_lambda_context(
+ [this, on_preempt=std::move(on_preempt)] (int) {
+ start_peering_event_operation(std::move(*on_preempt));
+ }) : nullptr);
+ }
+
+ void cancel_remote_recovery_reservation() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remote_cancel_reservation(
+ pgid);
+ }
+
+ void schedule_event_on_commit(
+ ceph::os::Transaction &t,
+ PGPeeringEventRef on_commit) final {
+ t.register_on_commit(
+ make_lambda_context(
+ [this, on_commit=std::move(on_commit)](int) {
+ start_peering_event_operation(std::move(*on_commit));
+ }));
+ }
+
+ void update_heartbeat_peers(std::set<int> peers) final {
+ // Not needed yet
+ }
+ void set_probe_targets(const std::set<pg_shard_t> &probe_set) final {
+ // Not needed yet
+ }
+ void clear_probe_targets() final {
+ // Not needed yet
+ }
+ void queue_want_pg_temp(const std::vector<int> &wanted) final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.queue_want_pg_temp(pgid.pgid, wanted);
+ }
+ void clear_want_pg_temp() final {
+ // TODO -- we probably want to add a mechanism for blocking on this
+ // after handling the peering event
+ std::ignore = shard_services.remove_want_pg_temp(pgid.pgid);
+ }
+ void check_recovery_sources(const OSDMapRef& newmap) final {
+ // Not needed yet
+ }
+ void check_blocklisted_watchers() final;
+ void clear_primary_state() final {
+ // Not needed yet
+ }
+
+ void queue_check_readable(epoch_t last_peering_reset,
+ ceph::timespan delay) final;
+ void recheck_readable() final;
+
+ unsigned get_target_pg_log_entries() const final;
+
+ void on_pool_change() final {
+ // Not needed yet
+ }
+ void on_role_change() final {
+ // Not needed yet
+ }
+ void on_change(ceph::os::Transaction &t) final;
+ void on_activate(interval_set<snapid_t> to_trim) final;
+ void on_activate_complete() final;
+ void on_new_interval() final {
+ // Not needed yet
+ }
+ Context *on_clean() final;
+ void on_activate_committed() final {
+ // Not needed yet (will be needed for IO unblocking)
+ }
+ void on_active_exit() final {
+ // Not needed yet
+ }
+
+ void on_removal(ceph::os::Transaction &t) final;
+
+ std::pair<ghobject_t, bool>
+ do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final;
+
+ // merge/split not ready
+ void clear_ready_to_merge() final {}
+ void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {}
+ void set_not_ready_to_merge_source(pg_t pgid) final {}
+ void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {}
+ void set_ready_to_merge_source(eversion_t lu) final {}
+
+ void on_active_actmap() final;
+ void on_active_advmap(const OSDMapRef &osdmap) final;
+
+ epoch_t cluster_osdmap_trim_lower_bound() final {
+ // TODO
+ return 0;
+ }
+
+ void on_backfill_reserved() final {
+ recovery_handler->on_backfill_reserved();
+ }
+ void on_backfill_canceled() final {
+ ceph_assert(0 == "Not implemented");
+ }
+
+ void on_recovery_reserved() final {
+ recovery_handler->start_pglogbased_recovery();
+ }
+
+
+ bool try_reserve_recovery_space(
+ int64_t primary_num_bytes, int64_t local_num_bytes) final {
+ // TODO
+ return true;
+ }
+ void unreserve_recovery_space() final {}
+
+ struct PGLogEntryHandler : public PGLog::LogEntryHandler {
+ PG *pg;
+ ceph::os::Transaction *t;
+ PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {}
+
+ // LogEntryHandler
+ void remove(const hobject_t &hoid) override {
+ // TODO
+ }
+ void try_stash(const hobject_t &hoid, version_t v) override {
+ // TODO
+ }
+ void rollback(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ void rollforward(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ void trim(const pg_log_entry_t &entry) override {
+ // TODO
+ }
+ };
+ PGLog::LogEntryHandlerRef get_log_handler(
+ ceph::os::Transaction &t) final {
+ return std::make_unique<PG::PGLogEntryHandler>(this, &t);
+ }
+
+ void rebuild_missing_set_with_deletes(PGLog &pglog) final {
+ pglog.rebuild_missing_set_with_deletes_crimson(
+ shard_services.get_store(),
+ coll_ref,
+ peering_state.get_info()).get();
+ }
+
+ PerfCounters &get_peering_perf() final {
+ return shard_services.get_recoverystate_perf_logger();
+ }
+ PerfCounters &get_perf_logger() final {
+ return shard_services.get_perf_logger();
+ }
+
+ void log_state_enter(const char *state) final;
+ void log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) final;
+
+ void dump_recovery_info(Formatter *f) const final {
+ }
+
+ OstreamTemp get_clog_info() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_INFO, nullptr);
+ }
+ OstreamTemp get_clog_debug() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_DEBUG, nullptr);
+ }
+ OstreamTemp get_clog_error() final {
+ // not needed yet: replace with not a stub (needs to be wired up to monc)
+ return OstreamTemp(CLOG_ERROR, nullptr);
+ }
+
+ ceph::signedspan get_mnow() const final;
+ HeartbeatStampsRef get_hb_stamps(int peer) final;
+ void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final;
+
+
+ // Utility
+ bool is_primary() const final {
+ return peering_state.is_primary();
+ }
+ bool is_nonprimary() const {
+ return peering_state.is_nonprimary();
+ }
+ bool is_peered() const final {
+ return peering_state.is_peered();
+ }
+ bool is_recovering() const final {
+ return peering_state.is_recovering();
+ }
+ bool is_backfilling() const final {
+ return peering_state.is_backfilling();
+ }
+ uint64_t get_last_user_version() const {
+ return get_info().last_user_version;
+ }
+ bool get_need_up_thru() const {
+ return peering_state.get_need_up_thru();
+ }
+ epoch_t get_same_interval_since() const {
+ return get_info().history.same_interval_since;
+ }
+
+ const auto& get_pgpool() const {
+ return peering_state.get_pgpool();
+ }
+ pg_shard_t get_primary() const {
+ return peering_state.get_primary();
+ }
+
+ /// initialize created PG
+ void init(
+ int role,
+ const std::vector<int>& up,
+ int up_primary,
+ const std::vector<int>& acting,
+ int acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pim,
+ ceph::os::Transaction &t);
+
+ seastar::future<> read_state(crimson::os::FuturizedStore::Shard* store);
+
+ interruptible_future<> do_peering_event(
+ PGPeeringEvent& evt, PeeringCtx &rctx);
+
+ seastar::future<> handle_advance_map(cached_map_t next_map, PeeringCtx &rctx);
+ seastar::future<> handle_activate_map(PeeringCtx &rctx);
+ seastar::future<> handle_initialize(PeeringCtx &rctx);
+
+ static hobject_t get_oid(const hobject_t& hobj);
+ static RWState::State get_lock_type(const OpInfo &op_info);
+
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+ using interruptor = ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+
+public:
+ using with_obc_func_t =
+ std::function<load_obc_iertr::future<> (ObjectContextRef)>;
+
+ load_obc_iertr::future<> with_locked_obc(
+ const hobject_t &hobj,
+ const OpInfo &op_info,
+ with_obc_func_t&& f);
+
+ interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
+ void log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &txn,
+ bool async = false);
+ void replica_clear_repop_obc(
+ const std::vector<pg_log_entry_t> &logv);
+ void handle_rep_op_reply(const MOSDRepOpReply& m);
+ interruptible_future<> do_update_log_missing(
+ Ref<MOSDPGUpdateLogMissing> m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> do_update_log_missing_reply(
+ Ref<MOSDPGUpdateLogMissingReply> m);
+
+
+ void print(std::ostream& os) const;
+ void dump_primary(Formatter*);
+ seastar::future<> submit_error_log(
+ Ref<MOSDOp> m,
+ const OpInfo &op_info,
+ ObjectContextRef obc,
+ const std::error_code e,
+ ceph_tid_t rep_tid,
+ eversion_t &version);
+
+private:
+
+ struct SnapTrimMutex {
+ struct WaitPG : OrderedConcurrentPhaseT<WaitPG> {
+ static constexpr auto type_name = "SnapTrimEvent::wait_pg";
+ } wait_pg;
+ seastar::shared_mutex mutex;
+
+ interruptible_future<> lock(SnapTrimEvent &st_event) noexcept;
+
+ void unlock() noexcept {
+ mutex.unlock();
+ }
+ } snaptrim_mutex;
+
+ using do_osd_ops_ertr = crimson::errorator<
+ crimson::ct_error::eagain>;
+ using do_osd_ops_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ ::crimson::errorator<crimson::ct_error::eagain>>;
+ template <typename Ret = void>
+ using pg_rep_op_fut_t =
+ std::tuple<interruptible_future<>,
+ do_osd_ops_iertr::future<Ret>>;
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops(
+ Ref<MOSDOp> m,
+ crimson::net::ConnectionRef conn,
+ ObjectContextRef obc,
+ const OpInfo &op_info,
+ const SnapContext& snapc);
+ using do_osd_ops_success_func_t =
+ std::function<do_osd_ops_iertr::future<>()>;
+ using do_osd_ops_failure_func_t =
+ std::function<do_osd_ops_iertr::future<>(const std::error_code&)>;
+ struct do_osd_ops_params_t;
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops(
+ ObjectContextRef obc,
+ std::vector<OSDOp>& ops,
+ const OpInfo &op_info,
+ const do_osd_ops_params_t &&params,
+ do_osd_ops_success_func_t success_func,
+ do_osd_ops_failure_func_t failure_func);
+ template <class Ret, class SuccessFunc, class FailureFunc>
+ do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute(
+ seastar::lw_shared_ptr<OpsExecuter> ox,
+ std::vector<OSDOp>& ops,
+ SuccessFunc&& success_func,
+ FailureFunc&& failure_func);
+ interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
+ std::tuple<interruptible_future<>, interruptible_future<>>
+ submit_transaction(
+ ObjectContextRef&& obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& oop,
+ std::vector<pg_log_entry_t>&& log_entries);
+ interruptible_future<> repair_object(
+ const hobject_t& oid,
+ eversion_t& v);
+ void check_blocklisted_obc_watchers(ObjectContextRef &obc);
+
+private:
+ PG_OSDMapGate osdmap_gate;
+ ShardServices &shard_services;
+
+
+public:
+ cached_map_t get_osdmap() { return peering_state.get_osdmap(); }
+ eversion_t next_version() {
+ return eversion_t(get_osdmap_epoch(),
+ ++projected_last_update.version);
+ }
+ ShardServices& get_shard_services() final {
+ return shard_services;
+ }
+ seastar::future<> stop();
+private:
+ std::unique_ptr<PGBackend> backend;
+ std::unique_ptr<RecoveryBackend> recovery_backend;
+ std::unique_ptr<PGRecovery> recovery_handler;
+
+ PeeringState peering_state;
+ eversion_t projected_last_update;
+
+public:
+ ObjectContextRegistry obc_registry;
+ ObjectContextLoader obc_loader;
+
+private:
+ OSDriver osdriver;
+ SnapMapper snap_mapper;
+
+public:
+ // PeeringListener
+ void publish_stats_to_osd() final;
+ void clear_publish_stats() final;
+ pg_stat_t get_stats() const;
+private:
+ std::optional<pg_stat_t> pg_stats;
+
+public:
+ RecoveryBackend* get_recovery_backend() final {
+ return recovery_backend.get();
+ }
+ PGRecovery* get_recovery_handler() final {
+ return recovery_handler.get();
+ }
+ PeeringState& get_peering_state() final {
+ return peering_state;
+ }
+ bool has_reset_since(epoch_t epoch) const final {
+ return peering_state.pg_has_reset_since(epoch);
+ }
+
+ const pg_missing_tracker_t& get_local_missing() const {
+ return peering_state.get_pg_log().get_missing();
+ }
+ epoch_t get_last_peering_reset() const final {
+ return peering_state.get_last_peering_reset();
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+ return peering_state.get_acting_recovery_backfill();
+ }
+ bool is_backfill_target(pg_shard_t osd) const {
+ return peering_state.is_backfill_target(osd);
+ }
+ void begin_peer_recover(pg_shard_t peer, const hobject_t oid) {
+ peering_state.begin_peer_recover(peer, oid);
+ }
+ uint64_t min_peer_features() const {
+ return peering_state.get_min_peer_features();
+ }
+ const std::map<hobject_t, std::set<pg_shard_t>>&
+ get_missing_loc_shards() const {
+ return peering_state.get_missing_loc().get_missing_locs();
+ }
+ const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
+ return peering_state.get_peer_missing();
+ }
+ epoch_t get_interval_start_epoch() const {
+ return get_info().history.same_interval_since;
+ }
+ const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const {
+ if (shard == pg_whoami)
+ return &get_local_missing();
+ else {
+ auto it = peering_state.get_peer_missing().find(shard);
+ if (it == peering_state.get_peer_missing().end())
+ return nullptr;
+ else
+ return &it->second;
+ }
+ }
+
+ struct complete_op_t {
+ const version_t user_version;
+ const eversion_t version;
+ const int err;
+ };
+ interruptible_future<std::optional<complete_op_t>>
+ already_complete(const osd_reqid_t& reqid);
+ int get_recovery_op_priority() const {
+ int64_t pri = 0;
+ get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+ return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority;
+ }
+ seastar::future<> mark_unfound_lost(int) {
+ // TODO: see PrimaryLogPG::mark_all_unfound_lost()
+ return seastar::now();
+ }
+
+ bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) const;
+
+ template <typename MsgType>
+ bool can_discard_replica_op(const MsgType& m) const {
+ return can_discard_replica_op(m, m.map_epoch);
+ }
+
+private:
+ // instead of seastar::gate, we use a boolean flag to indicate
+ // whether the system is shutting down, as we don't need to track
+ // continuations here.
+ bool stopping = false;
+
+ PGActivationBlocker wait_for_active_blocker;
+
+ friend std::ostream& operator<<(std::ostream&, const PG& pg);
+ friend class ClientRequest;
+ friend struct CommonClientRequest;
+ friend class PGAdvanceMap;
+ template <class T>
+ friend class PeeringEvent;
+ friend class RepRequest;
+ friend class LogMissingRequest;
+ friend class LogMissingRequestReply;
+ friend class BackfillRecovery;
+ friend struct PGFacade;
+ friend class InternalClientRequest;
+ friend class WatchTimeoutRequest;
+ friend class SnapTrimEvent;
+ friend class SnapTrimObjSubEvent;
+private:
+ seastar::future<bool> find_unfound() {
+ return seastar::make_ready_future<bool>(true);
+ }
+
+ bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const;
+ bool can_discard_op(const MOSDOp& m) const;
+ void context_registry_on_change();
+ bool is_missing_object(const hobject_t& soid) const {
+ return peering_state.get_pg_log().get_missing().get_items().count(soid);
+ }
+ bool is_unreadable_object(const hobject_t &oid,
+ eversion_t* v = 0) const final {
+ return is_missing_object(oid) ||
+ !peering_state.get_missing_loc().readable_with_acting(
+ oid, get_actingset(), v);
+ }
+ bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
+ const std::set<pg_shard_t> &get_actingset() const {
+ return peering_state.get_actingset();
+ }
+
+private:
+ friend class IOInterruptCondition;
+ struct log_update_t {
+ std::set<pg_shard_t> waiting_on;
+ seastar::shared_promise<> all_committed;
+ };
+
+ std::map<ceph_tid_t, log_update_t> log_entry_update_waiting_on;
+ // snap trimming
+ interval_set<snapid_t> snap_trimq;
+};
+
+struct PG::do_osd_ops_params_t {
+ crimson::net::ConnectionRef &get_connection() const {
+ return conn;
+ }
+ osd_reqid_t get_reqid() const {
+ return reqid;
+ }
+ utime_t get_mtime() const {
+ return mtime;
+ };
+ epoch_t get_map_epoch() const {
+ return map_epoch;
+ }
+ entity_inst_t get_orig_source_inst() const {
+ return orig_source_inst;
+ }
+ uint64_t get_features() const {
+ return features;
+ }
+ // Only used by InternalClientRequest, no op flags
+ bool has_flag(uint32_t flag) const {
+ return false;
+ }
+
+ // Only used by ExecutableMessagePimpl
+ entity_name_t get_source() const {
+ return orig_source_inst.name;
+ }
+
+ crimson::net::ConnectionRef &conn;
+ osd_reqid_t reqid;
+ utime_t mtime;
+ epoch_t map_epoch;
+ entity_inst_t orig_source_inst;
+ uint64_t features;
+};
+
+std::ostream& operator<<(std::ostream&, const PG& pg);
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::PG> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/pg_activation_blocker.cc b/src/crimson/osd/pg_activation_blocker.cc
new file mode 100644
index 000000000..48ffe3f84
--- /dev/null
+++ b/src/crimson/osd/pg_activation_blocker.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_activation_blocker.h"
+
+namespace crimson::osd {
+
+void PGActivationBlocker::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pg->get_pgid();
+}
+
+void PGActivationBlocker::unblock()
+{
+ p.set_value();
+ p = {};
+}
+
+seastar::future<>
+PGActivationBlocker::wait(PGActivationBlocker::BlockingEvent::TriggerI&& trigger)
+{
+ if (pg->get_peering_state().is_active()) {
+ return seastar::now();
+ } else {
+ return trigger.maybe_record_blocking(p.get_shared_future(), *this);
+ }
+}
+
+seastar::future<> PGActivationBlocker::stop()
+{
+ p.set_exception(crimson::common::system_shutdown_exception());
+ return seastar::now();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_activation_blocker.h b/src/crimson/osd/pg_activation_blocker.h
new file mode 100644
index 000000000..fff8219d1
--- /dev/null
+++ b/src/crimson/osd/pg_activation_blocker.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "crimson/common/operation.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class PGActivationBlocker : public crimson::BlockerT<PGActivationBlocker> {
+ PG *pg;
+
+ const spg_t pgid;
+ seastar::shared_promise<> p;
+
+protected:
+ void dump_detail(Formatter *f) const;
+
+public:
+ static constexpr const char *type_name = "PGActivationBlocker";
+ using Blocker = PGActivationBlocker;
+
+ PGActivationBlocker(PG *pg) : pg(pg) {}
+ void unblock();
+ seastar::future<> wait(PGActivationBlocker::BlockingEvent::TriggerI&&);
+ seastar::future<> stop();
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
new file mode 100644
index 000000000..02acb9a55
--- /dev/null
+++ b/src/crimson/osd/pg_backend.cc
@@ -0,0 +1,1811 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_backend.h"
+
+#include <charconv>
+#include <optional>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/print.hh>
+
+#include "messages/MOSDOp.h"
+#include "os/Transaction.h"
+#include "common/Checksummer.h"
+#include "common/Clock.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/common/tmap_helpers.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/object_context_loader.h"
+#include "replicated_backend.h"
+#include "replicated_recovery_backend.h"
+#include "ec_backend.h"
+#include "exceptions.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::runtime_error;
+using std::string;
+using std::string_view;
+using crimson::common::local_conf;
+
+std::unique_ptr<PGBackend>
+PGBackend::create(pg_t pgid,
+ const pg_shard_t pg_shard,
+ const pg_pool_t& pool,
+ crimson::os::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ DoutPrefixProvider &dpp)
+{
+ switch (pool.type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ return std::make_unique<ReplicatedBackend>(pgid, pg_shard,
+ coll, shard_services,
+ dpp);
+ case pg_pool_t::TYPE_ERASURE:
+ return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services,
+ std::move(ec_profile),
+ pool.stripe_width,
+ dpp);
+ default:
+ throw runtime_error(seastar::format("unsupported pool type '{}'",
+ pool.type));
+ }
+}
+
+PGBackend::PGBackend(shard_id_t shard,
+ CollectionRef coll,
+ crimson::osd::ShardServices &shard_services,
+ DoutPrefixProvider &dpp)
+ : shard{shard},
+ coll{coll},
+ shard_services{shard_services},
+ dpp{dpp},
+ store{&shard_services.get_store()}
+{}
+
+PGBackend::load_metadata_iertr::future
+ <PGBackend::loaded_object_md_t::ref>
+PGBackend::load_metadata(const hobject_t& oid)
+{
+ return interruptor::make_interruptible(store->get_attrs(
+ coll,
+ ghobject_t{oid, ghobject_t::NO_GEN, shard})).safe_then_interruptible(
+ [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{
+ loaded_object_md_t::ref ret(new loaded_object_md_t());
+ if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) {
+ bufferlist bl = std::move(oiiter->second);
+ try {
+ ret->os = ObjectState(
+ object_info_t(bl, oid),
+ true);
+ } catch (const buffer::error&) {
+ logger().warn("unable to decode ObjectState");
+ throw crimson::osd::invalid_argument();
+ }
+ } else {
+ logger().error(
+ "load_metadata: object {} present but missing object info",
+ oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+
+ if (oid.is_head()) {
+ // Returning object_corrupted when the object exsits and the
+ // Snapset is either not found or empty.
+ bool object_corrupted = true;
+ if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) {
+ object_corrupted = false;
+ bufferlist bl = std::move(ssiter->second);
+ if (bl.length()) {
+ ret->ssc = new crimson::osd::SnapSetContext(oid.get_snapdir());
+ try {
+ ret->ssc->snapset = SnapSet(bl);
+ ret->ssc->exists = true;
+ logger().debug(
+ "load_metadata: object {} and snapset {} present",
+ oid, ret->ssc->snapset);
+ } catch (const buffer::error&) {
+ logger().warn("unable to decode SnapSet");
+ throw crimson::osd::invalid_argument();
+ }
+ } else {
+ object_corrupted = true;
+ }
+ }
+ if (object_corrupted) {
+ logger().error(
+ "load_metadata: object {} present but missing snapset",
+ oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+ }
+
+ return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+ std::move(ret));
+ }, crimson::ct_error::enoent::handle([oid] {
+ logger().debug(
+ "load_metadata: object {} doesn't exist, returning empty metadata",
+ oid);
+ return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+ new loaded_object_md_t{
+ ObjectState(
+ object_info_t(oid),
+ false),
+ oid.is_head() ? (new crimson::osd::SnapSetContext(oid)) : nullptr
+ });
+ }));
+}
+
+PGBackend::rep_op_fut_t
+PGBackend::mutate_object(
+ std::set<pg_shard_t> pg_shards,
+ crimson::osd::ObjectContextRef &&obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ logger().trace("mutate_object: num_ops={}", txn.get_num_ops());
+ if (obc->obs.exists) {
+#if 0
+ obc->obs.oi.version = ctx->at_version;
+ obc->obs.oi.prior_version = ctx->obs->oi.version;
+#endif
+
+ obc->obs.oi.prior_version = obc->obs.oi.version;
+ obc->obs.oi.version = osd_op_p.at_version;
+ if (osd_op_p.user_at_version > obc->obs.oi.user_version)
+ obc->obs.oi.user_version = osd_op_p.user_at_version;
+ obc->obs.oi.last_reqid = osd_op_p.req_id;
+ obc->obs.oi.mtime = osd_op_p.mtime;
+ obc->obs.oi.local_mtime = ceph_clock_now();
+
+ // object_info_t
+ {
+ ceph::bufferlist osv;
+ obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL);
+ // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv);
+ }
+
+ // snapset
+ if (obc->obs.oi.soid.snap == CEPH_NOSNAP) {
+ logger().debug("final snapset {} in {}",
+ obc->ssc->snapset, obc->obs.oi.soid);
+ ceph::bufferlist bss;
+ encode(obc->ssc->snapset, bss);
+ txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss);
+ obc->ssc->exists = true;
+ } else {
+ logger().debug("no snapset (this is a clone)");
+ }
+ } else {
+ // reset cached ObjectState without enforcing eviction
+ obc->obs.oi = object_info_t(obc->obs.oi.soid);
+ }
+ return _submit_transaction(
+ std::move(pg_shards), obc->obs.oi.soid, std::move(txn),
+ std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries));
+}
+
+static inline bool _read_verify_data(
+ const object_info_t& oi,
+ const ceph::bufferlist& data)
+{
+ if (oi.is_data_digest() && oi.size == data.length()) {
+ // whole object? can we verify the checksum?
+ if (auto crc = data.crc32c(-1); crc != oi.data_digest) {
+ logger().error("full-object read crc {} != expected {} on {}",
+ crc, oi.data_digest, oi.soid);
+ // todo: mark soid missing, perform recovery, and retry
+ return false;
+ }
+ }
+ return true;
+}
+
+PGBackend::read_ierrorator::future<>
+PGBackend::read(const ObjectState& os, OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ const auto& oi = os.oi;
+ const ceph_osd_op& op = osd_op.op;
+ const uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ logger().trace("read: {} {}~{}", oi.soid, offset, length);
+
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ // are we beyond truncate_size?
+ size_t size = oi.size;
+ if ((op.extent.truncate_seq > oi.truncate_seq) &&
+ (op.extent.truncate_size < offset + length) &&
+ (op.extent.truncate_size < size)) {
+ size = op.extent.truncate_size;
+ }
+ if (offset >= size) {
+ // read size was trimmed to zero and it is expected to do nothing,
+ return read_errorator::now();
+ }
+ if (!length) {
+ // read the whole object if length is 0
+ length = size;
+ }
+ return _read(oi.soid, offset, length, op.flags).safe_then_interruptible_tuple(
+ [&delta_stats, &oi, &osd_op](auto&& bl) -> read_errorator::future<> {
+ if (!_read_verify_data(oi, bl)) {
+ // crc mismatches
+ return crimson::ct_error::object_corrupted::make();
+ }
+ logger().debug("read: data length: {}", bl.length());
+ osd_op.op.extent.length = bl.length();
+ osd_op.rval = 0;
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ osd_op.outdata = std::move(bl);
+ return read_errorator::now();
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+}
+
+PGBackend::read_ierrorator::future<>
+PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ const auto& op = osd_op.op;
+ /* clients (particularly cephfs) may send truncate operations out of order
+ * w.r.t. reads. op.extent.truncate_seq and op.extent.truncate_size allow
+ * the OSD to determine whether the client submitted read needs to be
+ * adjusted to compensate for a truncate the OSD hasn't seen yet.
+ */
+ uint64_t adjusted_size = os.oi.size;
+ const uint64_t offset = op.extent.offset;
+ uint64_t adjusted_length = op.extent.length;
+ if ((os.oi.truncate_seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
+ (adjusted_size > op.extent.truncate_size)) {
+ adjusted_size = op.extent.truncate_size;
+ }
+ if (offset > adjusted_size) {
+ adjusted_length = 0;
+ } else if (offset + adjusted_length > adjusted_size) {
+ adjusted_length = adjusted_size - offset;
+ }
+ logger().trace("sparse_read: {} {}~{}",
+ os.oi.soid, op.extent.offset, op.extent.length);
+ return interruptor::make_interruptible(store->fiemap(coll, ghobject_t{os.oi.soid},
+ offset, adjusted_length)).safe_then_interruptible(
+ [&delta_stats, &os, &osd_op, this](auto&& m) {
+ return seastar::do_with(interval_set<uint64_t>{std::move(m)},
+ [&delta_stats, &os, &osd_op, this](auto&& extents) {
+ return interruptor::make_interruptible(store->readv(coll, ghobject_t{os.oi.soid},
+ extents, osd_op.op.flags)).safe_then_interruptible_tuple(
+ [&delta_stats, &os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> {
+ if (_read_verify_data(os.oi, bl)) {
+ osd_op.op.extent.length = bl.length();
+ // re-encode since it might be modified
+ ceph::encode(extents, osd_op.outdata);
+ encode_destructively(bl, osd_op.outdata);
+ logger().trace("sparse_read got {} bytes from object {}",
+ osd_op.op.extent.length, os.oi.soid);
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.extent.length, 10);
+ return read_errorator::make_ready_future<>();
+ } else {
+ // crc mismatches
+ return crimson::ct_error::object_corrupted::make();
+ }
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+ });
+ });
+}
+
+namespace {
+
+ template<class CSum>
+ PGBackend::checksum_errorator::future<>
+ do_checksum(ceph::bufferlist& init_value_bl,
+ size_t chunk_size,
+ const ceph::bufferlist& buf,
+ ceph::bufferlist& result)
+ {
+ typename CSum::init_value_t init_value;
+ auto init_value_p = init_value_bl.cbegin();
+ try {
+ decode(init_value, init_value_p);
+ // chop off the consumed part
+ init_value_bl.splice(0, init_value_p.get_off());
+ } catch (const ceph::buffer::end_of_buffer&) {
+ logger().warn("{}: init value not provided", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ const uint32_t chunk_count = buf.length() / chunk_size;
+ ceph::bufferptr csum_data{
+ ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)};
+ Checksummer::calculate<CSum>(
+ init_value, chunk_size, 0, buf.length(), buf, &csum_data);
+ encode(chunk_count, result);
+ result.append(std::move(csum_data));
+ return PGBackend::checksum_errorator::now();
+ }
+}
+
+PGBackend::checksum_ierrorator::future<>
+PGBackend::checksum(const ObjectState& os, OSDOp& osd_op)
+{
+ // sanity tests and normalize the argments
+ auto& checksum = osd_op.op.checksum;
+ if (checksum.offset == 0 && checksum.length == 0) {
+ // zeroed offset+length implies checksum whole object
+ checksum.length = os.oi.size;
+ } else if (checksum.offset >= os.oi.size) {
+ // read size was trimmed to zero, do nothing,
+ // see PGBackend::read()
+ return checksum_errorator::now();
+ }
+ if (checksum.chunk_size > 0) {
+ if (checksum.length == 0) {
+ logger().warn("{}: length required when chunk size provided", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ if (checksum.length % checksum.chunk_size != 0) {
+ logger().warn("{}: length not aligned to chunk size", __func__);
+ return crimson::ct_error::invarg::make();
+ }
+ } else {
+ checksum.chunk_size = checksum.length;
+ }
+ if (checksum.length == 0) {
+ uint32_t count = 0;
+ encode(count, osd_op.outdata);
+ return checksum_errorator::now();
+ }
+
+ // read the chunk to be checksum'ed
+ return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags)
+ .safe_then_interruptible(
+ [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> {
+ auto& checksum = osd_op.op.checksum;
+ if (read_bl.length() != checksum.length) {
+ logger().warn("checksum: bytes read {} != {}",
+ read_bl.length(), checksum.length);
+ return crimson::ct_error::invarg::make();
+ }
+ // calculate its checksum and put the result in outdata
+ switch (checksum.type) {
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
+ return do_checksum<Checksummer::xxhash32>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
+ return do_checksum<Checksummer::xxhash64>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
+ return do_checksum<Checksummer::crc32c>(osd_op.indata,
+ checksum.chunk_size,
+ read_bl,
+ osd_op.outdata);
+ default:
+ logger().warn("checksum: unknown crc type ({})",
+ static_cast<uint32_t>(checksum.type));
+ return crimson::ct_error::invarg::make();
+ }
+ });
+}
+
+PGBackend::cmp_ext_ierrorator::future<>
+PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op)
+{
+ const ceph_osd_op& op = osd_op.op;
+ uint64_t obj_size = os.oi.size;
+ if (os.oi.truncate_seq < op.extent.truncate_seq &&
+ op.extent.offset + op.extent.length > op.extent.truncate_size) {
+ obj_size = op.extent.truncate_size;
+ }
+ uint64_t ext_len;
+ if (op.extent.offset >= obj_size) {
+ ext_len = 0;
+ } else if (op.extent.offset + op.extent.length > obj_size) {
+ ext_len = obj_size - op.extent.offset;
+ } else {
+ ext_len = op.extent.length;
+ }
+ auto read_ext = ll_read_ierrorator::make_ready_future<ceph::bufferlist>();
+ if (ext_len == 0) {
+ logger().debug("{}: zero length extent", __func__);
+ } else if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ } else {
+ read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0);
+ }
+ return read_ext.safe_then_interruptible([&osd_op](auto&& read_bl)
+ -> cmp_ext_errorator::future<> {
+ for (unsigned index = 0; index < osd_op.indata.length(); index++) {
+ char byte_in_op = osd_op.indata[index];
+ char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0);
+ if (byte_in_op != byte_from_disk) {
+ logger().debug("cmp_ext: mismatch at {}", index);
+ // Unlike other ops, we set osd_op.rval here and return a different
+ // error code via ct_error::cmp_fail.
+ osd_op.rval = -MAX_ERRNO - index;
+ return crimson::ct_error::cmp_fail::make();
+ }
+ }
+ osd_op.rval = 0;
+ return cmp_ext_errorator::make_ready_future<>();
+ });
+}
+
+PGBackend::stat_ierrorator::future<>
+PGBackend::stat(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ if (os.exists/* TODO: && !os.is_whiteout() */) {
+ logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime);
+ encode(os.oi.size, osd_op.outdata);
+ encode(os.oi.mtime, osd_op.outdata);
+ } else {
+ logger().debug("stat object does not exist");
+ return crimson::ct_error::enoent::make();
+ }
+ delta_stats.num_rd++;
+ return stat_errorator::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::_writefull(
+ ObjectState& os,
+ off_t truncate_size,
+ const bufferlist& bl,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ unsigned flags)
+{
+ const bool existing = maybe_create_new_object(os, txn, delta_stats);
+ if (existing && bl.length() < os.oi.size) {
+
+ txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, bl.length());
+ truncate_update_size_and_usage(delta_stats, os.oi, truncate_size);
+
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ bl.length(),
+ os.oi.size - bl.length());
+ }
+ if (bl.length()) {
+ txn.write(
+ coll->get_cid(), ghobject_t{os.oi.soid}, 0, bl.length(),
+ bl, flags);
+ update_size_and_usage(
+ delta_stats, os.oi, 0,
+ bl.length(), true);
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ 0,
+ std::max((uint64_t)bl.length(), os.oi.size));
+ }
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::_truncate(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ size_t offset,
+ size_t truncate_size,
+ uint32_t truncate_seq)
+{
+ if (truncate_seq) {
+ assert(offset == truncate_size);
+ if (truncate_seq <= os.oi.truncate_seq) {
+ logger().debug("{} truncate seq {} <= current {}, no-op",
+ __func__, truncate_seq, os.oi.truncate_seq);
+ return write_ertr::make_ready_future<>();
+ } else {
+ logger().debug("{} truncate seq {} > current {}, truncating",
+ __func__, truncate_seq, os.oi.truncate_seq);
+ os.oi.truncate_seq = truncate_seq;
+ os.oi.truncate_size = truncate_size;
+ }
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (os.oi.size != offset) {
+ txn.truncate(
+ coll->get_cid(),
+ ghobject_t{os.oi.soid}, offset);
+ if (os.oi.size > offset) {
+ // TODO: modified_ranges.union_of(trim);
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ offset,
+ os.oi.size - offset);
+ } else {
+ // os.oi.size < offset
+ osd_op_params.clean_regions.mark_data_region_dirty(
+ os.oi.size,
+ offset - os.oi.size);
+ }
+ truncate_update_size_and_usage(delta_stats, os.oi, offset);
+ os.oi.clear_data_digest();
+ }
+ delta_stats.num_wr++;
+ return write_ertr::now();
+}
+
+bool PGBackend::maybe_create_new_object(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists) {
+ ceph_assert(!os.oi.is_whiteout());
+ os.exists = true;
+ os.oi.new_object();
+
+ txn.touch(coll->get_cid(), ghobject_t{os.oi.soid});
+ delta_stats.num_objects++;
+ return false;
+ } else if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts--;
+ }
+ return true;
+}
+
+void PGBackend::update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi, uint64_t offset,
+ uint64_t length, bool write_full)
+{
+ if (write_full ||
+ (offset + length > oi.size && length)) {
+ uint64_t new_size = offset + length;
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += new_size;
+ oi.size = new_size;
+ }
+ delta_stats.num_wr++;
+ delta_stats.num_wr_kb += shift_round_up(length, 10);
+}
+
+void PGBackend::truncate_update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size)
+{
+ if (oi.size != truncate_size) {
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += truncate_size;
+ oi.size = truncate_size;
+ }
+}
+
+static bool is_offset_and_length_valid(
+ const std::uint64_t offset,
+ const std::uint64_t length)
+{
+ if (const std::uint64_t max = local_conf()->osd_max_object_size;
+ offset >= max || length > max || offset + length > max) {
+ logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; "
+ "Hard limit of object size is 4GB",
+ __func__, max, offset, length);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+PGBackend::interruptible_future<> PGBackend::set_allochint(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+
+ os.oi.expected_object_size = osd_op.op.alloc_hint.expected_object_size;
+ os.oi.expected_write_size = osd_op.op.alloc_hint.expected_write_size;
+ os.oi.alloc_hint_flags = osd_op.op.alloc_hint.flags;
+ txn.set_alloc_hint(coll->get_cid(),
+ ghobject_t{os.oi.soid},
+ os.oi.expected_object_size,
+ os.oi.expected_write_size,
+ os.oi.alloc_hint_flags);
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::write(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ bufferlist buf = osd_op.indata;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ if (auto seq = os.oi.truncate_seq;
+ seq != 0 && op.extent.truncate_seq < seq) {
+ // old write, arrived after trimtrunc
+ if (offset + length > os.oi.size) {
+ // no-op
+ if (offset > os.oi.size) {
+ length = 0;
+ buf.clear();
+ } else {
+ // truncate
+ auto len = os.oi.size - offset;
+ buf.splice(len, length);
+ length = len;
+ }
+ }
+ } else if (op.extent.truncate_seq > seq) {
+ // write arrives before trimtrunc
+ if (os.exists && !os.oi.is_whiteout()) {
+ txn.truncate(coll->get_cid(),
+ ghobject_t{os.oi.soid}, op.extent.truncate_size);
+ if (op.extent.truncate_size != os.oi.size) {
+ os.oi.size = length;
+ if (op.extent.truncate_size > os.oi.size) {
+ osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+ op.extent.truncate_size - os.oi.size);
+ } else {
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size,
+ os.oi.size - op.extent.truncate_size);
+ }
+ }
+ truncate_update_size_and_usage(delta_stats, os.oi, op.extent.truncate_size);
+ }
+ os.oi.truncate_seq = op.extent.truncate_seq;
+ os.oi.truncate_size = op.extent.truncate_size;
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (length == 0) {
+ if (offset > os.oi.size) {
+ txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset);
+ truncate_update_size_and_usage(delta_stats, os.oi, op.extent.offset);
+ } else {
+ txn.nop();
+ }
+ } else {
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ offset, length, std::move(buf), op.flags);
+ update_size_and_usage(delta_stats, os.oi, offset, length);
+ }
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+ op.extent.length);
+
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::write_same(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ const uint64_t len = op.writesame.length;
+ if (len == 0) {
+ return seastar::now();
+ }
+ if (op.writesame.data_length == 0 ||
+ len % op.writesame.data_length != 0 ||
+ op.writesame.data_length != osd_op.indata.length()) {
+ throw crimson::osd::invalid_argument();
+ }
+ ceph::bufferlist repeated_indata;
+ for (uint64_t size = 0; size < len; size += op.writesame.data_length) {
+ repeated_indata.append(osd_op.indata);
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ op.writesame.offset, len,
+ std::move(repeated_indata), op.flags);
+ update_size_and_usage(delta_stats, os.oi, op.writesame.offset, len);
+ osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len);
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::writefull(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ return _writefull(
+ os,
+ op.extent.truncate_size,
+ osd_op.indata,
+ txn,
+ osd_op_params,
+ delta_stats,
+ op.flags);
+}
+
+PGBackend::rollback_iertr::future<> PGBackend::rollback(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ crimson::osd::ObjectContextRef head,
+ crimson::osd::ObjectContextLoader& obc_loader)
+{
+ const ceph_osd_op& op = osd_op.op;
+ snapid_t snapid = (uint64_t)op.snap.snapid;
+ assert(os.oi.soid.is_head());
+ logger().debug("{} deleting {} and rolling back to old snap {}",
+ __func__, os.oi.soid ,snapid);
+ hobject_t target_coid = os.oi.soid;
+ target_coid.snap = snapid;
+ return obc_loader.with_clone_obc_only<RWState::RWWRITE>(
+ head, target_coid,
+ [this, &os, &txn, &delta_stats, &osd_op_params]
+ (auto resolved_obc) {
+ if (resolved_obc->obs.oi.soid.is_head()) {
+ // no-op: The resolved oid returned the head object
+ logger().debug("PGBackend::rollback: loaded head_obc: {}"
+ " do nothing",
+ resolved_obc->obs.oi.soid);
+ return rollback_iertr::now();
+ }
+ /* TODO: https://tracker.ceph.com/issues/59114 This implementation will not
+ * behave correctly for a rados operation consisting of a mutation followed
+ * by a rollback to a snapshot since the last mutation of the object.
+ * The correct behavior would be for the rollback to undo the mutation
+ * earlier in the operation by resolving to the clone created at the start
+ * of the operation (see resolve_oid).
+ * Instead, it will select HEAD leaving that mutation intact since the SnapSet won't
+ * yet contain that clone. This behavior exists in classic as well.
+ */
+ logger().debug("PGBackend::rollback: loaded clone_obc: {}",
+ resolved_obc->obs.oi.soid);
+ // 1) Delete current head
+ if (os.exists) {
+ txn.remove(coll->get_cid(), ghobject_t{os.oi.soid,
+ ghobject_t::NO_GEN, shard});
+ }
+ // 2) Clone correct snapshot into head
+ txn.clone(coll->get_cid(), ghobject_t{resolved_obc->obs.oi.soid},
+ ghobject_t{os.oi.soid});
+ // Copy clone obc.os.oi to os.oi
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ os.oi.copy_user_bits(resolved_obc->obs.oi);
+ delta_stats.num_bytes -= os.oi.size;
+ delta_stats.num_bytes += resolved_obc->obs.oi.size;
+ osd_op_params.clean_regions.mark_data_region_dirty(0,
+ std::max(os.oi.size, resolved_obc->obs.oi.size));
+ osd_op_params.clean_regions.mark_omap_dirty();
+ // TODO: 3) Calculate clone_overlaps by following overlaps
+ // forward from rollback snapshot
+ // https://tracker.ceph.com/issues/58263
+ return rollback_iertr::now();
+ }).safe_then_interruptible([] {
+ logger().debug("PGBackend::rollback succefully");
+ return rollback_iertr::now();
+ },// there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object;
+ // otherwise, do nothing.
+ crimson::ct_error::enoent::handle(
+ [this, &os, &snapid, &txn, &delta_stats] {
+ logger().debug("PGBackend::rollback: deleting head on {}"
+ " with snap_id of {}"
+ " because got ENOENT|whiteout on obc lookup",
+ os.oi.soid, snapid);
+ return remove(os, txn, delta_stats, false);
+ }),
+ rollback_ertr::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error in rollback"}
+ );
+}
+
+PGBackend::append_ierrorator::future<> PGBackend::append(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ const ceph_osd_op& op = osd_op.op;
+ if (op.extent.length != osd_op.indata.length()) {
+ return crimson::ct_error::invarg::make();
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ if (op.extent.length) {
+ txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+ os.oi.size /* offset */, op.extent.length,
+ std::move(osd_op.indata), op.flags);
+ update_size_and_usage(delta_stats, os.oi, os.oi.size,
+ op.extent.length);
+ osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+ op.extent.length);
+ }
+ return seastar::now();
+}
+
+PGBackend::write_iertr::future<> PGBackend::truncate(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{} object dne, truncate is a no-op", __func__);
+ return write_ertr::now();
+ }
+ const ceph_osd_op& op = osd_op.op;
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+ return _truncate(
+ os, txn, osd_op_params, delta_stats,
+ op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq);
+}
+
+PGBackend::write_iertr::future<> PGBackend::zero(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{} object dne, zero is a no-op", __func__);
+ return write_ertr::now();
+ }
+ const ceph_osd_op& op = osd_op.op;
+ if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ if (op.extent.offset >= os.oi.size || op.extent.length == 0) {
+ return write_iertr::now(); // noop
+ }
+
+ if (op.extent.offset + op.extent.length >= os.oi.size) {
+ return _truncate(
+ os, txn, osd_op_params, delta_stats,
+ op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq);
+ }
+
+ txn.zero(coll->get_cid(),
+ ghobject_t{os.oi.soid},
+ op.extent.offset,
+ op.extent.length);
+ // TODO: modified_ranges.union_of(zeroed);
+ osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+ op.extent.length);
+ delta_stats.num_wr++;
+ os.oi.clear_data_digest();
+ return write_ertr::now();
+}
+
+PGBackend::create_iertr::future<> PGBackend::create(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (os.exists && !os.oi.is_whiteout() &&
+ (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
+ // this is an exclusive create
+ return crimson::ct_error::eexist::make();
+ }
+
+ if (osd_op.indata.length()) {
+ // handle the legacy. `category` is no longer implemented.
+ try {
+ auto p = osd_op.indata.cbegin();
+ std::string category;
+ decode(category, p);
+ } catch (buffer::error&) {
+ return crimson::ct_error::invarg::make();
+ }
+ }
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.create(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<>
+PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn)
+{
+ // todo: snapset
+ txn.remove(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ os.oi.size = 0;
+ os.oi.new_object();
+ os.exists = false;
+ // todo: update watchers
+ if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ }
+ return seastar::now();
+}
+
+PGBackend::remove_iertr::future<>
+PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats, bool whiteout)
+{
+ if (!os.exists) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ if (!os.exists) {
+ logger().debug("{} {} does not exist",__func__, os.oi.soid);
+ return seastar::now();
+ }
+ if (whiteout && os.oi.is_whiteout()) {
+ logger().debug("{} whiteout set on {} ",__func__, os.oi.soid);
+ return seastar::now();
+ }
+ txn.remove(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ delta_stats.num_bytes -= os.oi.size;
+ os.oi.size = 0;
+ os.oi.new_object();
+
+ // todo: clone_overlap
+ if (whiteout) {
+ logger().debug("{} setting whiteout on {} ",__func__, os.oi.soid);
+ os.oi.set_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts++;
+ txn.create(coll->get_cid(),
+ ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+ return seastar::now();
+ }
+ // todo: update watchers
+ if (os.oi.is_whiteout()) {
+ os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ delta_stats.num_whiteouts--;
+ }
+ delta_stats.num_objects--;
+ os.exists = false;
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>>
+PGBackend::list_objects(const hobject_t& start, uint64_t limit) const
+{
+ auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard};
+ return interruptor::make_interruptible(store->list_objects(coll,
+ gstart,
+ ghobject_t::get_max(),
+ limit))
+ .then_interruptible([](auto ret) {
+ auto& [gobjects, next] = ret;
+ std::vector<hobject_t> objects;
+ boost::copy(gobjects |
+ boost::adaptors::filtered([](const ghobject_t& o) {
+ if (o.is_pgmeta()) {
+ return false;
+ } else if (o.hobj.is_temp()) {
+ return false;
+ } else {
+ return o.is_no_gen();
+ }
+ }) |
+ boost::adaptors::transformed([](const ghobject_t& o) {
+ return o.hobj;
+ }),
+ std::back_inserter(objects));
+ return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>(
+ std::make_tuple(objects, next.hobj));
+ });
+}
+
+PGBackend::setxattr_ierrorator::future<> PGBackend::setxattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ if (local_conf()->osd_max_attr_size > 0 &&
+ osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) {
+ return crimson::ct_error::file_too_large::make();
+ }
+
+ const auto max_name_len = std::min<uint64_t>(
+ store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len);
+ if (osd_op.op.xattr.name_len > max_name_len) {
+ return crimson::ct_error::enametoolong::make();
+ }
+
+ maybe_create_new_object(os, txn, delta_stats);
+
+ std::string name{"_"};
+ ceph::bufferlist val;
+ {
+ auto bp = osd_op.indata.cbegin();
+ bp.copy(osd_op.op.xattr.name_len, name);
+ bp.copy(osd_op.op.xattr.value_len, val);
+ }
+ logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name);
+ txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val);
+ delta_stats.num_wr++;
+ return seastar::now();
+}
+
+PGBackend::get_attr_ierrorator::future<> PGBackend::getxattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ std::string name;
+ ceph::bufferlist val;
+ {
+ auto bp = osd_op.indata.cbegin();
+ std::string aname;
+ bp.copy(osd_op.op.xattr.name_len, aname);
+ name = "_" + aname;
+ }
+ logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name);
+ return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible(
+ [&delta_stats, &osd_op] (ceph::bufferlist&& val) {
+ osd_op.outdata = std::move(val);
+ osd_op.op.xattr.value_len = osd_op.outdata.length();
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ return get_attr_errorator::now();
+ });
+}
+
+PGBackend::get_attr_ierrorator::future<ceph::bufferlist>
+PGBackend::getxattr(
+ const hobject_t& soid,
+ std::string_view key) const
+{
+ return store->get_attr(coll, ghobject_t{soid}, key);
+}
+
+PGBackend::get_attr_ierrorator::future<ceph::bufferlist>
+PGBackend::getxattr(
+ const hobject_t& soid,
+ std::string&& key) const
+{
+ return seastar::do_with(key, [this, &soid](auto &key) {
+ return store->get_attr(coll, ghobject_t{soid}, key);
+ });
+}
+
+PGBackend::get_attr_ierrorator::future<> PGBackend::get_xattrs(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then(
+ [&delta_stats, &osd_op](auto&& attrs) {
+ std::vector<std::pair<std::string, bufferlist>> user_xattrs;
+ ceph::bufferlist bl;
+ for (auto& [key, val] : attrs) {
+ if (key.size() > 1 && key[0] == '_') {
+ bl.append(std::move(val));
+ user_xattrs.emplace_back(key.substr(1), std::move(bl));
+ }
+ }
+ ceph::encode(user_xattrs, osd_op.outdata);
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ return get_attr_errorator::now();
+ });
+}
+
+namespace {
+
+template<typename U, typename V>
+int do_cmp_xattr(int op, const U& lhs, const V& rhs)
+{
+ switch (op) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ return lhs == rhs;
+ case CEPH_OSD_CMPXATTR_OP_NE:
+ return lhs != rhs;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ return lhs > rhs;
+ case CEPH_OSD_CMPXATTR_OP_GTE:
+ return lhs >= rhs;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ return lhs < rhs;
+ case CEPH_OSD_CMPXATTR_OP_LTE:
+ return lhs <= rhs;
+ default:
+ return -EINVAL;
+ }
+}
+
+} // anonymous namespace
+
+static int do_xattr_cmp_u64(int op, uint64_t lhs, bufferlist& rhs_xattr)
+{
+ uint64_t rhs;
+
+ if (rhs_xattr.length() > 0) {
+ const char* first = rhs_xattr.c_str();
+ if (auto [p, ec] = std::from_chars(first, first + rhs_xattr.length(), rhs);
+ ec != std::errc()) {
+ return -EINVAL;
+ }
+ } else {
+ rhs = 0;
+ }
+ logger().debug("do_xattr_cmp_u64 '{}' vs '{}' op {}", lhs, rhs, op);
+ return do_cmp_xattr(op, lhs, rhs);
+}
+
+PGBackend::cmp_xattr_ierrorator::future<> PGBackend::cmp_xattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ std::string name{"_"};
+ auto bp = osd_op.indata.cbegin();
+ bp.copy(osd_op.op.xattr.name_len, name);
+
+ logger().debug("cmpxattr on obj={} for attr={}", os.oi.soid, name);
+ return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible(
+ [&delta_stats, &osd_op] (auto &&xattr) -> cmp_xattr_ierrorator::future<> {
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10);
+
+ int result = 0;
+ auto bp = osd_op.indata.cbegin();
+ bp += osd_op.op.xattr.name_len;
+
+ switch (osd_op.op.xattr.cmp_mode) {
+ case CEPH_OSD_CMPXATTR_MODE_STRING:
+ {
+ string lhs;
+ bp.copy(osd_op.op.xattr.value_len, lhs);
+ string_view rhs(xattr.c_str(), xattr.length());
+ result = do_cmp_xattr(osd_op.op.xattr.cmp_op, lhs, rhs);
+ logger().debug("cmpxattr lhs={}, rhs={}", lhs, rhs);
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_MODE_U64:
+ {
+ uint64_t lhs;
+ try {
+ decode(lhs, bp);
+ } catch (ceph::buffer::error& e) {
+ logger().info("cmp_xattr: buffer error expection");
+ result = -EINVAL;
+ break;
+ }
+ result = do_xattr_cmp_u64(osd_op.op.xattr.cmp_op, lhs, xattr);
+ }
+ break;
+ default:
+ logger().info("bad cmp mode {}", osd_op.op.xattr.cmp_mode);
+ result = -EINVAL;
+ }
+ if (result == 0) {
+ logger().info("cmp_xattr: comparison returned false");
+ return crimson::ct_error::ecanceled::make();
+ } else if (result == -EINVAL) {
+ return crimson::ct_error::invarg::make();
+ } else {
+ osd_op.rval = 1;
+ return cmp_xattr_ierrorator::now();
+ }
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&delta_stats, &osd_op] ()
+ ->cmp_xattr_errorator::future<> {
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10);
+ return crimson::ct_error::ecanceled::make();
+ }),
+ cmp_xattr_errorator::pass_further{}
+ );
+}
+
+PGBackend::rm_xattr_iertr::future<>
+PGBackend::rm_xattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: {} DNE", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ auto bp = osd_op.indata.cbegin();
+ string attr_name{"_"};
+ bp.copy(osd_op.op.xattr.name_len, attr_name);
+ txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name);
+ return rm_xattr_iertr::now();
+}
+
+void PGBackend::clone(
+ /* const */object_info_t& snap_oi,
+ const ObjectState& os,
+ const ObjectState& d_os,
+ ceph::os::Transaction& txn)
+{
+ // See OpsExecutor::execute_clone documentation
+ txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
+ {
+ ceph::bufferlist bv;
+ snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
+ txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv);
+ }
+ txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR);
+}
+
+using get_omap_ertr =
+ crimson::os::FuturizedStore::Shard::read_errorator::extend<
+ crimson::ct_error::enodata>;
+using get_omap_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ get_omap_ertr>;
+static
+get_omap_iertr::future<
+ crimson::os::FuturizedStore::Shard::omap_values_t>
+maybe_get_omap_vals_by_keys(
+ crimson::os::FuturizedStore::Shard* store,
+ const crimson::os::CollectionRef& coll,
+ const object_info_t& oi,
+ const std::set<std::string>& keys_to_get)
+{
+ if (oi.is_omap()) {
+ return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+static
+get_omap_iertr::future<
+ std::tuple<bool, crimson::os::FuturizedStore::Shard::omap_values_t>>
+maybe_get_omap_vals(
+ crimson::os::FuturizedStore::Shard* store,
+ const crimson::os::CollectionRef& coll,
+ const object_info_t& oi,
+ const std::string& start_after)
+{
+ if (oi.is_omap()) {
+ return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+PGBackend::ll_read_ierrorator::future<ceph::bufferlist>
+PGBackend::omap_get_header(
+ const crimson::os::CollectionRef& c,
+ const ghobject_t& oid) const
+{
+ return store->omap_get_header(c, oid)
+ .handle_error(
+ crimson::ct_error::enodata::handle([] {
+ return seastar::make_ready_future<bufferlist>();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_header(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (os.oi.is_omap()) {
+ return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible(
+ [&delta_stats, &osd_op] (ceph::bufferlist&& header) {
+ osd_op.outdata = std::move(header);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return seastar::now();
+ });
+ } else {
+ // no omap? return empty data but not ENOENT. This is imporant for
+ // the case when the object is being creating due to to may_write().
+ return seastar::now();
+ }
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ std::string start_after;
+ uint64_t max_return;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(start_after, p);
+ decode(max_return, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+ max_return =
+ std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+
+
+ // TODO: truly chunk the reading
+ return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then_interruptible(
+ [=,&delta_stats, &osd_op](auto ret) {
+ ceph::bufferlist result;
+ bool truncated = false;
+ uint32_t num = 0;
+ for (auto &[key, val] : std::get<1>(ret)) {
+ if (num >= max_return ||
+ result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(key, result);
+ ++num;
+ }
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(result);
+ encode(truncated, osd_op.outdata);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ uint32_t num = 0;
+ bool truncated = false;
+ encode(num, osd_op.outdata);
+ encode(truncated, osd_op.outdata);
+ osd_op.rval = 0;
+ return seastar::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+static
+PGBackend::omap_cmp_ertr::future<> do_omap_val_cmp(
+ std::map<std::string, bufferlist, std::less<>> out,
+ std::map<std::string, std::pair<bufferlist, int>> assertions)
+{
+ bufferlist empty;
+ for (const auto &[akey, avalue] : assertions) {
+ const auto [abl, aflag] = avalue;
+ auto out_entry = out.find(akey);
+ bufferlist &bl = (out_entry != out.end()) ? out_entry->second : empty;
+ switch (aflag) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ if (!(bl == abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ if (!(bl < abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ if (!(bl > abl)) {
+ return crimson::ct_error::ecanceled::make();
+ }
+ break;
+ default:
+ return crimson::ct_error::invarg::make();
+ }
+ }
+ return PGBackend::omap_cmp_ertr::now();
+}
+PGBackend::omap_cmp_iertr::future<>
+PGBackend::omap_cmp(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ auto bp = osd_op.indata.cbegin();
+ std::map<std::string, std::pair<bufferlist, int> > assertions;
+ try {
+ decode(assertions, bp);
+ } catch (buffer::error&) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ delta_stats.num_rd++;
+ if (os.oi.is_omap()) {
+ std::set<std::string> to_get;
+ for (auto &i: assertions) {
+ to_get.insert(i.first);
+ }
+ return store->omap_get_values(coll, ghobject_t{os.oi.soid}, to_get)
+ .safe_then([=, &osd_op] (auto&& out) -> omap_cmp_iertr::future<> {
+ osd_op.rval = 0;
+ return do_omap_val_cmp(out, assertions);
+ });
+ } else {
+ return crimson::ct_error::ecanceled::make();
+ }
+}
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_vals(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ std::string start_after;
+ uint64_t max_return;
+ std::string filter_prefix;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(start_after, p);
+ decode(max_return, p);
+ decode(filter_prefix, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ max_return = \
+ std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+
+ // TODO: truly chunk the reading
+ return maybe_get_omap_vals(store, coll, os.oi, start_after)
+ .safe_then_interruptible(
+ [=, &osd_op] (auto&& ret) {
+ auto [done, vals] = std::move(ret);
+ assert(done);
+ ceph::bufferlist result;
+ bool truncated = false;
+ uint32_t num = 0;
+ auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix)
+ : std::begin(vals);
+ for (; iter != std::end(vals); ++iter) {
+ const auto& [key, value] = *iter;
+ if (key.substr(0, filter_prefix.size()) != filter_prefix) {
+ break;
+ } else if (num >= max_return ||
+ result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(key, result);
+ encode(value, result);
+ ++num;
+ }
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(result);
+ encode(truncated, osd_op.outdata);
+ return ll_read_errorator::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ encode(uint32_t{0} /* num */, osd_op.outdata);
+ encode(bool{false} /* truncated */, osd_op.outdata);
+ osd_op.rval = 0;
+ return ll_read_errorator::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::ll_read_ierrorator::future<>
+PGBackend::omap_get_vals_by_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", __func__, os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ std::set<std::string> keys_to_get;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(keys_to_get, p);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument();
+ }
+ delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ delta_stats.num_rd++;
+ return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get)
+ .safe_then_interruptible(
+ [&osd_op] (crimson::os::FuturizedStore::Shard::omap_values_t&& vals) {
+ encode(vals, osd_op.outdata);
+ return ll_read_errorator::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::enodata::handle([&osd_op] {
+ uint32_t num = 0;
+ encode(num, osd_op.outdata);
+ osd_op.rval = 0;
+ return ll_read_errorator::now();
+ }),
+ ll_read_errorator::pass_further{}
+ );
+}
+
+PGBackend::interruptible_future<>
+PGBackend::omap_set_vals(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+
+ ceph::bufferlist to_set_bl;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode_str_str_map_to_bl(p, &to_set_bl);
+ } catch (buffer::error&) {
+ throw crimson::osd::invalid_argument{};
+ }
+
+ txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl);
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
+ os.oi.set_flag(object_info_t::FLAG_OMAP);
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<>
+PGBackend::omap_set_header(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ maybe_create_new_object(os, txn, delta_stats);
+ txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata);
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ os.oi.set_flag(object_info_t::FLAG_OMAP);
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::omap_remove_range(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats)
+{
+ std::string key_begin, key_end;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode(key_begin, p);
+ decode(key_end, p);
+ } catch (buffer::error& e) {
+ throw crimson::osd::invalid_argument{};
+ }
+ txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end);
+ delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::interruptible_future<> PGBackend::omap_remove_key(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn)
+{
+ ceph::bufferlist to_rm_bl;
+ try {
+ auto p = osd_op.indata.cbegin();
+ decode_str_set_to_bl(p, &to_rm_bl);
+ } catch (buffer::error& e) {
+ throw crimson::osd::invalid_argument{};
+ }
+ txn.omap_rmkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_rm_bl);
+ // TODO:
+ // ctx->clean_regions.mark_omap_dirty();
+ // ctx->delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ return seastar::now();
+}
+
+PGBackend::omap_clear_iertr::future<>
+PGBackend::omap_clear(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats)
+{
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("{}: object does not exist: {}", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+ if (!os.oi.is_omap()) {
+ return omap_clear_ertr::now();
+ }
+ txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid});
+ osd_op_params.clean_regions.mark_omap_dirty();
+ delta_stats.num_wr++;
+ os.oi.clear_omap_digest();
+ os.oi.clear_flag(object_info_t::FLAG_OMAP);
+ return omap_clear_ertr::now();
+}
+
+PGBackend::interruptible_future<struct stat>
+PGBackend::stat(
+ CollectionRef c,
+ const ghobject_t& oid) const
+{
+ return store->stat(c, oid);
+}
+
+PGBackend::read_errorator::future<std::map<uint64_t, uint64_t>>
+PGBackend::fiemap(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ return store->fiemap(c, oid, off, len);
+}
+
+PGBackend::write_iertr::future<> PGBackend::tmapput(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params)
+{
+ logger().debug("PGBackend::tmapput: {}", os.oi.soid);
+ auto ret = crimson::common::do_tmap_put(osd_op.indata.cbegin());
+ if (!ret.has_value()) {
+ logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret.error());
+ ceph_assert(ret.error() == -EINVAL);
+ return crimson::ct_error::invarg::make();
+ } else {
+ auto bl = std::move(ret.value());
+ return _writefull(
+ os,
+ bl.length(),
+ std::move(bl),
+ txn,
+ osd_op_params,
+ delta_stats,
+ 0);
+ }
+}
+
+PGBackend::tmapup_iertr::future<> PGBackend::tmapup(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params)
+{
+ logger().debug("PGBackend::tmapup: {}", os.oi.soid);
+ return PGBackend::write_iertr::now(
+ ).si_then([this, &os] {
+ return _read(os.oi.soid, 0, os.oi.size, 0);
+ }).handle_error_interruptible(
+ crimson::ct_error::enoent::handle([](auto &) {
+ return seastar::make_ready_future<bufferlist>();
+ }),
+ PGBackend::write_iertr::pass_further{},
+ crimson::ct_error::assert_all{"read error in mutate_object_contents"}
+ ).si_then([this, &os, &osd_op, &txn,
+ &delta_stats, &osd_op_params]
+ (auto &&bl) mutable -> PGBackend::tmapup_iertr::future<> {
+ auto result = crimson::common::do_tmap_up(
+ osd_op.indata.cbegin(),
+ std::move(bl));
+ if (!result.has_value()) {
+ int ret = result.error();
+ logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret);
+ switch (ret) {
+ case -EEXIST:
+ return crimson::ct_error::eexist::make();
+ case -ENOENT:
+ return crimson::ct_error::enoent::make();
+ case -EINVAL:
+ return crimson::ct_error::invarg::make();
+ default:
+ ceph_assert(0 == "impossible error");
+ return crimson::ct_error::invarg::make();
+ }
+ }
+
+ logger().debug(
+ "PGBackend::tmapup: {}, result.value.length()={}, ret=0",
+ os.oi.soid, result.value().length());
+ return _writefull(
+ os,
+ result.value().length(),
+ result.value(),
+ txn,
+ osd_op_params,
+ delta_stats,
+ 0);
+ });
+}
+
+PGBackend::read_ierrorator::future<> PGBackend::tmapget(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats)
+{
+ logger().debug("PGBackend::tmapget: {}", os.oi.soid);
+ const auto& oi = os.oi;
+ logger().debug("PGBackend::tmapget: read {} 0~{}", oi.soid, oi.size);
+ if (!os.exists || os.oi.is_whiteout()) {
+ logger().debug("PGBackend::tmapget: {} DNE", os.oi.soid);
+ return crimson::ct_error::enoent::make();
+ }
+
+ return _read(oi.soid, 0, oi.size, 0).safe_then_interruptible_tuple(
+ [&delta_stats, &osd_op](auto&& bl) -> read_errorator::future<> {
+ logger().debug("PGBackend::tmapget: data length: {}", bl.length());
+ osd_op.op.extent.length = bl.length();
+ osd_op.rval = 0;
+ delta_stats.num_rd++;
+ delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ osd_op.outdata = std::move(bl);
+ return read_errorator::now();
+ }, crimson::ct_error::input_output_error::handle([] {
+ return read_errorator::future<>{crimson::ct_error::object_corrupted::make()};
+ }),
+ read_errorator::pass_further{});
+}
+
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
new file mode 100644
index 000000000..fbad37d4c
--- /dev/null
+++ b/src/crimson/osd/pg_backend.h
@@ -0,0 +1,448 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados.h"
+
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/acked_peers.h"
+#include "crimson/common/shared_lru.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+
+struct hobject_t;
+
+namespace ceph::os {
+ class Transaction;
+}
+
+namespace crimson::osd {
+ class ShardServices;
+ class PG;
+ class ObjectContextLoader;
+}
+
+class PGBackend
+{
+protected:
+ using CollectionRef = crimson::os::CollectionRef;
+ using ec_profile_t = std::map<std::string, std::string>;
+ // low-level read errorator
+ using ll_read_errorator = crimson::os::FuturizedStore::Shard::read_errorator;
+ using ll_read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ ll_read_errorator>;
+
+public:
+ using load_metadata_ertr = crimson::errorator<
+ crimson::ct_error::object_corrupted>;
+ using load_metadata_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_metadata_ertr>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+ using rep_op_fut_t =
+ std::tuple<interruptible_future<>,
+ interruptible_future<crimson::osd::acked_peers_t>>;
+ PGBackend(shard_id_t shard, CollectionRef coll,
+ crimson::osd::ShardServices &shard_services,
+ DoutPrefixProvider &dpp);
+ virtual ~PGBackend() = default;
+ static std::unique_ptr<PGBackend> create(pg_t pgid,
+ const pg_shard_t pg_shard,
+ const pg_pool_t& pool,
+ crimson::os::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ const ec_profile_t& ec_profile,
+ DoutPrefixProvider &dpp);
+ using attrs_t =
+ std::map<std::string, ceph::bufferptr, std::less<>>;
+ using read_errorator = ll_read_errorator::extend<
+ crimson::ct_error::object_corrupted>;
+ using read_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ read_errorator>;
+ read_ierrorator::future<> read(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+ read_ierrorator::future<> sparse_read(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+ using checksum_errorator = ll_read_errorator::extend<
+ crimson::ct_error::object_corrupted,
+ crimson::ct_error::invarg>;
+ using checksum_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ checksum_errorator>;
+ checksum_ierrorator::future<> checksum(
+ const ObjectState& os,
+ OSDOp& osd_op);
+ using cmp_ext_errorator = ll_read_errorator::extend<
+ crimson::ct_error::invarg,
+ crimson::ct_error::cmp_fail>;
+ using cmp_ext_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ cmp_ext_errorator>;
+ cmp_ext_ierrorator::future<> cmp_ext(
+ const ObjectState& os,
+ OSDOp& osd_op);
+ using stat_errorator = crimson::errorator<crimson::ct_error::enoent>;
+ using stat_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ stat_errorator>;
+ stat_ierrorator::future<> stat(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+
+ // TODO: switch the entire write family to errorator.
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::file_too_large,
+ crimson::ct_error::invarg>;
+ using write_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ write_ertr>;
+ using create_ertr = crimson::errorator<
+ crimson::ct_error::invarg,
+ crimson::ct_error::eexist>;
+ using create_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ create_ertr>;
+ create_iertr::future<> create(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ using remove_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using remove_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ remove_ertr>;
+ remove_iertr::future<> remove(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats,
+ bool whiteout);
+ interruptible_future<> remove(
+ ObjectState& os,
+ ceph::os::Transaction& txn);
+ interruptible_future<> set_allochint(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> write(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> write_same(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> writefull(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ using append_errorator = crimson::errorator<
+ crimson::ct_error::invarg>;
+ using append_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ append_errorator>;
+ append_ierrorator::future<> append(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ using rollback_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using rollback_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ rollback_ertr>;
+ rollback_iertr::future<> rollback(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ crimson::osd::ObjectContextRef head,
+ crimson::osd::ObjectContextLoader& obc_loader);
+ write_iertr::future<> truncate(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ write_iertr::future<> zero(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ rep_op_fut_t mutate_object(
+ std::set<pg_shard_t> pg_shards,
+ crimson::osd::ObjectContextRef &&obc,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries);
+ interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects(
+ const hobject_t& start,
+ uint64_t limit) const;
+ using setxattr_errorator = crimson::errorator<
+ crimson::ct_error::file_too_large,
+ crimson::ct_error::enametoolong>;
+ using setxattr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ setxattr_errorator>;
+ setxattr_ierrorator::future<> setxattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ using get_attr_errorator = crimson::os::FuturizedStore::Shard::get_attr_errorator;
+ using get_attr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ get_attr_errorator>;
+ get_attr_ierrorator::future<> getxattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ get_attr_ierrorator::future<ceph::bufferlist> getxattr(
+ const hobject_t& soid,
+ std::string_view key) const;
+ get_attr_ierrorator::future<ceph::bufferlist> getxattr(
+ const hobject_t& soid,
+ std::string&& key) const;
+ get_attr_ierrorator::future<> get_xattrs(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using cmp_xattr_errorator = get_attr_errorator::extend<
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::invarg>;
+ using cmp_xattr_ierrorator =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ cmp_xattr_errorator>;
+ cmp_xattr_ierrorator::future<> cmp_xattr(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>;
+ using rm_xattr_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ rm_xattr_ertr>;
+ rm_xattr_iertr::future<> rm_xattr(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans);
+ void clone(
+ /* const */object_info_t& snap_oi,
+ const ObjectState& os,
+ const ObjectState& d_os,
+ ceph::os::Transaction& trans);
+ interruptible_future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) const;
+ read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len);
+
+ write_iertr::future<> tmapput(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params);
+
+ using tmapup_ertr = write_ertr::extend<
+ crimson::ct_error::enoent,
+ crimson::ct_error::eexist>;
+ using tmapup_iertr = ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ tmapup_ertr>;
+ tmapup_iertr::future<> tmapup(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats,
+ osd_op_params_t& osd_op_params);
+
+ read_ierrorator::future<> tmapget(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats);
+
+ // OMAP
+ ll_read_ierrorator::future<> omap_get_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ using omap_cmp_ertr =
+ crimson::os::FuturizedStore::Shard::read_errorator::extend<
+ crimson::ct_error::ecanceled,
+ crimson::ct_error::invarg>;
+ using omap_cmp_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ omap_cmp_ertr>;
+ omap_cmp_iertr::future<> omap_cmp(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ ll_read_ierrorator::future<> omap_get_vals(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ ll_read_ierrorator::future<> omap_get_vals_by_keys(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ interruptible_future<> omap_set_vals(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ ll_read_ierrorator::future<ceph::bufferlist> omap_get_header(
+ const crimson::os::CollectionRef& c,
+ const ghobject_t& oid) const;
+ ll_read_ierrorator::future<> omap_get_header(
+ const ObjectState& os,
+ OSDOp& osd_op,
+ object_stat_sum_t& delta_stats) const;
+ interruptible_future<> omap_set_header(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> omap_remove_range(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ object_stat_sum_t& delta_stats);
+ interruptible_future<> omap_remove_key(
+ ObjectState& os,
+ const OSDOp& osd_op,
+ ceph::os::Transaction& trans);
+ using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>;
+ using omap_clear_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ omap_clear_ertr>;
+ omap_clear_iertr::future<> omap_clear(
+ ObjectState& os,
+ OSDOp& osd_op,
+ ceph::os::Transaction& trans,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats);
+
+ virtual void got_rep_op_reply(const MOSDRepOpReply&) {}
+ virtual seastar::future<> stop() = 0;
+ virtual void on_actingset_changed(bool same_primary) = 0;
+protected:
+ const shard_id_t shard;
+ CollectionRef coll;
+ crimson::osd::ShardServices &shard_services;
+ DoutPrefixProvider &dpp; ///< provides log prefix context
+ crimson::os::FuturizedStore::Shard* store;
+ virtual seastar::future<> request_committed(
+ const osd_reqid_t& reqid,
+ const eversion_t& at_version) = 0;
+public:
+ struct loaded_object_md_t {
+ ObjectState os;
+ crimson::osd::SnapSetContextRef ssc;
+ using ref = std::unique_ptr<loaded_object_md_t>;
+ };
+ load_metadata_iertr::future<loaded_object_md_t::ref>
+ load_metadata(
+ const hobject_t &oid);
+
+private:
+ virtual ll_read_ierrorator::future<ceph::bufferlist> _read(
+ const hobject_t& hoid,
+ size_t offset,
+ size_t length,
+ uint32_t flags) = 0;
+ write_iertr::future<> _writefull(
+ ObjectState& os,
+ off_t truncate_size,
+ const bufferlist& bl,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ unsigned flags);
+ write_iertr::future<> _truncate(
+ ObjectState& os,
+ ceph::os::Transaction& txn,
+ osd_op_params_t& osd_op_params,
+ object_stat_sum_t& delta_stats,
+ size_t offset,
+ size_t truncate_size,
+ uint32_t truncate_seq);
+
+ bool maybe_create_new_object(ObjectState& os,
+ ceph::os::Transaction& txn,
+ object_stat_sum_t& delta_stats);
+ void update_size_and_usage(object_stat_sum_t& delta_stats,
+ object_info_t& oi, uint64_t offset,
+ uint64_t length, bool write_full = false);
+ void truncate_update_size_and_usage(
+ object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size);
+ virtual rep_op_fut_t
+ _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) = 0;
+ friend class ReplicatedRecoveryBackend;
+ friend class ::crimson::osd::PG;
+};
diff --git a/src/crimson/osd/pg_interval_interrupt_condition.cc b/src/crimson/osd/pg_interval_interrupt_condition.cc
new file mode 100644
index 000000000..36243b825
--- /dev/null
+++ b/src/crimson/osd/pg_interval_interrupt_condition.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_interval_interrupt_condition.h"
+#include "pg.h"
+
+#include "crimson/common/log.h"
+
+SET_SUBSYS(osd);
+
+namespace crimson::osd {
+
+IOInterruptCondition::IOInterruptCondition(Ref<PG>& pg)
+ : pg(pg), e(pg->get_osdmap_epoch()) {}
+
+IOInterruptCondition::~IOInterruptCondition() {
+ // for the sake of forward declaring PG (which is a detivate of
+ // intrusive_ref_counter<...>)
+}
+
+bool IOInterruptCondition::new_interval_created() {
+ LOG_PREFIX(IOInterruptCondition::new_interval_created);
+ const epoch_t interval_start = pg->get_interval_start_epoch();
+ bool ret = e < interval_start;
+ if (ret) {
+ DEBUGDPP("stored interval e{} < interval_start e{}", *pg, e, interval_start);
+ }
+ return ret;
+}
+
+bool IOInterruptCondition::is_stopping() {
+ LOG_PREFIX(IOInterruptCondition::is_stopping);
+ if (pg->stopping) {
+ DEBUGDPP("pg stopping", *pg);
+ }
+ return pg->stopping;
+}
+
+bool IOInterruptCondition::is_primary() {
+ return pg->is_primary();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_interval_interrupt_condition.h b/src/crimson/osd/pg_interval_interrupt_condition.h
new file mode 100644
index 000000000..a3a0a1edb
--- /dev/null
+++ b/src/crimson/osd/pg_interval_interrupt_condition.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#pragma once
+
+#include "include/types.h"
+#include "crimson/common/errorator.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/type_helpers.h"
+
+namespace crimson::osd {
+
+class PG;
+
+class IOInterruptCondition {
+public:
+ IOInterruptCondition(Ref<PG>& pg);
+ ~IOInterruptCondition();
+
+ bool new_interval_created();
+
+ bool is_stopping();
+
+ bool is_primary();
+
+ template <typename Fut>
+ std::optional<Fut> may_interrupt() {
+ if (new_interval_created()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ ::crimson::common::actingset_changed(is_primary()));
+ }
+ if (is_stopping()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ ::crimson::common::system_shutdown_exception());
+ }
+ return std::optional<Fut>();
+ }
+
+ template <typename T>
+ static constexpr bool is_interruption_v =
+ std::is_same_v<T, ::crimson::common::actingset_changed>
+ || std::is_same_v<T, ::crimson::common::system_shutdown_exception>;
+
+ static bool is_interruption(std::exception_ptr& eptr) {
+ return (*eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::actingset_changed) ||
+ *eptr.__cxa_exception_type() ==
+ typeid(::crimson::common::system_shutdown_exception));
+ }
+
+private:
+ Ref<PG> pg;
+ epoch_t e;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc
new file mode 100644
index 000000000..193781250
--- /dev/null
+++ b/src/crimson/osd/pg_map.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/pg_map.h"
+
+#include "crimson/osd/pg.h"
+#include "common/Formatter.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::make_pair;
+
+namespace crimson::osd {
+
+PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {}
+PGMap::PGCreationState::~PGCreationState() {}
+
+void PGMap::PGCreationState::dump_detail(Formatter *f) const
+{
+ f->dump_stream("pgid") << pgid;
+ f->dump_bool("creating", creating);
+}
+
+PGMap::wait_for_pg_ret
+PGMap::wait_for_pg(PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid)
+{
+ if (auto pg = get_pg(pgid)) {
+ return make_pair(
+ wait_for_pg_fut(wait_for_pg_ertr::ready_future_marker{}, pg),
+ true);
+ } else {
+ auto &state = pgs_creating.emplace(pgid, pgid).first->second;
+ return make_pair(
+ wait_for_pg_fut(
+ trigger.maybe_record_blocking(state.promise.get_shared_future(), state)
+ ), state.creating);
+ }
+}
+
+void PGMap::remove_pg(spg_t pgid) {
+ ceph_assert(pgs.erase(pgid) == 1);
+}
+
+Ref<PG> PGMap::get_pg(spg_t pgid)
+{
+ if (auto pg = pgs.find(pgid); pg != pgs.end()) {
+ return pg->second;
+ } else {
+ return nullptr;
+ }
+}
+
+void PGMap::set_creating(spg_t pgid)
+{
+ logger().debug("Creating {}", pgid);
+ ceph_assert(pgs.count(pgid) == 0);
+ auto pg = pgs_creating.find(pgid);
+ ceph_assert(pg != pgs_creating.end());
+ ceph_assert(pg->second.creating == false);
+ pg->second.creating = true;
+}
+
+void PGMap::pg_created(spg_t pgid, Ref<PG> pg)
+{
+ logger().debug("Created {}", pgid);
+ ceph_assert(!pgs.count(pgid));
+ pgs.emplace(pgid, pg);
+
+ auto creating_iter = pgs_creating.find(pgid);
+ ceph_assert(creating_iter != pgs_creating.end());
+ auto promise = std::move(creating_iter->second.promise);
+ pgs_creating.erase(creating_iter);
+ promise.set_value(pg);
+}
+
+void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg)
+{
+ ceph_assert(!pgs.count(pgid));
+ pgs.emplace(pgid, pg);
+}
+
+void PGMap::pg_creation_canceled(spg_t pgid)
+{
+ logger().debug("PGMap::pg_creation_canceled: {}", pgid);
+ ceph_assert(!pgs.count(pgid));
+
+ auto creating_iter = pgs_creating.find(pgid);
+ ceph_assert(creating_iter != pgs_creating.end());
+ auto promise = std::move(creating_iter->second.promise);
+ pgs_creating.erase(creating_iter);
+ promise.set_exception(
+ crimson::ct_error::ecanceled::exception_ptr()
+ );
+}
+
+PGMap::~PGMap() {}
+
+}
diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h
new file mode 100644
index 000000000..3269de434
--- /dev/null
+++ b/src/crimson/osd/pg_map.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <algorithm>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/smp_helpers.h"
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+class PG;
+
+/**
+ * PGShardMapping
+ *
+ * Maintains a mapping from spg_t to the core containing that PG. Internally, each
+ * core has a local copy of the mapping to enable core-local lookups. Updates
+ * are proxied to core 0, and the back out to all other cores -- see maybe_create_pg.
+ */
+class PGShardMapping : public seastar::peering_sharded_service<PGShardMapping> {
+public:
+ /// Returns mapping if present, NULL_CORE otherwise
+ core_id_t get_pg_mapping(spg_t pgid) {
+ auto iter = pg_to_core.find(pgid);
+ ceph_assert_always(iter == pg_to_core.end() || iter->second != NULL_CORE);
+ return iter == pg_to_core.end() ? NULL_CORE : iter->second;
+ }
+
+ /// Returns mapping for pgid, creates new one if it doesn't already exist
+ seastar::future<core_id_t> maybe_create_pg(
+ spg_t pgid,
+ core_id_t core = NULL_CORE) {
+ auto find_iter = pg_to_core.find(pgid);
+ if (find_iter != pg_to_core.end()) {
+ ceph_assert_always(find_iter->second != NULL_CORE);
+ if (core != NULL_CORE) {
+ ceph_assert_always(find_iter->second == core);
+ }
+ return seastar::make_ready_future<core_id_t>(find_iter->second);
+ } else {
+ return container().invoke_on(0,[pgid, core]
+ (auto &primary_mapping) {
+ auto [insert_iter, inserted] = primary_mapping.pg_to_core.emplace(pgid, core);
+ ceph_assert_always(inserted);
+ ceph_assert_always(primary_mapping.core_to_num_pgs.size() > 0);
+ std::map<core_id_t, unsigned>::iterator core_iter;
+ if (core == NULL_CORE) {
+ core_iter = std::min_element(
+ primary_mapping.core_to_num_pgs.begin(),
+ primary_mapping.core_to_num_pgs.end(),
+ [](const auto &left, const auto &right) {
+ return left.second < right.second;
+ });
+ } else {
+ core_iter = primary_mapping.core_to_num_pgs.find(core);
+ }
+ ceph_assert_always(primary_mapping.core_to_num_pgs.end() != core_iter);
+ insert_iter->second = core_iter->first;
+ core_iter->second++;
+ return primary_mapping.container().invoke_on_others(
+ [pgid = insert_iter->first, core = insert_iter->second]
+ (auto &other_mapping) {
+ ceph_assert_always(core != NULL_CORE);
+ auto [insert_iter, inserted] = other_mapping.pg_to_core.emplace(pgid, core);
+ ceph_assert_always(inserted);
+ });
+ }).then([this, pgid] {
+ auto find_iter = pg_to_core.find(pgid);
+ return seastar::make_ready_future<core_id_t>(find_iter->second);
+ });
+ }
+ }
+
+ /// Remove pgid
+ seastar::future<> remove_pg(spg_t pgid) {
+ return container().invoke_on(0, [pgid](auto &primary_mapping) {
+ auto iter = primary_mapping.pg_to_core.find(pgid);
+ ceph_assert_always(iter != primary_mapping.pg_to_core.end());
+ ceph_assert_always(iter->second != NULL_CORE);
+ auto count_iter = primary_mapping.core_to_num_pgs.find(iter->second);
+ ceph_assert_always(count_iter != primary_mapping.core_to_num_pgs.end());
+ ceph_assert_always(count_iter->second > 0);
+ --(count_iter->second);
+ primary_mapping.pg_to_core.erase(iter);
+ return primary_mapping.container().invoke_on_others(
+ [pgid](auto &other_mapping) {
+ auto iter = other_mapping.pg_to_core.find(pgid);
+ ceph_assert_always(iter != other_mapping.pg_to_core.end());
+ ceph_assert_always(iter->second != NULL_CORE);
+ other_mapping.pg_to_core.erase(iter);
+ });
+ });
+ }
+
+ size_t get_num_pgs() const { return pg_to_core.size(); }
+
+ /// Map to cores in [min_core_mapping, core_mapping_limit)
+ PGShardMapping(core_id_t min_core_mapping, core_id_t core_mapping_limit) {
+ ceph_assert_always(min_core_mapping < core_mapping_limit);
+ for (auto i = min_core_mapping; i != core_mapping_limit; ++i) {
+ core_to_num_pgs.emplace(i, 0);
+ }
+ }
+
+ template <typename F>
+ void for_each_pgid(F &&f) const {
+ for (const auto &i: pg_to_core) {
+ std::invoke(f, i.first);
+ }
+ }
+
+private:
+ std::map<core_id_t, unsigned> core_to_num_pgs;
+ std::map<spg_t, core_id_t> pg_to_core;
+};
+
+/**
+ * PGMap
+ *
+ * Maps spg_t to PG instance within a shard. Handles dealing with waiting
+ * on pg creation.
+ */
+class PGMap {
+ struct PGCreationState : BlockerT<PGCreationState> {
+ static constexpr const char * type_name = "PGCreation";
+
+ void dump_detail(Formatter *f) const final;
+
+ spg_t pgid;
+ seastar::shared_promise<Ref<PG>> promise;
+ bool creating = false;
+ PGCreationState(spg_t pgid);
+
+ PGCreationState(const PGCreationState &) = delete;
+ PGCreationState(PGCreationState &&) = delete;
+ PGCreationState &operator=(const PGCreationState &) = delete;
+ PGCreationState &operator=(PGCreationState &&) = delete;
+
+ ~PGCreationState();
+ };
+
+ std::map<spg_t, PGCreationState> pgs_creating;
+ using pgs_t = std::map<spg_t, Ref<PG>>;
+ pgs_t pgs;
+
+public:
+ using PGCreationBlocker = PGCreationState;
+ using PGCreationBlockingEvent = PGCreationBlocker::BlockingEvent;
+ /**
+ * Get future for pg with a bool indicating whether it's already being
+ * created.
+ */
+ using wait_for_pg_ertr = crimson::errorator<
+ crimson::ct_error::ecanceled>;
+ using wait_for_pg_fut = wait_for_pg_ertr::future<Ref<PG>>;
+ using wait_for_pg_ret = std::pair<wait_for_pg_fut, bool>;
+ wait_for_pg_ret wait_for_pg(PGCreationBlockingEvent::TriggerI&&, spg_t pgid);
+
+ /**
+ * get PG in non-blocking manner
+ */
+ Ref<PG> get_pg(spg_t pgid);
+
+ /**
+ * Set creating
+ */
+ void set_creating(spg_t pgid);
+
+ /**
+ * Set newly created pg
+ */
+ void pg_created(spg_t pgid, Ref<PG> pg);
+
+ /**
+ * Add newly loaded pg
+ */
+ void pg_loaded(spg_t pgid, Ref<PG> pg);
+
+ /**
+ * Cancel pending creation of pgid.
+ */
+ void pg_creation_canceled(spg_t pgid);
+
+ void remove_pg(spg_t pgid);
+
+ pgs_t& get_pgs() { return pgs; }
+ const pgs_t& get_pgs() const { return pgs; }
+ auto get_pg_count() const { return pgs.size(); }
+ PGMap() = default;
+ ~PGMap();
+};
+
+}
diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc
new file mode 100644
index 000000000..288ee52a0
--- /dev/null
+++ b/src/crimson/osd/pg_meta.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_meta.h"
+
+#include <string_view>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+using std::string;
+using std::string_view;
+// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
+// easily skip them
+using crimson::os::FuturizedStore;
+
+PGMeta::PGMeta(FuturizedStore::Shard& store, spg_t pgid)
+ : store{store},
+ pgid{pgid}
+{}
+
+namespace {
+ template<typename T>
+ std::optional<T> find_value(const FuturizedStore::Shard::omap_values_t& values,
+ string_view key)
+ {
+ auto found = values.find(key);
+ if (found == values.end()) {
+ return {};
+ }
+ auto p = found->second.cbegin();
+ T value;
+ decode(value, p);
+ return std::make_optional(std::move(value));
+ }
+}
+
+seastar::future<epoch_t> PGMeta::get_epoch()
+{
+ return store.open_collection(coll_t{pgid}).then([this](auto ch) {
+ return store.omap_get_values(ch,
+ pgid.make_pgmeta_oid(),
+ {string{infover_key},
+ string{epoch_key}}).safe_then(
+ [](auto&& values) {
+ {
+ // sanity check
+ auto infover = find_value<__u8>(values, infover_key);
+ assert(infover);
+ if (*infover < 10) {
+ throw std::runtime_error("incompatible pg meta");
+ }
+ }
+ {
+ auto epoch = find_value<epoch_t>(values, epoch_key);
+ assert(epoch);
+ return seastar::make_ready_future<epoch_t>(*epoch);
+ }
+ },
+ FuturizedStore::Shard::read_errorator::assert_all{
+ "PGMeta::get_epoch: unable to read pgmeta"
+ });
+ });
+}
+
+seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load()
+{
+ return store.open_collection(coll_t{pgid}).then([this](auto ch) {
+ return store.omap_get_values(ch,
+ pgid.make_pgmeta_oid(),
+ {string{infover_key},
+ string{info_key},
+ string{biginfo_key},
+ string{fastinfo_key}});
+ }).safe_then([](auto&& values) {
+ {
+ // sanity check
+ auto infover = find_value<__u8>(values, infover_key);
+ assert(infover);
+ if (infover < 10) {
+ throw std::runtime_error("incompatible pg meta");
+ }
+ }
+ pg_info_t info;
+ {
+ auto found = find_value<pg_info_t>(values, info_key);
+ assert(found);
+ info = *std::move(found);
+ }
+ PastIntervals past_intervals;
+ {
+ using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>;
+ auto big_info = find_value<biginfo_t>(values, biginfo_key);
+ assert(big_info);
+ past_intervals = std::move(big_info->first);
+ info.purged_snaps = std::move(big_info->second);
+ }
+ {
+ auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key);
+ if (fast_info) {
+ fast_info->try_apply_to(&info);
+ }
+ }
+ return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>(
+ std::make_tuple(std::move(info), std::move(past_intervals)));
+ },
+ FuturizedStore::Shard::read_errorator::assert_all{
+ "PGMeta::load: unable to read pgmeta"
+ });
+}
diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h
new file mode 100644
index 000000000..21c2bb373
--- /dev/null
+++ b/src/crimson/osd/pg_meta.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <tuple>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+#include "crimson/os/futurized_store.h"
+
+/// PG related metadata
+class PGMeta
+{
+ crimson::os::FuturizedStore::Shard& store;
+ const spg_t pgid;
+public:
+ PGMeta(crimson::os::FuturizedStore::Shard& store, spg_t pgid);
+ seastar::future<epoch_t> get_epoch();
+ seastar::future<std::tuple<pg_info_t, PastIntervals>> load();
+};
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
new file mode 100644
index 000000000..09b45779e
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.cc
@@ -0,0 +1,569 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <fmt/ranges.h>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/backfill_facades.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_recovery.h"
+
+#include "osd/osd_types.h"
+#include "osd/PeeringState.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::map;
+using std::set;
+
+void PGRecovery::start_pglogbased_recovery()
+{
+ using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
+ (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ float(0.001));
+}
+
+PGRecovery::interruptible_future<bool>
+PGRecovery::start_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start)
+{
+ assert(pg->is_primary());
+ assert(pg->is_peered());
+ assert(pg->is_recovering());
+ // in ceph-osd the do_recovery() path handles both the pg log-based
+ // recovery and the backfill, albeit they are separated at the layer
+ // of PeeringState. In crimson-osd backfill has been cut from it, so
+ // and do_recovery() is actually solely for pg log-based recovery.
+ // At the time of writing it's considered to move it to FSM and fix
+ // the naming as well.
+ assert(!pg->is_backfilling());
+ assert(!pg->get_peering_state().is_deleting());
+
+ std::vector<interruptible_future<>> started;
+ started.reserve(max_to_start);
+ max_to_start -= start_primary_recovery_ops(trigger, max_to_start, &started);
+ if (max_to_start > 0) {
+ max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
+ }
+ using interruptor =
+ crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
+ return interruptor::parallel_for_each(started,
+ [] (auto&& ifut) {
+ return std::move(ifut);
+ }).then_interruptible([this] {
+ bool done = !pg->get_peering_state().needs_recovery();
+ if (done) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ if (!pg->get_peering_state().needs_backfill()) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered{});
+ } else {
+ logger().debug("start_recovery_ops: RequestBackfill for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::RequestBackfill{});
+ }
+ }
+ return seastar::make_ready_future<bool>(!done);
+ });
+}
+
+size_t PGRecovery::start_primary_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+
+ if (!pg->get_peering_state().have_missing()) {
+ pg->get_peering_state().local_recovery_complete();
+ return 0;
+ }
+
+ const auto &missing = pg->get_peering_state().get_pg_log().get_missing();
+
+ logger().info("{} recovering {} in pg {}, missing {}", __func__,
+ pg->get_recovery_backend()->total_recovering(),
+ *static_cast<crimson::osd::PG*>(pg),
+ missing);
+
+ unsigned started = 0;
+ int skipped = 0;
+
+ map<version_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested);
+ while (started < max_to_start && p != missing.get_rmissing().end()) {
+ // TODO: chain futures here to enable yielding to scheduler?
+ hobject_t soid;
+ version_t v = p->first;
+
+ auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second);
+ if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) {
+ // look at log!
+ pg_log_entry_t *latest = it_objects->second;
+ assert(latest->is_update() || latest->is_delete());
+ soid = latest->soid;
+ } else {
+ soid = p->second;
+ }
+ const pg_missing_item& item = missing.get_items().find(p->second)->second;
+ ++p;
+
+ hobject_t head = soid.get_head();
+
+ logger().info(
+ "{} {} item.need {} {} {} {} {}",
+ __func__,
+ soid,
+ item.need,
+ missing.is_missing(soid) ? " (missing)":"",
+ missing.is_missing(head) ? " (missing head)":"",
+ pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"",
+ pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":"");
+
+ // TODO: handle lost/unfound
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ ++started;
+ } else if (pg->get_recovery_backend()->is_recovering(head)) {
+ ++skipped;
+ } else {
+ out->emplace_back(recover_missing(trigger, soid, item.need));
+ ++started;
+ }
+
+ if (!skipped)
+ pg->get_peering_state().set_last_requested(v);
+ }
+
+ logger().info("{} started {} skipped {}", __func__, started, skipped);
+
+ return started;
+}
+
+size_t PGRecovery::start_replica_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+ uint64_t started = 0;
+
+ assert(!pg->get_peering_state().get_acting_recovery_backfill().empty());
+
+ auto recovery_order = get_replica_recovery_order();
+ for (auto &peer : recovery_order) {
+ assert(peer != pg->get_peering_state().get_primary());
+ const auto& pm = pg->get_peering_state().get_peer_missing(peer);
+
+ logger().debug("{}: peer osd.{} missing {} objects", __func__,
+ peer, pm.num_missing());
+ logger().trace("{}: peer osd.{} missing {}", __func__,
+ peer, pm.get_items());
+
+ // recover oldest first
+ for (auto p = pm.get_rmissing().begin();
+ p != pm.get_rmissing().end() && started < max_to_start;
+ ++p) {
+ const auto &soid = p->second;
+
+ if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
+ logger().debug("{}: object {} still unfound", __func__, soid);
+ continue;
+ }
+
+ const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer);
+ if (soid > pi.last_backfill) {
+ if (!pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().error(
+ "{}: object {} in missing set for backfill (last_backfill {})"
+ " but not in recovering",
+ __func__,
+ soid,
+ pi.last_backfill);
+ ceph_abort();
+ }
+ continue;
+ }
+
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().debug("{}: already recovering object {}", __func__, soid);
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ started++;
+ continue;
+ }
+
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ logger().debug("{}: soid {} is a delete, removing", __func__, soid);
+ map<hobject_t,pg_missing_item>::const_iterator r =
+ pm.get_items().find(soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_deletes(trigger, soid, r->second.need));
+ continue;
+ }
+
+ if (soid.is_snap() &&
+ pg->get_peering_state().get_pg_log().get_missing().is_missing(
+ soid.get_head())) {
+ logger().debug("{}: head {} still missing on primary", __func__,
+ soid.get_head());
+ continue;
+ }
+
+ if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) {
+ logger().debug("{}: soid {} still missing on primary", __func__, soid);
+ continue;
+ }
+
+ logger().debug("{}: recover_object_replicas({})", __func__,soid);
+ map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find(
+ soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_pushes(trigger, soid, r->second.need));
+ }
+ }
+
+ return started;
+}
+
+PGRecovery::interruptible_future<>
+PGRecovery::recover_missing(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t &soid, eversion_t need)
+{
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_delete(soid, need));
+ } else {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+ }
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->push_delete(soid, need).then_interruptible(
+ [=, this] {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ on_global_recover(soid, stat_diff, true);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+void PGRecovery::on_local_recover(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& recovery_info,
+ const bool is_delete,
+ ceph::os::Transaction& t)
+{
+ if (const auto &log = pg->get_peering_state().get_pg_log();
+ !is_delete &&
+ log.get_missing().is_missing(recovery_info.soid) &&
+ log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
+ assert(pg->is_primary());
+ if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second;
+ latest->op == pg_log_entry_t::LOST_REVERT) {
+ ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet");
+ }
+ }
+ pg->get_peering_state().recover_got(soid,
+ recovery_info.version, is_delete, t);
+
+ if (pg->is_primary()) {
+ if (!is_delete) {
+ auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend?
+ obc->obs.exists = true;
+ obc->obs.oi = recovery_info.oi;
+ }
+ if (!pg->is_unreadable_object(soid)) {
+ pg->get_recovery_backend()->get_recovering(soid).set_readable();
+ }
+ pg->publish_stats_to_osd();
+ }
+}
+
+void PGRecovery::on_global_recover (
+ const hobject_t& soid,
+ const object_stat_sum_t& stat_diff,
+ const bool is_delete)
+{
+ logger().info("{} {}", __func__, soid);
+ pg->get_peering_state().object_recovered(soid, stat_diff);
+ pg->publish_stats_to_osd();
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ if (!is_delete)
+ recovery_waiter.obc->drop_recovery_read();
+ recovery_waiter.set_recovered();
+ pg->get_recovery_backend()->remove_recovering(soid);
+}
+
+void PGRecovery::on_failed_recover(
+ const set<pg_shard_t>& from,
+ const hobject_t& soid,
+ const eversion_t& v)
+{
+ for (auto pg_shard : from) {
+ if (pg_shard != pg->get_pg_whoami()) {
+ pg->get_peering_state().force_object_missing(pg_shard, soid, v);
+ }
+ }
+}
+
+void PGRecovery::on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info)
+{
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{}: {}, {} on {}", __func__, oid,
+ recovery_info.version, peer);
+ pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version);
+}
+
+void PGRecovery::_committed_pushed_object(epoch_t epoch,
+ eversion_t last_complete)
+{
+ if (!pg->has_reset_since(epoch)) {
+ pg->get_peering_state().recovery_committed_to(last_complete);
+ } else {
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{} pg has changed, not touching last_complete_ondisk",
+ __func__);
+ }
+}
+
+template <class EventT>
+void PGRecovery::start_backfill_recovery(const EventT& evt)
+{
+ using BackfillRecovery = crimson::osd::BackfillRecovery;
+ std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ evt);
+}
+
+void PGRecovery::request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end)
+{
+ logger().debug("{}: target.osd={}", __func__, target.osd);
+ auto msg = crimson::make_message<MOSDPGScan>(
+ MOSDPGScan::OP_SCAN_GET_DIGEST,
+ pg->get_pg_whoami(),
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, target.shard),
+ begin,
+ end);
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(msg),
+ pg->get_osdmap_epoch());
+}
+
+void PGRecovery::request_primary_scan(
+ const hobject_t& begin)
+{
+ logger().debug("{}", __func__);
+ using crimson::common::local_conf;
+ std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+ begin,
+ local_conf()->osd_backfill_scan_min,
+ local_conf()->osd_backfill_scan_max
+ ).then_interruptible([this] (BackfillInterval bi) {
+ logger().debug("request_primary_scan:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) });
+ });
+}
+
+void PGRecovery::enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ logger().debug("{}: obj={} v={}",
+ __func__, obj, v);
+ pg->get_recovery_backend()->add_recovering(obj);
+ std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\
+ handle_exception_interruptible([] (auto) {
+ ceph_abort_msg("got exception on backfill's push");
+ return seastar::make_ready_future<>();
+ }).then_interruptible([this, obj] {
+ logger().debug("enqueue_push:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj)));
+ });
+}
+
+void PGRecovery::enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ // allocate a pair if target is seen for the first time
+ auto& req = backfill_drop_requests[target];
+ if (!req) {
+ req = crimson::make_message<MOSDPGBackfillRemove>(
+ spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch());
+ }
+ req->ls.emplace_back(obj, v);
+}
+
+void PGRecovery::maybe_flush()
+{
+ for (auto& [target, req] : backfill_drop_requests) {
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(req),
+ pg->get_osdmap_epoch());
+ }
+ backfill_drop_requests.clear();
+}
+
+void PGRecovery::update_peers_last_backfill(
+ const hobject_t& new_last_backfill)
+{
+ logger().debug("{}: new_last_backfill={}",
+ __func__, new_last_backfill);
+ // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+ // all the backfill targets. Otherwise, we will move last_backfill up on
+ // those targets need it and send OP_BACKFILL_PROGRESS to them.
+ for (const auto& bt : pg->get_peering_state().get_backfill_targets()) {
+ if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt);
+ new_last_backfill > pinfo.last_backfill) {
+ pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill);
+ auto m = crimson::make_message<MOSDPGBackfill>(
+ pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH
+ : MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, bt.shard));
+ // Use default priority here, must match sub_op priority
+ // TODO: if pinfo.last_backfill.is_max(), then
+ // start_recovery_op(hobject_t::get_max());
+ m->last_backfill = pinfo.last_backfill;
+ m->stats = pinfo.stats;
+ std::ignore = pg->get_shard_services().send_to_osd(
+ bt.osd, std::move(m), pg->get_osdmap_epoch());
+ logger().info("{}: peer {} num_objects now {} / {}",
+ __func__,
+ bt,
+ pinfo.stats.stats.sum.num_objects,
+ pg->get_info().stats.stats.sum.num_objects);
+ }
+ }
+}
+
+bool PGRecovery::budget_available() const
+{
+ // TODO: the limits!
+ return true;
+}
+
+void PGRecovery::backfilled()
+{
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::Backfilled{});
+}
+
+void PGRecovery::dispatch_backfill_event(
+ boost::intrusive_ptr<const boost::statechart::event_base> evt)
+{
+ logger().debug("{}", __func__);
+ backfill_state->process_event(evt);
+}
+
+void PGRecovery::on_backfill_reserved()
+{
+ logger().debug("{}", __func__);
+ // PIMP and depedency injection for the sake unittestability.
+ // I'm not afraid about the performance here.
+ using BackfillState = crimson::osd::BackfillState;
+ backfill_state = std::make_unique<BackfillState>(
+ *this,
+ std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
+ std::make_unique<crimson::osd::PGFacade>(
+ *static_cast<crimson::osd::PG*>(pg)));
+ // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
+ // will be set after on_backfill_reserved() returns.
+ // Backfill needs to take this into consideration when scheduling
+ // events -- they must be mutually exclusive with PeeringEvent
+ // instances. Otherwise the execution might begin without having
+ // the state updated.
+ ceph_assert(!pg->get_peering_state().is_backfilling());
+ start_backfill_recovery(BackfillState::Triggered{});
+}
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
new file mode 100644
index 000000000..719d0ad2d
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/recovery_backend.h"
+
+#include "osd/object_state.h"
+
+namespace crimson::osd {
+class UrgentRecovery;
+}
+
+class MOSDPGBackfillRemove;
+class PGBackend;
+
+class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
+public:
+ template <typename T = void>
+ using interruptible_future = RecoveryBackend::interruptible_future<T>;
+ PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
+ virtual ~PGRecovery() {}
+ void start_pglogbased_recovery();
+
+ interruptible_future<bool> start_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start);
+ void on_backfill_reserved();
+ void dispatch_backfill_event(
+ boost::intrusive_ptr<const boost::statechart::event_base> evt);
+
+ seastar::future<> stop() { return seastar::now(); }
+private:
+ PGRecoveryListener* pg;
+ size_t start_primary_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start,
+ std::vector<interruptible_future<>> *out);
+ size_t start_replica_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ size_t max_to_start,
+ std::vector<interruptible_future<>> *out);
+
+ std::vector<pg_shard_t> get_replica_recovery_order() const {
+ return pg->get_replica_recovery_order();
+ }
+ RecoveryBackend::interruptible_future<> recover_missing(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ const hobject_t &soid, eversion_t need);
+ RecoveryBackend::interruptible_future<> prep_object_replica_deletes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need);
+ RecoveryBackend::interruptible_future<> prep_object_replica_pushes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need);
+
+ void on_local_recover(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& recovery_info,
+ bool is_delete,
+ ceph::os::Transaction& t);
+ void on_global_recover (
+ const hobject_t& soid,
+ const object_stat_sum_t& stat_diff,
+ bool is_delete);
+ void on_failed_recover(
+ const std::set<pg_shard_t>& from,
+ const hobject_t& soid,
+ const eversion_t& v);
+ void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info);
+ void _committed_pushed_object(epoch_t epoch,
+ eversion_t last_complete);
+ friend class ReplicatedRecoveryBackend;
+ friend class crimson::osd::UrgentRecovery;
+
+ // backfill begin
+ std::unique_ptr<crimson::osd::BackfillState> backfill_state;
+ std::map<pg_shard_t,
+ MURef<MOSDPGBackfillRemove>> backfill_drop_requests;
+
+ template <class EventT>
+ void start_backfill_recovery(
+ const EventT& evt);
+ void request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end) final;
+ void request_primary_scan(
+ const hobject_t& begin) final;
+ void enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v) final;
+ void enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v) final;
+ void maybe_flush() final;
+ void update_peers_last_backfill(
+ const hobject_t& new_last_backfill) final;
+ bool budget_available() const final;
+ void backfilled() final;
+ friend crimson::osd::BackfillState::PGFacade;
+ friend crimson::osd::PG;
+ // backfill end
+};
diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h
new file mode 100644
index 000000000..c922b9956
--- /dev/null
+++ b/src/crimson/osd/pg_recovery_listener.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "common/hobject.h"
+#include "include/types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+ class ShardServices;
+};
+
+class RecoveryBackend;
+class PGRecovery;
+
+class PGRecoveryListener {
+public:
+ virtual crimson::osd::ShardServices& get_shard_services() = 0;
+ virtual PGRecovery* get_recovery_handler() = 0;
+ virtual epoch_t get_osdmap_epoch() const = 0;
+ virtual bool is_primary() const = 0;
+ virtual bool is_peered() const = 0;
+ virtual bool is_recovering() const = 0;
+ virtual bool is_backfilling() const = 0;
+ virtual PeeringState& get_peering_state() = 0;
+ virtual const pg_shard_t& get_pg_whoami() const = 0;
+ virtual const spg_t& get_pgid() const = 0;
+ virtual RecoveryBackend* get_recovery_backend() = 0;
+ virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0;
+ virtual bool has_reset_since(epoch_t) const = 0;
+ virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0;
+ virtual epoch_t get_last_peering_reset() const = 0;
+ virtual const pg_info_t& get_info() const= 0;
+ virtual seastar::future<> stop() = 0;
+ virtual void publish_stats_to_osd() = 0;
+};
diff --git a/src/crimson/osd/pg_shard_manager.cc b/src/crimson/osd/pg_shard_manager.cc
new file mode 100644
index 000000000..6061c856b
--- /dev/null
+++ b/src/crimson/osd/pg_shard_manager.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/pg_shard_manager.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+seastar::future<> PGShardManager::load_pgs(crimson::os::FuturizedStore& store)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return store.list_collections(
+ ).then([this](auto colls_cores) {
+ return seastar::parallel_for_each(
+ colls_cores,
+ [this](auto coll_core) {
+ auto[coll, shard_core] = coll_core;
+ spg_t pgid;
+ if (coll.is_pg(&pgid)) {
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ pgid, shard_core
+ ).then([this, pgid] (auto core) {
+ return this->template with_remote_shard_state(
+ core,
+ [pgid](
+ PerShardState &per_shard_state,
+ ShardServices &shard_services) {
+ return shard_services.load_pg(
+ pgid
+ ).then([pgid, &per_shard_state](auto &&pg) {
+ logger().info("load_pgs: loaded {}", pgid);
+ per_shard_state.pg_map.pg_loaded(pgid, std::move(pg));
+ return seastar::now();
+ });
+ });
+ });
+ } else if (coll.is_temp(&pgid)) {
+ logger().warn(
+ "found temp collection on crimson osd, should be impossible: {}",
+ coll);
+ ceph_assert(0 == "temp collection on crimson osd, should be impossible");
+ return seastar::now();
+ } else {
+ logger().warn("ignoring unrecognized collection: {}", coll);
+ return seastar::now();
+ }
+ });
+ });
+}
+
+seastar::future<> PGShardManager::stop_pgs()
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all([](auto &local_service) {
+ return local_service.local_state.stop_pgs();
+ });
+}
+
+seastar::future<std::map<pg_t, pg_stat_t>>
+PGShardManager::get_pg_stats() const
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.map_reduce0(
+ [](auto &local) {
+ return local.local_state.get_pg_stats();
+ },
+ std::map<pg_t, pg_stat_t>(),
+ [](auto &&left, auto &&right) {
+ left.merge(std::move(right));
+ return std::move(left);
+ });
+}
+
+seastar::future<> PGShardManager::broadcast_map_to_pgs(epoch_t epoch)
+{
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all([epoch](auto &local_service) {
+ return local_service.local_state.broadcast_map_to_pgs(
+ local_service, epoch
+ );
+ }).then([this, epoch] {
+ logger().debug("PGShardManager::broadcast_map_to_pgs "
+ "broadcasted up to {}",
+ epoch);
+ return shard_services.invoke_on_all([epoch](auto &local_service) {
+ local_service.local_state.osdmap_gate.got_map(epoch);
+ return seastar::now();
+ });
+ });
+}
+
+seastar::future<> PGShardManager::set_up_epoch(epoch_t e) {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return shard_services.invoke_on_all(
+ seastar::smp_submit_to_options{},
+ [e](auto &local_service) {
+ local_service.local_state.set_up_epoch(e);
+ return seastar::now();
+ });
+}
+
+}
diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h
new file mode 100644
index 000000000..2f3a3015d
--- /dev/null
+++ b/src/crimson/osd/pg_shard_manager.h
@@ -0,0 +1,390 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/sharded.hh>
+
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/pg_map.h"
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+namespace crimson::osd {
+/**
+ * PGShardManager
+ *
+ * Manages all state required to partition PGs over seastar reactors
+ * as well as state required to route messages to pgs. Mediates access to
+ * shared resources required by PGs (objectstore, messenger, monclient,
+ * etc)
+ */
+class PGShardManager {
+ seastar::sharded<OSDSingletonState> &osd_singleton_state;
+ seastar::sharded<ShardServices> &shard_services;
+ seastar::sharded<PGShardMapping> &pg_to_shard_mapping;
+
+#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) const { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD_TO_OSD_SINGLETON(METHOD) \
+ FORWARD(METHOD, METHOD, get_osd_singleton_state())
+
+public:
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ PGShardManager(
+ seastar::sharded<OSDSingletonState> &osd_singleton_state,
+ seastar::sharded<ShardServices> &shard_services,
+ seastar::sharded<PGShardMapping> &pg_to_shard_mapping)
+ : osd_singleton_state(osd_singleton_state),
+ shard_services(shard_services),
+ pg_to_shard_mapping(pg_to_shard_mapping) {}
+
+ auto &get_osd_singleton_state() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return osd_singleton_state.local();
+ }
+ auto &get_osd_singleton_state() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return osd_singleton_state.local();
+ }
+ auto &get_shard_services() {
+ return shard_services.local();
+ }
+ auto &get_shard_services() const {
+ return shard_services.local();
+ }
+ auto &get_local_state() { return get_shard_services().local_state; }
+ auto &get_local_state() const { return get_shard_services().local_state; }
+ auto &get_pg_to_shard_mapping() { return pg_to_shard_mapping.local(); }
+ auto &get_pg_to_shard_mapping() const { return pg_to_shard_mapping.local(); }
+
+ seastar::future<> update_map(local_cached_map_t &&map) {
+ get_osd_singleton_state().update_map(
+ make_local_shared_foreign(local_cached_map_t(map))
+ );
+ /* We need each core to get its own foreign_ptr<local_cached_map_t>.
+ * foreign_ptr can't be cheaply copied, so we make one for each core
+ * up front. */
+ return seastar::do_with(
+ std::vector<seastar::foreign_ptr<local_cached_map_t>>(),
+ [this, map](auto &fmaps) {
+ fmaps.resize(seastar::smp::count);
+ for (auto &i: fmaps) {
+ i = seastar::foreign_ptr(map);
+ }
+ return shard_services.invoke_on_all(
+ [&fmaps](auto &local) mutable {
+ local.local_state.update_map(
+ make_local_shared_foreign(
+ std::move(fmaps[seastar::this_shard_id()])
+ ));
+ });
+ });
+ }
+
+ seastar::future<> stop_registries() {
+ return shard_services.invoke_on_all([](auto &local) {
+ return local.local_state.stop_registry();
+ });
+ }
+
+ FORWARD_TO_OSD_SINGLETON(send_pg_created)
+
+ // osd state forwards
+ FORWARD(is_active, is_active, get_shard_services().local_state.osd_state)
+ FORWARD(is_preboot, is_preboot, get_shard_services().local_state.osd_state)
+ FORWARD(is_booting, is_booting, get_shard_services().local_state.osd_state)
+ FORWARD(is_stopping, is_stopping, get_shard_services().local_state.osd_state)
+ FORWARD(is_prestop, is_prestop, get_shard_services().local_state.osd_state)
+ FORWARD(is_initializing, is_initializing, get_shard_services().local_state.osd_state)
+ FORWARD(set_prestop, set_prestop, get_shard_services().local_state.osd_state)
+ FORWARD(set_preboot, set_preboot, get_shard_services().local_state.osd_state)
+ FORWARD(set_booting, set_booting, get_shard_services().local_state.osd_state)
+ FORWARD(set_stopping, set_stopping, get_shard_services().local_state.osd_state)
+ FORWARD(set_active, set_active, get_shard_services().local_state.osd_state)
+ FORWARD(when_active, when_active, get_shard_services().local_state.osd_state)
+ FORWARD_CONST(get_osd_state_string, to_string, get_shard_services().local_state.osd_state)
+
+ FORWARD(got_map, got_map, get_shard_services().local_state.osdmap_gate)
+ FORWARD(wait_for_map, wait_for_map, get_shard_services().local_state.osdmap_gate)
+
+ // Metacoll
+ FORWARD_TO_OSD_SINGLETON(init_meta_coll)
+ FORWARD_TO_OSD_SINGLETON(get_meta_coll)
+
+ FORWARD_TO_OSD_SINGLETON(set_superblock)
+
+ // Core OSDMap methods
+ FORWARD_TO_OSD_SINGLETON(get_local_map)
+ FORWARD_TO_OSD_SINGLETON(load_map_bl)
+ FORWARD_TO_OSD_SINGLETON(load_map_bls)
+ FORWARD_TO_OSD_SINGLETON(store_maps)
+
+ seastar::future<> set_up_epoch(epoch_t e);
+
+ template <typename F>
+ auto with_remote_shard_state(core_id_t core, F &&f) {
+ return shard_services.invoke_on(
+ core, [f=std::move(f)](auto &target_shard_services) mutable {
+ return std::invoke(
+ std::move(f), target_shard_services.local_state,
+ target_shard_services);
+ });
+ }
+
+ template <typename T, typename F>
+ auto with_remote_shard_state_and_op(
+ core_id_t core,
+ typename T::IRef &&op,
+ F &&f) {
+ if (seastar::this_shard_id() == core) {
+ auto &target_shard_services = shard_services.local();
+ return std::invoke(
+ std::move(f),
+ target_shard_services.local_state,
+ target_shard_services,
+ std::move(op));
+ }
+ return op->prepare_remote_submission(
+ ).then([op=std::move(op), f=std::move(f), this, core
+ ](auto f_conn) mutable {
+ return shard_services.invoke_on(
+ core,
+ [f=std::move(f), op=std::move(op), f_conn=std::move(f_conn)
+ ](auto &target_shard_services) mutable {
+ op->finish_remote_submission(std::move(f_conn));
+ return std::invoke(
+ std::move(f),
+ target_shard_services.local_state,
+ target_shard_services,
+ std::move(op));
+ });
+ });
+ }
+
+ /// Runs opref on the appropriate core, creating the pg as necessary.
+ template <typename T>
+ seastar::future<> run_with_pg_maybe_create(
+ typename T::IRef op
+ ) {
+ ceph_assert(op->use_count() == 1);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ static_assert(T::can_create());
+ logger.debug("{}: can_create", *op);
+
+ get_local_state().registry.remove_from_registry(*op);
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ op->get_pgid()
+ ).then([this, op = std::move(op)](auto core) mutable {
+ return this->template with_remote_shard_state_and_op<T>(
+ core, std::move(op),
+ [](PerShardState &per_shard_state,
+ ShardServices &shard_services,
+ typename T::IRef op) {
+ per_shard_state.registry.add_to_registry(*op);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ auto &opref = *op;
+ return opref.template with_blocking_event<
+ PGMap::PGCreationBlockingEvent
+ >([&shard_services, &opref](
+ auto &&trigger) {
+ return shard_services.get_or_create_pg(
+ std::move(trigger),
+ opref.get_pgid(),
+ std::move(opref.get_create_info())
+ );
+ }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) {
+ logger.debug("{}: have_pg", opref);
+ return opref.with_pg(shard_services, pgref);
+ }).handle_error(
+ crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+ logger.debug("{}: pg creation canceled, dropping", opref);
+ return seastar::now();
+ })
+ ).then([op=std::move(op)] {});
+ });
+ });
+ }
+
+ /// Runs opref on the appropriate core, waiting for pg as necessary
+ template <typename T>
+ seastar::future<> run_with_pg_maybe_wait(
+ typename T::IRef op
+ ) {
+ ceph_assert(op->use_count() == 1);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ static_assert(!T::can_create());
+ logger.debug("{}: !can_create", *op);
+
+ get_local_state().registry.remove_from_registry(*op);
+ return get_pg_to_shard_mapping().maybe_create_pg(
+ op->get_pgid()
+ ).then([this, op = std::move(op)](auto core) mutable {
+ return this->template with_remote_shard_state_and_op<T>(
+ core, std::move(op),
+ [](PerShardState &per_shard_state,
+ ShardServices &shard_services,
+ typename T::IRef op) {
+ per_shard_state.registry.add_to_registry(*op);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ auto &opref = *op;
+ return opref.template with_blocking_event<
+ PGMap::PGCreationBlockingEvent
+ >([&shard_services, &opref](
+ auto &&trigger) {
+ return shard_services.wait_for_pg(
+ std::move(trigger), opref.get_pgid());
+ }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) {
+ logger.debug("{}: have_pg", opref);
+ return opref.with_pg(shard_services, pgref);
+ }).handle_error(
+ crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+ logger.debug("{}: pg creation canceled, dropping", opref);
+ return seastar::now();
+ })
+ ).then([op=std::move(op)] {});
+ });
+ });
+ }
+
+ seastar::future<> load_pgs(crimson::os::FuturizedStore& store);
+ seastar::future<> stop_pgs();
+
+ seastar::future<std::map<pg_t, pg_stat_t>> get_pg_stats() const;
+
+ /**
+ * invoke_method_on_each_shard_seq
+ *
+ * Invokes shard_services method on each shard sequentially.
+ */
+ template <typename F, typename... Args>
+ seastar::future<> invoke_on_each_shard_seq(
+ F &&f) const {
+ return sharded_map_seq(
+ shard_services,
+ [f=std::forward<F>(f)](const ShardServices &shard_services) mutable {
+ return std::invoke(
+ f,
+ shard_services);
+ });
+ }
+
+ /**
+ * for_each_pg
+ *
+ * Invokes f on each pg sequentially. Caller may rely on f not being
+ * invoked concurrently on multiple cores.
+ */
+ template <typename F>
+ seastar::future<> for_each_pg(F &&f) const {
+ return invoke_on_each_shard_seq(
+ [f=std::move(f)](const auto &local_service) mutable {
+ for (auto &pg: local_service.local_state.pg_map.get_pgs()) {
+ std::apply(f, pg);
+ }
+ return seastar::now();
+ });
+ }
+
+ /**
+ * for_each_pgid
+ *
+ * Syncronously invokes f on each pgid
+ */
+ template <typename F>
+ void for_each_pgid(F &&f) const {
+ return get_pg_to_shard_mapping().for_each_pgid(
+ std::forward<F>(f));
+ }
+
+ auto get_num_pgs() const {
+ return get_pg_to_shard_mapping().get_num_pgs();
+ }
+
+ seastar::future<> broadcast_map_to_pgs(epoch_t epoch);
+
+ template <typename F>
+ auto with_pg(spg_t pgid, F &&f) {
+ core_id_t core = get_pg_to_shard_mapping().get_pg_mapping(pgid);
+ return with_remote_shard_state(
+ core,
+ [pgid, f=std::move(f)](auto &local_state, auto &local_service) mutable {
+ return std::invoke(
+ std::move(f),
+ local_state.pg_map.get_pg(pgid));
+ });
+ }
+
+ template <typename T, typename... Args>
+ auto start_pg_operation(Args&&... args) {
+ auto op = get_local_state().registry.create_operation<T>(
+ std::forward<Args>(args)...);
+ auto &logger = crimson::get_logger(ceph_subsys_osd);
+ logger.debug("{}: starting {}", *op, __func__);
+
+ auto &opref = *op;
+ auto id = op->get_id();
+ if constexpr (T::is_trackable) {
+ op->template track_event<typename T::StartEvent>();
+ }
+ auto fut = opref.template enter_stage<>(
+ opref.get_connection_pipeline().await_active
+ ).then([this, &opref, &logger] {
+ logger.debug("{}: start_pg_operation in await_active stage", opref);
+ return get_shard_services().local_state.osd_state.when_active();
+ }).then([&logger, &opref] {
+ logger.debug("{}: start_pg_operation active, entering await_map", opref);
+ return opref.template enter_stage<>(
+ opref.get_connection_pipeline().await_map);
+ }).then([this, &logger, &opref] {
+ logger.debug("{}: start_pg_operation await_map stage", opref);
+ using OSDMapBlockingEvent =
+ OSD_OSDMapGate::OSDMapBlocker::BlockingEvent;
+ return opref.template with_blocking_event<OSDMapBlockingEvent>(
+ [this, &opref](auto &&trigger) {
+ std::ignore = this;
+ return get_shard_services().local_state.osdmap_gate.wait_for_map(
+ std::move(trigger),
+ opref.get_epoch(),
+ &get_shard_services());
+ });
+ }).then([&logger, &opref](auto epoch) {
+ logger.debug("{}: got map {}, entering get_pg", opref, epoch);
+ return opref.template enter_stage<>(
+ opref.get_connection_pipeline().get_pg);
+ }).then([this, &logger, &opref, op=std::move(op)]() mutable {
+ logger.debug("{}: in get_pg core {}", opref, seastar::this_shard_id());
+ logger.debug("{}: in get_pg", opref);
+ if constexpr (T::can_create()) {
+ logger.debug("{}: can_create", opref);
+ return run_with_pg_maybe_create<T>(std::move(op));
+ } else {
+ logger.debug("{}: !can_create", opref);
+ return run_with_pg_maybe_wait<T>(std::move(op));
+ }
+ });
+ return std::make_pair(id, std::move(fut));
+ }
+
+#undef FORWARD
+#undef FORWARD_CONST
+#undef FORWARD_TO_OSD_SINGLETON
+};
+
+}
diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc
new file mode 100644
index 000000000..b5394bfdc
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.cc
@@ -0,0 +1,328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+#include "messages/MOSDFastDispatchOp.h"
+#include "osd/osd_types.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+hobject_t RecoveryBackend::get_temp_recovery_object(
+ const hobject_t& target,
+ eversion_t version) const
+{
+ hobject_t hoid =
+ target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}",
+ pg.get_info().pgid,
+ version,
+ pg.get_info().history.same_interval_since,
+ target.snap));
+ logger().debug("{} {}", __func__, hoid);
+ return hoid;
+}
+
+void RecoveryBackend::clean_up(ceph::os::Transaction& t,
+ std::string_view why)
+{
+ for (auto& soid : temp_contents) {
+ t.remove(pg.get_collection_ref()->get_cid(),
+ ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+ }
+ temp_contents.clear();
+
+ for (auto& [soid, recovery_waiter] : recovering) {
+ if ((recovery_waiter->pull_info
+ && recovery_waiter->pull_info->is_complete())
+ || (!recovery_waiter->pull_info
+ && recovery_waiter->obc && recovery_waiter->obc->obs.exists)) {
+ recovery_waiter->obc->interrupt(
+ ::crimson::common::actingset_changed(
+ pg.is_primary()));
+ recovery_waiter->interrupt(why);
+ }
+ }
+ recovering.clear();
+}
+
+void RecoveryBackend::WaitForObjectRecovery::stop() {
+ readable.set_exception(
+ crimson::common::system_shutdown_exception());
+ recovered.set_exception(
+ crimson::common::system_shutdown_exception());
+ pulled.set_exception(
+ crimson::common::system_shutdown_exception());
+ for (auto& [pg_shard, pr] : pushes) {
+ pr.set_exception(
+ crimson::common::system_shutdown_exception());
+ }
+}
+
+void RecoveryBackend::handle_backfill_finish(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(!pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1);
+ auto reply = crimson::make_message<MOSDPGBackfill>(
+ MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+ pg.get_osdmap_epoch(),
+ m.query_epoch,
+ spg_t(pg.get_pgid().pgid, pg.get_primary().shard));
+ reply->set_priority(pg.get_recovery_op_priority());
+ std::ignore = conn->send(std::move(reply));
+ shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(&pg),
+ pg.get_pg_whoami(),
+ pg.get_pgid(),
+ pg.get_osdmap_epoch(),
+ pg.get_osdmap_epoch(),
+ RecoveryDone{});
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_progress(
+ MOSDPGBackfill& m)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(!pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2);
+
+ ObjectStore::Transaction t;
+ pg.get_peering_state().update_backfill_progress(
+ m.last_backfill,
+ m.stats,
+ m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ t);
+ logger().debug("RecoveryBackend::handle_backfill_progress: do_transaction...");
+ return shard_services.get_store().do_transaction(
+ pg.get_collection_ref(), std::move(t)).or_terminate();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_finish_ack(
+ MOSDPGBackfill& m)
+{
+ logger().debug("{}", __func__);
+ ceph_assert(pg.is_primary());
+ ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3);
+ // TODO:
+ // finish_recovery_op(hobject_t::get_max());
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ switch (m.op) {
+ case MOSDPGBackfill::OP_BACKFILL_FINISH:
+ handle_backfill_finish(m, conn);
+ [[fallthrough]];
+ case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
+ return handle_backfill_progress(m);
+ case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
+ return handle_backfill_finish_ack(m);
+ default:
+ ceph_assert("unknown op type for pg backfill");
+ return seastar::now();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_backfill_remove(
+ MOSDPGBackfillRemove& m)
+{
+ logger().debug("{} m.ls={}", __func__, m.ls);
+ assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
+ if (pg.can_discard_replica_op(m)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ ObjectStore::Transaction t;
+ for ([[maybe_unused]] const auto& [soid, ver] : m.ls) {
+ // TODO: the reserved space management. PG::try_reserve_recovery_space().
+ t.remove(pg.get_collection_ref()->get_cid(),
+ ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+ }
+ logger().debug("RecoveryBackend::handle_backfill_remove: do_transaction...");
+ return shard_services.get_store().do_transaction(
+ pg.get_collection_ref(), std::move(t)).or_terminate();
+}
+
+RecoveryBackend::interruptible_future<BackfillInterval>
+RecoveryBackend::scan_for_backfill(
+ const hobject_t& start,
+ [[maybe_unused]] const std::int64_t min,
+ const std::int64_t max)
+{
+ logger().debug("{} starting from {}", __func__, start);
+ auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+ return backend->list_objects(start, max).then_interruptible(
+ [this, start, version_map] (auto&& ret) {
+ auto&& [objects, next] = std::move(ret);
+ return seastar::do_with(
+ std::move(objects),
+ [this, version_map](auto &objects) {
+ return interruptor::parallel_for_each(objects,
+ [this, version_map] (const hobject_t& object)
+ -> interruptible_future<> {
+ crimson::osd::ObjectContextRef obc;
+ if (pg.is_primary()) {
+ obc = pg.obc_registry.maybe_get_cached_obc(object);
+ }
+ if (obc) {
+ if (obc->obs.exists) {
+ logger().debug("scan_for_backfill found (primary): {} {}",
+ object, obc->obs.oi.version);
+ version_map->emplace(object, obc->obs.oi.version);
+ } else {
+ // if the object does not exist here, it must have been removed
+ // between the collection_list_partial and here. This can happen
+ // for the first item in the range, which is usually last_backfill.
+ }
+ return seastar::now();
+ } else {
+ return backend->load_metadata(object).safe_then_interruptible(
+ [version_map, object] (auto md) {
+ if (md->os.exists) {
+ logger().debug("scan_for_backfill found: {} {}",
+ object, md->os.oi.version);
+ version_map->emplace(object, md->os.oi.version);
+ }
+ return seastar::now();
+ }, PGBackend::load_metadata_ertr::assert_all{});
+ }
+ });
+ }).then_interruptible([version_map, start=std::move(start), next=std::move(next), this] {
+ BackfillInterval bi;
+ bi.begin = std::move(start);
+ bi.end = std::move(next);
+ bi.version = pg.get_info().last_update;
+ bi.objects = std::move(*version_map);
+ logger().debug("{} BackfillInterval filled, leaving",
+ "scan_for_backfill");
+ return seastar::make_ready_future<BackfillInterval>(std::move(bi));
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan_get_digest(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (false /* FIXME: check for backfill too full */) {
+ std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+ // TODO: abstract start_background_recovery
+ static_cast<crimson::osd::PG*>(&pg),
+ pg.get_pg_whoami(),
+ pg.get_pgid(),
+ pg.get_osdmap_epoch(),
+ pg.get_osdmap_epoch(),
+ PeeringState::BackfillTooFull());
+ return seastar::now();
+ }
+ return scan_for_backfill(
+ std::move(m.begin),
+ crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
+ crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
+ ).then_interruptible(
+ [this, query_epoch=m.query_epoch, conn
+ ](auto backfill_interval) {
+ auto reply = crimson::make_message<MOSDPGScan>(
+ MOSDPGScan::OP_SCAN_DIGEST,
+ pg.get_pg_whoami(),
+ pg.get_osdmap_epoch(),
+ query_epoch,
+ spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard),
+ backfill_interval.begin,
+ backfill_interval.end);
+ encode(backfill_interval.objects, reply->get_data());
+ return conn->send(std::move(reply));
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan_digest(
+ MOSDPGScan& m)
+{
+ logger().debug("{}", __func__);
+ // Check that from is in backfill_targets vector
+ ceph_assert(pg.is_backfill_target(m.from));
+
+ BackfillInterval bi;
+ bi.begin = m.begin;
+ bi.end = m.end;
+ {
+ auto p = m.get_data().cbegin();
+ // take care to preserve ordering!
+ bi.clear_objects();
+ ::decode_noclear(bi.objects, p);
+ }
+ shard_services.start_operation<crimson::osd::BackfillRecovery>(
+ static_cast<crimson::osd::PG*>(&pg),
+ shard_services,
+ pg.get_osdmap_epoch(),
+ crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) });
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_scan(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn)
+{
+ logger().debug("{}", __func__);
+ if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) {
+ logger().debug("{}: discarding {}", __func__, m);
+ return seastar::now();
+ }
+ switch (m.op) {
+ case MOSDPGScan::OP_SCAN_GET_DIGEST:
+ return handle_scan_get_digest(m, conn);
+ case MOSDPGScan::OP_SCAN_DIGEST:
+ return handle_scan_digest(m);
+ default:
+ // FIXME: move to errorator
+ ceph_assert("unknown op type for pg scan");
+ return seastar::now();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+RecoveryBackend::handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn)
+{
+ switch (m->get_header().type) {
+ case MSG_OSD_PG_BACKFILL:
+ return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m), conn);
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m));
+ case MSG_OSD_PG_SCAN:
+ return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m), conn);
+ default:
+ return seastar::make_exception_future<>(
+ std::invalid_argument(fmt::format("invalid request type: {}",
+ m->get_header().type)));
+ }
+}
diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h
new file mode 100644
index 000000000..65e9bb01f
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.h
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGScan.h"
+#include "osd/recovery_types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd{
+ class PG;
+}
+
+class PGBackend;
+
+class RecoveryBackend {
+public:
+ class WaitForObjectRecovery;
+public:
+ template <typename T = void>
+ using interruptible_future =
+ ::crimson::interruptible::interruptible_future<
+ ::crimson::osd::IOInterruptCondition, T>;
+ using interruptor =
+ ::crimson::interruptible::interruptor<
+ ::crimson::osd::IOInterruptCondition>;
+ RecoveryBackend(crimson::osd::PG& pg,
+ crimson::osd::ShardServices& shard_services,
+ crimson::os::CollectionRef coll,
+ PGBackend* backend)
+ : pg{pg},
+ shard_services{shard_services},
+ store{&shard_services.get_store()},
+ coll{coll},
+ backend{backend} {}
+ virtual ~RecoveryBackend() {}
+ WaitForObjectRecovery& add_recovering(const hobject_t& soid) {
+ auto [it, added] = recovering.emplace(soid, new WaitForObjectRecovery{});
+ assert(added);
+ return *(it->second);
+ }
+ WaitForObjectRecovery& get_recovering(const hobject_t& soid) {
+ assert(is_recovering(soid));
+ return *(recovering.at(soid));
+ }
+ void remove_recovering(const hobject_t& soid) {
+ recovering.erase(soid);
+ }
+ bool is_recovering(const hobject_t& soid) const {
+ return recovering.count(soid) != 0;
+ }
+ uint64_t total_recovering() const {
+ return recovering.size();
+ }
+
+ virtual interruptible_future<> handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn);
+
+ virtual interruptible_future<> recover_object(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+ virtual interruptible_future<> recover_delete(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+ virtual interruptible_future<> push_delete(
+ const hobject_t& soid,
+ eversion_t need) = 0;
+
+ interruptible_future<BackfillInterval> scan_for_backfill(
+ const hobject_t& from,
+ std::int64_t min,
+ std::int64_t max);
+
+ void on_peering_interval_change(ceph::os::Transaction& t) {
+ clean_up(t, "new peering interval");
+ }
+
+ seastar::future<> stop() {
+ for (auto& [soid, recovery_waiter] : recovering) {
+ recovery_waiter->stop();
+ }
+ return on_stop();
+ }
+protected:
+ crimson::osd::PG& pg;
+ crimson::osd::ShardServices& shard_services;
+ crimson::os::FuturizedStore::Shard* store;
+ crimson::os::CollectionRef coll;
+ PGBackend* backend;
+
+ struct pull_info_t {
+ pg_shard_t from;
+ hobject_t soid;
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ crimson::osd::ObjectContextRef head_ctx;
+ crimson::osd::ObjectContextRef obc;
+ object_stat_sum_t stat;
+ bool is_complete() const {
+ return recovery_progress.is_complete(recovery_info);
+ }
+ };
+
+ struct push_info_t {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ crimson::osd::ObjectContextRef obc;
+ object_stat_sum_t stat;
+ };
+
+public:
+ class WaitForObjectRecovery :
+ public boost::intrusive_ref_counter<
+ WaitForObjectRecovery, boost::thread_unsafe_counter>,
+ public crimson::BlockerT<WaitForObjectRecovery> {
+ seastar::shared_promise<> readable, recovered, pulled;
+ std::map<pg_shard_t, seastar::shared_promise<>> pushes;
+ public:
+ static constexpr const char* type_name = "WaitForObjectRecovery";
+
+ crimson::osd::ObjectContextRef obc;
+ std::optional<pull_info_t> pull_info;
+ std::map<pg_shard_t, push_info_t> pushing;
+
+ seastar::future<> wait_for_readable() {
+ return readable.get_shared_future();
+ }
+ seastar::future<> wait_for_pushes(pg_shard_t shard) {
+ return pushes[shard].get_shared_future();
+ }
+ seastar::future<> wait_for_recovered() {
+ return recovered.get_shared_future();
+ }
+ template <typename T, typename F>
+ auto wait_track_blocking(T &trigger, F &&fut) {
+ WaitForObjectRecoveryRef ref = this;
+ return track_blocking(
+ trigger,
+ std::forward<F>(fut)
+ ).finally([ref] {});
+ }
+ template <typename T>
+ seastar::future<> wait_for_recovered(T &trigger) {
+ WaitForObjectRecoveryRef ref = this;
+ return wait_track_blocking(trigger, recovered.get_shared_future());
+ }
+ seastar::future<> wait_for_pull() {
+ return pulled.get_shared_future();
+ }
+ void set_readable() {
+ readable.set_value();
+ }
+ void set_recovered() {
+ recovered.set_value();
+ }
+ void set_pushed(pg_shard_t shard) {
+ pushes[shard].set_value();
+ }
+ void set_pulled() {
+ pulled.set_value();
+ }
+ void set_push_failed(pg_shard_t shard, std::exception_ptr e) {
+ pushes.at(shard).set_exception(e);
+ }
+ void interrupt(std::string_view why) {
+ readable.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ recovered.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ pulled.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ for (auto& [pg_shard, pr] : pushes) {
+ pr.set_exception(std::system_error(
+ std::make_error_code(std::errc::interrupted), why.data()));
+ }
+ }
+ void stop();
+ void dump_detail(Formatter* f) const {
+ }
+ };
+ using RecoveryBlockingEvent =
+ crimson::AggregateBlockingEvent<WaitForObjectRecovery::BlockingEvent>;
+ using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>;
+protected:
+ std::map<hobject_t, WaitForObjectRecoveryRef> recovering;
+ hobject_t get_temp_recovery_object(
+ const hobject_t& target,
+ eversion_t version) const;
+
+ boost::container::flat_set<hobject_t> temp_contents;
+
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+ void clean_up(ceph::os::Transaction& t, std::string_view why);
+ virtual seastar::future<> on_stop() = 0;
+private:
+ void handle_backfill_finish(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_backfill_progress(
+ MOSDPGBackfill& m);
+ interruptible_future<> handle_backfill_finish_ack(
+ MOSDPGBackfill& m);
+ interruptible_future<> handle_backfill(
+ MOSDPGBackfill& m,
+ crimson::net::ConnectionRef conn);
+
+ interruptible_future<> handle_scan_get_digest(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_scan_digest(
+ MOSDPGScan& m);
+ interruptible_future<> handle_scan(
+ MOSDPGScan& m,
+ crimson::net::ConnectionRef conn);
+ interruptible_future<> handle_backfill_remove(MOSDPGBackfillRemove& m);
+};
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
new file mode 100644
index 000000000..0ff4ad573
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.cc
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_backend.h"
+
+#include "messages/MOSDRepOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/shard_services.h"
+#include "osd/PeeringState.h"
+
+SET_SUBSYS(osd);
+
+ReplicatedBackend::ReplicatedBackend(pg_t pgid,
+ pg_shard_t whoami,
+ ReplicatedBackend::CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ DoutPrefixProvider &dpp)
+ : PGBackend{whoami.shard, coll, shard_services, dpp},
+ pgid{pgid},
+ whoami{whoami}
+{}
+
+ReplicatedBackend::ll_read_ierrorator::future<ceph::bufferlist>
+ReplicatedBackend::_read(const hobject_t& hoid,
+ const uint64_t off,
+ const uint64_t len,
+ const uint32_t flags)
+{
+ return store->read(coll, ghobject_t{hoid}, off, len, flags);
+}
+
+ReplicatedBackend::rep_op_fut_t
+ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& log_entries)
+{
+ LOG_PREFIX(ReplicatedBackend::_submit_transaction);
+
+ const ceph_tid_t tid = shard_services.get_tid();
+ auto pending_txn =
+ pending_trans.try_emplace(tid, pg_shards.size(), osd_op_p.at_version).first;
+ bufferlist encoded_txn;
+ encode(txn, encoded_txn);
+
+ DEBUGDPP("object {}", dpp, hoid);
+ auto all_completed = interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(txn))
+ ).then_interruptible([FNAME, this,
+ peers=pending_txn->second.weak_from_this()] {
+ if (!peers) {
+ // for now, only actingset_changed can cause peers
+ // to be nullptr
+ ERRORDPP("peers is null, this should be impossible", dpp);
+ assert(0 == "impossible");
+ }
+ if (--peers->pending == 0) {
+ peers->all_committed.set_value();
+ peers->all_committed = {};
+ return seastar::now();
+ }
+ return peers->all_committed.get_shared_future();
+ }).then_interruptible([pending_txn, this] {
+ auto acked_peers = std::move(pending_txn->second.acked_peers);
+ pending_trans.erase(pending_txn);
+ return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers));
+ });
+
+ auto sends = std::make_unique<std::vector<seastar::future<>>>();
+ for (auto pg_shard : pg_shards) {
+ if (pg_shard != whoami) {
+ auto m = crimson::make_message<MOSDRepOp>(
+ osd_op_p.req_id,
+ whoami,
+ spg_t{pgid, pg_shard.shard},
+ hoid,
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ map_epoch,
+ min_epoch,
+ tid,
+ osd_op_p.at_version);
+ m->set_data(encoded_txn);
+ pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+ encode(log_entries, m->logbl);
+ m->pg_trim_to = osd_op_p.pg_trim_to;
+ m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
+ m->set_rollback_to(osd_op_p.at_version);
+ // TODO: set more stuff. e.g., pg_states
+ sends->emplace_back(shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch));
+ }
+ }
+ auto sends_complete = seastar::when_all_succeed(
+ sends->begin(), sends->end()
+ ).finally([sends=std::move(sends)] {});
+ return {std::move(sends_complete), std::move(all_completed)};
+}
+
+void ReplicatedBackend::on_actingset_changed(bool same_primary)
+{
+ crimson::common::actingset_changed e_actingset_changed{same_primary};
+ for (auto& [tid, pending_txn] : pending_trans) {
+ pending_txn.all_committed.set_exception(e_actingset_changed);
+ }
+ pending_trans.clear();
+}
+
+void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply)
+{
+ LOG_PREFIX(ReplicatedBackend::got_rep_op_reply);
+ auto found = pending_trans.find(reply.get_tid());
+ if (found == pending_trans.end()) {
+ WARNDPP("cannot find rep op for message {}", dpp, reply);
+ return;
+ }
+ auto& peers = found->second;
+ for (auto& peer : peers.acked_peers) {
+ if (peer.shard == reply.from) {
+ peer.last_complete_ondisk = reply.get_last_complete_ondisk();
+ if (--peers.pending == 0) {
+ peers.all_committed.set_value();
+ peers.all_committed = {};
+ }
+ return;
+ }
+ }
+}
+
+seastar::future<> ReplicatedBackend::stop()
+{
+ LOG_PREFIX(ReplicatedBackend::stop);
+ INFODPP("cid {}", coll->get_cid());
+ for (auto& [tid, pending_on] : pending_trans) {
+ pending_on.all_committed.set_exception(
+ crimson::common::system_shutdown_exception());
+ }
+ pending_trans.clear();
+ return seastar::now();
+}
+
+seastar::future<>
+ReplicatedBackend::request_committed(const osd_reqid_t& reqid,
+ const eversion_t& at_version)
+{
+ if (std::empty(pending_trans)) {
+ return seastar::now();
+ }
+ auto iter = pending_trans.begin();
+ auto& pending_txn = iter->second;
+ if (pending_txn.at_version > at_version) {
+ return seastar::now();
+ }
+ for (; iter->second.at_version < at_version; ++iter);
+ // As for now, the previous client_request with the same reqid
+ // mustn't have finished, as that would mean later client_requests
+ // has finished before earlier ones.
+ //
+ // The following line of code should be "assert(pending_txn.at_version == at_version)",
+ // as there can be only one transaction at any time in pending_trans due to
+ // PG::request_pg_pipeline. But there's a high possibility that we will
+ // improve the parallelism here in the future, which means there may be multiple
+ // client requests in flight, so we loosed the restriction to as follows. Correct
+ // me if I'm wrong:-)
+ assert(iter != pending_trans.end() && iter->second.at_version == at_version);
+ if (iter->second.pending) {
+ return iter->second.all_committed.get_shared_future();
+ } else {
+ return seastar::now();
+ }
+}
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
new file mode 100644
index 000000000..f789a35ea
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/weak_ptr.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+
+#include "acked_peers.h"
+#include "pg_backend.h"
+
+namespace crimson::osd {
+ class ShardServices;
+}
+
+class ReplicatedBackend : public PGBackend
+{
+public:
+ ReplicatedBackend(pg_t pgid, pg_shard_t whoami,
+ CollectionRef coll,
+ crimson::osd::ShardServices& shard_services,
+ DoutPrefixProvider &dpp);
+ void got_rep_op_reply(const MOSDRepOpReply& reply) final;
+ seastar::future<> stop() final;
+ void on_actingset_changed(bool same_primary) final;
+private:
+ ll_read_ierrorator::future<ceph::bufferlist>
+ _read(const hobject_t& hoid, uint64_t off,
+ uint64_t len, uint32_t flags) override;
+ rep_op_fut_t _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+ const hobject_t& hoid,
+ ceph::os::Transaction&& txn,
+ osd_op_params_t&& osd_op_p,
+ epoch_t min_epoch, epoch_t max_epoch,
+ std::vector<pg_log_entry_t>&& log_entries) final;
+ const pg_t pgid;
+ const pg_shard_t whoami;
+ class pending_on_t : public seastar::weakly_referencable<pending_on_t> {
+ public:
+ pending_on_t(size_t pending, const eversion_t& at_version)
+ : pending{static_cast<unsigned>(pending)}, at_version(at_version)
+ {}
+ unsigned pending;
+ // The order of pending_txns' at_version must be the same as their
+ // corresponding ceph_tid_t, as we rely on this condition for checking
+ // whether a client request is already completed. To put it another
+ // way, client requests at_version must be updated synchorously/simultaneously
+ // with ceph_tid_t.
+ const eversion_t at_version;
+ crimson::osd::acked_peers_t acked_peers;
+ seastar::shared_promise<> all_committed;
+ };
+ using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>;
+ pending_transactions_t pending_trans;
+
+ seastar::future<> request_committed(
+ const osd_reqid_t& reqid, const eversion_t& at_version) final;
+};
diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc
new file mode 100644
index 000000000..bd301cc2b
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.cc
@@ -0,0 +1,1182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/future.hh>
+#include <seastar/core/do_with.hh>
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "osd/osd_types_fmt.h"
+#include "replicated_recovery_backend.h"
+#include "msg/Message.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::less;
+using std::map;
+using std::string;
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::recover_object(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ // always add_recovering(soid) before recover_object(soid)
+ assert(is_recovering(soid));
+ // start tracking the recovery of soid
+ return maybe_pull_missing_obj(soid, need).then_interruptible([this, soid, need] {
+ logger().debug("recover_object: loading obc: {}", soid);
+ return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
+ [this, soid, need](auto obc) {
+ logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid);
+ auto& recovery_waiter = get_recovering(soid);
+ recovery_waiter.obc = obc;
+ recovery_waiter.obc->wait_recovery_read();
+ return maybe_push_shards(soid, need);
+ }).handle_error_interruptible(
+ crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) {
+ // TODO: may need eio handling?
+ logger().error("recover_object saw error code {}, ignoring object {}",
+ code, soid);
+ }));
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::maybe_push_shards(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return seastar::do_with(
+ get_shards_to_push(soid),
+ [this, need, soid](auto &shards) {
+ return interruptor::parallel_for_each(
+ shards,
+ [this, need, soid](auto shard) {
+ return prep_push(soid, need, shard).then_interruptible([this, soid, shard](auto push) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->pushes.push_back(std::move(push));
+ msg->set_priority(pg.get_recovery_op_priority());
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(shard.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch()))
+ .then_interruptible(
+ [this, soid, shard] {
+ return get_recovering(soid).wait_for_pushes(shard);
+ });
+ });
+ });
+ }).then_interruptible([this, soid] {
+ auto &recovery = get_recovering(soid);
+ if (auto push_info = recovery.pushing.begin();
+ push_info != recovery.pushing.end()) {
+ pg.get_recovery_handler()->on_global_recover(soid,
+ push_info->second.stat,
+ false);
+ } else if (recovery.pull_info) {
+ // no push happened (empty get_shards_to_push()) but pull actually did
+ pg.get_recovery_handler()->on_global_recover(soid,
+ recovery.pull_info->stat,
+ false);
+ } else {
+ // no pulls, no pushes
+ }
+ return seastar::make_ready_future<>();
+ }).handle_exception_interruptible([this, soid](auto e) {
+ auto &recovery = get_recovering(soid);
+ if (recovery.obc) {
+ recovery.obc->drop_recovery_read();
+ }
+ recovering.erase(soid);
+ return seastar::make_exception_future<>(e);
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::maybe_pull_missing_obj(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ pg_missing_tracker_t local_missing = pg.get_local_missing();
+ if (!local_missing.is_missing(soid)) {
+ return seastar::make_ready_future<>();
+ }
+ PullOp pull_op;
+ auto& recovery_waiter = get_recovering(soid);
+ recovery_waiter.pull_info =
+ std::make_optional<RecoveryBackend::pull_info_t>();
+ auto& pull_info = *recovery_waiter.pull_info;
+ prepare_pull(pull_op, pull_info, soid, need);
+ auto msg = crimson::make_message<MOSDPGPull>();
+ msg->from = pg.get_pg_whoami();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_pulls({std::move(pull_op)});
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(
+ pull_info.from.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch()
+ )).then_interruptible([&recovery_waiter] {
+ return recovery_waiter.wait_for_pull();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::push_delete(
+ const hobject_t& soid,
+ eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ epoch_t min_epoch = pg.get_last_peering_reset();
+
+ assert(pg.get_acting_recovery_backfill().size() > 0);
+ return interruptor::parallel_for_each(pg.get_acting_recovery_backfill(),
+ [this, soid, need, min_epoch](pg_shard_t shard)
+ -> interruptible_future<> {
+ if (shard == pg.get_pg_whoami())
+ return seastar::make_ready_future<>();
+ auto iter = pg.get_shard_missing().find(shard);
+ if (iter == pg.get_shard_missing().end())
+ return seastar::make_ready_future<>();
+ if (iter->second.is_missing(soid)) {
+ logger().debug("push_delete: will remove {} from {}", soid, shard);
+ pg.begin_peer_recover(shard, soid);
+ spg_t target_pg(pg.get_info().pgid.pgid, shard.shard);
+ auto msg = crimson::make_message<MOSDPGRecoveryDelete>(
+ pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch);
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->objects.push_back(std::make_pair(soid, need));
+ return interruptor::make_interruptible(
+ shard_services.send_to_osd(shard.osd, std::move(msg),
+ pg.get_osdmap_epoch())).then_interruptible(
+ [this, soid, shard] {
+ return get_recovering(soid).wait_for_pushes(shard);
+ });
+ }
+ return seastar::make_ready_future<>();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_delete(
+ Ref<MOSDPGRecoveryDelete> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+
+ auto& p = m->objects.front(); //TODO: only one delete per message for now.
+ return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch())
+ .then_interruptible(
+ [this, m] {
+ auto reply = crimson::make_message<MOSDPGRecoveryDeleteReply>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard);
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->objects = m->objects;
+ return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::on_local_recover_persist(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& _recovery_info,
+ bool is_delete,
+ epoch_t epoch_frozen)
+{
+ logger().debug("{}", __func__);
+ ceph::os::Transaction t;
+ pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t);
+ logger().debug("ReplicatedRecoveryBackend::on_local_recover_persist: do_transaction...");
+ return interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(t)))
+ .then_interruptible(
+ [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ return seastar::make_ready_future<>();
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::local_recover_delete(
+ const hobject_t& soid,
+ eversion_t need,
+ epoch_t epoch_to_freeze)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+ return backend->load_metadata(soid).safe_then_interruptible([this]
+ (auto lomt) -> interruptible_future<> {
+ if (lomt->os.exists) {
+ return seastar::do_with(ceph::os::Transaction(),
+ [this, lomt = std::move(lomt)](auto& txn) {
+ return backend->remove(lomt->os, txn).then_interruptible(
+ [this, &txn]() mutable {
+ logger().debug("ReplicatedRecoveryBackend::local_recover_delete: do_transaction...");
+ return shard_services.get_store().do_transaction(coll,
+ std::move(txn));
+ });
+ });
+ }
+ return seastar::make_ready_future<>();
+ }).safe_then_interruptible([this, soid, epoch_to_freeze, need] {
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = need;
+ return on_local_recover_persist(soid, recovery_info,
+ true, epoch_to_freeze);
+ }, PGBackend::load_metadata_ertr::all_same_way(
+ [this, soid, epoch_to_freeze, need] (auto e) {
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = need;
+ return on_local_recover_persist(soid, recovery_info,
+ true, epoch_to_freeze);
+ })
+ );
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::recover_delete(
+ const hobject_t &soid, eversion_t need)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ epoch_t cur_epoch = pg.get_osdmap_epoch();
+ return seastar::do_with(object_stat_sum_t(),
+ [this, soid, need, cur_epoch](auto& stat_diff) {
+ return local_recover_delete(soid, need, cur_epoch).then_interruptible(
+ [this, &stat_diff, cur_epoch, soid, need]()
+ -> interruptible_future<> {
+ if (!pg.has_reset_since(cur_epoch)) {
+ bool object_missing = false;
+ for (const auto& shard : pg.get_acting_recovery_backfill()) {
+ if (shard == pg.get_pg_whoami())
+ continue;
+ if (pg.get_shard_missing(shard)->is_missing(soid)) {
+ logger().debug("recover_delete: soid {} needs to deleted from replca {}",
+ soid, shard);
+ object_missing = true;
+ break;
+ }
+ }
+
+ if (!object_missing) {
+ stat_diff.num_objects_recovered = 1;
+ return seastar::make_ready_future<>();
+ } else {
+ return push_delete(soid, need);
+ }
+ }
+ return seastar::make_ready_future<>();
+ }).then_interruptible([this, soid, &stat_diff] {
+ pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+ return seastar::make_ready_future<>();
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<PushOp>
+ReplicatedRecoveryBackend::prep_push(
+ const hobject_t& soid,
+ eversion_t need,
+ pg_shard_t pg_shard)
+{
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ auto& recovery_waiter = get_recovering(soid);
+ auto& obc = recovery_waiter.obc;
+ interval_set<uint64_t> data_subset;
+ if (obc->obs.oi.size) {
+ data_subset.insert(0, obc->obs.oi.size);
+ }
+ const auto& missing = pg.get_shard_missing().find(pg_shard)->second;
+ const auto it = missing.get_items().find(soid);
+ assert(it != missing.get_items().end());
+ data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+ logger().debug("prep_push: {} data_subset {} to {}",
+ soid, data_subset, pg_shard);
+
+ auto& push_info = recovery_waiter.pushing[pg_shard];
+ pg.begin_peer_recover(pg_shard, soid);
+ const auto pmissing_iter = pg.get_shard_missing().find(pg_shard);
+ const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+ assert(missing_iter != pmissing_iter->second.get_items().end());
+
+ push_info.obc = obc;
+ push_info.recovery_info.size = obc->obs.oi.size;
+ push_info.recovery_info.copy_subset = data_subset;
+ push_info.recovery_info.soid = soid;
+ push_info.recovery_info.oi = obc->obs.oi;
+ push_info.recovery_info.version = obc->obs.oi.version;
+ push_info.recovery_info.object_exist =
+ missing_iter->second.clean_regions.object_is_exist();
+ push_info.recovery_progress.omap_complete =
+ !missing_iter->second.clean_regions.omap_is_dirty();
+
+ return build_push_op(push_info.recovery_info,
+ push_info.recovery_progress,
+ &push_info.stat).then_interruptible(
+ [this, soid, pg_shard](auto push_op) {
+ auto& recovery_waiter = get_recovering(soid);
+ auto& push_info = recovery_waiter.pushing[pg_shard];
+ push_info.recovery_progress = push_op.after_progress;
+ return push_op;
+ });
+}
+
+void ReplicatedRecoveryBackend::prepare_pull(PullOp& pull_op,
+ pull_info_t& pull_info,
+ const hobject_t& soid,
+ eversion_t need) {
+ logger().debug("{}: {}, {}", __func__, soid, need);
+
+ pg_missing_tracker_t local_missing = pg.get_local_missing();
+ const auto missing_iter = local_missing.get_items().find(soid);
+ auto m = pg.get_missing_loc_shards();
+ pg_shard_t fromshard = *(m[soid].begin());
+
+ //TODO: skipped snap objects case for now
+ pull_op.recovery_info.copy_subset.insert(0, (uint64_t) -1);
+ pull_op.recovery_info.copy_subset.intersection_of(
+ missing_iter->second.clean_regions.get_dirty_regions());
+ pull_op.recovery_info.size = ((uint64_t) -1);
+ pull_op.recovery_info.object_exist =
+ missing_iter->second.clean_regions.object_is_exist();
+ pull_op.recovery_info.soid = soid;
+ pull_op.soid = soid;
+ pull_op.recovery_progress.data_complete = false;
+ pull_op.recovery_progress.omap_complete =
+ !missing_iter->second.clean_regions.omap_is_dirty();
+ pull_op.recovery_progress.data_recovered_to = 0;
+ pull_op.recovery_progress.first = true;
+
+ pull_info.from = fromshard;
+ pull_info.soid = soid;
+ pull_info.recovery_info = pull_op.recovery_info;
+ pull_info.recovery_progress = pull_op.recovery_progress;
+}
+
+RecoveryBackend::interruptible_future<PushOp>
+ReplicatedRecoveryBackend::build_push_op(
+ const ObjectRecoveryInfo& recovery_info,
+ const ObjectRecoveryProgress& progress,
+ object_stat_sum_t* stat)
+{
+ logger().debug("{} {} @{}",
+ __func__, recovery_info.soid, recovery_info.version);
+ return seastar::do_with(ObjectRecoveryProgress(progress),
+ uint64_t(crimson::common::local_conf()
+ ->osd_recovery_max_chunk),
+ recovery_info.version,
+ PushOp(),
+ [this, &recovery_info, &progress, stat]
+ (auto& new_progress, auto& available, auto& v, auto& push_op) {
+ return read_metadata_for_push_op(recovery_info.soid,
+ progress, new_progress,
+ v, &push_op
+ ).then_interruptible([&](eversion_t local_ver) mutable {
+ // If requestor didn't know the version, use ours
+ if (v == eversion_t()) {
+ v = local_ver;
+ } else if (v != local_ver) {
+ logger().error("build_push_op: {} push {} v{} failed because local copy is {}",
+ pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver);
+ // TODO: bail out
+ }
+ return read_omap_for_push_op(recovery_info.soid,
+ progress,
+ new_progress,
+ available, &push_op);
+ }).then_interruptible([this, &recovery_info, &progress,
+ &available, &push_op]() mutable {
+ logger().debug("build_push_op: available: {}, copy_subset: {}",
+ available, recovery_info.copy_subset);
+ return read_object_for_push_op(recovery_info.soid,
+ recovery_info.copy_subset,
+ progress.data_recovered_to,
+ available, &push_op);
+ }).then_interruptible([&recovery_info, &v, &progress,
+ &new_progress, stat, &push_op]
+ (uint64_t recovered_to) mutable {
+ new_progress.data_recovered_to = recovered_to;
+ if (new_progress.is_complete(recovery_info)) {
+ new_progress.data_complete = true;
+ if (stat)
+ stat->num_objects_recovered++;
+ } else if (progress.first && progress.omap_complete) {
+ // If omap is not changed, we need recovery omap
+ // when recovery cannot be completed once
+ new_progress.omap_complete = false;
+ }
+ if (stat) {
+ stat->num_keys_recovered += push_op.omap_entries.size();
+ stat->num_bytes_recovered += push_op.data.length();
+ }
+ push_op.version = v;
+ push_op.soid = recovery_info.soid;
+ push_op.recovery_info = recovery_info;
+ push_op.after_progress = new_progress;
+ push_op.before_progress = progress;
+ logger().debug("build_push_op: push_op version:"
+ " {}, push_op data length: {}",
+ push_op.version, push_op.data.length());
+ return seastar::make_ready_future<PushOp>(std::move(push_op));
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<eversion_t>
+ReplicatedRecoveryBackend::read_metadata_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ eversion_t ver,
+ PushOp* push_op)
+{
+ logger().debug("{}, {}", __func__, oid);
+ if (!progress.first) {
+ return seastar::make_ready_future<eversion_t>(ver);
+ }
+ return interruptor::make_interruptible(interruptor::when_all_succeed(
+ backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>(
+ crimson::os::FuturizedStore::Shard::read_errorator::all_same_way(
+ [oid] (const std::error_code& e) {
+ logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid);
+ return seastar::make_ready_future<bufferlist>();
+ })),
+ interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid)))
+ .handle_error_interruptible<false>(
+ crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way(
+ [oid] (const std::error_code& e) {
+ logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid);
+ return seastar::make_ready_future<crimson::os::FuturizedStore::Shard::attrs_t>();
+ }))
+ )).then_unpack_interruptible([&new_progress, push_op](auto bl, auto attrs) {
+ if (bl.length() == 0) {
+ logger().warn("read_metadata_for_push_op: fail to read omap header");
+ } else if (attrs.empty()) {
+ logger().error("read_metadata_for_push_op: fail to read attrs");
+ return eversion_t{};
+ }
+ push_op->omap_header.claim_append(std::move(bl));
+ for (auto&& [key, val] : attrs) {
+ push_op->attrset.emplace(std::move(key), std::move(val));
+ }
+ logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]);
+ object_info_t oi;
+ oi.decode_no_oid(push_op->attrset[OI_ATTR]);
+ new_progress.first = false;
+ return oi.version;
+ });
+}
+
+RecoveryBackend::interruptible_future<uint64_t>
+ReplicatedRecoveryBackend::read_object_for_push_op(
+ const hobject_t& oid,
+ const interval_set<uint64_t>& copy_subset,
+ uint64_t offset,
+ uint64_t max_len,
+ PushOp* push_op)
+{
+ if (max_len == 0 || copy_subset.empty()) {
+ push_op->data_included.clear();
+ return seastar::make_ready_future<uint64_t>(offset);
+ }
+ // 1. get the extents in the interested range
+ return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid},
+ 0, copy_subset.range_end())).safe_then_interruptible(
+ [=, this](auto&& fiemap_included) mutable {
+ interval_set<uint64_t> extents;
+ try {
+ extents.intersection_of(copy_subset, std::move(fiemap_included));
+ } catch (std::exception &) {
+ // if fiemap() fails, we will read nothing, as the intersection of
+ // copy_subset and an empty interval_set would be empty anyway
+ extents.clear();
+ }
+ // 2. we can read up to "max_len" bytes from "offset", so truncate the
+ // extents down to this quota. no need to return the number of consumed
+ // bytes, as this is the last consumer of this quota
+ push_op->data_included.span_of(extents, offset, max_len);
+ // 3. read the truncated extents
+ // TODO: check if the returned extents are pruned
+ return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid},
+ push_op->data_included, 0));
+ }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) {
+ push_op->data.claim_append(std::move(bl));
+ uint64_t recovered_to = 0;
+ if (push_op->data_included.empty()) {
+ // zero filled section, skip to end!
+ recovered_to = range_end;
+ } else {
+ // note down the progress, we will start from there next time
+ recovered_to = push_op->data_included.range_end();
+ }
+ return seastar::make_ready_future<uint64_t>(recovered_to);
+ }, PGBackend::read_errorator::all_same_way([](auto e) {
+ logger().debug("build_push_op: read exception");
+ return seastar::make_exception_future<uint64_t>(e);
+ }));
+}
+
+static std::optional<std::string> nullopt_if_empty(const std::string& s)
+{
+ return s.empty() ? std::nullopt : std::make_optional(s);
+}
+
+static bool is_too_many_entries_per_chunk(const PushOp* push_op)
+{
+ const uint64_t entries_per_chunk =
+ crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk;
+ if (!entries_per_chunk) {
+ // the limit is disabled
+ return false;
+ }
+ return push_op->omap_entries.size() >= entries_per_chunk;
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::read_omap_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ uint64_t& max_len,
+ PushOp* push_op)
+{
+ if (progress.omap_complete) {
+ return seastar::make_ready_future<>();
+ }
+ return seastar::repeat([&new_progress, &max_len, push_op, &oid, this] {
+ return shard_services.get_store().omap_get_values(
+ coll, ghobject_t{oid}, nullopt_if_empty(new_progress.omap_recovered_to)
+ ).safe_then([&new_progress, &max_len, push_op](const auto& ret) {
+ const auto& [done, kvs] = ret;
+ bool stop = done;
+ // assuming "values.empty() only if done" holds here!
+ for (const auto& [key, value] : kvs) {
+ if (is_too_many_entries_per_chunk(push_op)) {
+ stop = true;
+ break;
+ }
+ if (const uint64_t entry_size = key.size() + value.length();
+ entry_size > max_len) {
+ stop = true;
+ break;
+ } else {
+ max_len -= std::min(max_len, entry_size);
+ }
+ push_op->omap_entries.emplace(key, value);
+ }
+ if (!push_op->omap_entries.empty()) {
+ // we iterate in order
+ new_progress.omap_recovered_to = std::rbegin(push_op->omap_entries)->first;
+ }
+ if (done) {
+ new_progress.omap_complete = true;
+ }
+ return seastar::make_ready_future<seastar::stop_iteration>(
+ stop ? seastar::stop_iteration::yes : seastar::stop_iteration::no
+ );
+ }, crimson::os::FuturizedStore::Shard::read_errorator::assert_all{});
+ });
+}
+
+std::vector<pg_shard_t>
+ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const
+{
+ std::vector<pg_shard_t> shards;
+ assert(pg.get_acting_recovery_backfill().size() > 0);
+ for (const auto& peer : pg.get_acting_recovery_backfill()) {
+ if (peer == pg.get_pg_whoami())
+ continue;
+ auto shard_missing =
+ pg.get_shard_missing().find(peer);
+ assert(shard_missing != pg.get_shard_missing().end());
+ if (shard_missing->second.is_missing(soid)) {
+ shards.push_back(shard_missing->first);
+ }
+ }
+ return shards;
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ return seastar::do_with(m->take_pulls(), [this, from=m->from](auto& pulls) {
+ return interruptor::parallel_for_each(pulls,
+ [this, from](auto& pull_op) {
+ const hobject_t& soid = pull_op.soid;
+ logger().debug("handle_pull: {}", soid);
+ return backend->stat(coll, ghobject_t(soid)).then_interruptible(
+ [this, &pull_op](auto st) {
+ ObjectRecoveryInfo &recovery_info = pull_op.recovery_info;
+ ObjectRecoveryProgress &progress = pull_op.recovery_progress;
+ if (progress.first && recovery_info.size == ((uint64_t) -1)) {
+ // Adjust size and copy_subset
+ recovery_info.size = st.st_size;
+ if (st.st_size) {
+ interval_set<uint64_t> object_range;
+ object_range.insert(0, st.st_size);
+ recovery_info.copy_subset.intersection_of(object_range);
+ } else {
+ recovery_info.copy_subset.clear();
+ }
+ assert(recovery_info.clone_subset.empty());
+ }
+ return build_push_op(recovery_info, progress, 0);
+ }).then_interruptible([this, from](auto push_op) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pushes.push_back(std::move(push_op));
+ return shard_services.send_to_osd(from.osd, std::move(msg),
+ pg.get_osdmap_epoch());
+ });
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<bool>
+ReplicatedRecoveryBackend::_handle_pull_response(
+ pg_shard_t from,
+ PushOp& push_op,
+ PullOp* response,
+ ceph::os::Transaction* t)
+{
+ logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}",
+ push_op.recovery_info, push_op.after_progress,
+ push_op.data.length(), push_op.data_included);
+
+ const hobject_t &hoid = push_op.soid;
+ auto& recovery_waiter = get_recovering(hoid);
+ auto& pull_info = *recovery_waiter.pull_info;
+ if (pull_info.recovery_info.size == (uint64_t(-1))) {
+ pull_info.recovery_info.size = push_op.recovery_info.size;
+ pull_info.recovery_info.copy_subset.intersection_of(
+ push_op.recovery_info.copy_subset);
+ }
+
+ // If primary doesn't have object info and didn't know version
+ if (pull_info.recovery_info.version == eversion_t())
+ pull_info.recovery_info.version = push_op.version;
+
+ auto prepare_waiter = interruptor::make_interruptible(
+ seastar::make_ready_future<>());
+ if (pull_info.recovery_progress.first) {
+ prepare_waiter = pg.obc_loader.with_obc<RWState::RWNONE>(
+ pull_info.recovery_info.soid,
+ [&pull_info, &recovery_waiter, &push_op](auto obc) {
+ pull_info.obc = obc;
+ recovery_waiter.obc = obc;
+ obc->obs.oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid);
+ pull_info.recovery_info.oi = obc->obs.oi;
+ return crimson::osd::PG::load_obc_ertr::now();
+ }).handle_error_interruptible(crimson::ct_error::assert_all{});
+ };
+ return prepare_waiter.then_interruptible(
+ [this, &pull_info, &push_op, t, response]() mutable {
+ const bool first = pull_info.recovery_progress.first;
+ pull_info.recovery_progress = push_op.after_progress;
+ logger().debug("new recovery_info {}, new progress {}",
+ pull_info.recovery_info, pull_info.recovery_progress);
+ interval_set<uint64_t> data_zeros;
+ {
+ uint64_t offset = push_op.before_progress.data_recovered_to;
+ uint64_t length = (push_op.after_progress.data_recovered_to -
+ push_op.before_progress.data_recovered_to);
+ if (length) {
+ data_zeros.insert(offset, length);
+ }
+ }
+ auto [usable_intervals, data] =
+ trim_pushed_data(pull_info.recovery_info.copy_subset,
+ push_op.data_included, push_op.data);
+ bool complete = pull_info.is_complete();
+ bool clear_omap = !push_op.before_progress.omap_complete;
+ return submit_push_data(pull_info.recovery_info,
+ first, complete, clear_omap,
+ std::move(data_zeros), std::move(usable_intervals),
+ std::move(data), std::move(push_op.omap_header),
+ push_op.attrset, std::move(push_op.omap_entries), t)
+ .then_interruptible(
+ [this, response, &pull_info, &push_op, complete,
+ t, bytes_recovered=data.length()] {
+ pull_info.stat.num_keys_recovered += push_op.omap_entries.size();
+ pull_info.stat.num_bytes_recovered += bytes_recovered;
+
+ if (complete) {
+ pull_info.stat.num_objects_recovered++;
+ pg.get_recovery_handler()->on_local_recover(
+ push_op.soid, get_recovering(push_op.soid).pull_info->recovery_info,
+ false, *t);
+ return true;
+ } else {
+ response->soid = push_op.soid;
+ response->recovery_info = pull_info.recovery_info;
+ response->recovery_progress = pull_info.recovery_progress;
+ return false;
+ }
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_pull_response(
+ Ref<MOSDPGPush> m)
+{
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ const PushOp& push_op = m->pushes[0]; //TODO: only one push per message for now.
+ if (push_op.version == eversion_t()) {
+ // replica doesn't have it!
+ pg.get_recovery_handler()->on_failed_recover({ m->from }, push_op.soid,
+ get_recovering(push_op.soid).pull_info->recovery_info.version);
+ return seastar::make_exception_future<>(
+ std::runtime_error(fmt::format(
+ "Error on pushing side {} when pulling obj {}",
+ m->from, push_op.soid)));
+ }
+
+ logger().debug("{}: {}", __func__, *m);
+ return seastar::do_with(PullOp(), [this, m](auto& response) {
+ return seastar::do_with(ceph::os::Transaction(), m.get(),
+ [this, &response](auto& t, auto& m) {
+ pg_shard_t from = m->from;
+ PushOp& push_op = m->pushes[0]; // only one push per message for now
+ return _handle_pull_response(from, push_op, &response, &t
+ ).then_interruptible(
+ [this, &t](bool complete) {
+ epoch_t epoch_frozen = pg.get_osdmap_epoch();
+ logger().debug("ReplicatedRecoveryBackend::handle_pull_response: do_transaction...");
+ return shard_services.get_store().do_transaction(coll, std::move(t))
+ .then([this, epoch_frozen, complete,
+ last_complete = pg.get_info().last_complete] {
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ return seastar::make_ready_future<bool>(complete);
+ });
+ });
+ }).then_interruptible([this, m, &response](bool complete) {
+ if (complete) {
+ auto& push_op = m->pushes[0];
+ get_recovering(push_op.soid).set_pulled();
+ return seastar::make_ready_future<>();
+ } else {
+ auto reply = crimson::make_message<MOSDPGPull>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = pg.get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->set_pulls({std::move(response)});
+ return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+ }
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::_handle_push(
+ pg_shard_t from,
+ PushOp &push_op,
+ PushReplyOp *response,
+ ceph::os::Transaction *t)
+{
+ logger().debug("{}", __func__);
+
+ bool first = push_op.before_progress.first;
+ interval_set<uint64_t> data_zeros;
+ {
+ uint64_t offset = push_op.before_progress.data_recovered_to;
+ uint64_t length = (push_op.after_progress.data_recovered_to -
+ push_op.before_progress.data_recovered_to);
+ if (length) {
+ data_zeros.insert(offset, length);
+ }
+ }
+ bool complete = (push_op.after_progress.data_complete &&
+ push_op.after_progress.omap_complete);
+ bool clear_omap = !push_op.before_progress.omap_complete;
+ response->soid = push_op.recovery_info.soid;
+
+ return submit_push_data(push_op.recovery_info, first, complete, clear_omap,
+ std::move(data_zeros),
+ std::move(push_op.data_included),
+ std::move(push_op.data),
+ std::move(push_op.omap_header),
+ push_op.attrset,
+ std::move(push_op.omap_entries), t)
+ .then_interruptible(
+ [this, complete, &push_op, t] {
+ if (complete) {
+ pg.get_recovery_handler()->on_local_recover(
+ push_op.recovery_info.soid, push_op.recovery_info,
+ false, *t);
+ }
+ });
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_push(
+ Ref<MOSDPGPush> m)
+{
+ if (pg.can_discard_replica_op(*m)) {
+ logger().debug("{}: discarding {}", __func__, *m);
+ return seastar::now();
+ }
+ if (pg.is_primary()) {
+ return handle_pull_response(m);
+ }
+
+ logger().debug("{}: {}", __func__, *m);
+ return seastar::do_with(PushReplyOp(), [this, m](auto& response) {
+ PushOp& push_op = m->pushes[0]; // TODO: only one push per message for now
+ return seastar::do_with(ceph::os::Transaction(),
+ [this, m, &push_op, &response](auto& t) {
+ return _handle_push(m->from, push_op, &response, &t).then_interruptible(
+ [this, &t] {
+ epoch_t epoch_frozen = pg.get_osdmap_epoch();
+ logger().debug("ReplicatedRecoveryBackend::handle_push: do_transaction...");
+ return interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll, std::move(t))).then_interruptible(
+ [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+ //TODO: this should be grouped with pg.on_local_recover somehow.
+ pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+ });
+ });
+ }).then_interruptible([this, m, &response]() mutable {
+ auto reply = crimson::make_message<MOSDPGPushReply>();
+ reply->from = pg.get_pg_whoami();
+ reply->set_priority(m->get_priority());
+ reply->pgid = pg.get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ std::vector<PushReplyOp> replies = { std::move(response) };
+ reply->replies.swap(replies);
+ return shard_services.send_to_osd(m->from.osd,
+ std::move(reply), pg.get_osdmap_epoch());
+ });
+ });
+}
+
+RecoveryBackend::interruptible_future<std::optional<PushOp>>
+ReplicatedRecoveryBackend::_handle_push_reply(
+ pg_shard_t peer,
+ const PushReplyOp &op)
+{
+ const hobject_t& soid = op.soid;
+ logger().debug("{}, soid {}, from {}", __func__, soid, peer);
+ auto recovering_iter = recovering.find(soid);
+ if (recovering_iter == recovering.end()
+ || !recovering_iter->second->pushing.count(peer)) {
+ logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ } else {
+ auto& push_info = recovering_iter->second->pushing[peer];
+ bool error = push_info.recovery_progress.error;
+ if (!push_info.recovery_progress.data_complete && !error) {
+ return build_push_op(push_info.recovery_info, push_info.recovery_progress,
+ &push_info.stat
+ ).then_interruptible([&push_info] (auto push_op) {
+ push_info.recovery_progress = push_op.after_progress;
+ return seastar::make_ready_future<std::optional<PushOp>>(
+ std::move(push_op));
+ }).handle_exception_interruptible(
+ [recovering_iter, &push_info, peer] (auto e) {
+ push_info.recovery_progress.error = true;
+ recovering_iter->second->set_push_failed(peer, e);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ });
+ }
+ if (!error) {
+ pg.get_recovery_handler()->on_peer_recover(peer,
+ soid,
+ push_info.recovery_info);
+ }
+ recovering_iter->second->set_pushed(peer);
+ return seastar::make_ready_future<std::optional<PushOp>>();
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_push_reply(
+ Ref<MOSDPGPushReply> m)
+{
+ logger().debug("{}: {}", __func__, *m);
+ auto from = m->from;
+ auto& push_reply = m->replies[0]; //TODO: only one reply per message
+
+ return _handle_push_reply(from, push_reply).then_interruptible(
+ [this, from](std::optional<PushOp> push_op) {
+ if (push_op) {
+ auto msg = crimson::make_message<MOSDPGPush>();
+ msg->from = pg.get_pg_whoami();
+ msg->pgid = pg.get_pgid();
+ msg->map_epoch = pg.get_osdmap_epoch();
+ msg->min_epoch = pg.get_last_peering_reset();
+ msg->set_priority(pg.get_recovery_op_priority());
+ msg->pushes.push_back(std::move(*push_op));
+ return shard_services.send_to_osd(from.osd,
+ std::move(msg),
+ pg.get_osdmap_epoch());
+ } else {
+ return seastar::make_ready_future<>();
+ }
+ });
+}
+
+std::pair<interval_set<uint64_t>,
+ bufferlist>
+ReplicatedRecoveryBackend::trim_pushed_data(
+ const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ ceph::bufferlist data_received)
+{
+ logger().debug("{}", __func__);
+ // what i have is only a subset of what i want
+ if (intervals_received.subset_of(copy_subset)) {
+ return {intervals_received, data_received};
+ }
+ // only collect the extents included by copy_subset and intervals_received
+ interval_set<uint64_t> intervals_usable;
+ bufferlist data_usable;
+ intervals_usable.intersection_of(copy_subset, intervals_received);
+ uint64_t have_off = 0;
+ for (auto [have_start, have_len] : intervals_received) {
+ interval_set<uint64_t> want;
+ want.insert(have_start, have_len);
+ want.intersection_of(copy_subset);
+ for (auto [want_start, want_len] : want) {
+ bufferlist sub;
+ uint64_t data_off = have_off + (want_start - have_start);
+ sub.substr_of(data_received, data_off, want_len);
+ data_usable.claim_append(sub);
+ }
+ have_off += have_len;
+ }
+ return {intervals_usable, data_usable};
+}
+
+RecoveryBackend::interruptible_future<hobject_t>
+ReplicatedRecoveryBackend::prep_push_target(
+ const ObjectRecoveryInfo& recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ ObjectStore::Transaction* t,
+ const map<string, bufferlist, less<>>& attrs,
+ bufferlist&& omap_header)
+{
+ if (!first) {
+ return seastar::make_ready_future<hobject_t>(
+ get_temp_recovery_object(recovery_info.soid,
+ recovery_info.version));
+ }
+
+ ghobject_t target_oid;
+ if (complete) {
+ // overwrite the original object
+ target_oid = ghobject_t(recovery_info.soid);
+ } else {
+ target_oid = ghobject_t(get_temp_recovery_object(recovery_info.soid,
+ recovery_info.version));
+ logger().debug("{}: Adding oid {} in the temp collection",
+ __func__, target_oid);
+ add_temp_obj(target_oid.hobj);
+ }
+ // create a new object
+ if (!complete || !recovery_info.object_exist) {
+ t->remove(coll->get_cid(), target_oid);
+ t->touch(coll->get_cid(), target_oid);
+ object_info_t oi;
+ oi.decode_no_oid(attrs.at(OI_ATTR));
+ t->set_alloc_hint(coll->get_cid(), target_oid,
+ oi.expected_object_size,
+ oi.expected_write_size,
+ oi.alloc_hint_flags);
+ }
+ if (complete) {
+ // remove xattr and update later if overwrite on original object
+ t->rmattrs(coll->get_cid(), target_oid);
+ // if need update omap, clear the previous content first
+ if (clear_omap) {
+ t->omap_clear(coll->get_cid(), target_oid);
+ }
+ }
+ t->truncate(coll->get_cid(), target_oid, recovery_info.size);
+ if (omap_header.length()) {
+ t->omap_setheader(coll->get_cid(), target_oid, omap_header);
+ }
+ if (complete || !recovery_info.object_exist) {
+ return seastar::make_ready_future<hobject_t>(target_oid.hobj);
+ }
+ // clone overlap content in local object if using a new object
+ return interruptor::make_interruptible(store->stat(coll, ghobject_t(recovery_info.soid)))
+ .then_interruptible(
+ [this, &recovery_info, t, target_oid] (auto st) {
+ // TODO: pg num bytes counting
+ uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+ interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+ if (local_size) {
+ local_intervals_included.insert(0, local_size);
+ local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+ local_intervals_included.subtract(local_intervals_excluded);
+ }
+ for (auto [off, len] : local_intervals_included) {
+ logger().debug(" clone_range {} {}~{}",
+ recovery_info.soid, off, len);
+ t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid),
+ target_oid, off, len, off);
+ }
+ return seastar::make_ready_future<hobject_t>(target_oid.hobj);
+ });
+}
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::submit_push_data(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ interval_set<uint64_t>&& data_zeros,
+ interval_set<uint64_t>&& intervals_included,
+ bufferlist&& data_included,
+ bufferlist&& omap_header,
+ const map<string, bufferlist, less<>> &attrs,
+ map<string, bufferlist>&& omap_entries,
+ ObjectStore::Transaction *t)
+{
+ logger().debug("{}", __func__);
+ return prep_push_target(recovery_info, first, complete,
+ clear_omap, t, attrs,
+ std::move(omap_header)).then_interruptible(
+ [this,
+ &recovery_info, t,
+ first, complete,
+ data_zeros=std::move(data_zeros),
+ intervals_included=std::move(intervals_included),
+ data_included=std::move(data_included),
+ omap_entries=std::move(omap_entries),
+ &attrs](auto target_oid) mutable {
+
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+ // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+ if (!data_zeros.empty()) {
+ data_zeros.intersection_of(recovery_info.copy_subset);
+ assert(intervals_included.subset_of(data_zeros));
+ data_zeros.subtract(intervals_included);
+
+ logger().debug("submit_push_data recovering object {} copy_subset: {} "
+ "intervals_included: {} data_zeros: {}",
+ recovery_info.soid, recovery_info.copy_subset,
+ intervals_included, data_zeros);
+
+ for (auto [start, len] : data_zeros) {
+ t->zero(coll->get_cid(), ghobject_t(target_oid), start, len);
+ }
+ }
+ uint64_t off = 0;
+ for (auto [start, len] : intervals_included) {
+ bufferlist bit;
+ bit.substr_of(data_included, off, len);
+ t->write(coll->get_cid(), ghobject_t(target_oid),
+ start, len, bit, fadvise_flags);
+ off += len;
+ }
+
+ if (!omap_entries.empty())
+ t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries);
+ if (!attrs.empty())
+ t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs);
+
+ if (complete) {
+ if (!first) {
+ logger().debug("submit_push_data: Removing oid {} from the temp collection",
+ target_oid);
+ clear_temp_obj(target_oid);
+ t->remove(coll->get_cid(), ghobject_t(recovery_info.soid));
+ t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid),
+ coll->get_cid(), ghobject_t(recovery_info.soid));
+ }
+ submit_push_complete(recovery_info, t);
+ }
+ logger().debug("submit_push_data: done");
+ return seastar::make_ready_future<>();
+ });
+}
+
+void ReplicatedRecoveryBackend::submit_push_complete(
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t)
+{
+ for (const auto& [oid, extents] : recovery_info.clone_subset) {
+ for (const auto& [off, len] : extents) {
+ logger().debug(" clone_range {} {}~{}", oid, off, len);
+ t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid),
+ off, len, off);
+ }
+ }
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_delete_reply(
+ Ref<MOSDPGRecoveryDeleteReply> m)
+{
+ auto& p = m->objects.front();
+ hobject_t soid = p.first;
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.version = p.second;
+ pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info);
+ get_recovering(soid).set_pushed(m->from);
+ return seastar::now();
+}
+
+RecoveryBackend::interruptible_future<>
+ReplicatedRecoveryBackend::handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn)
+{
+ switch (m->get_header().type) {
+ case MSG_OSD_PG_PULL:
+ return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m));
+ case MSG_OSD_PG_PUSH:
+ return handle_push(boost::static_pointer_cast<MOSDPGPush>(m));
+ case MSG_OSD_PG_PUSH_REPLY:
+ return handle_push_reply(
+ boost::static_pointer_cast<MOSDPGPushReply>(m));
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ return handle_recovery_delete(
+ boost::static_pointer_cast<MOSDPGRecoveryDelete>(m));
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ return handle_recovery_delete_reply(
+ boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m));
+ default:
+ // delegate to parent class for handling backend-agnostic recovery ops.
+ return RecoveryBackend::handle_recovery_op(std::move(m), conn);
+ }
+}
+
diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h
new file mode 100644
index 000000000..b023b7417
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.h
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/interruptible_future.h"
+#include "crimson/osd/pg_interval_interrupt_condition.h"
+#include "crimson/osd/recovery_backend.h"
+
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "os/ObjectStore.h"
+
+class ReplicatedRecoveryBackend : public RecoveryBackend {
+public:
+ ReplicatedRecoveryBackend(crimson::osd::PG& pg,
+ crimson::osd::ShardServices& shard_services,
+ crimson::os::CollectionRef coll,
+ PGBackend* backend)
+ : RecoveryBackend(pg, shard_services, coll, backend)
+ {}
+ interruptible_future<> handle_recovery_op(
+ Ref<MOSDFastDispatchOp> m,
+ crimson::net::ConnectionRef conn) final;
+
+ interruptible_future<> recover_object(
+ const hobject_t& soid,
+ eversion_t need) final;
+ interruptible_future<> recover_delete(
+ const hobject_t& soid,
+ eversion_t need) final;
+ interruptible_future<> push_delete(
+ const hobject_t& soid,
+ eversion_t need) final;
+protected:
+ interruptible_future<> handle_pull(
+ Ref<MOSDPGPull> m);
+ interruptible_future<> handle_pull_response(
+ Ref<MOSDPGPush> m);
+ interruptible_future<> handle_push(
+ Ref<MOSDPGPush> m);
+ interruptible_future<> handle_push_reply(
+ Ref<MOSDPGPushReply> m);
+ interruptible_future<> handle_recovery_delete(
+ Ref<MOSDPGRecoveryDelete> m);
+ interruptible_future<> handle_recovery_delete_reply(
+ Ref<MOSDPGRecoveryDeleteReply> m);
+ interruptible_future<PushOp> prep_push(
+ const hobject_t& soid,
+ eversion_t need,
+ pg_shard_t pg_shard);
+ void prepare_pull(
+ PullOp& pull_op,
+ pull_info_t& pull_info,
+ const hobject_t& soid,
+ eversion_t need);
+ std::vector<pg_shard_t> get_shards_to_push(
+ const hobject_t& soid) const;
+ interruptible_future<PushOp> build_push_op(
+ const ObjectRecoveryInfo& recovery_info,
+ const ObjectRecoveryProgress& progress,
+ object_stat_sum_t* stat);
+ /// @returns true if this push op is the last push op for
+ /// recovery @c pop.soid
+ interruptible_future<bool> _handle_pull_response(
+ pg_shard_t from,
+ PushOp& push_op,
+ PullOp* response,
+ ceph::os::Transaction* t);
+ std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data(
+ const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ ceph::bufferlist data_received);
+ interruptible_future<> submit_push_data(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ interval_set<uint64_t>&& data_zeros,
+ interval_set<uint64_t>&& intervals_included,
+ ceph::bufferlist&& data_included,
+ ceph::bufferlist&& omap_header,
+ const std::map<std::string, bufferlist, std::less<>> &attrs,
+ std::map<std::string, bufferlist>&& omap_entries,
+ ceph::os::Transaction *t);
+ void submit_push_complete(
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t);
+ interruptible_future<> _handle_push(
+ pg_shard_t from,
+ PushOp& push_op,
+ PushReplyOp *response,
+ ceph::os::Transaction *t);
+ interruptible_future<std::optional<PushOp>> _handle_push_reply(
+ pg_shard_t peer,
+ const PushReplyOp &op);
+ interruptible_future<> on_local_recover_persist(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& _recovery_info,
+ bool is_delete,
+ epoch_t epoch_to_freeze);
+ interruptible_future<> local_recover_delete(
+ const hobject_t& soid,
+ eversion_t need,
+ epoch_t epoch_frozen);
+ seastar::future<> on_stop() final {
+ return seastar::now();
+ }
+private:
+ /// pull missing object from peer
+ interruptible_future<> maybe_pull_missing_obj(
+ const hobject_t& soid,
+ eversion_t need);
+
+ /// load object context for recovery if it is not ready yet
+ using load_obc_ertr = crimson::errorator<
+ crimson::ct_error::object_corrupted>;
+ using load_obc_iertr =
+ ::crimson::interruptible::interruptible_errorator<
+ ::crimson::osd::IOInterruptCondition,
+ load_obc_ertr>;
+
+ interruptible_future<> maybe_push_shards(
+ const hobject_t& soid,
+ eversion_t need);
+
+ /// read the data attached to given object. the size of them is supposed to
+ /// be relatively small.
+ ///
+ /// @return @c oi.version
+ interruptible_future<eversion_t> read_metadata_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ eversion_t ver,
+ PushOp* push_op);
+ /// read the remaining extents of object to be recovered and fill push_op
+ /// with them
+ ///
+ /// @param oid object being recovered
+ /// @param copy_subset extents we want
+ /// @param offset the offset in object from where we should read
+ /// @return the new offset
+ interruptible_future<uint64_t> read_object_for_push_op(
+ const hobject_t& oid,
+ const interval_set<uint64_t>& copy_subset,
+ uint64_t offset,
+ uint64_t max_len,
+ PushOp* push_op);
+ interruptible_future<> read_omap_for_push_op(
+ const hobject_t& oid,
+ const ObjectRecoveryProgress& progress,
+ ObjectRecoveryProgress& new_progress,
+ uint64_t& max_len,
+ PushOp* push_op);
+ interruptible_future<hobject_t> prep_push_target(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ ObjectStore::Transaction* t,
+ const std::map<std::string, bufferlist, std::less<>> &attrs,
+ bufferlist&& omap_header);
+ using interruptor = crimson::interruptible::interruptor<
+ crimson::osd::IOInterruptCondition>;
+};
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc
new file mode 100644
index 000000000..006e4816c
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout
+
+
+namespace crimson::osd::scheduler {
+
+mClockScheduler::mClockScheduler(ConfigProxy &conf) :
+ scheduler(
+ std::bind(&mClockScheduler::ClientRegistry::get_info,
+ &client_registry,
+ _1),
+ dmc::AtLimit::Allow,
+ conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+ conf.add_observer(this);
+ client_registry.update_from_config(conf);
+}
+
+void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+{
+ default_external_client_info.update(
+ conf.get_val<double>("osd_mclock_scheduler_client_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_client_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(scheduler_class_t::background_recovery)].update(
+ conf.get_val<double>("osd_mclock_scheduler_background_recovery_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_background_recovery_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(scheduler_class_t::background_best_effort)].update(
+ conf.get_val<double>("osd_mclock_scheduler_background_best_effort_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
+ conf.get_val<double>("osd_mclock_scheduler_background_best_effort_lim"));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+ const client_profile_id_t &client) const
+{
+ auto ret = external_client_infos.find(client);
+ if (ret == external_client_infos.end())
+ return &default_external_client_info;
+ else
+ return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+ const scheduler_id_t &id) const {
+ switch (id.class_id) {
+ case scheduler_class_t::immediate:
+ ceph_assert(0 == "Cannot schedule immediate");
+ return (dmc::ClientInfo*)nullptr;
+ case scheduler_class_t::repop:
+ case scheduler_class_t::client:
+ return get_external_client(id.client_profile_id);
+ default:
+ ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+ return &internal_client_infos[static_cast<size_t>(id.class_id)];
+ }
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+}
+
+void mClockScheduler::enqueue(item_t&& item)
+{
+ auto id = get_scheduler_id(item);
+ auto cost = item.params.cost;
+
+ if (scheduler_class_t::immediate == item.params.klass) {
+ immediate.push_front(std::move(item));
+ } else {
+ scheduler.add_request(
+ std::move(item),
+ id,
+ cost);
+ }
+}
+
+void mClockScheduler::enqueue_front(item_t&& item)
+{
+ immediate.push_back(std::move(item));
+ // TODO: item may not be immediate, update mclock machinery to permit
+ // putting the item back in the queue
+}
+
+item_t mClockScheduler::dequeue()
+{
+ if (!immediate.empty()) {
+ auto ret = std::move(immediate.back());
+ immediate.pop_back();
+ return ret;
+ } else {
+ mclock_queue_t::PullReq result = scheduler.pull_request();
+ if (result.is_future()) {
+ ceph_assert(
+ 0 == "Not implemented, user would have to be able to be woken up");
+ return std::move(*(item_t*)nullptr);
+ } else if (result.is_none()) {
+ ceph_assert(
+ 0 == "Impossible, must have checked empty() first");
+ return std::move(*(item_t*)nullptr);
+ } else {
+ ceph_assert(result.is_retn());
+
+ auto &retn = result.get_retn();
+ return std::move(*retn.request);
+ }
+ }
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_mclock_scheduler_client_res",
+ "osd_mclock_scheduler_client_wgt",
+ "osd_mclock_scheduler_client_lim",
+ "osd_mclock_scheduler_background_recovery_res",
+ "osd_mclock_scheduler_background_recovery_wgt",
+ "osd_mclock_scheduler_background_recovery_lim",
+ "osd_mclock_scheduler_background_best_effort_res",
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ "osd_mclock_scheduler_background_best_effort_lim",
+ NULL
+ };
+ return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ client_registry.update_from_config(conf);
+}
+
+}
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h
new file mode 100644
index 000000000..153fc758b
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "common/config.h"
+#include "common/ceph_context.h"
+
+
+namespace crimson::osd::scheduler {
+
+using client_id_t = uint64_t;
+using profile_id_t = uint64_t;
+
+struct client_profile_id_t {
+ client_id_t client_id;
+ profile_id_t profile_id;
+ auto operator<=>(const client_profile_id_t&) const = default;
+};
+
+
+struct scheduler_id_t {
+ scheduler_class_t class_id;
+ client_profile_id_t client_profile_id;
+ auto operator<=>(const scheduler_id_t&) const = default;
+};
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public Scheduler, md_config_obs_t {
+
+ class ClientRegistry {
+ std::array<
+ crimson::dmclock::ClientInfo,
+ static_cast<size_t>(scheduler_class_t::client)
+ > internal_client_infos = {
+ // Placeholder, gets replaced with configured values
+ crimson::dmclock::ClientInfo(1, 1, 1),
+ crimson::dmclock::ClientInfo(1, 1, 1)
+ };
+
+ crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+ std::map<client_profile_id_t,
+ crimson::dmclock::ClientInfo> external_client_infos;
+ const crimson::dmclock::ClientInfo *get_external_client(
+ const client_profile_id_t &client) const;
+ public:
+ void update_from_config(const ConfigProxy &conf);
+ const crimson::dmclock::ClientInfo *get_info(
+ const scheduler_id_t &id) const;
+ } client_registry;
+
+ using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+ scheduler_id_t,
+ item_t,
+ true,
+ true,
+ 2>;
+ mclock_queue_t scheduler;
+ std::list<item_t> immediate;
+
+ static scheduler_id_t get_scheduler_id(const item_t &item) {
+ return scheduler_id_t{
+ item.params.klass,
+ client_profile_id_t{
+ item.params.owner,
+ 0
+ }
+ };
+ }
+
+public:
+ mClockScheduler(ConfigProxy &conf);
+
+ // Enqueue op in the back of the regular queue
+ void enqueue(item_t &&item) final;
+
+ // Enqueue the op in the front of the regular queue
+ void enqueue_front(item_t &&item) final;
+
+ // Return an op to be dispatch
+ item_t dequeue() final;
+
+ // Returns if the queue is empty
+ bool empty() const final {
+ return immediate.empty() && scheduler.empty();
+ }
+
+ // Formatted output of the queue
+ void dump(ceph::Formatter &f) const final;
+
+ void print(std::ostream &ostream) const final {
+ ostream << "mClockScheduler";
+ }
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+};
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc
new file mode 100644
index 000000000..c85cb388e
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.cc
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <ostream>
+
+#include <seastar/core/print.hh>
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/WeightedPriorityQueue.h"
+
+namespace crimson::osd::scheduler {
+
+std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c)
+{
+ switch (c) {
+ case scheduler_class_t::background_best_effort:
+ return lhs << "background_best_effort";
+ case scheduler_class_t::background_recovery:
+ return lhs << "background_recovery";
+ case scheduler_class_t::client:
+ return lhs << "client";
+ case scheduler_class_t::repop:
+ return lhs << "repop";
+ case scheduler_class_t::immediate:
+ return lhs << "immediate";
+ default:
+ return lhs;
+ }
+}
+
+/**
+ * Implements Scheduler in terms of OpQueue
+ *
+ * Templated on queue type to avoid dynamic dispatch, T should implement
+ * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for
+ * the boilerplate priority cutoff/strict concept which is needed for
+ * OpQueue based implementations.
+ */
+template <typename T>
+class ClassedOpQueueScheduler final : public Scheduler {
+ const scheduler_class_t cutoff;
+ T queue;
+
+ using priority_t = uint64_t;
+ std::array<
+ priority_t,
+ static_cast<size_t>(scheduler_class_t::immediate)
+ > priority_map = {
+ // Placeholder, gets replaced with configured values
+ 0, 0, 0
+ };
+
+ static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) {
+ if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+ srand(time(NULL));
+ return (rand() % 2 < 1) ?
+ scheduler_class_t::repop : scheduler_class_t::immediate;
+ } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+ return scheduler_class_t::immediate;
+ } else {
+ return scheduler_class_t::repop;
+ }
+ }
+
+ bool use_strict(scheduler_class_t kl) const {
+ return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff);
+ }
+
+ priority_t get_priority(scheduler_class_t kl) const {
+ ceph_assert(static_cast<size_t>(kl) <
+ static_cast<size_t>(scheduler_class_t::immediate));
+ return priority_map[static_cast<size_t>(kl)];
+ }
+
+public:
+ template <typename... Args>
+ ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) :
+ cutoff(get_io_prio_cut(conf)),
+ queue(std::forward<Args>(args)...)
+ {
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::background_best_effort)
+ ] = conf.get_val<uint64_t>("osd_scrub_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::background_recovery)
+ ] = conf.get_val<uint64_t>("osd_recovery_op_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::client)
+ ] = conf.get_val<uint64_t>("osd_client_op_priority");
+ priority_map[
+ static_cast<size_t>(scheduler_class_t::repop)
+ ] = conf.get_val<uint64_t>("osd_client_op_priority");
+ }
+
+ void enqueue(item_t &&item) final {
+ if (use_strict(item.params.klass))
+ queue.enqueue_strict(
+ item.params.owner, get_priority(item.params.klass), std::move(item));
+ else
+ queue.enqueue(
+ item.params.owner, get_priority(item.params.klass),
+ item.params.cost, std::move(item));
+ }
+
+ void enqueue_front(item_t &&item) final {
+ if (use_strict(item.params.klass))
+ queue.enqueue_strict_front(
+ item.params.owner, get_priority(item.params.klass), std::move(item));
+ else
+ queue.enqueue_front(
+ item.params.owner, get_priority(item.params.klass),
+ item.params.cost, std::move(item));
+ }
+
+ bool empty() const final {
+ return queue.empty();
+ }
+
+ item_t dequeue() final {
+ return queue.dequeue();
+ }
+
+ void dump(ceph::Formatter &f) const final {
+ return queue.dump(&f);
+ }
+
+ void print(std::ostream &out) const final {
+ out << "ClassedOpQueueScheduler(queue=";
+ queue.print(out);
+ out << ", cutoff=" << cutoff << ")";
+ }
+
+ ~ClassedOpQueueScheduler() final {};
+};
+
+SchedulerRef make_scheduler(ConfigProxy &conf)
+{
+ const std::string _type = conf.get_val<std::string>("osd_op_queue");
+ const std::string *type = &_type;
+ if (*type == "debug_random") {
+ static const std::string index_lookup[] = { "mclock_scheduler",
+ "wpq" };
+ srand(time(NULL));
+ unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
+ type = &index_lookup[which];
+ }
+
+ if (*type == "wpq" ) {
+ // default is 'wpq'
+ return std::make_unique<
+ ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>(
+ conf,
+ conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"),
+ conf->osd_op_pq_min_cost
+ );
+ } else if (*type == "mclock_scheduler") {
+ return std::make_unique<mClockScheduler>(conf);
+ } else {
+ ceph_assert("Invalid choice of wq" == 0);
+ return std::unique_ptr<mClockScheduler>();
+ }
+}
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) {
+ rhs.print(lhs);
+ return lhs;
+}
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h
new file mode 100644
index 000000000..a014991ab
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <ostream>
+
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::osd::scheduler {
+
+enum class scheduler_class_t : uint8_t {
+ background_best_effort = 0,
+ background_recovery,
+ client,
+ repop,
+ immediate,
+};
+
+std::ostream &operator<<(std::ostream &, const scheduler_class_t &);
+
+using client_t = uint64_t;
+using cost_t = uint64_t;
+
+struct params_t {
+ cost_t cost = 1;
+ client_t owner;
+ scheduler_class_t klass;
+};
+
+struct item_t {
+ params_t params;
+ seastar::promise<> wake;
+};
+
+/**
+ * Base interface for classes responsible for choosing
+ * op processing order in the OSD.
+ */
+class Scheduler {
+public:
+ // Enqueue op for scheduling
+ virtual void enqueue(item_t &&item) = 0;
+
+ // Enqueue op for processing as though it were enqueued prior
+ // to other items already scheduled.
+ virtual void enqueue_front(item_t &&item) = 0;
+
+ // Returns true iff there are no ops scheduled
+ virtual bool empty() const = 0;
+
+ // Return next op to be processed
+ virtual item_t dequeue() = 0;
+
+ // Dump formatted representation for the queue
+ virtual void dump(ceph::Formatter &f) const = 0;
+
+ // Print human readable brief description with relevant parameters
+ virtual void print(std::ostream &out) const = 0;
+
+ // Destructor
+ virtual ~Scheduler() {};
+};
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &);
+using SchedulerRef = std::unique_ptr<Scheduler>;
+
+SchedulerRef make_scheduler(ConfigProxy &);
+
+}
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
new file mode 100644
index 000000000..a6431305d
--- /dev/null
+++ b/src/crimson/osd/shard_services.cc
@@ -0,0 +1,761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/smart_ptr/make_local_shared.hpp>
+
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDAlive.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGTemp.h"
+
+#include "osd/osd_perf_counters.h"
+#include "osd/PeeringState.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/mgr/client.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/net/Connection.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_meta.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::vector;
+
+namespace crimson::osd {
+
+PerShardState::PerShardState(
+ int whoami,
+ ceph::mono_time startup_time,
+ PerfCounters *perf,
+ PerfCounters *recoverystate_perf,
+ crimson::os::FuturizedStore &store,
+ OSDState &osd_state)
+ : whoami(whoami),
+ store(store.get_sharded_store()),
+ osd_state(osd_state),
+ osdmap_gate("PerShardState::osdmap_gate"),
+ perf(perf), recoverystate_perf(recoverystate_perf),
+ throttler(crimson::common::local_conf()),
+ next_tid(
+ static_cast<ceph_tid_t>(seastar::this_shard_id()) <<
+ (std::numeric_limits<ceph_tid_t>::digits - 8)),
+ startup_time(startup_time)
+{}
+
+seastar::future<> PerShardState::dump_ops_in_flight(Formatter *f) const
+{
+ registry.for_each_op([f](const auto &op) {
+ op.dump(f);
+ });
+ return seastar::now();
+}
+
+seastar::future<> PerShardState::stop_pgs()
+{
+ assert_core();
+ return seastar::parallel_for_each(
+ pg_map.get_pgs(),
+ [](auto& p) {
+ return p.second->stop();
+ });
+}
+
+std::map<pg_t, pg_stat_t> PerShardState::get_pg_stats() const
+{
+ assert_core();
+ std::map<pg_t, pg_stat_t> ret;
+ for (auto [pgid, pg] : pg_map.get_pgs()) {
+ if (pg->is_primary()) {
+ auto stats = pg->get_stats();
+ // todo: update reported_epoch,reported_seq,last_fresh
+ stats.reported_epoch = osdmap->get_epoch();
+ ret.emplace(pgid.pgid, std::move(stats));
+ }
+ }
+ return ret;
+}
+
+seastar::future<> PerShardState::broadcast_map_to_pgs(
+ ShardServices &shard_services,
+ epoch_t epoch)
+{
+ assert_core();
+ auto &pgs = pg_map.get_pgs();
+ return seastar::parallel_for_each(
+ pgs.begin(), pgs.end(),
+ [=, &shard_services](auto& pg) {
+ return shard_services.start_operation<PGAdvanceMap>(
+ shard_services,
+ pg.second, epoch,
+ PeeringCtx{}, false).second;
+ });
+}
+
+Ref<PG> PerShardState::get_pg(spg_t pgid)
+{
+ assert_core();
+ return pg_map.get_pg(pgid);
+}
+
+HeartbeatStampsRef PerShardState::get_hb_stamps(int peer)
+{
+ assert_core();
+ auto [stamps, added] = heartbeat_stamps.try_emplace(peer);
+ if (added) {
+ stamps->second = ceph::make_ref<HeartbeatStamps>(peer);
+ }
+ return stamps->second;
+}
+
+OSDSingletonState::OSDSingletonState(
+ int whoami,
+ crimson::net::Messenger &cluster_msgr,
+ crimson::net::Messenger &public_msgr,
+ crimson::mon::Client &monc,
+ crimson::mgr::Client &mgrc)
+ : whoami(whoami),
+ cluster_msgr(cluster_msgr),
+ public_msgr(public_msgr),
+ monc(monc),
+ mgrc(mgrc),
+ local_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_backfills,
+ crimson::common::local_conf()->osd_min_recovery_priority),
+ remote_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_backfills,
+ crimson::common::local_conf()->osd_min_recovery_priority),
+ snap_reserver(
+ &cct,
+ &finisher,
+ crimson::common::local_conf()->osd_max_trimming_pgs)
+{
+ crimson::common::local_conf().add_observer(this);
+ osdmaps[0] = boost::make_local_shared<OSDMap>();
+
+ perf = build_osd_logger(&cct);
+ cct.get_perfcounters_collection()->add(perf);
+
+ recoverystate_perf = build_recoverystate_perf(&cct);
+ cct.get_perfcounters_collection()->add(recoverystate_perf);
+}
+
+seastar::future<> OSDSingletonState::send_to_osd(
+ int peer, MessageURef m, epoch_t from_epoch)
+{
+ if (osdmap->is_down(peer)) {
+ logger().info("{}: osd.{} is_down", __func__, peer);
+ return seastar::now();
+ } else if (osdmap->get_info(peer).up_from > from_epoch) {
+ logger().info("{}: osd.{} {} > {}", __func__, peer,
+ osdmap->get_info(peer).up_from, from_epoch);
+ return seastar::now();
+ } else {
+ auto conn = cluster_msgr.connect(
+ osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD);
+ return conn->send(std::move(m));
+ }
+}
+
+seastar::future<> OSDSingletonState::osdmap_subscribe(
+ version_t epoch, bool force_request)
+{
+ logger().info("{}({})", __func__, epoch);
+ if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
+ force_request) {
+ return monc.renew_subs();
+ } else {
+ return seastar::now();
+ }
+}
+
+void OSDSingletonState::queue_want_pg_temp(
+ pg_t pgid,
+ const vector<int>& want,
+ bool forced)
+{
+ auto p = pg_temp_pending.find(pgid);
+ if (p == pg_temp_pending.end() ||
+ p->second.acting != want ||
+ forced) {
+ pg_temp_wanted[pgid] = {want, forced};
+ }
+}
+
+void OSDSingletonState::remove_want_pg_temp(pg_t pgid)
+{
+ pg_temp_wanted.erase(pgid);
+ pg_temp_pending.erase(pgid);
+}
+
+void OSDSingletonState::requeue_pg_temp()
+{
+ unsigned old_wanted = pg_temp_wanted.size();
+ unsigned old_pending = pg_temp_pending.size();
+ pg_temp_wanted.merge(pg_temp_pending);
+ pg_temp_pending.clear();
+ logger().debug(
+ "{}: {} + {} -> {}",
+ __func__ ,
+ old_wanted,
+ old_pending,
+ pg_temp_wanted.size());
+}
+
+seastar::future<> OSDSingletonState::send_pg_temp()
+{
+ if (pg_temp_wanted.empty())
+ return seastar::now();
+ logger().debug("{}: {}", __func__, pg_temp_wanted);
+ MURef<MOSDPGTemp> ms[2] = {nullptr, nullptr};
+ for (auto& [pgid, pg_temp] : pg_temp_wanted) {
+ auto& m = ms[pg_temp.forced];
+ if (!m) {
+ m = crimson::make_message<MOSDPGTemp>(osdmap->get_epoch());
+ m->forced = pg_temp.forced;
+ }
+ m->pg_temp.emplace(pgid, pg_temp.acting);
+ }
+ pg_temp_pending.merge(pg_temp_wanted);
+ pg_temp_wanted.clear();
+ return seastar::parallel_for_each(std::begin(ms), std::end(ms),
+ [this](auto& m) {
+ if (m) {
+ return monc.send_message(std::move(m));
+ } else {
+ return seastar::now();
+ }
+ });
+}
+
+std::ostream& operator<<(
+ std::ostream& out,
+ const OSDSingletonState::pg_temp_t& pg_temp)
+{
+ out << pg_temp.acting;
+ if (pg_temp.forced) {
+ out << " (forced)";
+ }
+ return out;
+}
+
+seastar::future<> OSDSingletonState::send_pg_created(pg_t pgid)
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+ pg_created.insert(pgid);
+ return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid));
+}
+
+seastar::future<> OSDSingletonState::send_pg_created()
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+ return seastar::parallel_for_each(pg_created,
+ [this](auto &pgid) {
+ return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid));
+ });
+}
+
+void OSDSingletonState::prune_pg_created()
+{
+ logger().debug(__func__);
+ auto o = get_osdmap();
+ auto i = pg_created.begin();
+ while (i != pg_created.end()) {
+ auto p = o->get_pg_pool(i->pool());
+ if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
+ logger().debug("{} pruning {}", __func__, *i);
+ i = pg_created.erase(i);
+ } else {
+ logger().debug(" keeping {}", __func__, *i);
+ ++i;
+ }
+ }
+}
+
+seastar::future<> OSDSingletonState::send_alive(const epoch_t want)
+{
+ logger().info(
+ "{} want={} up_thru_wanted={}",
+ __func__,
+ want,
+ up_thru_wanted);
+
+ if (want > up_thru_wanted) {
+ up_thru_wanted = want;
+ } else {
+ logger().debug("{} want={} <= up_thru_wanted={}; skipping",
+ __func__, want, up_thru_wanted);
+ return seastar::now();
+ }
+ if (!osdmap->exists(whoami)) {
+ logger().warn("{} DNE", __func__);
+ return seastar::now();
+ } if (const epoch_t up_thru = osdmap->get_up_thru(whoami);
+ up_thru_wanted > up_thru) {
+ logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru);
+ return monc.send_message(
+ crimson::make_message<MOSDAlive>(osdmap->get_epoch(), want));
+ } else {
+ logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami));
+ return seastar::now();
+ }
+}
+
+const char** OSDSingletonState::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_max_backfills",
+ "osd_min_recovery_priority",
+ "osd_max_trimming_pgs",
+ nullptr
+ };
+ return KEYS;
+}
+
+void OSDSingletonState::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("osd_max_backfills")) {
+ local_reserver.set_max(conf->osd_max_backfills);
+ remote_reserver.set_max(conf->osd_max_backfills);
+ }
+ if (changed.count("osd_min_recovery_priority")) {
+ local_reserver.set_min_priority(conf->osd_min_recovery_priority);
+ remote_reserver.set_min_priority(conf->osd_min_recovery_priority);
+ }
+ if (changed.count("osd_max_trimming_pgs")) {
+ snap_reserver.set_max(conf->osd_max_trimming_pgs);
+ }
+}
+
+seastar::future<OSDSingletonState::local_cached_map_t>
+OSDSingletonState::get_local_map(epoch_t e)
+{
+ // TODO: use LRU cache for managing osdmap, fallback to disk if we have to
+ if (auto found = osdmaps.find(e); found) {
+ logger().debug("{} osdmap.{} found in cache", __func__, e);
+ return seastar::make_ready_future<local_cached_map_t>(std::move(found));
+ } else {
+ logger().debug("{} loading osdmap.{} from disk", __func__, e);
+ return load_map(e).then([e, this](std::unique_ptr<OSDMap> osdmap) {
+ return seastar::make_ready_future<local_cached_map_t>(
+ osdmaps.insert(e, std::move(osdmap)));
+ });
+ }
+}
+
+void OSDSingletonState::store_map_bl(
+ ceph::os::Transaction& t,
+ epoch_t e, bufferlist&& bl)
+{
+ meta_coll->store_map(t, e, bl);
+ map_bl_cache.insert(e, std::move(bl));
+}
+
+seastar::future<bufferlist> OSDSingletonState::load_map_bl(
+ epoch_t e)
+{
+ if (std::optional<bufferlist> found = map_bl_cache.find(e); found) {
+ logger().debug("{} osdmap.{} found in cache", __func__, e);
+ return seastar::make_ready_future<bufferlist>(*found);
+ } else {
+ logger().debug("{} loading osdmap.{} from disk", __func__, e);
+ return meta_coll->load_map(e);
+ }
+}
+
+seastar::future<std::map<epoch_t, bufferlist>> OSDSingletonState::load_map_bls(
+ epoch_t first,
+ epoch_t last)
+{
+ logger().debug("{} loading maps [{},{}]",
+ __func__, first, last);
+ ceph_assert(first <= last);
+ return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first),
+ boost::make_counting_iterator<epoch_t>(last + 1),
+ [this](epoch_t e) {
+ return load_map_bl(e).then([e](auto&& bl) {
+ return seastar::make_ready_future<std::pair<epoch_t, bufferlist>>(
+ std::make_pair(e, std::move(bl)));
+ });
+ },
+ std::map<epoch_t, bufferlist>{},
+ [](auto&& bls, auto&& epoch_bl) {
+ bls.emplace(std::move(epoch_bl));
+ return std::move(bls);
+ });
+}
+
+seastar::future<std::unique_ptr<OSDMap>> OSDSingletonState::load_map(epoch_t e)
+{
+ auto o = std::make_unique<OSDMap>();
+ logger().info("{} osdmap.{}", __func__, e);
+ if (e == 0) {
+ return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o));
+ }
+ return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable {
+ o->decode(bl);
+ return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o));
+ });
+}
+
+seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t,
+ epoch_t start, Ref<MOSDMap> m)
+{
+ return seastar::do_for_each(
+ boost::make_counting_iterator(start),
+ boost::make_counting_iterator(m->get_last() + 1),
+ [&t, m, this](epoch_t e) {
+ if (auto p = m->maps.find(e); p != m->maps.end()) {
+ auto o = std::make_unique<OSDMap>();
+ o->decode(p->second);
+ logger().info("store_maps storing osdmap.{}", e);
+ store_map_bl(t, e, std::move(std::move(p->second)));
+ osdmaps.insert(e, std::move(o));
+ return seastar::now();
+ } else if (auto p = m->incremental_maps.find(e);
+ p != m->incremental_maps.end()) {
+ logger().info("store_maps found osdmap.{} incremental map, "
+ "loading osdmap.{}", e, e - 1);
+ ceph_assert(std::cmp_greater(e, 0u));
+ return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) {
+ OSDMap::Incremental inc;
+ auto i = bl.cbegin();
+ inc.decode(i);
+ o->apply_incremental(inc);
+ bufferlist fbl;
+ o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+ logger().info("store_maps storing osdmap.{}", o->get_epoch());
+ store_map_bl(t, e, std::move(fbl));
+ osdmaps.insert(e, std::move(o));
+ return seastar::now();
+ });
+ } else {
+ logger().error("MOSDMap lied about what maps it had?");
+ return seastar::now();
+ }
+ });
+}
+
+seastar::future<Ref<PG>> ShardServices::make_pg(
+ OSDMapService::cached_map_t create_map,
+ spg_t pgid,
+ bool do_create)
+{
+ using ec_profile_t = std::map<std::string, std::string>;
+ auto get_pool_info_for_pg = [create_map, pgid, this] {
+ if (create_map->have_pg_pool(pgid.pool())) {
+ pg_pool_t pi = *create_map->get_pg_pool(pgid.pool());
+ std::string name = create_map->get_pool_name(pgid.pool());
+ ec_profile_t ec_profile;
+ if (pi.is_erasure()) {
+ ec_profile = create_map->get_erasure_code_profile(
+ pi.erasure_code_profile);
+ }
+ return seastar::make_ready_future<
+ std::tuple<pg_pool_t,std::string, ec_profile_t>
+ >(std::make_tuple(
+ std::move(pi),
+ std::move(name),
+ std::move(ec_profile)));
+ } else {
+ // pool was deleted; grab final pg_pool_t off disk.
+ return get_pool_info(pgid.pool());
+ }
+ };
+ auto get_collection = [pgid, do_create, this] {
+ const coll_t cid{pgid};
+ if (do_create) {
+ return get_store().create_new_collection(cid);
+ } else {
+ return get_store().open_collection(cid);
+ }
+ };
+ return seastar::when_all(
+ std::move(get_pool_info_for_pg),
+ std::move(get_collection)
+ ).then([pgid, create_map, this](auto &&ret) {
+ auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0());
+ auto coll = std::move(std::get<1>(ret).get0());
+ return seastar::make_ready_future<Ref<PG>>(
+ new PG{
+ pgid,
+ pg_shard_t{local_state.whoami, pgid.shard},
+ std::move(coll),
+ std::move(pool),
+ std::move(name),
+ create_map,
+ *this,
+ ec_profile});
+ });
+}
+
+seastar::future<Ref<PG>> ShardServices::handle_pg_create_info(
+ std::unique_ptr<PGCreateInfo> info) {
+ return seastar::do_with(
+ std::move(info),
+ [this](auto &info)
+ -> seastar::future<Ref<PG>> {
+ return get_map(info->epoch).then(
+ [&info, this](cached_map_t startmap)
+ -> seastar::future<std::tuple<Ref<PG>, cached_map_t>> {
+ const spg_t &pgid = info->pgid;
+ if (info->by_mon) {
+ int64_t pool_id = pgid.pgid.pool();
+ const pg_pool_t *pool = get_map()->get_pg_pool(pool_id);
+ if (!pool) {
+ logger().debug(
+ "{} ignoring pgid {}, pool dne",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ } else if (!pool->is_crimson()) {
+ logger().debug(
+ "{} ignoring pgid {}, pool lacks crimson flag",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ }
+ ceph_assert(get_map()->require_osd_release >=
+ ceph_release_t::octopus);
+ if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) {
+ // this ensures we do not process old creating messages after the
+ // pool's initial pgs have been created (and pg are subsequently
+ // allowed to split or merge).
+ logger().debug(
+ "{} dropping {} create, pool does not have CREATING flag set",
+ __func__,
+ pgid);
+ local_state.pg_map.pg_creation_canceled(pgid);
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(Ref<PG>(), startmap));
+ }
+ }
+ return make_pg(
+ startmap, pgid, true
+ ).then([startmap=std::move(startmap)](auto pg) mutable {
+ return seastar::make_ready_future<
+ std::tuple<Ref<PG>, OSDMapService::cached_map_t>
+ >(std::make_tuple(std::move(pg), std::move(startmap)));
+ });
+ }).then([this, &info](auto &&ret)
+ ->seastar::future<Ref<PG>> {
+ auto [pg, startmap] = std::move(ret);
+ if (!pg)
+ return seastar::make_ready_future<Ref<PG>>(Ref<PG>());
+ const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool());
+
+ int up_primary, acting_primary;
+ vector<int> up, acting;
+ startmap->pg_to_up_acting_osds(
+ info->pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+ int role = startmap->calc_pg_role(
+ pg_shard_t(local_state.whoami, info->pgid.shard),
+ acting);
+
+ PeeringCtx rctx;
+ create_pg_collection(
+ rctx.transaction,
+ info->pgid,
+ info->pgid.get_split_bits(pp->get_pg_num()));
+ init_pg_ondisk(
+ rctx.transaction,
+ info->pgid,
+ pp);
+
+ pg->init(
+ role,
+ up,
+ up_primary,
+ acting,
+ acting_primary,
+ info->history,
+ info->past_intervals,
+ rctx.transaction);
+
+ return start_operation<PGAdvanceMap>(
+ *this, pg, get_map()->get_epoch(), std::move(rctx), true
+ ).second.then([pg=pg] {
+ return seastar::make_ready_future<Ref<PG>>(pg);
+ });
+ });
+ });
+}
+
+
+ShardServices::get_or_create_pg_ret
+ShardServices::get_or_create_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&& trigger,
+ spg_t pgid,
+ std::unique_ptr<PGCreateInfo> info)
+{
+ if (info) {
+ auto [fut, creating] = local_state.pg_map.wait_for_pg(
+ std::move(trigger), pgid);
+ if (!creating) {
+ local_state.pg_map.set_creating(pgid);
+ (void)handle_pg_create_info(
+ std::move(info));
+ }
+ return std::move(fut);
+ } else {
+ return get_or_create_pg_ret(
+ get_or_create_pg_ertr::ready_future_marker{},
+ local_state.pg_map.get_pg(pgid));
+ }
+}
+
+ShardServices::wait_for_pg_ret
+ShardServices::wait_for_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid)
+{
+ return local_state.pg_map.wait_for_pg(std::move(trigger), pgid).first;
+}
+
+seastar::future<Ref<PG>> ShardServices::load_pg(spg_t pgid)
+
+{
+ logger().debug("{}: {}", __func__, pgid);
+
+ return seastar::do_with(PGMeta(get_store(), pgid), [](auto& pg_meta) {
+ return pg_meta.get_epoch();
+ }).then([this](epoch_t e) {
+ return get_map(e);
+ }).then([pgid, this](auto&& create_map) {
+ return make_pg(std::move(create_map), pgid, false);
+ }).then([this](Ref<PG> pg) {
+ return pg->read_state(&get_store()).then([pg] {
+ return seastar::make_ready_future<Ref<PG>>(std::move(pg));
+ });
+ }).handle_exception([pgid](auto ep) {
+ logger().info("pg {} saw exception on load {}", pgid, ep);
+ ceph_abort("Could not load pg" == 0);
+ return seastar::make_exception_future<Ref<PG>>(ep);
+ });
+}
+
+seastar::future<> ShardServices::dispatch_context_transaction(
+ crimson::os::CollectionRef col, PeeringCtx &ctx) {
+ if (ctx.transaction.empty()) {
+ logger().debug("ShardServices::dispatch_context_transaction: empty transaction");
+ return seastar::now();
+ }
+
+ logger().debug("ShardServices::dispatch_context_transaction: do_transaction ...");
+ auto ret = get_store().do_transaction(
+ col,
+ std::move(ctx.transaction));
+ ctx.reset_transaction();
+ return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context_messages(
+ BufferedRecoveryMessages &&ctx)
+{
+ auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
+ [this](auto& osd_messages) {
+ auto& [peer, messages] = osd_messages;
+ logger().debug("dispatch_context_messages sending messages to {}", peer);
+ return seastar::parallel_for_each(
+ std::move(messages), [=, peer=peer, this](auto& m) {
+ return send_to_osd(peer, std::move(m), local_state.osdmap->get_epoch());
+ });
+ });
+ ctx.message_map.clear();
+ return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context(
+ crimson::os::CollectionRef col,
+ PeeringCtx &&ctx)
+{
+ ceph_assert(col || ctx.transaction.empty());
+ return seastar::when_all_succeed(
+ dispatch_context_messages(
+ BufferedRecoveryMessages{ctx}),
+ col ? dispatch_context_transaction(col, ctx) : seastar::now()
+ ).then_unpack([] {
+ return seastar::now();
+ });
+}
+
+seastar::future<> OSDSingletonState::send_incremental_map(
+ crimson::net::Connection &conn,
+ epoch_t first)
+{
+ logger().info("{}: first osdmap: {} "
+ "superblock's oldest map: {}",
+ __func__, first, superblock.oldest_map);
+ if (first >= superblock.oldest_map) {
+ return load_map_bls(
+ first, superblock.newest_map
+ ).then([this, &conn, first](auto&& bls) {
+ auto m = crimson::make_message<MOSDMap>(
+ monc.get_fsid(),
+ osdmap->get_encoding_features());
+ m->cluster_osdmap_trim_lower_bound = first;
+ m->newest_map = superblock.newest_map;
+ m->maps = std::move(bls);
+ return conn.send(std::move(m));
+ });
+ } else {
+ return load_map_bl(osdmap->get_epoch()
+ ).then([this, &conn](auto&& bl) mutable {
+ auto m = crimson::make_message<MOSDMap>(
+ monc.get_fsid(),
+ osdmap->get_encoding_features());
+ /* TODO: once we support the tracking of superblock's
+ * cluster_osdmap_trim_lower_bound, the MOSDMap should
+ * be populated with this value instead of the oldest_map.
+ * See: OSD::handle_osd_map for how classic updates the
+ * cluster's trim lower bound.
+ */
+ m->cluster_osdmap_trim_lower_bound = superblock.oldest_map;
+ m->newest_map = superblock.newest_map;
+ m->maps.emplace(osdmap->get_epoch(), std::move(bl));
+ return conn.send(std::move(m));
+ });
+ }
+}
+
+seastar::future<> OSDSingletonState::send_incremental_map_to_osd(
+ int osd,
+ epoch_t first)
+{
+ if (osdmap->is_down(osd)) {
+ logger().info("{}: osd.{} is_down", __func__, osd);
+ return seastar::now();
+ } else {
+ auto conn = cluster_msgr.connect(
+ osdmap->get_cluster_addrs(osd).front(), CEPH_ENTITY_TYPE_OSD);
+ return send_incremental_map(*conn, first);
+ }
+}
+
+};
diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h
new file mode 100644
index 000000000..9b7553e7b
--- /dev/null
+++ b/src/crimson/osd/shard_services.h
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/common_fwd.h"
+#include "osd_operation.h"
+#include "msg/MessageRef.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/shared_lru.h"
+#include "crimson/os/futurized_collection.h"
+#include "osd/PeeringState.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/osd_meta.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/state.h"
+#include "common/AsyncReserver.h"
+
+namespace crimson::net {
+ class Messenger;
+}
+
+namespace crimson::mgr {
+ class Client;
+}
+
+namespace crimson::mon {
+ class Client;
+}
+
+namespace crimson::os {
+ class FuturizedStore;
+}
+
+class OSDMap;
+class PeeringCtx;
+class BufferedRecoveryMessages;
+
+namespace crimson::osd {
+
+class PGShardManager;
+
+/**
+ * PerShardState
+ *
+ * Per-shard state holding instances local to each shard.
+ */
+class PerShardState {
+ friend class ShardServices;
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ const core_id_t core = seastar::this_shard_id();
+#define assert_core() ceph_assert(seastar::this_shard_id() == core);
+
+ const int whoami;
+ crimson::os::FuturizedStore::Shard &store;
+ crimson::common::CephContext cct;
+
+ OSDState &osd_state;
+ OSD_OSDMapGate osdmap_gate;
+
+ PerfCounters *perf = nullptr;
+ PerfCounters *recoverystate_perf = nullptr;
+
+ // Op Management
+ OSDOperationRegistry registry;
+ OperationThrottler throttler;
+
+ seastar::future<> dump_ops_in_flight(Formatter *f) const;
+
+ epoch_t up_epoch = 0;
+ OSDMapService::cached_map_t osdmap;
+ const auto &get_osdmap() const {
+ assert_core();
+ return osdmap;
+ }
+ void update_map(OSDMapService::cached_map_t new_osdmap) {
+ assert_core();
+ osdmap = std::move(new_osdmap);
+ }
+ void set_up_epoch(epoch_t epoch) {
+ assert_core();
+ up_epoch = epoch;
+ }
+
+ // prevent creating new osd operations when system is shutting down,
+ // this is necessary because there are chances that a new operation
+ // is created, after the interruption of all ongoing operations, and
+ // creats and waits on a new and may-never-resolve future, in which
+ // case the shutdown may never succeed.
+ bool stopping = false;
+ seastar::future<> stop_registry() {
+ assert_core();
+ crimson::get_logger(ceph_subsys_osd).info("PerShardState::{}", __func__);
+ stopping = true;
+ return registry.stop();
+ }
+
+ // PGMap state
+ PGMap pg_map;
+
+ seastar::future<> stop_pgs();
+ std::map<pg_t, pg_stat_t> get_pg_stats() const;
+ seastar::future<> broadcast_map_to_pgs(
+ ShardServices &shard_services,
+ epoch_t epoch);
+
+ Ref<PG> get_pg(spg_t pgid);
+ template <typename F>
+ void for_each_pg(F &&f) const {
+ assert_core();
+ for (auto &pg : pg_map.get_pgs()) {
+ std::invoke(f, pg.first, pg.second);
+ }
+ }
+
+ template <typename T, typename... Args>
+ auto start_operation(Args&&... args) {
+ assert_core();
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ auto op = registry.create_operation<T>(std::forward<Args>(args)...);
+ crimson::get_logger(ceph_subsys_osd).info(
+ "PerShardState::{}, {}", __func__, *op);
+ auto fut = seastar::yield().then([op] {
+ return op->start().finally([op /* by copy */] {
+ // ensure the op's lifetime is appropriate. It is not enough to
+ // guarantee it's alive at the scheduling stages (i.e. `then()`
+ // calling) but also during the actual execution (i.e. when passed
+ // lambdas are actually run).
+ });
+ });
+ return std::make_pair(std::move(op), std::move(fut));
+ }
+
+ template <typename InterruptorT, typename T, typename... Args>
+ auto start_operation_may_interrupt(Args&&... args) {
+ assert_core();
+ if (__builtin_expect(stopping, false)) {
+ throw crimson::common::system_shutdown_exception();
+ }
+ auto op = registry.create_operation<T>(std::forward<Args>(args)...);
+ crimson::get_logger(ceph_subsys_osd).info(
+ "PerShardState::{}, {}", __func__, *op);
+ auto fut = InterruptorT::make_interruptible(
+ seastar::yield()
+ ).then_interruptible([op] {
+ return op->start().finally([op /* by copy */] {
+ // ensure the op's lifetime is appropriate. It is not enough to
+ // guarantee it's alive at the scheduling stages (i.e. `then()`
+ // calling) but also during the actual execution (i.e. when passed
+ // lambdas are actually run).
+ });
+ });
+ return std::make_pair(std::move(op), std::move(fut));
+ }
+
+ // tids for ops i issue, prefixed with core id to ensure uniqueness
+ ceph_tid_t next_tid;
+ ceph_tid_t get_tid() {
+ assert_core();
+ return next_tid++;
+ }
+
+ HeartbeatStampsRef get_hb_stamps(int peer);
+ std::map<int, HeartbeatStampsRef> heartbeat_stamps;
+
+ // Time state
+ const ceph::mono_time startup_time;
+ ceph::signedspan get_mnow() const {
+ assert_core();
+ return ceph::mono_clock::now() - startup_time;
+ }
+
+public:
+ PerShardState(
+ int whoami,
+ ceph::mono_time startup_time,
+ PerfCounters *perf,
+ PerfCounters *recoverystate_perf,
+ crimson::os::FuturizedStore &store,
+ OSDState& osd_state);
+};
+
+/**
+ * OSDSingletonState
+ *
+ * OSD-wide singleton holding instances that need to be accessible
+ * from all PGs.
+ */
+class OSDSingletonState : public md_config_obs_t {
+ friend class ShardServices;
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+public:
+ OSDSingletonState(
+ int whoami,
+ crimson::net::Messenger &cluster_msgr,
+ crimson::net::Messenger &public_msgr,
+ crimson::mon::Client &monc,
+ crimson::mgr::Client &mgrc);
+
+private:
+ const int whoami;
+
+ crimson::common::CephContext cct;
+ PerfCounters *perf = nullptr;
+ PerfCounters *recoverystate_perf = nullptr;
+
+ SharedLRU<epoch_t, OSDMap> osdmaps;
+ SimpleLRU<epoch_t, bufferlist, false> map_bl_cache;
+
+ cached_map_t osdmap;
+ cached_map_t &get_osdmap() { return osdmap; }
+ void update_map(cached_map_t new_osdmap) {
+ osdmap = std::move(new_osdmap);
+ }
+
+ crimson::net::Messenger &cluster_msgr;
+ crimson::net::Messenger &public_msgr;
+
+ seastar::future<> send_to_osd(int peer, MessageURef m, epoch_t from_epoch);
+
+ crimson::mon::Client &monc;
+ seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+ crimson::mgr::Client &mgrc;
+
+ std::unique_ptr<OSDMeta> meta_coll;
+ template <typename... Args>
+ void init_meta_coll(Args&&... args) {
+ meta_coll = std::make_unique<OSDMeta>(std::forward<Args>(args)...);
+ }
+ OSDMeta &get_meta_coll() {
+ assert(meta_coll);
+ return *meta_coll;
+ }
+
+ OSDSuperblock superblock;
+ void set_superblock(OSDSuperblock _superblock) {
+ superblock = std::move(_superblock);
+ }
+
+ seastar::future<> send_incremental_map(
+ crimson::net::Connection &conn,
+ epoch_t first);
+
+ seastar::future<> send_incremental_map_to_osd(int osd, epoch_t first);
+
+ auto get_pool_info(int64_t poolid) {
+ return get_meta_coll().load_final_pool_info(poolid);
+ }
+
+ // global pg temp state
+ struct pg_temp_t {
+ std::vector<int> acting;
+ bool forced = false;
+ };
+ std::map<pg_t, pg_temp_t> pg_temp_wanted;
+ std::map<pg_t, pg_temp_t> pg_temp_pending;
+ friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
+
+ void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want,
+ bool forced = false);
+ void remove_want_pg_temp(pg_t pgid);
+ void requeue_pg_temp();
+ seastar::future<> send_pg_temp();
+
+ std::set<pg_t> pg_created;
+ seastar::future<> send_pg_created(pg_t pgid);
+ seastar::future<> send_pg_created();
+ void prune_pg_created();
+
+ struct DirectFinisher {
+ void queue(Context *c) {
+ c->complete(0);
+ }
+ } finisher;
+ AsyncReserver<spg_t, DirectFinisher> local_reserver;
+ AsyncReserver<spg_t, DirectFinisher> remote_reserver;
+ AsyncReserver<spg_t, DirectFinisher> snap_reserver;
+
+ epoch_t up_thru_wanted = 0;
+ seastar::future<> send_alive(epoch_t want);
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed) final;
+
+ seastar::future<local_cached_map_t> get_local_map(epoch_t e);
+ seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e);
+ seastar::future<bufferlist> load_map_bl(epoch_t e);
+ seastar::future<std::map<epoch_t, bufferlist>>
+ load_map_bls(epoch_t first, epoch_t last);
+ void store_map_bl(ceph::os::Transaction& t,
+ epoch_t e, bufferlist&& bl);
+ seastar::future<> store_maps(ceph::os::Transaction& t,
+ epoch_t start, Ref<MOSDMap> m);
+};
+
+/**
+ * Represents services available to each PG
+ */
+class ShardServices : public OSDMapService {
+ friend class PGShardManager;
+ friend class OSD;
+ using cached_map_t = OSDMapService::cached_map_t;
+ using local_cached_map_t = OSDMapService::local_cached_map_t;
+
+ PerShardState local_state;
+ seastar::sharded<OSDSingletonState> &osd_singleton_state;
+ PGShardMapping& pg_to_shard_mapping;
+
+ template <typename F, typename... Args>
+ auto with_singleton(F &&f, Args&&... args) {
+ return osd_singleton_state.invoke_on(
+ PRIMARY_CORE,
+ std::forward<F>(f),
+ std::forward<Args>(args)...
+ );
+ }
+
+#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) const { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \
+ template <typename... Args> \
+ auto FROM_METHOD(Args&&... args) { \
+ return TARGET.TO_METHOD(std::forward<Args>(args)...); \
+ }
+
+#define FORWARD_TO_LOCAL(METHOD) FORWARD(METHOD, METHOD, local_state)
+#define FORWARD_TO_LOCAL_CONST(METHOD) FORWARD_CONST( \
+ METHOD, METHOD, local_state) \
+
+#define FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, TARGET) \
+ template <typename... Args> \
+ auto METHOD(Args&&... args) { \
+ return with_singleton( \
+ [](auto &local_state, auto&&... args) { \
+ return local_state.TARGET( \
+ std::forward<decltype(args)>(args)...); \
+ }, std::forward<Args>(args)...); \
+ }
+#define FORWARD_TO_OSD_SINGLETON(METHOD) \
+ FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, METHOD)
+
+public:
+ template <typename... PSSArgs>
+ ShardServices(
+ seastar::sharded<OSDSingletonState> &osd_singleton_state,
+ PGShardMapping& pg_to_shard_mapping,
+ PSSArgs&&... args)
+ : local_state(std::forward<PSSArgs>(args)...),
+ osd_singleton_state(osd_singleton_state),
+ pg_to_shard_mapping(pg_to_shard_mapping) {}
+
+ FORWARD_TO_OSD_SINGLETON(send_to_osd)
+
+ crimson::os::FuturizedStore::Shard &get_store() {
+ return local_state.store;
+ }
+
+ auto remove_pg(spg_t pgid) {
+ local_state.pg_map.remove_pg(pgid);
+ return pg_to_shard_mapping.remove_pg(pgid);
+ }
+
+ crimson::common::CephContext *get_cct() {
+ return &(local_state.cct);
+ }
+
+ template <typename T, typename... Args>
+ auto start_operation(Args&&... args) {
+ return local_state.start_operation<T>(std::forward<Args>(args)...);
+ }
+
+ template <typename InterruptorT, typename T, typename... Args>
+ auto start_operation_may_interrupt(Args&&... args) {
+ return local_state.start_operation_may_interrupt<
+ InterruptorT, T>(std::forward<Args>(args)...);
+ }
+
+ auto &get_registry() { return local_state.registry; }
+
+ // Loggers
+ PerfCounters &get_recoverystate_perf_logger() {
+ return *local_state.recoverystate_perf;
+ }
+ PerfCounters &get_perf_logger() {
+ return *local_state.perf;
+ }
+
+ // Diagnostics
+ FORWARD_TO_LOCAL_CONST(dump_ops_in_flight);
+
+ // Local PG Management
+ seastar::future<Ref<PG>> make_pg(
+ cached_map_t create_map,
+ spg_t pgid,
+ bool do_create);
+ seastar::future<Ref<PG>> handle_pg_create_info(
+ std::unique_ptr<PGCreateInfo> info);
+
+ using get_or_create_pg_ertr = PGMap::wait_for_pg_ertr;
+ using get_or_create_pg_ret = get_or_create_pg_ertr::future<Ref<PG>>;
+ get_or_create_pg_ret get_or_create_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&&,
+ spg_t pgid,
+ std::unique_ptr<PGCreateInfo> info);
+
+ using wait_for_pg_ertr = PGMap::wait_for_pg_ertr;
+ using wait_for_pg_ret = wait_for_pg_ertr::future<Ref<PG>>;
+ wait_for_pg_ret wait_for_pg(
+ PGMap::PGCreationBlockingEvent::TriggerI&&, spg_t pgid);
+ seastar::future<Ref<PG>> load_pg(spg_t pgid);
+
+ /// Dispatch and reset ctx transaction
+ seastar::future<> dispatch_context_transaction(
+ crimson::os::CollectionRef col, PeeringCtx &ctx);
+
+ /// Dispatch and reset ctx messages
+ seastar::future<> dispatch_context_messages(
+ BufferedRecoveryMessages &&ctx);
+
+ /// Dispatch ctx and dispose of context
+ seastar::future<> dispatch_context(
+ crimson::os::CollectionRef col,
+ PeeringCtx &&ctx);
+
+ /// Dispatch ctx and dispose of ctx, transaction must be empty
+ seastar::future<> dispatch_context(
+ PeeringCtx &&ctx) {
+ return dispatch_context({}, std::move(ctx));
+ }
+
+ /// Return per-core tid
+ ceph_tid_t get_tid() { return local_state.get_tid(); }
+
+ /// Return core-local pg count * number of cores
+ unsigned get_num_local_pgs() const {
+ return local_state.pg_map.get_pg_count();
+ }
+
+ // OSDMapService
+ cached_map_t get_map() const final { return local_state.get_osdmap(); }
+ epoch_t get_up_epoch() const final { return local_state.up_epoch; }
+ seastar::future<cached_map_t> get_map(epoch_t e) final {
+ return with_singleton(
+ [](auto &sstate, epoch_t e) {
+ return sstate.get_local_map(
+ e
+ ).then([](auto lmap) {
+ return seastar::foreign_ptr<local_cached_map_t>(lmap);
+ });
+ }, e).then([](auto fmap) {
+ return make_local_shared_foreign(std::move(fmap));
+ });
+ }
+
+ FORWARD_TO_OSD_SINGLETON(get_pool_info)
+ FORWARD(with_throttle_while, with_throttle_while, local_state.throttler)
+
+ FORWARD_TO_OSD_SINGLETON(send_incremental_map)
+ FORWARD_TO_OSD_SINGLETON(send_incremental_map_to_osd)
+
+ FORWARD_TO_OSD_SINGLETON(osdmap_subscribe)
+ FORWARD_TO_OSD_SINGLETON(queue_want_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(remove_want_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(requeue_pg_temp)
+ FORWARD_TO_OSD_SINGLETON(send_pg_created)
+ FORWARD_TO_OSD_SINGLETON(send_alive)
+ FORWARD_TO_OSD_SINGLETON(send_pg_temp)
+ FORWARD_TO_LOCAL_CONST(get_mnow)
+ FORWARD_TO_LOCAL(get_hb_stamps)
+
+ FORWARD(pg_created, pg_created, local_state.pg_map)
+
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_update_priority,
+ local_reserver.update_priority)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_cancel_reservation,
+ local_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ local_dump_reservations,
+ local_reserver.dump)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ remote_cancel_reservation,
+ remote_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ remote_dump_reservations,
+ remote_reserver.dump)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ snap_cancel_reservation,
+ snap_reserver.cancel_reservation)
+ FORWARD_TO_OSD_SINGLETON_TARGET(
+ snap_dump_reservations,
+ snap_reserver.dump)
+
+ Context *invoke_context_on_core(core_id_t core, Context *c) {
+ if (!c) return nullptr;
+ return new LambdaContext([core, c](int code) {
+ std::ignore = seastar::smp::submit_to(
+ core,
+ [c, code] {
+ c->complete(code);
+ });
+ });
+ }
+ seastar::future<> local_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio,
+ Context *on_preempt) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved, Context *wrapped_on_preempt) {
+ return singleton.local_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio,
+ wrapped_on_preempt);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved),
+ invoke_context_on_core(seastar::this_shard_id(), on_preempt));
+ }
+ seastar::future<> remote_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio,
+ Context *on_preempt) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved, Context *wrapped_on_preempt) {
+ return singleton.remote_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio,
+ wrapped_on_preempt);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved),
+ invoke_context_on_core(seastar::this_shard_id(), on_preempt));
+ }
+ seastar::future<> snap_request_reservation(
+ spg_t item,
+ Context *on_reserved,
+ unsigned prio) {
+ return with_singleton(
+ [item, prio](OSDSingletonState &singleton,
+ Context *wrapped_on_reserved) {
+ return singleton.snap_reserver.request_reservation(
+ item,
+ wrapped_on_reserved,
+ prio);
+ },
+ invoke_context_on_core(seastar::this_shard_id(), on_reserved));
+ }
+
+#undef FORWARD_CONST
+#undef FORWARD
+#undef FORWARD_TO_OSD_SINGLETON
+#undef FORWARD_TO_LOCAL
+#undef FORWARD_TO_LOCAL_CONST
+};
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::OSDSingletonState::pg_temp_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h
new file mode 100644
index 000000000..f0676a4ec
--- /dev/null
+++ b/src/crimson/osd/state.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string_view>
+#include <ostream>
+
+#include <seastar/core/shared_future.hh>
+
+class OSDMap;
+
+namespace crimson::osd {
+
+// seastar::sharded puts start_single on core 0
+constexpr core_id_t PRIMARY_CORE = 0;
+
+/**
+ * OSDState
+ *
+ * Maintains state representing the OSD's progress from booting through
+ * shutdown.
+ *
+ * Shards other than PRIMARY_CORE may use their local instance to check
+ * on ACTIVE and STOPPING. All other methods are restricted to
+ * PRIMARY_CORE (such methods start with an assert to this effect).
+ */
+class OSDState : public seastar::peering_sharded_service<OSDState> {
+
+ enum class State {
+ INITIALIZING,
+ PREBOOT,
+ BOOTING,
+ ACTIVE,
+ PRESTOP,
+ STOPPING,
+ WAITING_FOR_HEALTHY,
+ };
+
+ State state = State::INITIALIZING;
+ mutable seastar::shared_promise<> wait_for_active;
+
+ /// Sets local instance state to active, called from set_active
+ void _set_active() {
+ state = State::ACTIVE;
+ wait_for_active.set_value();
+ wait_for_active = {};
+ }
+ /// Sets local instance state to stopping, called from set_stopping
+ void _set_stopping() {
+ state = State::STOPPING;
+ wait_for_active.set_exception(crimson::common::system_shutdown_exception{});
+ wait_for_active = {};
+ }
+public:
+ bool is_initializing() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::INITIALIZING;
+ }
+ bool is_preboot() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::PREBOOT;
+ }
+ bool is_booting() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::BOOTING;
+ }
+ bool is_active() const {
+ return state == State::ACTIVE;
+ }
+ seastar::future<> when_active() const {
+ return is_active() ? seastar::now()
+ : wait_for_active.get_shared_future();
+ };
+ bool is_prestop() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::PRESTOP;
+ }
+ bool is_stopping() const {
+ return state == State::STOPPING;
+ }
+ bool is_waiting_for_healthy() const {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return state == State::WAITING_FOR_HEALTHY;
+ }
+ void set_preboot() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::PREBOOT;
+ }
+ void set_booting() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::BOOTING;
+ }
+ /// Sets all shards to active
+ seastar::future<> set_active() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return container().invoke_on_all([](auto& osd_state) {
+ osd_state._set_active();
+ });
+ }
+ void set_prestop() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ state = State::PRESTOP;
+ }
+ /// Sets all shards to stopping
+ seastar::future<> set_stopping() {
+ ceph_assert(seastar::this_shard_id() == PRIMARY_CORE);
+ return container().invoke_on_all([](auto& osd_state) {
+ osd_state._set_stopping();
+ });
+ }
+ std::string_view to_string() const {
+ switch (state) {
+ case State::INITIALIZING: return "initializing";
+ case State::PREBOOT: return "preboot";
+ case State::BOOTING: return "booting";
+ case State::ACTIVE: return "active";
+ case State::PRESTOP: return "prestop";
+ case State::STOPPING: return "stopping";
+ case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy";
+ default: return "???";
+ }
+ }
+};
+
+inline std::ostream&
+operator<<(std::ostream& os, const OSDState& s) {
+ return os << s.to_string();
+}
+}
diff --git a/src/crimson/osd/stop_signal.h b/src/crimson/osd/stop_signal.h
new file mode 100644
index 000000000..951f8d4b7
--- /dev/null
+++ b/src/crimson/osd/stop_signal.h
@@ -0,0 +1,83 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/condition-variable.hh>
+
+/// Seastar apps lib namespace
+
+namespace seastar_apps_lib {
+
+
+/// \brief Futurized SIGINT/SIGTERM signals handler class
+///
+/// Seastar-style helper class that allows easy waiting for SIGINT/SIGTERM signals
+/// from your app.
+///
+/// Example:
+/// \code
+/// #include <seastar/apps/lib/stop_signal.hh>
+/// ...
+/// int main() {
+/// ...
+/// seastar::thread th([] {
+/// seastar_apps_lib::stop_signal stop_signal;
+/// <some code>
+/// stop_signal.wait().get(); // this will wait till we receive SIGINT or SIGTERM signal
+/// });
+/// \endcode
+class stop_signal {
+ seastar::condition_variable _cond;
+ seastar::abort_source _abort_source;
+
+private:
+ void on_signal() {
+ if (stopping()) {
+ return;
+ }
+ _abort_source.request_abort();
+ _cond.broadcast();
+ }
+public:
+ stop_signal() {
+ seastar::engine().handle_signal(SIGINT, [this] { on_signal(); });
+ seastar::engine().handle_signal(SIGTERM, [this] { on_signal(); });
+ }
+ ~stop_signal() {
+ // There's no way to unregister a handler yet, so register a no-op handler instead.
+ seastar::engine().handle_signal(SIGINT, [] {});
+ seastar::engine().handle_signal(SIGTERM, [] {});
+ }
+ seastar::future<> wait() {
+ return _cond.wait([this] { return _abort_source.abort_requested(); });
+ }
+ bool stopping() const {
+ return _abort_source.abort_requested();
+ }
+ auto& abort_source() {
+ return _abort_source;
+ }
+};
+}
diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc
new file mode 100644
index 000000000..4573333c3
--- /dev/null
+++ b/src/crimson/osd/watch.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm_ext/insert.hpp>
+
+#include "crimson/osd/watch.h"
+#include "crimson/osd/osd_operations/internal_client_request.h"
+
+#include "messages/MWatchNotify.h"
+
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+namespace crimson::osd {
+
+// a watcher can remove itself if it has not seen a notification after a period of time.
+// in the case, we need to drop it also from the persisted `ObjectState` instance.
+// this operation resembles a bit the `_UNWATCH` subop.
+class WatchTimeoutRequest final : public InternalClientRequest {
+public:
+ WatchTimeoutRequest(WatchRef watch, Ref<PG> pg)
+ : InternalClientRequest(std::move(pg)),
+ watch(std::move(watch)) {
+ }
+
+ const hobject_t& get_target_oid() const final;
+ PG::do_osd_ops_params_t get_do_osd_ops_params() const final;
+ std::vector<OSDOp> create_osd_ops() final;
+
+private:
+ WatchRef watch;
+};
+
+const hobject_t& WatchTimeoutRequest::get_target_oid() const
+{
+ assert(watch->obc);
+ return watch->obc->get_oid();
+}
+
+PG::do_osd_ops_params_t
+WatchTimeoutRequest::get_do_osd_ops_params() const
+{
+ osd_reqid_t reqid;
+ reqid.name = watch->entity_name;
+ PG::do_osd_ops_params_t params{
+ watch->conn,
+ reqid,
+ ceph_clock_now(),
+ get_pg().get_osdmap_epoch(),
+ entity_inst_t{ watch->entity_name, watch->winfo.addr },
+ 0
+ };
+ logger().debug("{}: params.reqid={}", __func__, params.reqid);
+ return params;
+}
+
+std::vector<OSDOp> WatchTimeoutRequest::create_osd_ops()
+{
+ logger().debug("{}", __func__);
+ assert(watch);
+ OSDOp osd_op;
+ osd_op.op.op = CEPH_OSD_OP_WATCH;
+ osd_op.op.flags = 0;
+ osd_op.op.watch.op = CEPH_OSD_WATCH_OP_UNWATCH;
+ osd_op.op.watch.cookie = watch->winfo.cookie;
+ return std::vector{std::move(osd_op)};
+}
+
+Watch::~Watch()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+}
+
+seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool)
+{
+ if (this->conn == conn) {
+ logger().debug("conn={} already connected", conn);
+ return seastar::now();
+ }
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+ this->conn = std::move(conn);
+ return seastar::now();
+}
+
+void Watch::disconnect()
+{
+ ceph_assert(!conn);
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+}
+
+seastar::future<> Watch::send_notify_msg(NotifyRef notify)
+{
+ logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id);
+ return conn->send(crimson::make_message<MWatchNotify>(
+ winfo.cookie,
+ notify->user_version,
+ notify->ninfo.notify_id,
+ CEPH_WATCH_EVENT_NOTIFY,
+ notify->ninfo.bl,
+ notify->client_gid));
+}
+
+seastar::future<> Watch::start_notify(NotifyRef notify)
+{
+ logger().debug("{} gid={} cookie={} starting notify(id={})",
+ __func__, get_watcher_gid(), get_cookie(),
+ notify->ninfo.notify_id);
+ auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify));
+ ceph_assert(emplaced);
+ ceph_assert(is_alive());
+ return is_connected() ? send_notify_msg(*it) : seastar::now();
+}
+
+seastar::future<> Watch::notify_ack(
+ const uint64_t notify_id,
+ const ceph::bufferlist& reply_bl)
+{
+ logger().debug("{} gid={} cookie={} notify_id={}",
+ __func__, get_watcher_gid(), get_cookie(), notify_id);
+ const auto it = in_progress_notifies.find(notify_id);
+ if (it == std::end(in_progress_notifies)) {
+ logger().error("{} notify_id={} not found on the in-progess list."
+ " Supressing but this should not happen.",
+ __func__, notify_id);
+ return seastar::now();
+ }
+ auto notify = *it;
+ logger().debug("Watch::notify_ack gid={} cookie={} found notify(id={})",
+ get_watcher_gid(),
+ get_cookie(),
+ notify->get_id());
+ // let's ensure we're extending the life-time till end of this method
+ static_assert(std::is_same_v<decltype(notify), NotifyRef>);
+ in_progress_notifies.erase(it);
+ return notify->complete_watcher(shared_from_this(), reply_bl);
+}
+
+seastar::future<> Watch::send_disconnect_msg()
+{
+ if (!is_connected()) {
+ return seastar::now();
+ }
+ ceph::bufferlist empty;
+ return conn->send(crimson::make_message<MWatchNotify>(
+ winfo.cookie,
+ 0,
+ 0,
+ CEPH_WATCH_EVENT_DISCONNECT,
+ empty));
+}
+
+void Watch::discard_state()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+ ceph_assert(obc);
+ in_progress_notifies.clear();
+ timeout_timer.cancel();
+}
+
+void Watch::got_ping(utime_t)
+{
+ if (is_connected()) {
+ // using cancel() + arm() as rearm() has no overload for time delta.
+ timeout_timer.cancel();
+ timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
+ }
+}
+
+seastar::future<> Watch::remove()
+{
+ logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie());
+ // in contrast to ceph-osd crimson sends CEPH_WATCH_EVENT_DISCONNECT directly
+ // from the timeout handler and _after_ CEPH_WATCH_EVENT_NOTIFY_COMPLETE.
+ // this simplifies the Watch::remove() interface as callers aren't obliged
+ // anymore to decide whether EVENT_DISCONNECT needs to be send or not -- it
+ // becomes an implementation detail of Watch.
+ return seastar::do_for_each(in_progress_notifies,
+ [this_shared=shared_from_this()] (auto notify) {
+ logger().debug("Watch::remove gid={} cookie={} notify(id={})",
+ this_shared->get_watcher_gid(),
+ this_shared->get_cookie(),
+ notify->ninfo.notify_id);
+ return notify->remove_watcher(this_shared);
+ }).then([this] {
+ discard_state();
+ return seastar::now();
+ });
+}
+
+void Watch::cancel_notify(const uint64_t notify_id)
+{
+ logger().debug("{} gid={} cookie={} notify(id={})",
+ __func__, get_watcher_gid(), get_cookie(),
+ notify_id);
+ const auto it = in_progress_notifies.find(notify_id);
+ assert(it != std::end(in_progress_notifies));
+ in_progress_notifies.erase(it);
+}
+
+void Watch::do_watch_timeout()
+{
+ assert(pg);
+ auto [op, fut] = pg->get_shard_services().start_operation<WatchTimeoutRequest>(
+ shared_from_this(), pg);
+ std::ignore = std::move(fut).then([op=std::move(op), this] {
+ return send_disconnect_msg();
+ });
+}
+
+bool notify_reply_t::operator<(const notify_reply_t& rhs) const
+{
+ // comparing std::pairs to emphasize our legacy. ceph-osd stores
+ // notify_replies as std::multimap<std::pair<gid, cookie>, bl>.
+ // unfortunately, what seems to be an implementation detail, got
+ // exposed as part of our public API (the `reply_buffer` parameter
+ // of the `rados_notify` family).
+ const auto lhsp = std::make_pair(watcher_gid, watcher_cookie);
+ const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie);
+ return lhsp < rhsp;
+}
+
+std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs)
+{
+ out << "notify_reply_t{watcher_gid=" << rhs.watcher_gid
+ << ", watcher_cookie=" << rhs.watcher_cookie << "}";
+ return out;
+}
+
+Notify::Notify(crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version)
+ : ninfo(ninfo),
+ conn(std::move(conn)),
+ client_gid(client_gid),
+ user_version(user_version)
+{}
+
+Notify::~Notify()
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+}
+
+seastar::future<> Notify::remove_watcher(WatchRef watch)
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+
+ if (discarded || complete) {
+ logger().debug("{} for notify(id={}) discarded/complete already"
+ " discarded: {} complete: {}", __func__,
+ ninfo.notify_id, discarded ,complete);
+ return seastar::now();
+ }
+ [[maybe_unused]] const auto num_removed = watchers.erase(watch);
+ assert(num_removed > 0);
+ if (watchers.empty()) {
+ complete = true;
+ [[maybe_unused]] bool was_armed = timeout_timer.cancel();
+ assert(was_armed);
+ return send_completion();
+ } else {
+ return seastar::now();
+ }
+}
+
+
+seastar::future<> Notify::complete_watcher(
+ WatchRef watch,
+ const ceph::bufferlist& reply_bl)
+{
+ logger().debug("{} for notify(id={})", __func__, ninfo.notify_id);
+
+ if (discarded || complete) {
+ logger().debug("{} for notify(id={}) discarded/complete already"
+ " discarded: {} complete: {}", __func__,
+ ninfo.notify_id, discarded ,complete);
+ return seastar::now();
+ }
+ notify_replies.emplace(notify_reply_t{
+ watch->get_watcher_gid(),
+ watch->get_cookie(),
+ reply_bl});
+ return remove_watcher(std::move(watch));
+}
+
+seastar::future<> Notify::send_completion(
+ std::set<WatchRef> timedout_watchers)
+{
+ logger().info("{} -- {} in progress watchers, timedout watchers {}",
+ __func__, watchers.size(), timedout_watchers.size());
+ logger().debug("{} sending notify replies: {}", __func__, notify_replies);
+
+ ceph::bufferlist empty;
+ auto reply = crimson::make_message<MWatchNotify>(
+ ninfo.cookie,
+ user_version,
+ ninfo.notify_id,
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE,
+ empty,
+ client_gid);
+ ceph::bufferlist reply_bl;
+ {
+ std::vector<std::pair<uint64_t,uint64_t>> missed;
+ missed.reserve(std::size(timedout_watchers));
+ boost::insert(
+ missed, std::begin(missed),
+ timedout_watchers | boost::adaptors::transformed([] (auto w) {
+ return std::make_pair(w->get_watcher_gid(), w->get_cookie());
+ }));
+ ceph::encode(notify_replies, reply_bl);
+ ceph::encode(missed, reply_bl);
+ }
+ reply->set_data(std::move(reply_bl));
+ if (!timedout_watchers.empty()) {
+ reply->return_code = -ETIMEDOUT;
+ }
+ return conn->send(std::move(reply));
+}
+
+void Notify::do_notify_timeout()
+{
+ logger().debug("{} complete={}", __func__, complete);
+ if (complete) {
+ return;
+ }
+ // it might be that `this` is kept alive only because of the reference
+ // a watcher stores and which is being removed by `cancel_notify()`.
+ // to avoid use-after-free we bump up the ref counter with `guard_ptr`.
+ [[maybe_unused]] auto guard_ptr = shared_from_this();
+ for (auto& watcher : watchers) {
+ logger().debug("canceling watcher cookie={} gid={} use_count={}",
+ watcher->get_cookie(),
+ watcher->get_watcher_gid(),
+ watcher->use_count());
+ watcher->cancel_notify(ninfo.notify_id);
+ }
+ std::ignore = send_completion(std::move(watchers));
+ watchers.clear();
+}
+
+} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::WatchTimeoutRequest> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h
new file mode 100644
index 000000000..b3982141d
--- /dev/null
+++ b/src/crimson/osd/watch.h
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iterator>
+#include <map>
+#include <set>
+
+#include <seastar/core/shared_ptr.hh>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/pg.h"
+#include "include/denc.h"
+
+namespace crimson::osd {
+
+class Notify;
+using NotifyRef = seastar::shared_ptr<Notify>;
+
+// NOTE: really need to have this public. Otherwise `shared_from_this()`
+// will abort. According to cppreference.com:
+//
+// "The constructors of std::shared_ptr detect the presence
+// of an unambiguous and accessible (ie. public inheritance
+// is mandatory) (since C++17) enable_shared_from_this base".
+//
+// I expect the `seastar::shared_ptr` shares this behaviour.
+class Watch : public seastar::enable_shared_from_this<Watch> {
+ // this is a private tag for the public constructor that turns it into
+ // de facto private one. The motivation behind the hack is make_shared
+ // used by create().
+ struct private_ctag_t{};
+
+ std::set<NotifyRef, std::less<>> in_progress_notifies;
+ crimson::net::ConnectionRef conn;
+ crimson::osd::ObjectContextRef obc;
+
+ watch_info_t winfo;
+ entity_name_t entity_name;
+ Ref<PG> pg;
+
+ seastar::timer<seastar::lowres_clock> timeout_timer;
+
+ seastar::future<> start_notify(NotifyRef);
+ seastar::future<> send_notify_msg(NotifyRef);
+ seastar::future<> send_disconnect_msg();
+
+ friend Notify;
+ friend class WatchTimeoutRequest;
+
+public:
+ Watch(private_ctag_t,
+ crimson::osd::ObjectContextRef obc,
+ const watch_info_t& winfo,
+ const entity_name_t& entity_name,
+ Ref<PG> pg)
+ : obc(std::move(obc)),
+ winfo(winfo),
+ entity_name(entity_name),
+ pg(std::move(pg)),
+ timeout_timer([this] {
+ return do_watch_timeout();
+ }) {
+ assert(this->pg);
+ }
+ ~Watch();
+
+ seastar::future<> connect(crimson::net::ConnectionRef, bool);
+ void disconnect();
+ bool is_alive() const {
+ return true;
+ }
+ bool is_connected() const {
+ return static_cast<bool>(conn);
+ }
+ void got_ping(utime_t);
+
+ void discard_state();
+
+ seastar::future<> remove();
+
+ /// Call when notify_ack received on notify_id
+ seastar::future<> notify_ack(
+ uint64_t notify_id, ///< [in] id of acked notify
+ const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer
+
+ template <class... Args>
+ static seastar::shared_ptr<Watch> create(Args&&... args) {
+ return seastar::make_shared<Watch>(private_ctag_t{},
+ std::forward<Args>(args)...);
+ };
+
+ uint64_t get_watcher_gid() const {
+ return entity_name.num();
+ }
+ auto get_pg() const {
+ return pg;
+ }
+ auto& get_entity() const {
+ return entity_name;
+ }
+ auto& get_cookie() const {
+ return winfo.cookie;
+ }
+ auto& get_peer_addr() const {
+ return winfo.addr;
+ }
+ void cancel_notify(const uint64_t notify_id);
+ void do_watch_timeout();
+};
+
+using WatchRef = seastar::shared_ptr<Watch>;
+
+struct notify_reply_t {
+ uint64_t watcher_gid;
+ uint64_t watcher_cookie;
+ ceph::bufferlist bl;
+
+ bool operator<(const notify_reply_t& rhs) const;
+ DENC(notify_reply_t, v, p) {
+ // there is no versioning / preamble
+ denc(v.watcher_gid, p);
+ denc(v.watcher_cookie, p);
+ denc(v.bl, p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs);
+
+class Notify : public seastar::enable_shared_from_this<Notify> {
+ std::set<WatchRef> watchers;
+ const notify_info_t ninfo;
+ crimson::net::ConnectionRef conn;
+ const uint64_t client_gid;
+ const uint64_t user_version;
+ bool complete{false};
+ bool discarded{false};
+ seastar::timer<seastar::lowres_clock> timeout_timer{
+ [this] { do_notify_timeout(); }
+ };
+
+ ~Notify();
+
+ /// (gid,cookie) -> reply_bl for everyone who acked the notify
+ std::multiset<notify_reply_t> notify_replies;
+
+ uint64_t get_id() const { return ninfo.notify_id; }
+
+ /// Sends notify completion if watchers.empty() or timeout
+ seastar::future<> send_completion(
+ std::set<WatchRef> timedout_watchers = {});
+
+ /// Called on Notify timeout
+ void do_notify_timeout();
+
+ Notify(crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version);
+ template <class WatchIteratorT>
+ Notify(WatchIteratorT begin,
+ WatchIteratorT end,
+ crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version);
+ // this is a private tag for the public constructor that turns it into
+ // de facto private one. The motivation behind the hack is make_shared
+ // used by create_n_propagate factory.
+ struct private_ctag_t{};
+
+ using ptr_t = seastar::shared_ptr<Notify>;
+ friend bool operator<(const ptr_t& lhs, const ptr_t& rhs) {
+ assert(lhs);
+ assert(rhs);
+ return lhs->get_id() < rhs->get_id();
+ }
+ friend bool operator<(const ptr_t& ptr, const uint64_t id) {
+ assert(ptr);
+ return ptr->get_id() < id;
+ }
+ friend bool operator<(const uint64_t id, const ptr_t& ptr) {
+ assert(ptr);
+ return id < ptr->get_id();
+ }
+
+ friend Watch;
+
+public:
+ template <class... Args>
+ Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) {
+ }
+
+ template <class WatchIteratorT, class... Args>
+ static seastar::future<> create_n_propagate(
+ WatchIteratorT begin,
+ WatchIteratorT end,
+ Args&&... args);
+
+ seastar::future<> remove_watcher(WatchRef watch);
+ seastar::future<> complete_watcher(WatchRef watch,
+ const ceph::bufferlist& reply_bl);
+};
+
+
+template <class WatchIteratorT>
+Notify::Notify(WatchIteratorT begin,
+ WatchIteratorT end,
+ crimson::net::ConnectionRef conn,
+ const notify_info_t& ninfo,
+ const uint64_t client_gid,
+ const uint64_t user_version)
+ : watchers(begin, end),
+ ninfo(ninfo),
+ conn(std::move(conn)),
+ client_gid(client_gid),
+ user_version(user_version) {
+ assert(!std::empty(watchers));
+ if (ninfo.timeout) {
+ timeout_timer.arm(std::chrono::seconds{ninfo.timeout});
+ }
+}
+
+template <class WatchIteratorT, class... Args>
+seastar::future<> Notify::create_n_propagate(
+ WatchIteratorT begin,
+ WatchIteratorT end,
+ Args&&... args)
+{
+ static_assert(
+ std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type,
+ crimson::osd::WatchRef>);
+ if (begin == end) {
+ auto notify = seastar::make_shared<Notify>(
+ private_ctag_t{},
+ std::forward<Args>(args)...);
+ return notify->send_completion();
+ } else {
+ auto notify = seastar::make_shared<Notify>(
+ private_ctag_t{},
+ begin, end,
+ std::forward<Args>(args)...);
+ return seastar::do_for_each(begin, end, [=] (auto& watchref) {
+ return watchref->start_notify(notify);
+ });
+ }
+}
+
+} // namespace crimson::osd
+
+WRITE_CLASS_DENC(crimson::osd::notify_reply_t)
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::notify_reply_t> : fmt::ostream_formatter {};
+#endif