summaryrefslogtreecommitdiffstats
path: root/src/crimson/osd/pg_recovery.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/crimson/osd/pg_recovery.cc569
1 files changed, 569 insertions, 0 deletions
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
new file mode 100644
index 000000000..09b45779e
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.cc
@@ -0,0 +1,569 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <fmt/ranges.h>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/backfill_facades.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_recovery.h"
+
+#include "osd/osd_types.h"
+#include "osd/PeeringState.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_osd);
+ }
+}
+
+using std::map;
+using std::set;
+
+void PGRecovery::start_pglogbased_recovery()
+{
+ using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
+ (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ float(0.001));
+}
+
+PGRecovery::interruptible_future<bool>
+PGRecovery::start_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start)
+{
+ assert(pg->is_primary());
+ assert(pg->is_peered());
+ assert(pg->is_recovering());
+ // in ceph-osd the do_recovery() path handles both the pg log-based
+ // recovery and the backfill, albeit they are separated at the layer
+ // of PeeringState. In crimson-osd backfill has been cut from it, so
+ // and do_recovery() is actually solely for pg log-based recovery.
+ // At the time of writing it's considered to move it to FSM and fix
+ // the naming as well.
+ assert(!pg->is_backfilling());
+ assert(!pg->get_peering_state().is_deleting());
+
+ std::vector<interruptible_future<>> started;
+ started.reserve(max_to_start);
+ max_to_start -= start_primary_recovery_ops(trigger, max_to_start, &started);
+ if (max_to_start > 0) {
+ max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
+ }
+ using interruptor =
+ crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
+ return interruptor::parallel_for_each(started,
+ [] (auto&& ifut) {
+ return std::move(ifut);
+ }).then_interruptible([this] {
+ bool done = !pg->get_peering_state().needs_recovery();
+ if (done) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ if (!pg->get_peering_state().needs_backfill()) {
+ logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered{});
+ } else {
+ logger().debug("start_recovery_ops: RequestBackfill for pg: {}",
+ pg->get_pgid());
+ (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::RequestBackfill{});
+ }
+ }
+ return seastar::make_ready_future<bool>(!done);
+ });
+}
+
+size_t PGRecovery::start_primary_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+
+ if (!pg->get_peering_state().have_missing()) {
+ pg->get_peering_state().local_recovery_complete();
+ return 0;
+ }
+
+ const auto &missing = pg->get_peering_state().get_pg_log().get_missing();
+
+ logger().info("{} recovering {} in pg {}, missing {}", __func__,
+ pg->get_recovery_backend()->total_recovering(),
+ *static_cast<crimson::osd::PG*>(pg),
+ missing);
+
+ unsigned started = 0;
+ int skipped = 0;
+
+ map<version_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested);
+ while (started < max_to_start && p != missing.get_rmissing().end()) {
+ // TODO: chain futures here to enable yielding to scheduler?
+ hobject_t soid;
+ version_t v = p->first;
+
+ auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second);
+ if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) {
+ // look at log!
+ pg_log_entry_t *latest = it_objects->second;
+ assert(latest->is_update() || latest->is_delete());
+ soid = latest->soid;
+ } else {
+ soid = p->second;
+ }
+ const pg_missing_item& item = missing.get_items().find(p->second)->second;
+ ++p;
+
+ hobject_t head = soid.get_head();
+
+ logger().info(
+ "{} {} item.need {} {} {} {} {}",
+ __func__,
+ soid,
+ item.need,
+ missing.is_missing(soid) ? " (missing)":"",
+ missing.is_missing(head) ? " (missing head)":"",
+ pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"",
+ pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":"");
+
+ // TODO: handle lost/unfound
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ ++started;
+ } else if (pg->get_recovery_backend()->is_recovering(head)) {
+ ++skipped;
+ } else {
+ out->emplace_back(recover_missing(trigger, soid, item.need));
+ ++started;
+ }
+
+ if (!skipped)
+ pg->get_peering_state().set_last_requested(v);
+ }
+
+ logger().info("{} started {} skipped {}", __func__, started, skipped);
+
+ return started;
+}
+
+size_t PGRecovery::start_replica_recovery_ops(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ size_t max_to_start,
+ std::vector<PGRecovery::interruptible_future<>> *out)
+{
+ if (!pg->is_recovering()) {
+ return 0;
+ }
+ uint64_t started = 0;
+
+ assert(!pg->get_peering_state().get_acting_recovery_backfill().empty());
+
+ auto recovery_order = get_replica_recovery_order();
+ for (auto &peer : recovery_order) {
+ assert(peer != pg->get_peering_state().get_primary());
+ const auto& pm = pg->get_peering_state().get_peer_missing(peer);
+
+ logger().debug("{}: peer osd.{} missing {} objects", __func__,
+ peer, pm.num_missing());
+ logger().trace("{}: peer osd.{} missing {}", __func__,
+ peer, pm.get_items());
+
+ // recover oldest first
+ for (auto p = pm.get_rmissing().begin();
+ p != pm.get_rmissing().end() && started < max_to_start;
+ ++p) {
+ const auto &soid = p->second;
+
+ if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
+ logger().debug("{}: object {} still unfound", __func__, soid);
+ continue;
+ }
+
+ const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer);
+ if (soid > pi.last_backfill) {
+ if (!pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().error(
+ "{}: object {} in missing set for backfill (last_backfill {})"
+ " but not in recovering",
+ __func__,
+ soid,
+ pi.last_backfill);
+ ceph_abort();
+ }
+ continue;
+ }
+
+ if (pg->get_recovery_backend()->is_recovering(soid)) {
+ logger().debug("{}: already recovering object {}", __func__, soid);
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ out->emplace_back(recovery_waiter.wait_for_recovered(trigger));
+ started++;
+ continue;
+ }
+
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ logger().debug("{}: soid {} is a delete, removing", __func__, soid);
+ map<hobject_t,pg_missing_item>::const_iterator r =
+ pm.get_items().find(soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_deletes(trigger, soid, r->second.need));
+ continue;
+ }
+
+ if (soid.is_snap() &&
+ pg->get_peering_state().get_pg_log().get_missing().is_missing(
+ soid.get_head())) {
+ logger().debug("{}: head {} still missing on primary", __func__,
+ soid.get_head());
+ continue;
+ }
+
+ if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) {
+ logger().debug("{}: soid {} still missing on primary", __func__, soid);
+ continue;
+ }
+
+ logger().debug("{}: recover_object_replicas({})", __func__,soid);
+ map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find(
+ soid);
+ started++;
+ out->emplace_back(
+ prep_object_replica_pushes(trigger, soid, r->second.need));
+ }
+ }
+
+ return started;
+}
+
+PGRecovery::interruptible_future<>
+PGRecovery::recover_missing(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t &soid, eversion_t need)
+{
+ if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_delete(soid, need));
+ } else {
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+ }
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->push_delete(soid, need).then_interruptible(
+ [=, this] {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ on_global_recover(soid, stat_diff, true);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes(
+ RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ const hobject_t& soid,
+ eversion_t need)
+{
+ return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking(
+ trigger,
+ pg->get_recovery_backend()->recover_object(soid, need)
+ .handle_exception_interruptible(
+ [=, this, soid = std::move(soid)] (auto e) {
+ on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+ return seastar::make_ready_future<>();
+ })
+ );
+}
+
+void PGRecovery::on_local_recover(
+ const hobject_t& soid,
+ const ObjectRecoveryInfo& recovery_info,
+ const bool is_delete,
+ ceph::os::Transaction& t)
+{
+ if (const auto &log = pg->get_peering_state().get_pg_log();
+ !is_delete &&
+ log.get_missing().is_missing(recovery_info.soid) &&
+ log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
+ assert(pg->is_primary());
+ if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second;
+ latest->op == pg_log_entry_t::LOST_REVERT) {
+ ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet");
+ }
+ }
+ pg->get_peering_state().recover_got(soid,
+ recovery_info.version, is_delete, t);
+
+ if (pg->is_primary()) {
+ if (!is_delete) {
+ auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend?
+ obc->obs.exists = true;
+ obc->obs.oi = recovery_info.oi;
+ }
+ if (!pg->is_unreadable_object(soid)) {
+ pg->get_recovery_backend()->get_recovering(soid).set_readable();
+ }
+ pg->publish_stats_to_osd();
+ }
+}
+
+void PGRecovery::on_global_recover (
+ const hobject_t& soid,
+ const object_stat_sum_t& stat_diff,
+ const bool is_delete)
+{
+ logger().info("{} {}", __func__, soid);
+ pg->get_peering_state().object_recovered(soid, stat_diff);
+ pg->publish_stats_to_osd();
+ auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+ if (!is_delete)
+ recovery_waiter.obc->drop_recovery_read();
+ recovery_waiter.set_recovered();
+ pg->get_recovery_backend()->remove_recovering(soid);
+}
+
+void PGRecovery::on_failed_recover(
+ const set<pg_shard_t>& from,
+ const hobject_t& soid,
+ const eversion_t& v)
+{
+ for (auto pg_shard : from) {
+ if (pg_shard != pg->get_pg_whoami()) {
+ pg->get_peering_state().force_object_missing(pg_shard, soid, v);
+ }
+ }
+}
+
+void PGRecovery::on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info)
+{
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{}: {}, {} on {}", __func__, oid,
+ recovery_info.version, peer);
+ pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version);
+}
+
+void PGRecovery::_committed_pushed_object(epoch_t epoch,
+ eversion_t last_complete)
+{
+ if (!pg->has_reset_since(epoch)) {
+ pg->get_peering_state().recovery_committed_to(last_complete);
+ } else {
+ crimson::get_logger(ceph_subsys_osd).debug(
+ "{} pg has changed, not touching last_complete_ondisk",
+ __func__);
+ }
+}
+
+template <class EventT>
+void PGRecovery::start_backfill_recovery(const EventT& evt)
+{
+ using BackfillRecovery = crimson::osd::BackfillRecovery;
+ std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_shard_services(),
+ pg->get_osdmap_epoch(),
+ evt);
+}
+
+void PGRecovery::request_replica_scan(
+ const pg_shard_t& target,
+ const hobject_t& begin,
+ const hobject_t& end)
+{
+ logger().debug("{}: target.osd={}", __func__, target.osd);
+ auto msg = crimson::make_message<MOSDPGScan>(
+ MOSDPGScan::OP_SCAN_GET_DIGEST,
+ pg->get_pg_whoami(),
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, target.shard),
+ begin,
+ end);
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(msg),
+ pg->get_osdmap_epoch());
+}
+
+void PGRecovery::request_primary_scan(
+ const hobject_t& begin)
+{
+ logger().debug("{}", __func__);
+ using crimson::common::local_conf;
+ std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+ begin,
+ local_conf()->osd_backfill_scan_min,
+ local_conf()->osd_backfill_scan_max
+ ).then_interruptible([this] (BackfillInterval bi) {
+ logger().debug("request_primary_scan:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) });
+ });
+}
+
+void PGRecovery::enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ logger().debug("{}: obj={} v={}",
+ __func__, obj, v);
+ pg->get_recovery_backend()->add_recovering(obj);
+ std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\
+ handle_exception_interruptible([] (auto) {
+ ceph_abort_msg("got exception on backfill's push");
+ return seastar::make_ready_future<>();
+ }).then_interruptible([this, obj] {
+ logger().debug("enqueue_push:{}", __func__);
+ using BackfillState = crimson::osd::BackfillState;
+ start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj)));
+ });
+}
+
+void PGRecovery::enqueue_drop(
+ const pg_shard_t& target,
+ const hobject_t& obj,
+ const eversion_t& v)
+{
+ // allocate a pair if target is seen for the first time
+ auto& req = backfill_drop_requests[target];
+ if (!req) {
+ req = crimson::make_message<MOSDPGBackfillRemove>(
+ spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch());
+ }
+ req->ls.emplace_back(obj, v);
+}
+
+void PGRecovery::maybe_flush()
+{
+ for (auto& [target, req] : backfill_drop_requests) {
+ std::ignore = pg->get_shard_services().send_to_osd(
+ target.osd,
+ std::move(req),
+ pg->get_osdmap_epoch());
+ }
+ backfill_drop_requests.clear();
+}
+
+void PGRecovery::update_peers_last_backfill(
+ const hobject_t& new_last_backfill)
+{
+ logger().debug("{}: new_last_backfill={}",
+ __func__, new_last_backfill);
+ // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+ // all the backfill targets. Otherwise, we will move last_backfill up on
+ // those targets need it and send OP_BACKFILL_PROGRESS to them.
+ for (const auto& bt : pg->get_peering_state().get_backfill_targets()) {
+ if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt);
+ new_last_backfill > pinfo.last_backfill) {
+ pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill);
+ auto m = crimson::make_message<MOSDPGBackfill>(
+ pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH
+ : MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ pg->get_osdmap_epoch(),
+ pg->get_last_peering_reset(),
+ spg_t(pg->get_pgid().pgid, bt.shard));
+ // Use default priority here, must match sub_op priority
+ // TODO: if pinfo.last_backfill.is_max(), then
+ // start_recovery_op(hobject_t::get_max());
+ m->last_backfill = pinfo.last_backfill;
+ m->stats = pinfo.stats;
+ std::ignore = pg->get_shard_services().send_to_osd(
+ bt.osd, std::move(m), pg->get_osdmap_epoch());
+ logger().info("{}: peer {} num_objects now {} / {}",
+ __func__,
+ bt,
+ pinfo.stats.stats.sum.num_objects,
+ pg->get_info().stats.stats.sum.num_objects);
+ }
+ }
+}
+
+bool PGRecovery::budget_available() const
+{
+ // TODO: the limits!
+ return true;
+}
+
+void PGRecovery::backfilled()
+{
+ using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+ std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
+ static_cast<crimson::osd::PG*>(pg),
+ pg->get_pg_whoami(),
+ pg->get_pgid(),
+ pg->get_osdmap_epoch(),
+ pg->get_osdmap_epoch(),
+ PeeringState::Backfilled{});
+}
+
+void PGRecovery::dispatch_backfill_event(
+ boost::intrusive_ptr<const boost::statechart::event_base> evt)
+{
+ logger().debug("{}", __func__);
+ backfill_state->process_event(evt);
+}
+
+void PGRecovery::on_backfill_reserved()
+{
+ logger().debug("{}", __func__);
+ // PIMP and depedency injection for the sake unittestability.
+ // I'm not afraid about the performance here.
+ using BackfillState = crimson::osd::BackfillState;
+ backfill_state = std::make_unique<BackfillState>(
+ *this,
+ std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
+ std::make_unique<crimson::osd::PGFacade>(
+ *static_cast<crimson::osd::PG*>(pg)));
+ // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
+ // will be set after on_backfill_reserved() returns.
+ // Backfill needs to take this into consideration when scheduling
+ // events -- they must be mutually exclusive with PeeringEvent
+ // instances. Otherwise the execution might begin without having
+ // the state updated.
+ ceph_assert(!pg->get_peering_state().is_backfilling());
+ start_backfill_recovery(BackfillState::Triggered{});
+}