summaryrefslogtreecommitdiffstats
path: root/src/osd/scrubber/scrub_backend.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/osd/scrubber/scrub_backend.cc')
-rw-r--r--src/osd/scrubber/scrub_backend.cc1954
1 files changed, 1954 insertions, 0 deletions
diff --git a/src/osd/scrubber/scrub_backend.cc b/src/osd/scrubber/scrub_backend.cc
new file mode 100644
index 000000000..e25c5b99d
--- /dev/null
+++ b/src/osd/scrubber/scrub_backend.cc
@@ -0,0 +1,1954 @@
+// -*- m_mode_desc:C++; tab-width:8; c-basic-offset:2; indent-tabs-m_mode_desc:t
+// -*- vim: ts=2 sw=2 smarttab
+
+#include "./scrub_backend.h"
+
+#include <algorithm>
+
+#include <fmt/ranges.h>
+
+#include "common/debug.h"
+
+#include "include/utime_fmt.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "osd/ECUtil.h"
+#include "osd/OSD.h"
+#include "osd/PG.h"
+#include "osd/PrimaryLogPG.h"
+#include "osd/osd_types_fmt.h"
+
+#include "pg_scrubber.h"
+
+using std::set;
+using std::stringstream;
+using std::vector;
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+using namespace std::literals;
+
+#define dout_context (m_scrubber.get_pg_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+
+#define dout_prefix ScrubBackend::logger_prefix(_dout, this)
+
+std::ostream& ScrubBackend::logger_prefix(std::ostream* out,
+ const ScrubBackend* t)
+{
+ return t->m_scrubber.gen_prefix(*out) << " b.e.: ";
+}
+
+// ////////////////////////////////////////////////////////////////////////// //
+
+// for a Primary
+ScrubBackend::ScrubBackend(ScrubBeListener& scrubber,
+ PgScrubBeListener& pg,
+ pg_shard_t i_am,
+ bool repair,
+ scrub_level_t shallow_or_deep,
+ const std::set<pg_shard_t>& acting)
+ : m_scrubber{scrubber}
+ , m_pg{pg}
+ , m_pg_whoami{i_am}
+ , m_repair{repair}
+ , m_depth{shallow_or_deep}
+ , m_pg_id{scrubber.get_pgid()}
+ , m_pool{m_pg.get_pgpool()}
+ , m_incomplete_clones_allowed{m_pool.info.allow_incomplete_clones()}
+ , m_conf{m_scrubber.get_pg_cct()->_conf}
+ , clog{m_scrubber.get_logger()}
+{
+ m_formatted_id = m_pg_id.calc_name_sring();
+
+ m_acting_but_me.reserve(acting.size());
+ std::copy_if(acting.begin(),
+ acting.end(),
+ std::back_inserter(m_acting_but_me),
+ [i_am](const pg_shard_t& shard) { return shard != i_am; });
+
+ m_is_replicated = m_pool.info.is_replicated();
+ m_mode_desc =
+ (m_repair ? "repair"sv
+ : (m_depth == scrub_level_t::deep ? "deep-scrub"sv : "scrub"sv));
+}
+
+// for a Replica
+ScrubBackend::ScrubBackend(ScrubBeListener& scrubber,
+ PgScrubBeListener& pg,
+ pg_shard_t i_am,
+ bool repair,
+ scrub_level_t shallow_or_deep)
+ : m_scrubber{scrubber}
+ , m_pg{pg}
+ , m_pg_whoami{i_am}
+ , m_repair{repair}
+ , m_depth{shallow_or_deep}
+ , m_pg_id{scrubber.get_pgid()}
+ , m_pool{m_pg.get_pgpool()}
+ , m_conf{m_scrubber.get_pg_cct()->_conf}
+ , clog{m_scrubber.get_logger()}
+{
+ m_formatted_id = m_pg_id.calc_name_sring();
+ m_is_replicated = m_pool.info.is_replicated();
+ m_mode_desc =
+ (m_repair ? "repair"sv
+ : (m_depth == scrub_level_t::deep ? "deep-scrub"sv : "scrub"sv));
+}
+
+uint64_t ScrubBackend::logical_to_ondisk_size(uint64_t logical_size) const
+{
+ return m_pg.logical_to_ondisk_size(logical_size);
+}
+
+void ScrubBackend::update_repair_status(bool should_repair)
+{
+ dout(15) << __func__
+ << ": repair state set to :" << (should_repair ? "true" : "false")
+ << dendl;
+ m_repair = should_repair;
+ m_mode_desc =
+ (m_repair ? "repair"sv
+ : (m_depth == scrub_level_t::deep ? "deep-scrub"sv : "scrub"sv));
+}
+
+void ScrubBackend::new_chunk()
+{
+ dout(15) << __func__ << dendl;
+ this_chunk.emplace(m_pg_whoami);
+}
+
+ScrubMap& ScrubBackend::get_primary_scrubmap()
+{
+ return this_chunk->received_maps[m_pg_whoami];
+}
+
+void ScrubBackend::merge_to_authoritative_set()
+{
+ dout(15) << __func__ << dendl;
+ ceph_assert(m_scrubber.is_primary());
+ ceph_assert(this_chunk->authoritative_set.empty() &&
+ "the scrubber-backend should be empty");
+
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 15>()) {
+ for (const auto& rpl : m_acting_but_me) {
+ dout(15) << fmt::format("{}: replica {} has {} items",
+ __func__,
+ rpl,
+ this_chunk->received_maps[rpl].objects.size())
+ << dendl;
+ }
+ }
+
+ // Construct the authoritative set of objects
+ for (const auto& map : this_chunk->received_maps) {
+ std::transform(map.second.objects.begin(),
+ map.second.objects.end(),
+ std::inserter(this_chunk->authoritative_set,
+ this_chunk->authoritative_set.end()),
+ [](const auto& i) { return i.first; });
+ }
+}
+
+ScrubMap& ScrubBackend::my_map()
+{
+ return this_chunk->received_maps[m_pg_whoami];
+}
+
+void ScrubBackend::decode_received_map(pg_shard_t from,
+ const MOSDRepScrubMap& msg)
+{
+ auto p = const_cast<bufferlist&>(msg.get_data()).cbegin();
+ this_chunk->received_maps[from].decode(p, m_pool.id);
+
+ dout(15) << __func__ << ": decoded map from : " << from
+ << ": versions: " << this_chunk->received_maps[from].valid_through
+ << " / " << msg.get_map_epoch() << dendl;
+}
+
+
+std::vector<snap_mapper_fix_t> ScrubBackend::replica_clean_meta(
+ ScrubMap& repl_map,
+ bool max_reached,
+ const hobject_t& start,
+ SnapMapReaderI& snaps_getter)
+{
+ dout(15) << __func__ << ": REPL META # " << m_cleaned_meta_map.objects.size()
+ << " objects" << dendl;
+ ceph_assert(!m_cleaned_meta_map.objects.size());
+ m_cleaned_meta_map.clear_from(start); // RRR how can this be required?
+ m_cleaned_meta_map.insert(repl_map);
+ auto for_meta_scrub = clean_meta_map(m_cleaned_meta_map, max_reached);
+ return scan_snaps(for_meta_scrub, snaps_getter);
+}
+
+
+// /////////////////////////////////////////////////////////////////////////////
+//
+// comparing the maps
+//
+// /////////////////////////////////////////////////////////////////////////////
+
+objs_fix_list_t ScrubBackend::scrub_compare_maps(
+ bool max_reached,
+ SnapMapReaderI& snaps_getter)
+{
+ dout(10) << __func__ << " has maps, analyzing" << dendl;
+ ceph_assert(m_scrubber.is_primary());
+
+ // construct authoritative scrub map for type-specific scrubbing
+
+ m_cleaned_meta_map.insert(my_map());
+ merge_to_authoritative_set();
+
+ // collect some omap statistics into m_omap_stats
+ omap_checks();
+
+ update_authoritative();
+ auto for_meta_scrub = clean_meta_map(m_cleaned_meta_map, max_reached);
+
+ // ok, do the pg-type specific scrubbing
+
+ // (Validates consistency of the object info and snap sets)
+ scrub_snapshot_metadata(for_meta_scrub);
+
+ return objs_fix_list_t{std::move(this_chunk->m_inconsistent_objs),
+ scan_snaps(for_meta_scrub, snaps_getter)};
+}
+
+void ScrubBackend::omap_checks()
+{
+ const bool needs_omap_check = std::any_of(
+ this_chunk->received_maps.begin(),
+ this_chunk->received_maps.end(),
+ [](const auto& m) -> bool {
+ return m.second.has_large_omap_object_errors || m.second.has_omap_keys;
+ });
+
+ if (!needs_omap_check) {
+ return; // Nothing to do
+ }
+
+ stringstream wss;
+
+ // Iterate through objects and update omap stats
+ for (const auto& ho : this_chunk->authoritative_set) {
+
+ for (const auto& [srd, smap] : this_chunk->received_maps) {
+ if (srd != m_pg_whoami) {
+ // Only set omap stats for the primary
+ continue;
+ }
+
+ auto it = smap.objects.find(ho);
+ if (it == smap.objects.end()) {
+ continue;
+ }
+
+ const ScrubMap::object& smap_obj = it->second;
+ m_omap_stats.omap_bytes += smap_obj.object_omap_bytes;
+ m_omap_stats.omap_keys += smap_obj.object_omap_keys;
+ if (smap_obj.large_omap_object_found) {
+ auto osdmap = m_scrubber.get_osdmap();
+ pg_t pg;
+ osdmap->map_to_pg(ho.pool, ho.oid.name, ho.get_key(), ho.nspace, &pg);
+ pg_t mpg = osdmap->raw_pg_to_pg(pg);
+ m_omap_stats.large_omap_objects++;
+ wss << "Large omap object found. Object: " << ho << " PG: " << pg
+ << " (" << mpg << ")"
+ << " Key count: " << smap_obj.large_omap_object_key_count
+ << " Size (bytes): " << smap_obj.large_omap_object_value_size
+ << '\n';
+ break;
+ }
+ }
+ }
+
+ if (!wss.str().empty()) {
+ dout(5) << __func__ << ": " << wss.str() << dendl;
+ clog.warn(wss);
+ }
+}
+
+/*
+ * update_authoritative() updates:
+ *
+ * - m_auth_peers: adds obj-> list of pairs < scrub-map, shard>
+ *
+ * - m_cleaned_meta_map: replaces [obj] entry with:
+ * the relevant object in the scrub-map of the "selected" (back-most) peer
+ */
+void ScrubBackend::update_authoritative()
+{
+ dout(10) << __func__ << dendl;
+
+ if (m_acting_but_me.empty()) {
+ return;
+ }
+
+ compare_smaps(); // note: might cluster-log errors
+
+ // update the session-wide m_auth_peers with the list of good
+ // peers for each object (i.e. the ones that are in this_chunks's auth list)
+ for (auto& [obj, peers] : this_chunk->authoritative) {
+
+ auth_peers_t good_peers;
+
+ for (auto& peer : peers) {
+ good_peers.emplace_back(this_chunk->received_maps[peer].objects[obj],
+ peer);
+ }
+
+ m_auth_peers.emplace(obj, std::move(good_peers));
+ }
+
+ for (const auto& [obj, peers] : this_chunk->authoritative) {
+ m_cleaned_meta_map.objects.erase(obj);
+ m_cleaned_meta_map.objects.insert(
+ *(this_chunk->received_maps[peers.back()].objects.find(obj)));
+ }
+}
+
+int ScrubBackend::scrub_process_inconsistent()
+{
+ dout(20) << fmt::format("{}: {} (m_repair:{}) good peers tbl #: {}",
+ __func__,
+ m_mode_desc,
+ m_repair,
+ m_auth_peers.size())
+ << dendl;
+
+ ceph_assert(!m_auth_peers.empty());
+ // authoritative only store objects which are missing or inconsistent.
+
+ // some tests expect an error message that does not contain the __func__ and
+ // PG:
+ auto err_msg = fmt::format("{} {} {} missing, {} inconsistent objects",
+ m_formatted_id,
+ m_mode_desc,
+ m_missing.size(),
+ m_inconsistent.size());
+
+ dout(4) << err_msg << dendl;
+ clog.error() << err_msg;
+
+ ceph_assert(m_repair);
+ int fixed_cnt{0};
+
+ for (const auto& [hobj, shrd_list] : m_auth_peers) {
+
+ auto missing_entry = m_missing.find(hobj);
+
+ if (missing_entry != m_missing.end()) {
+ repair_object(hobj, shrd_list, missing_entry->second);
+ fixed_cnt += missing_entry->second.size();
+ }
+
+ if (m_inconsistent.count(hobj)) {
+ repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+ fixed_cnt += m_inconsistent[hobj].size();
+ }
+ }
+ return fixed_cnt;
+}
+
+void ScrubBackend::repair_object(const hobject_t& soid,
+ const auth_peers_t& ok_peers,
+ const set<pg_shard_t>& bad_peers)
+{
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ // log the good peers
+ set<pg_shard_t> ok_shards; // the shards from the ok_peers list
+ for (const auto& peer : ok_peers) {
+ ok_shards.insert(peer.second);
+ }
+ dout(10) << fmt::format(
+ "repair_object {} bad_peers osd.{{{}}}, ok_peers osd.{{{}}}",
+ soid,
+ bad_peers,
+ ok_shards)
+ << dendl;
+ }
+
+ const ScrubMap::object& po = ok_peers.back().first;
+
+ object_info_t oi;
+ try {
+ bufferlist bv;
+ if (po.attrs.count(OI_ATTR)) {
+ bv.push_back(po.attrs.find(OI_ATTR)->second);
+ }
+ auto bliter = bv.cbegin();
+ decode(oi, bliter);
+ } catch (...) {
+ dout(0) << __func__
+ << ": Need version of replica, bad object_info_t: " << soid
+ << dendl;
+ ceph_abort();
+ }
+
+ if (bad_peers.count(m_pg.get_primary())) {
+ // We should only be scrubbing if the PG is clean.
+ ceph_assert(!m_pg.is_waiting_for_unreadable_object());
+ dout(10) << __func__ << ": primary = " << m_pg.get_primary() << dendl;
+ }
+
+ // No need to pass ok_peers, they must not be missing the object, so
+ // force_object_missing will add them to missing_loc anyway
+ m_pg.force_object_missing(ScrubberPasskey{}, bad_peers, soid, oi.version);
+}
+
+
+// /////////////////////////////////////////////////////////////////////////////
+//
+// components formerly of PGBackend::be_compare_scrubmaps()
+//
+// /////////////////////////////////////////////////////////////////////////////
+
+using usable_t = shard_as_auth_t::usable_t;
+
+
+static inline int dcount(const object_info_t& oi)
+{
+ return (oi.is_data_digest() ? 1 : 0) + (oi.is_omap_digest() ? 1 : 0);
+}
+
+auth_selection_t ScrubBackend::select_auth_object(const hobject_t& ho,
+ stringstream& errstream)
+{
+ // Create a list of shards (with the Primary first, so that it will be
+ // auth-copy, all other things being equal)
+
+ /// \todo: consider sorting the candidate shards by the conditions for
+ /// selecting best auth source below. Then - stopping on the first one
+ /// that is auth eligible.
+ /// This creates an issue with 'digest_match' that should be handled.
+ std::list<pg_shard_t> shards;
+ for (const auto& [srd, smap] : this_chunk->received_maps) {
+ if (srd != m_pg_whoami) {
+ shards.push_back(srd);
+ }
+ }
+ shards.push_front(m_pg_whoami);
+
+ auth_selection_t ret_auth;
+ ret_auth.auth = this_chunk->received_maps.end();
+ eversion_t auth_version;
+
+ for (auto& l : shards) {
+
+ auto shard_ret = possible_auth_shard(ho, l, ret_auth.shard_map);
+
+ // digest_match will only be true if computed digests are the same
+ if (auth_version != eversion_t() &&
+ ret_auth.auth->second.objects[ho].digest_present &&
+ shard_ret.digest.has_value() &&
+ ret_auth.auth->second.objects[ho].digest != *shard_ret.digest) {
+
+ ret_auth.digest_match = false;
+ dout(10) << fmt::format(
+ "{}: digest_match = false, {} data_digest 0x{:x} != "
+ "data_digest 0x{:x}",
+ __func__,
+ ho,
+ ret_auth.auth->second.objects[ho].digest,
+ *shard_ret.digest)
+ << dendl;
+ }
+
+ dout(20)
+ << fmt::format("{}: {} shard {} got:{:D}", __func__, ho, l, shard_ret)
+ << dendl;
+
+ if (shard_ret.possible_auth == shard_as_auth_t::usable_t::not_usable) {
+
+ // Don't use this particular shard due to previous errors
+ // XXX: For now we can't pick one shard for repair and another's object
+ // info or snapset
+
+ ceph_assert(shard_ret.error_text.length());
+ errstream << m_pg_id.pgid << " shard " << l << " soid " << ho << " : "
+ << shard_ret.error_text << "\n";
+
+ } else if (shard_ret.possible_auth ==
+ shard_as_auth_t::usable_t::not_found) {
+
+ // do not emit the returned error message to the log
+ dout(15) << fmt::format("{}: {} not found on shard {}", __func__, ho, l)
+ << dendl;
+ } else {
+
+ dout(30) << fmt::format("{}: consider using {} srv: {} oi soid: {}",
+ __func__,
+ l,
+ shard_ret.oi.version,
+ shard_ret.oi.soid)
+ << dendl;
+
+ // consider using this shard as authoritative. Is it more recent?
+
+ if (auth_version == eversion_t() || shard_ret.oi.version > auth_version ||
+ (shard_ret.oi.version == auth_version &&
+ dcount(shard_ret.oi) > dcount(ret_auth.auth_oi))) {
+
+ dout(20) << fmt::format("{}: using {} moved auth oi {:p} <-> {:p}",
+ __func__,
+ l,
+ (void*)&ret_auth.auth_oi,
+ (void*)&shard_ret.oi)
+ << dendl;
+
+ ret_auth.auth = shard_ret.auth_iter;
+ ret_auth.auth_shard = ret_auth.auth->first;
+ ret_auth.auth_oi = shard_ret.oi;
+ auth_version = shard_ret.oi.version;
+ ret_auth.is_auth_available = true;
+ }
+ }
+ }
+
+ dout(10) << fmt::format("{}: selecting osd {} for obj {} with oi {}",
+ __func__,
+ ret_auth.auth_shard,
+ ho,
+ ret_auth.auth_oi)
+ << dendl;
+
+ return ret_auth;
+}
+
+using set_sinfo_err_t = void (shard_info_wrapper::*)();
+
+inline static const char* sep(bool& prev_err)
+{
+ if (prev_err) {
+ return ", ";
+ } else {
+ prev_err = true;
+ return "";
+ }
+}
+
+// retval: should we continue with the tests
+static inline bool dup_error_cond(bool& prev_err,
+ bool continue_on_err,
+ bool pred,
+ shard_info_wrapper& si,
+ set_sinfo_err_t sete,
+ std::string_view msg,
+ stringstream& errstream)
+{
+ if (pred) {
+ (si.*sete)();
+ errstream << sep(prev_err) << msg;
+ return continue_on_err;
+ }
+ return true;
+}
+
+/**
+ * calls a shard_info_wrapper function, but only if the error predicate is
+ * true.
+ * Returns a copy of the error status.
+ */
+static inline bool test_error_cond(bool error_pred,
+ shard_info_wrapper& si,
+ set_sinfo_err_t sete)
+{
+ if (error_pred) {
+ (si.*sete)();
+ }
+ return error_pred;
+}
+
+shard_as_auth_t ScrubBackend::possible_auth_shard(const hobject_t& obj,
+ const pg_shard_t& srd,
+ shard_info_map_t& shard_map)
+{
+ // 'maps' (originally called with this_chunk->maps): this_chunk->maps
+ // 'auth_oi' (called with 'auth_oi', which wasn't initialized at call site)
+ // - create and return
+ // 'shard_map' - the one created in select_auth_object()
+ // - used to access the 'shard_info'
+
+ const auto j = this_chunk->received_maps.find(srd);
+ const auto& j_shard = j->first;
+ const auto& j_smap = j->second;
+ auto i = j_smap.objects.find(obj);
+ if (i == j_smap.objects.end()) {
+ return shard_as_auth_t{};
+ }
+ const auto& smap_obj = i->second;
+
+ auto& shard_info = shard_map[j_shard];
+ if (j_shard == m_pg_whoami) {
+ shard_info.primary = true;
+ }
+
+ stringstream errstream; // for this shard
+
+ bool err{false};
+ dup_error_cond(err,
+ true,
+ smap_obj.read_error,
+ shard_info,
+ &shard_info_wrapper::set_read_error,
+ "candidate had a read error"sv,
+ errstream);
+ dup_error_cond(err,
+ true,
+ smap_obj.ec_hash_mismatch,
+ shard_info,
+ &shard_info_wrapper::set_ec_hash_mismatch,
+ "candidate had an ec hash mismatch"sv,
+ errstream);
+ dup_error_cond(err,
+ true,
+ smap_obj.ec_size_mismatch,
+ shard_info,
+ &shard_info_wrapper::set_ec_size_mismatch,
+ "candidate had an ec size mismatch"sv,
+ errstream);
+
+ if (!dup_error_cond(err,
+ false,
+ smap_obj.stat_error,
+ shard_info,
+ &shard_info_wrapper::set_stat_error,
+ "candidate had a stat error"sv,
+ errstream)) {
+ // With stat_error no further checking
+ // We don't need to also see a missing_object_info_attr
+ return shard_as_auth_t{errstream.str()};
+ }
+
+ // We won't pick an auth copy if the snapset is missing or won't decode.
+ ceph_assert(!obj.is_snapdir());
+
+ if (obj.is_head()) {
+ auto k = smap_obj.attrs.find(SS_ATTR);
+ if (dup_error_cond(err,
+ false,
+ (k == smap_obj.attrs.end()),
+ shard_info,
+ &shard_info_wrapper::set_snapset_missing,
+ "candidate had a missing snapset key"sv,
+ errstream)) {
+ bufferlist ss_bl;
+ SnapSet snapset;
+ ss_bl.push_back(k->second);
+ try {
+ auto bliter = ss_bl.cbegin();
+ decode(snapset, bliter);
+ } catch (...) {
+ // invalid snapset, probably corrupt
+ dup_error_cond(err,
+ false,
+ true,
+ shard_info,
+ &shard_info_wrapper::set_snapset_corrupted,
+ "candidate had a corrupt snapset"sv,
+ errstream);
+ }
+ } else {
+ // debug@dev only
+ dout(30) << fmt::format(
+ "{} missing snap addr: {:p} shard_info: {:p} er: {:x}",
+ __func__,
+ (void*)&smap_obj,
+ (void*)&shard_info,
+ shard_info.errors)
+ << dendl;
+ }
+ }
+
+ if (!m_is_replicated) {
+ auto k = smap_obj.attrs.find(ECUtil::get_hinfo_key());
+ if (dup_error_cond(err,
+ false,
+ (k == smap_obj.attrs.end()),
+ shard_info,
+ &shard_info_wrapper::set_hinfo_missing,
+ "candidate had a missing hinfo key"sv,
+ errstream)) {
+ bufferlist hk_bl;
+ ECUtil::HashInfo hi;
+ hk_bl.push_back(k->second);
+ try {
+ auto bliter = hk_bl.cbegin();
+ decode(hi, bliter);
+ } catch (...) {
+ dup_error_cond(err,
+ false,
+ true,
+ shard_info,
+ &shard_info_wrapper::set_hinfo_corrupted,
+ "candidate had a corrupt hinfo"sv,
+ errstream);
+ }
+ }
+ }
+
+ object_info_t oi;
+
+ {
+ auto k = smap_obj.attrs.find(OI_ATTR);
+ if (!dup_error_cond(err,
+ false,
+ (k == smap_obj.attrs.end()),
+ shard_info,
+ &shard_info_wrapper::set_info_missing,
+ "candidate had a missing info key"sv,
+ errstream)) {
+ // no object info on object, probably corrupt
+ return shard_as_auth_t{errstream.str()};
+ }
+
+ bufferlist bl;
+ bl.push_back(k->second);
+ try {
+ auto bliter = bl.cbegin();
+ decode(oi, bliter);
+ } catch (...) {
+ // invalid object info, probably corrupt
+ if (!dup_error_cond(err,
+ false,
+ true,
+ shard_info,
+ &shard_info_wrapper::set_info_corrupted,
+ "candidate had a corrupt info"sv,
+ errstream)) {
+ return shard_as_auth_t{errstream.str()};
+ }
+ }
+ }
+
+ // This is automatically corrected in repair_oinfo_oid()
+ ceph_assert(oi.soid == obj);
+
+ if (test_error_cond(smap_obj.size != logical_to_ondisk_size(oi.size),
+ shard_info,
+ &shard_info_wrapper::set_obj_size_info_mismatch)) {
+
+ errstream << sep(err) << "candidate size " << smap_obj.size << " info size "
+ << logical_to_ondisk_size(oi.size) << " mismatch";
+ }
+
+ std::optional<uint32_t> digest;
+ if (smap_obj.digest_present) {
+ digest = smap_obj.digest;
+ }
+
+ if (shard_info.errors) {
+ ceph_assert(err);
+ return shard_as_auth_t{errstream.str(), digest};
+ }
+
+ ceph_assert(!err);
+ // note that the error text is made available to the caller, even
+ // for a successful shard selection
+ return shard_as_auth_t{oi, j, errstream.str(), digest};
+}
+
+// re-implementation of PGBackend::be_compare_scrubmaps()
+void ScrubBackend::compare_smaps()
+{
+ dout(10) << __func__
+ << ": authoritative-set #: " << this_chunk->authoritative_set.size()
+ << dendl;
+
+ std::for_each(this_chunk->authoritative_set.begin(),
+ this_chunk->authoritative_set.end(),
+ [this](const auto& ho) {
+ if (auto maybe_clust_err = compare_obj_in_maps(ho);
+ maybe_clust_err) {
+ clog.error() << *maybe_clust_err;
+ }
+ });
+}
+
+std::optional<std::string> ScrubBackend::compare_obj_in_maps(
+ const hobject_t& ho)
+{
+ // clear per-object data:
+ this_chunk->cur_inconsistent.clear();
+ this_chunk->cur_missing.clear();
+ this_chunk->fix_digest = false;
+
+ stringstream candidates_errors;
+ auto auth_res = select_auth_object(ho, candidates_errors);
+ if (candidates_errors.str().size()) {
+ // a collection of shard-specific errors detected while
+ // finding the best shard to serve as authoritative
+ clog.error() << candidates_errors.str();
+ }
+
+ inconsistent_obj_wrapper object_error{ho};
+ if (!auth_res.is_auth_available) {
+ // no auth selected
+ object_error.set_version(0);
+ object_error.set_auth_missing(ho,
+ this_chunk->received_maps,
+ auth_res.shard_map,
+ this_chunk->m_error_counts.shallow_errors,
+ this_chunk->m_error_counts.deep_errors,
+ m_pg_whoami);
+
+ if (object_error.has_deep_errors()) {
+ this_chunk->m_error_counts.deep_errors++;
+ } else if (object_error.has_shallow_errors()) {
+ this_chunk->m_error_counts.shallow_errors++;
+ }
+
+ this_chunk->m_inconsistent_objs.push_back(std::move(object_error));
+ return fmt::format("{} soid {} : failed to pick suitable object info\n",
+ m_scrubber.get_pgid().pgid,
+ ho);
+ }
+
+ stringstream errstream;
+ auto& auth = auth_res.auth;
+
+ // an auth source was selected
+
+ object_error.set_version(auth_res.auth_oi.user_version);
+ ScrubMap::object& auth_object = auth->second.objects[ho];
+ ceph_assert(!this_chunk->fix_digest);
+
+ auto [auths, objerrs] =
+ match_in_shards(ho, auth_res, object_error, errstream);
+
+ auto opt_ers =
+ for_empty_auth_list(std::move(auths),
+ std::move(objerrs),
+ auth,
+ ho,
+ errstream);
+
+ if (opt_ers.has_value()) {
+
+ // At this point auth_list is populated, so we add the object error
+ // shards as inconsistent.
+ inconsistents(ho,
+ auth_object,
+ auth_res.auth_oi,
+ std::move(*opt_ers),
+ errstream);
+ } else {
+
+ // both the auth & errs containers are empty
+ errstream << m_pg_id << " soid " << ho << " : empty auth list\n";
+ }
+
+ if (object_error.has_deep_errors()) {
+ this_chunk->m_error_counts.deep_errors++;
+ } else if (object_error.has_shallow_errors()) {
+ this_chunk->m_error_counts.shallow_errors++;
+ }
+
+ if (object_error.errors || object_error.union_shards.errors) {
+ this_chunk->m_inconsistent_objs.push_back(std::move(object_error));
+ }
+
+ if (errstream.str().empty()) {
+ return std::nullopt;
+ } else {
+ return errstream.str();
+ }
+}
+
+
+std::optional<ScrubBackend::auth_and_obj_errs_t>
+ScrubBackend::for_empty_auth_list(std::list<pg_shard_t>&& auths,
+ std::set<pg_shard_t>&& obj_errors,
+ shard_to_scrubmap_t::iterator auth,
+ const hobject_t& ho,
+ stringstream& errstream)
+{
+ if (auths.empty()) {
+ if (obj_errors.empty()) {
+ errstream << m_pg_id << " soid " << ho
+ << " : failed to pick suitable auth object\n";
+ return std::nullopt;
+ }
+ // Object errors exist and nothing in auth_list
+ // Prefer the auth shard, otherwise take first from list.
+ pg_shard_t shard;
+ if (obj_errors.count(auth->first)) {
+ shard = auth->first;
+ } else {
+ shard = *(obj_errors.begin());
+ }
+
+ auths.push_back(shard);
+ obj_errors.erase(shard);
+ }
+
+ return ScrubBackend::auth_and_obj_errs_t{std::move(auths),
+ std::move(obj_errors)};
+}
+
+
+/// \todo replace the errstream with a member of this_chunk. Better be a
+/// fmt::buffer. Then - we can use it directly in should_fix_digest()
+void ScrubBackend::inconsistents(const hobject_t& ho,
+ ScrubMap::object& auth_object,
+ object_info_t& auth_oi,
+ auth_and_obj_errs_t&& auth_n_errs,
+ stringstream& errstream)
+{
+ auto& object_errors = auth_n_errs.object_errors;
+ auto& auth_list = auth_n_errs.auth_list;
+
+ this_chunk->cur_inconsistent.insert(object_errors.begin(),
+ object_errors.end()); // merge?
+
+ dout(15) << fmt::format(
+ "{}: object errors #: {} auth list #: {} cur_missing #: {} "
+ "cur_incon #: {}",
+ __func__,
+ object_errors.size(),
+ auth_list.size(),
+ this_chunk->cur_missing.size(),
+ this_chunk->cur_inconsistent.size())
+ << dendl;
+
+
+ if (!this_chunk->cur_missing.empty()) {
+ m_missing[ho] = this_chunk->cur_missing;
+ }
+ if (!this_chunk->cur_inconsistent.empty()) {
+ m_inconsistent[ho] = this_chunk->cur_inconsistent;
+ }
+
+ if (this_chunk->fix_digest) {
+
+ ceph_assert(auth_object.digest_present);
+ std::optional<uint32_t> data_digest{auth_object.digest};
+
+ std::optional<uint32_t> omap_digest;
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ }
+ this_chunk->missing_digest.push_back(
+ make_pair(ho, make_pair(data_digest, omap_digest)));
+ }
+
+ if (!this_chunk->cur_inconsistent.empty() ||
+ !this_chunk->cur_missing.empty()) {
+
+ this_chunk->authoritative[ho] = auth_list;
+
+ } else if (!this_chunk->fix_digest && m_is_replicated) {
+
+ auto is_to_fix =
+ should_fix_digest(ho, auth_object, auth_oi, m_repair, errstream);
+
+ switch (is_to_fix) {
+
+ case digest_fixing_t::no:
+ break;
+
+ case digest_fixing_t::if_aged: {
+ utime_t age = this_chunk->started - auth_oi.local_mtime;
+
+ // \todo find out 'age_limit' only once
+ const auto age_limit = m_conf->osd_deep_scrub_update_digest_min_age;
+
+ if (age <= age_limit) {
+ dout(20) << __func__ << ": missing digest but age (" << age
+ << ") < conf (" << age_limit << ") on " << ho << dendl;
+ break;
+ }
+ }
+
+ [[fallthrough]];
+
+ case digest_fixing_t::force:
+
+ std::optional<uint32_t> data_digest;
+ if (auth_object.digest_present) {
+ data_digest = auth_object.digest;
+ dout(20) << __func__ << ": will update data digest on " << ho
+ << dendl;
+ }
+
+ std::optional<uint32_t> omap_digest;
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ dout(20) << __func__ << ": will update omap digest on " << ho
+ << dendl;
+ }
+ this_chunk->missing_digest.push_back(
+ make_pair(ho, make_pair(data_digest, omap_digest)));
+ break;
+ }
+ }
+}
+
+/// \todo consider changing to use format() and to return the strings
+ScrubBackend::digest_fixing_t ScrubBackend::should_fix_digest(
+ const hobject_t& ho,
+ const ScrubMap::object& auth_object,
+ const object_info_t& auth_oi,
+ bool repair_flag,
+ stringstream& errstream)
+{
+ digest_fixing_t update{digest_fixing_t::no};
+
+ if (auth_object.digest_present && !auth_oi.is_data_digest()) {
+ dout(15) << __func__ << " missing data digest on " << ho << dendl;
+ update = digest_fixing_t::if_aged;
+ }
+
+ if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
+ dout(15) << __func__ << " missing omap digest on " << ho << dendl;
+ update = digest_fixing_t::if_aged;
+ }
+
+ // recorded digest != actual digest?
+ if (auth_oi.is_data_digest() && auth_object.digest_present &&
+ auth_oi.data_digest != auth_object.digest) {
+ errstream << m_pg_id << " recorded data digest 0x" << std::hex
+ << auth_oi.data_digest << " != on disk 0x" << auth_object.digest
+ << std::dec << " on " << auth_oi.soid << "\n";
+ if (repair_flag)
+ update = digest_fixing_t::force;
+ }
+
+ if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
+ auth_oi.omap_digest != auth_object.omap_digest) {
+ errstream << m_pg_id << " recorded omap digest 0x" << std::hex
+ << auth_oi.omap_digest << " != on disk 0x"
+ << auth_object.omap_digest << std::dec << " on " << auth_oi.soid
+ << "\n";
+ if (repair_flag)
+ update = digest_fixing_t::force;
+ }
+
+ return update;
+}
+
+ScrubBackend::auth_and_obj_errs_t ScrubBackend::match_in_shards(
+ const hobject_t& ho,
+ auth_selection_t& auth_sel,
+ inconsistent_obj_wrapper& obj_result,
+ stringstream& errstream)
+{
+ std::list<pg_shard_t> auth_list; // out "param" to
+ std::set<pg_shard_t> object_errors; // be returned
+
+ for (auto& [srd, smap] : this_chunk->received_maps) {
+
+ if (srd == auth_sel.auth_shard) {
+ auth_sel.shard_map[auth_sel.auth_shard].selected_oi = true;
+ }
+
+ if (smap.objects.count(ho)) {
+
+ // the scrub-map has our object
+ auth_sel.shard_map[srd].set_object(smap.objects[ho]);
+
+ // Compare
+ stringstream ss;
+ const auto& auth_object = auth_sel.auth->second.objects[ho];
+ const bool discrep_found = compare_obj_details(auth_sel.auth_shard,
+ auth_object,
+ auth_sel.auth_oi,
+ smap.objects[ho],
+ auth_sel.shard_map[srd],
+ obj_result,
+ ss,
+ ho.has_snapset());
+
+ dout(20) << fmt::format(
+ "{}: {}{} <{}:{}> shards: {} {} {}", __func__,
+ (m_repair ? "repair " : ""),
+ (m_is_replicated ? "replicated " : ""), srd,
+ (srd == auth_sel.auth_shard ? "auth" : "-"),
+ auth_sel.shard_map.size(),
+ (auth_sel.digest_match ? " digest_match " : " "),
+ (auth_sel.shard_map[srd].only_data_digest_mismatch_info()
+ ? "'info mismatch info'"
+ : ""))
+ << dendl;
+ if (discrep_found) {
+ dout(10) << fmt::format(
+ "{}: <{}> auth:{} ({}/{}) vs {} ({}/{}) {}", __func__, ho,
+ auth_sel.auth_shard, auth_object.omap_digest_present,
+ auth_object.omap_digest, srd,
+ smap.objects[ho].omap_digest_present ? true : false,
+ smap.objects[ho].omap_digest, ss.str())
+ << dendl;
+ }
+
+ // If all replicas match, but they don't match object_info we can
+ // repair it by using missing_digest mechanism
+ if (m_repair && m_is_replicated && (srd == auth_sel.auth_shard) &&
+ auth_sel.shard_map.size() > 1 && auth_sel.digest_match &&
+ auth_sel.shard_map[srd].only_data_digest_mismatch_info() &&
+ auth_object.digest_present) {
+ // Set in missing_digests
+ this_chunk->fix_digest = true;
+ // Clear the error
+ auth_sel.shard_map[srd].clear_data_digest_mismatch_info();
+ errstream << m_pg_id << " soid " << ho
+ << " : repairing object info data_digest"
+ << "\n";
+ }
+
+ // Some errors might have already been set in select_auth_object()
+ if (auth_sel.shard_map[srd].errors != 0) {
+
+ this_chunk->cur_inconsistent.insert(srd);
+ if (auth_sel.shard_map[srd].has_deep_errors()) {
+ this_chunk->m_error_counts.deep_errors++;
+ } else {
+ this_chunk->m_error_counts.shallow_errors++;
+ }
+
+ if (discrep_found) {
+ // Only true if compare_obj_details() found errors and put something
+ // in ss
+ errstream << m_pg_id << " shard " << srd << " soid " << ho << " : "
+ << ss.str() << "\n";
+ }
+
+ } else if (discrep_found) {
+
+ // Track possible shards to use as authoritative, if needed
+
+ // There are errors, without identifying the shard
+ object_errors.insert(srd);
+ errstream << m_pg_id << " soid " << ho << " : " << ss.str() << "\n";
+
+ } else {
+
+ // XXX: The auth shard might get here that we don't know
+ // that it has the "correct" data.
+ auth_list.push_back(srd);
+ }
+
+ } else {
+
+ this_chunk->cur_missing.insert(srd);
+ auth_sel.shard_map[srd].set_missing();
+ auth_sel.shard_map[srd].primary = (srd == m_pg_whoami);
+
+ // Can't have any other errors if there is no information available
+ this_chunk->m_error_counts.shallow_errors++;
+ errstream << m_pg_id << " shard " << srd << " " << ho << " : missing\n";
+ }
+ obj_result.add_shard(srd, auth_sel.shard_map[srd]);
+
+ dout(20) << __func__ << ": (debug) soid " << ho << " : " << errstream.str()
+ << dendl;
+ }
+
+ dout(15) << fmt::format("{}: auth_list: {} #: {}; obj-errs#: {}",
+ __func__,
+ auth_list,
+ auth_list.size(),
+ object_errors.size())
+ << dendl;
+ return {auth_list, object_errors};
+}
+
+// == PGBackend::be_compare_scrub_objects()
+bool ScrubBackend::compare_obj_details(pg_shard_t auth_shard,
+ const ScrubMap::object& auth,
+ const object_info_t& auth_oi,
+ const ScrubMap::object& candidate,
+ shard_info_wrapper& shard_result,
+ inconsistent_obj_wrapper& obj_result,
+ stringstream& errstream,
+ bool has_snapset)
+{
+ fmt::memory_buffer out;
+ bool error{false};
+
+ // ------------------------------------------------------------------------
+
+ if (auth.digest_present && candidate.digest_present &&
+ auth.digest != candidate.digest) {
+ fmt::format_to(std::back_inserter(out),
+ "data_digest {:#x} != data_digest {:#x} from shard {}",
+ candidate.digest,
+ auth.digest,
+ auth_shard);
+ error = true;
+ obj_result.set_data_digest_mismatch();
+ }
+
+ if (auth.omap_digest_present && candidate.omap_digest_present &&
+ auth.omap_digest != candidate.omap_digest) {
+ fmt::format_to(std::back_inserter(out),
+ "{}omap_digest {:#x} != omap_digest {:#x} from shard {}",
+ sep(error),
+ candidate.omap_digest,
+ auth.omap_digest,
+ auth_shard);
+ obj_result.set_omap_digest_mismatch();
+ }
+
+ // for replicated:
+ if (m_is_replicated) {
+ if (auth_oi.is_data_digest() && candidate.digest_present &&
+ auth_oi.data_digest != candidate.digest) {
+ fmt::format_to(std::back_inserter(out),
+ "{}data_digest {:#x} != data_digest {:#x} from auth oi {}",
+ sep(error),
+ candidate.digest,
+ auth_oi.data_digest,
+ auth_oi);
+ shard_result.set_data_digest_mismatch_info();
+ }
+
+ // for replicated:
+ if (auth_oi.is_omap_digest() && candidate.omap_digest_present &&
+ auth_oi.omap_digest != candidate.omap_digest) {
+ fmt::format_to(std::back_inserter(out),
+ "{}omap_digest {:#x} != omap_digest {:#x} from auth oi {}",
+ sep(error),
+ candidate.omap_digest,
+ auth_oi.omap_digest,
+ auth_oi);
+ shard_result.set_omap_digest_mismatch_info();
+ }
+ }
+
+ // ------------------------------------------------------------------------
+
+ if (candidate.stat_error) {
+ if (error) {
+ errstream << fmt::to_string(out);
+ }
+ return error;
+ }
+
+ // ------------------------------------------------------------------------
+
+ if (!shard_result.has_info_missing() && !shard_result.has_info_corrupted()) {
+
+ auto can_attr = candidate.attrs.find(OI_ATTR);
+ ceph_assert(can_attr != candidate.attrs.end());
+ bufferlist can_bl;
+ can_bl.push_back(can_attr->second);
+
+ auto auth_attr = auth.attrs.find(OI_ATTR);
+ ceph_assert(auth_attr != auth.attrs.end());
+ bufferlist auth_bl;
+ auth_bl.push_back(auth_attr->second);
+
+ if (!can_bl.contents_equal(auth_bl)) {
+ fmt::format_to(std::back_inserter(out),
+ "{}object info inconsistent ",
+ sep(error));
+ obj_result.set_object_info_inconsistency();
+ }
+ }
+
+ if (has_snapset) {
+ if (!shard_result.has_snapset_missing() &&
+ !shard_result.has_snapset_corrupted()) {
+
+ auto can_attr = candidate.attrs.find(SS_ATTR);
+ ceph_assert(can_attr != candidate.attrs.end());
+ bufferlist can_bl;
+ can_bl.push_back(can_attr->second);
+
+ auto auth_attr = auth.attrs.find(SS_ATTR);
+ ceph_assert(auth_attr != auth.attrs.end());
+ bufferlist auth_bl;
+ auth_bl.push_back(auth_attr->second);
+
+ if (!can_bl.contents_equal(auth_bl)) {
+ fmt::format_to(std::back_inserter(out),
+ "{}snapset inconsistent ",
+ sep(error));
+ obj_result.set_snapset_inconsistency();
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------
+
+ if (!m_is_replicated) {
+ if (!shard_result.has_hinfo_missing() &&
+ !shard_result.has_hinfo_corrupted()) {
+
+ auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
+ ceph_assert(can_hi != candidate.attrs.end());
+ bufferlist can_bl;
+ can_bl.push_back(can_hi->second);
+
+ auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
+ ceph_assert(auth_hi != auth.attrs.end());
+ bufferlist auth_bl;
+ auth_bl.push_back(auth_hi->second);
+
+ if (!can_bl.contents_equal(auth_bl)) {
+ fmt::format_to(std::back_inserter(out),
+ "{}hinfo inconsistent ",
+ sep(error));
+ obj_result.set_hinfo_inconsistency();
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------
+
+ // sizes:
+
+ uint64_t oi_size = logical_to_ondisk_size(auth_oi.size);
+ if (oi_size != candidate.size) {
+ fmt::format_to(std::back_inserter(out),
+ "{}size {} != size {} from auth oi {}",
+ sep(error),
+ candidate.size,
+ oi_size,
+ auth_oi);
+ shard_result.set_size_mismatch_info();
+ }
+
+ if (auth.size != candidate.size) {
+ fmt::format_to(std::back_inserter(out),
+ "{}size {} != size {} from shard {}",
+ sep(error),
+ candidate.size,
+ auth.size,
+ auth_shard);
+ obj_result.set_size_mismatch();
+ }
+
+ // If the replica is too large and we didn't already count it for this object
+
+ if (candidate.size > m_conf->osd_max_object_size &&
+ !obj_result.has_size_too_large()) {
+
+ fmt::format_to(std::back_inserter(out),
+ "{}size {} > {} is too large",
+ sep(error),
+ candidate.size,
+ m_conf->osd_max_object_size);
+ obj_result.set_size_too_large();
+ }
+
+ // ------------------------------------------------------------------------
+
+ // comparing the attributes:
+
+ for (const auto& [k, v] : auth.attrs) {
+ if (k == OI_ATTR || k[0] != '_') {
+ // We check system keys separately
+ continue;
+ }
+
+ auto cand = candidate.attrs.find(k);
+ if (cand == candidate.attrs.end()) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr name mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_name_mismatch();
+ } else if (cand->second.cmp(v)) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr value mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_value_mismatch();
+ }
+ }
+
+ for (const auto& [k, v] : candidate.attrs) {
+ if (k == OI_ATTR || k[0] != '_') {
+ // We check system keys separately
+ continue;
+ }
+
+ auto in_auth = auth.attrs.find(k);
+ if (in_auth == auth.attrs.end()) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr name mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_name_mismatch();
+ }
+ }
+
+ if (error) {
+ errstream << fmt::to_string(out);
+ }
+ return error;
+}
+
+static inline bool doing_clones(
+ const std::optional<SnapSet>& snapset,
+ const vector<snapid_t>::reverse_iterator& curclone)
+{
+ return snapset && curclone != snapset->clones.rend();
+}
+
+// /////////////////////////////////////////////////////////////////////////////
+//
+// final checking & fixing - scrub_snapshot_metadata()
+//
+// /////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects expected
+ * ======= =======
+ * obj1 snap 1 head, unexpected obj1 snap 1
+ * obj2 head head, match
+ * [SnapSet clones 6 4 2 1]
+ * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6 obj2 snap 6, match
+ * obj2 snap 4 obj2 snap 4, match
+ * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ * [Snapset clones 3 1]
+ * obj3 snap 3 obj3 snap 3 match
+ * obj3 snap 1 obj3 snap 1 match
+ * obj4 head head, match
+ * [Snapset clones 4]
+ * EOL obj4 snap 4, (expected)
+ */
+void ScrubBackend::scrub_snapshot_metadata(ScrubMap& map)
+{
+ dout(10) << __func__ << " num stat obj "
+ << m_pg.get_pg_info(ScrubberPasskey{}).stats.stats.sum.num_objects
+ << dendl;
+
+ std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
+
+ // traverse in reverse order.
+ std::optional<hobject_t> head;
+ std::optional<SnapSet> snapset; // If initialized so will head (above)
+ vector<snapid_t>::reverse_iterator
+ curclone; // Defined only if snapset initialized
+ int missing = 0;
+ inconsistent_snapset_wrapper soid_error, head_error;
+ int soid_error_count = 0;
+
+ for (auto p = map.objects.rbegin(); p != map.objects.rend(); ++p) {
+
+ const hobject_t& soid = p->first;
+ ceph_assert(!soid.is_snapdir());
+ soid_error = inconsistent_snapset_wrapper{soid};
+ object_stat_sum_t stat;
+
+ stat.num_objects++;
+
+ if (soid.nspace == m_conf->osd_hit_set_namespace)
+ stat.num_objects_hit_set_archive++;
+
+ if (soid.is_snap()) {
+ // it's a clone
+ stat.num_object_clones++;
+ }
+
+ // basic checks.
+ std::optional<object_info_t> oi;
+ if (!p->second.attrs.count(OI_ATTR)) {
+ oi = std::nullopt;
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : no '" << OI_ATTR << "' attr";
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_info_missing();
+ } else {
+ bufferlist bv;
+ bv.push_back(p->second.attrs[OI_ATTR]);
+ try {
+ oi = object_info_t(bv);
+ } catch (ceph::buffer::error& e) {
+ oi = std::nullopt;
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : can't decode '" << OI_ATTR << "' attr "
+ << e.what();
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_info_corrupted();
+ soid_error.set_info_missing(); // Not available too
+ }
+ }
+
+ if (oi) {
+ if (logical_to_ondisk_size(oi->size) != p->second.size) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : on disk size (" << p->second.size
+ << ") does not match object info size (" << oi->size
+ << ") adjusted for ondisk to ("
+ << logical_to_ondisk_size(oi->size) << ")";
+ soid_error.set_size_mismatch();
+ this_chunk->m_error_counts.shallow_errors++;
+ }
+
+ dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl;
+
+ // A clone num_bytes will be added later when we have snapset
+ if (!soid.is_snap()) {
+ stat.num_bytes += oi->size;
+ }
+ if (soid.nspace == m_conf->osd_hit_set_namespace)
+ stat.num_bytes_hit_set_archive += oi->size;
+
+ if (oi->is_dirty())
+ ++stat.num_objects_dirty;
+ if (oi->is_whiteout())
+ ++stat.num_whiteouts;
+ if (oi->is_omap())
+ ++stat.num_objects_omap;
+ if (oi->is_cache_pinned())
+ ++stat.num_objects_pinned;
+ if (oi->has_manifest())
+ ++stat.num_objects_manifest;
+ }
+
+ // Check for any problems while processing clones
+ if (doing_clones(snapset, curclone)) {
+ std::optional<snapid_t> target;
+ // Expecting an object with snap for current head
+ if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+ dout(10) << __func__ << " " << m_mode_desc << " " << m_pg_id
+ << " new object " << soid << " while processing " << *head
+ << dendl;
+
+ target = all_clones;
+ } else {
+ ceph_assert(soid.is_snap());
+ target = soid.snap;
+ }
+
+ // Log any clones we were expecting to be there up to target
+ // This will set missing, but will be a no-op if snap.soid == *curclone.
+ missing +=
+ process_clones_to(head, snapset, target, &curclone, head_error);
+ }
+
+ bool expected;
+ // Check doing_clones() again in case we ran process_clones_to()
+ if (doing_clones(snapset, curclone)) {
+ // A head would have processed all clones above
+ // or all greater than *curclone.
+ ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+ // After processing above clone snap should match the expected curclone
+ expected = (*curclone == soid.snap);
+ } else {
+ // If we aren't doing clones any longer, then expecting head
+ expected = soid.has_snapset();
+ }
+ if (!expected) {
+ // If we couldn't read the head's snapset, just ignore clones
+ if (head && !snapset) {
+ clog.error() << m_mode_desc << " " << m_pg_id
+ << " " << soid
+ << " : clone ignored due to missing snapset";
+ } else {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : is an unexpected clone";
+ }
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_headless();
+ this_chunk->m_inconsistent_objs.push_back(std::move(soid_error));
+ ++soid_error_count;
+ if (head && soid.get_head() == head->get_head())
+ head_error.set_clone(soid.snap);
+ continue;
+ }
+
+ // new snapset?
+ if (soid.has_snapset()) {
+
+ if (missing) {
+ log_missing(missing, head, __func__);
+ }
+
+ // Save previous head error information
+ if (head && (head_error.errors || soid_error_count)) {
+ this_chunk->m_inconsistent_objs.push_back(std::move(head_error));
+ }
+
+ // Set this as a new head object
+ head = soid;
+ missing = 0;
+ head_error = soid_error;
+ soid_error_count = 0;
+
+ dout(20) << __func__ << " " << m_mode_desc << " new head " << head
+ << dendl;
+
+ if (p->second.attrs.count(SS_ATTR) == 0) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : no '" << SS_ATTR << "' attr";
+ this_chunk->m_error_counts.shallow_errors++;
+ snapset = std::nullopt;
+ head_error.set_snapset_missing();
+ } else {
+ bufferlist bl;
+ bl.push_back(p->second.attrs[SS_ATTR]);
+ auto blp = bl.cbegin();
+ try {
+ snapset = SnapSet(); // Initialize optional<> before decoding into it
+ decode(*snapset, blp);
+ head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+ } catch (ceph::buffer::error& e) {
+ snapset = std::nullopt;
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : can't decode '" << SS_ATTR << "' attr "
+ << e.what();
+ this_chunk->m_error_counts.shallow_errors++;
+ head_error.set_snapset_corrupted();
+ }
+ }
+
+ if (snapset) {
+ // what will be next?
+ curclone = snapset->clones.rbegin();
+
+ if (!snapset->clones.empty()) {
+ dout(20) << " snapset " << *snapset << dendl;
+ if (snapset->seq == 0) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : snaps.seq not set";
+ this_chunk->m_error_counts.shallow_errors++;
+ head_error.set_snapset_error();
+ }
+ }
+ }
+ } else {
+ ceph_assert(soid.is_snap());
+ ceph_assert(head);
+ ceph_assert(snapset);
+ ceph_assert(soid.snap == *curclone);
+
+ dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid
+ << dendl;
+
+ if (snapset->clone_size.count(soid.snap) == 0) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : is missing in clone_size";
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_size_mismatch();
+ } else {
+ if (oi && oi->size != snapset->clone_size[soid.snap]) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : size " << oi->size << " != clone_size "
+ << snapset->clone_size[*curclone];
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_size_mismatch();
+ }
+
+ if (snapset->clone_overlap.count(soid.snap) == 0) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : is missing in clone_overlap";
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_size_mismatch();
+ } else {
+ // This checking is based on get_clone_bytes(). The first 2 asserts
+ // can't happen because we know we have a clone_size and
+ // a clone_overlap. Now we check that the interval_set won't
+ // cause the last assert.
+ uint64_t size = snapset->clone_size.find(soid.snap)->second;
+ const interval_set<uint64_t>& overlap =
+ snapset->clone_overlap.find(soid.snap)->second;
+ bool bad_interval_set = false;
+ for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+ i != overlap.end();
+ ++i) {
+ if (size < i.get_len()) {
+ bad_interval_set = true;
+ break;
+ }
+ size -= i.get_len();
+ }
+
+ if (bad_interval_set) {
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
+ << " : bad interval_set in clone_overlap";
+ this_chunk->m_error_counts.shallow_errors++;
+ soid_error.set_size_mismatch();
+ } else {
+ stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+ }
+ }
+ }
+
+ // what's next?
+ ++curclone;
+ if (soid_error.errors) {
+ this_chunk->m_inconsistent_objs.push_back(std::move(soid_error));
+ ++soid_error_count;
+ }
+ }
+ m_scrubber.add_to_stats(stat);
+ }
+
+ if (doing_clones(snapset, curclone)) {
+ dout(10) << __func__ << " " << m_mode_desc << " " << m_pg_id
+ << " No more objects while processing " << *head << dendl;
+
+ missing +=
+ process_clones_to(head, snapset, all_clones, &curclone, head_error);
+ }
+
+ // There could be missing found by the test above or even
+ // before dropping out of the loop for the last head.
+
+ if (missing) {
+ log_missing(missing, head, __func__);
+ }
+ if (head && (head_error.errors || soid_error_count)) {
+ this_chunk->m_inconsistent_objs.push_back(std::move(head_error));
+ }
+
+ // fix data/omap digests
+ m_scrubber.submit_digest_fixes(this_chunk->missing_digest);
+
+ dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
+}
+
+int ScrubBackend::process_clones_to(
+ const std::optional<hobject_t>& head,
+ const std::optional<SnapSet>& snapset,
+ std::optional<snapid_t> target,
+ vector<snapid_t>::reverse_iterator* curclone,
+ inconsistent_snapset_wrapper& e)
+{
+ ceph_assert(head);
+ ceph_assert(snapset);
+ int missing_count = 0;
+
+ // NOTE: clones are in descending order, thus **curclone > target test here
+ hobject_t next_clone(*head);
+ while (doing_clones(snapset, *curclone) &&
+ (!target || **curclone > *target)) {
+
+ ++missing_count;
+ // it is okay to be missing one or more clones in a cache tier.
+ // skip higher-numbered clones in the list.
+ if (!m_incomplete_clones_allowed) {
+ next_clone.snap = **curclone;
+ clog.error() << m_mode_desc << " " << m_pg_id << " " << *head
+ << " : expected clone " << next_clone << " " << m_missing
+ << " missing";
+ this_chunk->m_error_counts.shallow_errors++;
+ e.set_clone_missing(next_clone.snap);
+ }
+ // Clones are descending
+ ++(*curclone);
+ }
+ return missing_count;
+}
+
+void ScrubBackend::log_missing(int missing,
+ const std::optional<hobject_t>& head,
+ const char* logged_func_name)
+{
+ ceph_assert(head);
+ if (m_incomplete_clones_allowed) {
+ dout(20) << logged_func_name << " " << m_mode_desc << " " << m_pg_id << " "
+ << *head << " skipped " << missing << " clone(s) in cache tier"
+ << dendl;
+ } else {
+ clog.info() << m_mode_desc << " " << m_pg_id << " " << *head << " : "
+ << missing << " missing clone(s)";
+ }
+}
+
+
+// ////////////////////////////////////////////////////////////////////////////////
+
+std::vector<snap_mapper_fix_t> ScrubBackend::scan_snaps(
+ ScrubMap& smap,
+ SnapMapReaderI& snaps_getter)
+{
+ std::vector<snap_mapper_fix_t> out_orders;
+ hobject_t head;
+ SnapSet snapset;
+
+ // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+ // in this function
+ dout(15) << "_scan_snaps starts" << dendl;
+
+ for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+ const hobject_t& hoid = i->first;
+ ScrubMap::object& o = i->second;
+
+ dout(20) << __func__ << " " << hoid << dendl;
+
+ ceph_assert(!hoid.is_snapdir());
+
+ if (hoid.is_head()) {
+ // parse the SnapSet
+ bufferlist bl;
+ if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+ // no snaps for this head
+ continue;
+ }
+ bl.push_back(o.attrs[SS_ATTR]);
+ auto p = bl.cbegin();
+ try {
+ decode(snapset, p);
+ } catch (...) {
+ dout(20) << fmt::format("{}: failed to decode the snapset ({})",
+ __func__,
+ hoid)
+ << dendl;
+ continue;
+ }
+ head = hoid.get_head();
+ continue;
+ }
+
+ /// \todo document why guaranteed to have initialized 'head' at this point
+
+ if (hoid.snap < CEPH_MAXSNAP) {
+
+ if (hoid.get_head() != head) {
+ derr << __func__ << " no head for " << hoid << " (have " << head << ")"
+ << dendl;
+ continue;
+ }
+
+ // the 'hoid' is a clone hoid at this point. The 'snapset' below was taken
+ // from the corresponding head hoid.
+ auto maybe_fix_order = scan_object_snaps(hoid, snapset, snaps_getter);
+ if (maybe_fix_order) {
+ out_orders.push_back(std::move(*maybe_fix_order));
+ }
+ }
+ }
+
+ dout(15) << __func__ << " " << out_orders.size() << " fix orders" << dendl;
+ return out_orders;
+}
+
+std::optional<snap_mapper_fix_t> ScrubBackend::scan_object_snaps(
+ const hobject_t& hoid,
+ const SnapSet& snapset,
+ SnapMapReaderI& snaps_getter)
+{
+ using result_t = Scrub::SnapMapReaderI::result_t;
+ dout(15) << fmt::format("{}: obj:{} snapset:{}", __func__, hoid, snapset)
+ << dendl;
+
+ auto p = snapset.clone_snaps.find(hoid.snap);
+ if (p == snapset.clone_snaps.end()) {
+ derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
+ << dendl;
+ return std::nullopt;
+ }
+ set<snapid_t> obj_snaps{p->second.begin(), p->second.end()};
+
+ // clang-format off
+
+ // validate both that the mapper contains the correct snaps for the object
+ // and that it is internally consistent.
+ // possible outcomes:
+ //
+ // Error scenarios:
+ // - SnapMapper index of object snaps does not match that stored in head
+ // object snapset attribute:
+ // we should delete the snapmapper entry and re-add it.
+ // - no mapping found for the object's snaps:
+ // we should add the missing mapper entries.
+ // - the snapmapper set for this object is internally inconsistent (e.g.
+ // the OBJ_ entries do not match the SNA_ entries). We remove
+ // whatever entries are there, and redo the DB content for this object.
+ //
+ // And
+ // There is the "happy path": cur_snaps == obj_snaps. Nothing to do there.
+
+ // clang-format on
+
+ auto cur_snaps = snaps_getter.get_snaps_check_consistency(hoid);
+ if (!cur_snaps) {
+ switch (auto e = cur_snaps.error(); e.code) {
+ case result_t::code_t::backend_error:
+ derr << __func__ << ": get_snaps returned "
+ << cpp_strerror(e.backend_error) << " for " << hoid << dendl;
+ ceph_abort();
+ case result_t::code_t::not_found:
+ dout(10) << __func__ << ": no snaps for " << hoid << ". Adding."
+ << dendl;
+ return snap_mapper_fix_t{snap_mapper_op_t::add, hoid, obj_snaps, {}};
+ case result_t::code_t::inconsistent:
+ dout(10) << __func__ << ": inconsistent snapmapper data for " << hoid
+ << ". Recreating." << dendl;
+ return snap_mapper_fix_t{
+ snap_mapper_op_t::overwrite, hoid, obj_snaps, {}};
+ default:
+ dout(10) << __func__ << ": error (" << cpp_strerror(e.backend_error)
+ << ") fetching snapmapper data for " << hoid << ". Recreating."
+ << dendl;
+ return snap_mapper_fix_t{
+ snap_mapper_op_t::overwrite, hoid, obj_snaps, {}};
+ }
+ __builtin_unreachable();
+ }
+
+ if (*cur_snaps == obj_snaps) {
+ dout(20) << fmt::format(
+ "{}: {}: snapset match SnapMapper's ({})", __func__, hoid,
+ obj_snaps)
+ << dendl;
+ return std::nullopt;
+ }
+
+ // add this object to the list of snapsets that needs fixing. Note
+ // that we also collect the existing (bogus) list, for logging purposes
+ dout(20) << fmt::format(
+ "{}: obj {}: was: {} updating to: {}", __func__, hoid,
+ *cur_snaps, obj_snaps)
+ << dendl;
+ return snap_mapper_fix_t{
+ snap_mapper_op_t::update, hoid, obj_snaps, *cur_snaps};
+}
+
+/*
+ * Building a map of objects suitable for snapshot validation.
+ *
+ * We are moving all "full" clone sets, i.e. the head and (preceding it, as
+ * snapshots precede the head entry) the clone entries, into 'for_meta_scrub'.
+ * That collection, not containing partial items, will be scrubbed by
+ * scrub_snapshot_metadata().
+ *
+ * What's left in m_cleaned_meta_map is the leftover partial items that need to
+ * be completed before they can be processed.
+ */
+ScrubMap ScrubBackend::clean_meta_map(ScrubMap& cleaned, bool max_reached)
+{
+ ScrubMap for_meta_scrub;
+
+ if (max_reached || cleaned.objects.empty()) {
+ cleaned.swap(for_meta_scrub);
+ } else {
+ auto iter = cleaned.objects.end();
+ --iter; // not empty, see 'if' clause
+ auto begin = cleaned.objects.begin();
+ if (iter->first.has_snapset()) {
+ ++iter;
+ } else {
+ while (iter != begin) {
+ auto next = iter--;
+ if (next->first.get_head() != iter->first.get_head()) {
+ ++iter;
+ break;
+ }
+ }
+ }
+ for_meta_scrub.objects.insert(begin, iter);
+ cleaned.objects.erase(begin, iter);
+ }
+
+ return for_meta_scrub;
+}