From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/osd/osd_types.cc | 7373 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 7373 insertions(+) create mode 100644 src/osd/osd_types.cc (limited to 'src/osd/osd_types.cc') diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc new file mode 100644 index 000000000..14694de19 --- /dev/null +++ b/src/osd/osd_types.cc @@ -0,0 +1,7373 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include "include/ceph_features.h" +#include "include/encoding.h" +#include "include/stringify.h" +extern "C" { +#include "crush/hash.h" +} + +#include "common/Formatter.h" +#include "common/StackStringStream.h" +#include "include/utime_fmt.h" +#include "OSDMap.h" +#include "osd_types.h" +#include "osd_types_fmt.h" +#include "os/Transaction.h" + +using std::list; +using std::make_pair; +using std::map; +using std::ostream; +using std::pair; +using std::set; +using std::shared_ptr; +using std::string; +using std::unique_ptr; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::decode_nohead; +using ceph::encode; +using ceph::encode_nohead; +using ceph::Formatter; +using ceph::make_timespan; +using ceph::JSONFormatter; + +using namespace std::literals; + +const char *ceph_osd_flag_name(unsigned flag) +{ + switch (flag) { + case CEPH_OSD_FLAG_ACK: return "ack"; + case CEPH_OSD_FLAG_ONNVRAM: return "onnvram"; + case CEPH_OSD_FLAG_ONDISK: return "ondisk"; + case CEPH_OSD_FLAG_RETRY: return "retry"; + case CEPH_OSD_FLAG_READ: return "read"; + case CEPH_OSD_FLAG_WRITE: return "write"; + case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap"; + case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old"; + case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads"; + case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec"; + case CEPH_OSD_FLAG_PGOP: return "pgop"; + case CEPH_OSD_FLAG_EXEC: return "exec"; + case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public"; + case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads"; + case CEPH_OSD_FLAG_RWORDERED: return "rwordered"; + case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache"; + case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks"; + case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay"; + case CEPH_OSD_FLAG_FLUSH: return "flush"; + case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone"; + case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc"; + case CEPH_OSD_FLAG_REDIRECTED: return "redirected"; + case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected"; + case CEPH_OSD_FLAG_FULL_TRY: return "full_try"; + case CEPH_OSD_FLAG_FULL_FORCE: return "full_force"; + case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect"; + case CEPH_OSD_FLAG_RETURNVEC: return "returnvec"; + case CEPH_OSD_FLAG_SUPPORTSPOOLEIO: return "supports_pool_eio"; + default: return "???"; + } +} + +string ceph_osd_flag_string(unsigned flags) +{ + string s; + for (unsigned i=0; i<32; ++i) { + if (flags & (1u<dump_string("alert", s); + } + } +} + +// -- osd_reqid_t -- +void osd_reqid_t::dump(Formatter *f) const +{ + f->dump_stream("name") << name; + f->dump_int("inc", inc); + f->dump_unsigned("tid", tid); +} + +void osd_reqid_t::generate_test_instances(list& o) +{ + o.push_back(new osd_reqid_t); + o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678)); +} + +// -- object_locator_t -- + +void object_locator_t::encode(ceph::buffer::list& bl) const +{ + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); + __u8 encode_compat = 3; + ENCODE_START(6, encode_compat, bl); + encode(pool, bl); + int32_t preferred = -1; // tell old code there is no preferred osd (-1). + encode(preferred, bl); + encode(key, bl); + encode(nspace, bl); + encode(hash, bl); + if (hash != -1) + encode_compat = std::max(encode_compat, 6); // need to interpret the hash + ENCODE_FINISH_NEW_COMPAT(bl, encode_compat); +} + +void object_locator_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p); + if (struct_v < 2) { + int32_t op; + decode(op, p); + pool = op; + int16_t pref; + decode(pref, p); + } else { + decode(pool, p); + int32_t preferred; + decode(preferred, p); + } + decode(key, p); + if (struct_v >= 5) + decode(nspace, p); + if (struct_v >= 6) + decode(hash, p); + else + hash = -1; + DECODE_FINISH(p); + // verify that nobody's corrupted the locator + ceph_assert(hash == -1 || key.empty()); +} + +void object_locator_t::dump(Formatter *f) const +{ + f->dump_int("pool", pool); + f->dump_string("key", key); + f->dump_string("namespace", nspace); + f->dump_int("hash", hash); +} + +void object_locator_t::generate_test_instances(list& o) +{ + o.push_back(new object_locator_t); + o.push_back(new object_locator_t(123)); + o.push_back(new object_locator_t(123, 876)); + o.push_back(new object_locator_t(1, "n2")); + o.push_back(new object_locator_t(1234, "", "key")); + o.push_back(new object_locator_t(12, "n1", "key2")); +} + +// -- request_redirect_t -- +void request_redirect_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(redirect_locator, bl); + encode(redirect_object, bl); + // legacy of the removed osd_instructions member + encode((uint32_t)0, bl); + ENCODE_FINISH(bl); +} + +void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + uint32_t legacy_osd_instructions_len; + decode(redirect_locator, bl); + decode(redirect_object, bl); + decode(legacy_osd_instructions_len, bl); + if (legacy_osd_instructions_len) { + bl += legacy_osd_instructions_len; + } + DECODE_FINISH(bl); +} + +void request_redirect_t::dump(Formatter *f) const +{ + f->dump_string("object", redirect_object); + f->open_object_section("locator"); + redirect_locator.dump(f); + f->close_section(); // locator +} + +void request_redirect_t::generate_test_instances(list& o) +{ + object_locator_t loc(1, "redir_obj"); + o.push_back(new request_redirect_t()); + o.push_back(new request_redirect_t(loc, 0)); + o.push_back(new request_redirect_t(loc, "redir_obj")); + o.push_back(new request_redirect_t(loc)); +} + +void objectstore_perf_stat_t::dump(Formatter *f) const +{ + // *_ms values just for compatibility. + f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0); + f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0); + f->dump_unsigned("commit_latency_ns", os_commit_latency_ns); + f->dump_unsigned("apply_latency_ns", os_apply_latency_ns); +} + +void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + uint8_t target_v = 2; + if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) { + target_v = 1; + } + ENCODE_START(target_v, target_v, bl); + if (target_v >= 2) { + encode(os_commit_latency_ns, bl); + encode(os_apply_latency_ns, bl); + } else { + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS; + uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS; + encode(commit_latency_ms, bl); // for compatibility with older monitor. + encode(apply_latency_ms, bl); // for compatibility with older monitor. + } + ENCODE_FINISH(bl); +} + +void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(2, bl); + if (struct_v >= 2) { + decode(os_commit_latency_ns, bl); + decode(os_apply_latency_ns, bl); + } else { + uint32_t commit_latency_ms; + uint32_t apply_latency_ms; + decode(commit_latency_ms, bl); + decode(apply_latency_ms, bl); + constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count(); + os_commit_latency_ns = commit_latency_ms * NS_PER_MS; + os_apply_latency_ns = apply_latency_ms * NS_PER_MS; + } + DECODE_FINISH(bl); +} + +void objectstore_perf_stat_t::generate_test_instances(std::list& o) +{ + o.push_back(new objectstore_perf_stat_t()); + o.push_back(new objectstore_perf_stat_t()); + o.back()->os_commit_latency_ns = 20000000; + o.back()->os_apply_latency_ns = 30000000; +} + +// -- osd_stat_t -- +void osd_stat_t::dump(Formatter *f, bool with_net) const +{ + f->dump_unsigned("up_from", up_from); + f->dump_unsigned("seq", seq); + f->dump_unsigned("num_pgs", num_pgs); + f->dump_unsigned("num_osds", num_osds); + f->dump_unsigned("num_per_pool_osds", num_per_pool_osds); + f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds); + + /// dump legacy stats fields to ensure backward compatibility. + f->dump_unsigned("kb", statfs.kb()); + f->dump_unsigned("kb_used", statfs.kb_used_raw()); + f->dump_unsigned("kb_used_data", statfs.kb_used_data()); + f->dump_unsigned("kb_used_omap", statfs.kb_used_omap()); + f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata()); + f->dump_unsigned("kb_avail", statfs.kb_avail()); + //////////////////// + + f->open_object_section("statfs"); + statfs.dump(f); + f->close_section(); + f->open_array_section("hb_peers"); + for (auto p : hb_peers) + f->dump_int("osd", p); + f->close_section(); + f->dump_int("snap_trim_queue_len", snap_trim_queue_len); + f->dump_int("num_snap_trimming", num_snap_trimming); + f->dump_int("num_shards_repaired", num_shards_repaired); + f->open_object_section("op_queue_age_hist"); + op_queue_age_hist.dump(f); + f->close_section(); + f->open_object_section("perf_stat"); + os_perf_stat.dump(f); + f->close_section(); + f->open_array_section("alerts"); + ::dump(f, os_alerts); + f->close_section(); + if (with_net) { + dump_ping_time(f); + } +} + +void osd_stat_t::dump_ping_time(Formatter *f) const +{ + f->open_array_section("network_ping_times"); + for (auto &i : hb_pingtime) { + f->open_object_section("entry"); + f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); + f->open_array_section("interfaces"); + f->open_object_section("interface"); + f->dump_string("interface", "back"); + f->open_object_section("average"); + f->dump_float("1min", i.second.back_pingtime[0]/1000.0); + f->dump_float("5min", i.second.back_pingtime[1]/1000.0); + f->dump_float("15min", i.second.back_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.back_min[0]/1000.0); + f->dump_float("5min", i.second.back_min[1]/1000.0); + f->dump_float("15min", i.second.back_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.back_max[0]/1000.0); + f->dump_float("5min", i.second.back_max[1]/1000.0); + f->dump_float("15min", i.second.back_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.back_last/1000.0); + f->close_section(); // interface + + if (i.second.front_pingtime[0] != 0) { + f->open_object_section("interface"); + f->dump_string("interface", "front"); + f->open_object_section("average"); + f->dump_float("1min", i.second.front_pingtime[0]/1000.0); + f->dump_float("5min", i.second.front_pingtime[1]/1000.0); + f->dump_float("15min", i.second.front_pingtime[2]/1000.0); + f->close_section(); // average + f->open_object_section("min"); + f->dump_float("1min", i.second.front_min[0]/1000.0); + f->dump_float("5min", i.second.front_min[1]/1000.0); + f->dump_float("15min", i.second.front_min[2]/1000.0); + f->close_section(); // min + f->open_object_section("max"); + f->dump_float("1min", i.second.front_max[0]/1000.0); + f->dump_float("5min", i.second.front_max[1]/1000.0); + f->dump_float("15min", i.second.front_max[2]/1000.0); + f->close_section(); // max + f->dump_float("last", i.second.front_last/1000.0); + f->close_section(); // interface + } + f->close_section(); // interfaces + f->close_section(); // entry + } + f->close_section(); // network_ping_time +} + +void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(14, 2, bl); + + //////// for compatibility //////// + int64_t kb = statfs.kb(); + int64_t kb_used = statfs.kb_used_raw(); + int64_t kb_avail = statfs.kb_avail(); + encode(kb, bl); + encode(kb_used, bl); + encode(kb_avail, bl); + /////////////////////////////////// + + encode(snap_trim_queue_len, bl); + encode(num_snap_trimming, bl); + encode(hb_peers, bl); + encode((uint32_t)0, bl); + encode(op_queue_age_hist, bl); + encode(os_perf_stat, bl, features); + encode(up_from, bl); + encode(seq, bl); + encode(num_pgs, bl); + + //////// for compatibility //////// + int64_t kb_used_data = statfs.kb_used_data(); + int64_t kb_used_omap = statfs.kb_used_omap(); + int64_t kb_used_meta = statfs.kb_used_internal_metadata(); + encode(kb_used_data, bl); + encode(kb_used_omap, bl); + encode(kb_used_meta, bl); + encode(statfs, bl); + /////////////////////////////////// + encode(os_alerts, bl); + encode(num_shards_repaired, bl); + encode(num_osds, bl); + encode(num_per_pool_osds, bl); + encode(num_per_pool_omap_osds, bl); + + // hb_pingtime map + encode((int)hb_pingtime.size(), bl); + for (auto i : hb_pingtime) { + encode(i.first, bl); // osd + encode(i.second.last_update, bl); + encode(i.second.back_pingtime[0], bl); + encode(i.second.back_pingtime[1], bl); + encode(i.second.back_pingtime[2], bl); + encode(i.second.back_min[0], bl); + encode(i.second.back_min[1], bl); + encode(i.second.back_min[2], bl); + encode(i.second.back_max[0], bl); + encode(i.second.back_max[1], bl); + encode(i.second.back_max[2], bl); + encode(i.second.back_last, bl); + encode(i.second.front_pingtime[0], bl); + encode(i.second.front_pingtime[1], bl); + encode(i.second.front_pingtime[2], bl); + encode(i.second.front_min[0], bl); + encode(i.second.front_min[1], bl); + encode(i.second.front_min[2], bl); + encode(i.second.front_max[0], bl); + encode(i.second.front_max[1], bl); + encode(i.second.front_max[2], bl); + encode(i.second.front_last, bl); + } + ENCODE_FINISH(bl); +} + +void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + int64_t kb, kb_used,kb_avail; + int64_t kb_used_data, kb_used_omap, kb_used_meta; + DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl); + decode(kb, bl); + decode(kb_used, bl); + decode(kb_avail, bl); + decode(snap_trim_queue_len, bl); + decode(num_snap_trimming, bl); + decode(hb_peers, bl); + vector num_hb_out; + decode(num_hb_out, bl); + if (struct_v >= 3) + decode(op_queue_age_hist, bl); + if (struct_v >= 4) + decode(os_perf_stat, bl); + if (struct_v >= 6) { + decode(up_from, bl); + decode(seq, bl); + } + if (struct_v >= 7) { + decode(num_pgs, bl); + } + if (struct_v >= 8) { + decode(kb_used_data, bl); + decode(kb_used_omap, bl); + decode(kb_used_meta, bl); + } else { + kb_used_data = kb_used; + kb_used_omap = 0; + kb_used_meta = 0; + } + if (struct_v >= 9) { + decode(statfs, bl); + } else { + statfs.reset(); + statfs.total = kb << 10; + statfs.available = kb_avail << 10; + // actually it's totally unexpected to have ststfs.total < statfs.available + // here but unfortunately legacy generate_test_instances produced such a + // case hence inserting some handling rather than assert + statfs.internally_reserved = + statfs.total > statfs.available ? statfs.total - statfs.available : 0; + kb_used <<= 10; + if ((int64_t)statfs.internally_reserved > kb_used) { + statfs.internally_reserved -= kb_used; + } else { + statfs.internally_reserved = 0; + } + statfs.allocated = kb_used_data << 10; + statfs.omap_allocated = kb_used_omap << 10; + statfs.internal_metadata = kb_used_meta << 10; + } + if (struct_v >= 10) { + decode(os_alerts, bl); + } else { + os_alerts.clear(); + } + if (struct_v >= 11) { + decode(num_shards_repaired, bl); + } else { + num_shards_repaired = 0; + } + if (struct_v >= 12) { + decode(num_osds, bl); + decode(num_per_pool_osds, bl); + } else { + num_osds = 0; + num_per_pool_osds = 0; + } + if (struct_v >= 13) { + decode(num_per_pool_omap_osds, bl); + } else { + num_per_pool_omap_osds = 0; + } + hb_pingtime.clear(); + if (struct_v >= 14) { + int count; + decode(count, bl); + for (int i = 0 ; i < count ; i++) { + int osd; + decode(osd, bl); + struct Interfaces ifs; + decode(ifs.last_update, bl); + decode(ifs.back_pingtime[0],bl); + decode(ifs.back_pingtime[1], bl); + decode(ifs.back_pingtime[2], bl); + decode(ifs.back_min[0],bl); + decode(ifs.back_min[1], bl); + decode(ifs.back_min[2], bl); + decode(ifs.back_max[0],bl); + decode(ifs.back_max[1], bl); + decode(ifs.back_max[2], bl); + decode(ifs.back_last, bl); + decode(ifs.front_pingtime[0], bl); + decode(ifs.front_pingtime[1], bl); + decode(ifs.front_pingtime[2], bl); + decode(ifs.front_min[0], bl); + decode(ifs.front_min[1], bl); + decode(ifs.front_min[2], bl); + decode(ifs.front_max[0], bl); + decode(ifs.front_max[1], bl); + decode(ifs.front_max[2], bl); + decode(ifs.front_last, bl); + hb_pingtime[osd] = ifs; + } + } + DECODE_FINISH(bl); +} + +void osd_stat_t::generate_test_instances(std::list& o) +{ + o.push_back(new osd_stat_t); + + o.push_back(new osd_stat_t); + list ll; + store_statfs_t::generate_test_instances(ll); + o.back()->statfs = *ll.back(); + o.back()->hb_peers.push_back(7); + o.back()->snap_trim_queue_len = 8; + o.back()->num_snap_trimming = 99; + o.back()->num_shards_repaired = 101; + o.back()->os_alerts[0].emplace( + "some alert", "some alert details"); + o.back()->os_alerts[1].emplace( + "some alert2", "some alert2 details"); + struct Interfaces gen_interfaces = { + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; + o.back()->hb_pingtime[20] = gen_interfaces; + gen_interfaces = { + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + o.back()->hb_pingtime[30] = gen_interfaces; +} + +// -- pg_t -- + +int pg_t::print(char *o, int maxlen) const +{ + return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps()); +} + +bool pg_t::parse(const char *s) +{ + uint64_t ppool; + uint32_t pseed; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + m_pool = ppool; + m_seed = pseed; + return true; +} + +bool spg_t::parse(const char *s) +{ + shard = shard_id_t::NO_SHARD; + uint64_t ppool; + uint32_t pseed; + uint32_t pshard; + int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed); + if (r < 2) + return false; + pgid.set_pool(ppool); + pgid.set_ps(pseed); + + const char *p = strchr(s, 's'); + if (p) { + r = sscanf(p, "s%u", &pshard); + if (r == 1) { + shard = shard_id_t(pshard); + } else { + return false; + } + } + return true; +} + +char *spg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + if (!is_no_shard()) { + buf = ritoa((uint8_t)shard.id, buf); + *--buf = 's'; + } + + return pgid.calc_name(buf, ""); +} + +std::string spg_t::calc_name_sring() const +{ + char buf[spg_t::calc_name_buf_size]; + buf[spg_t::calc_name_buf_size - 1] = '\0'; + return string{calc_name(buf + spg_t::calc_name_buf_size - 1, "")}; +} + +ostream& operator<<(ostream& out, const spg_t &pg) +{ + char buf[spg_t::calc_name_buf_size]; + buf[spg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, ""); + return out; +} + +pg_t pg_t::get_ancestor(unsigned old_pg_num) const +{ + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + pg_t ret = *this; + ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask); + return ret; +} + +bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set *children) const +{ + //ceph_assert(m_seed < old_pg_num); + if (m_seed >= old_pg_num) { + // degenerate case + return false; + } + if (new_pg_num <= old_pg_num) + return false; + + bool split = false; + if (true) { + unsigned old_bits = cbits(old_pg_num); + unsigned old_mask = (1 << old_bits) - 1; + for (unsigned n = 1; ; n++) { + unsigned next_bit = (n << (old_bits-1)); + unsigned s = next_bit | m_seed; + + if (s < old_pg_num || s == m_seed) + continue; + if (s >= new_pg_num) + break; + if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) { + split = true; + if (children) + children->insert(pg_t(s, m_pool)); + } + } + } + if (false) { + // brute force + int old_bits = cbits(old_pg_num); + int old_mask = (1 << old_bits) - 1; + for (unsigned x = old_pg_num; x < new_pg_num; ++x) { + unsigned o = ceph_stable_mod(x, old_pg_num, old_mask); + if (o == m_seed) { + split = true; + children->insert(pg_t(x, m_pool)); + } + } + } + return split; +} + +unsigned pg_t::get_split_bits(unsigned pg_num) const { + if (pg_num == 1) + return 0; + ceph_assert(pg_num > 1); + + // Find unique p such that pg_num \in [2^(p-1), 2^p) + unsigned p = cbits(pg_num); + ceph_assert(p); // silence coverity #751330 + + if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1)))) + return p; + else + return p - 1; +} + +bool pg_t::is_merge_source( + unsigned old_pg_num, + unsigned new_pg_num, + pg_t *parent) const +{ + if (m_seed < old_pg_num && + m_seed >= new_pg_num) { + if (parent) { + pg_t t = *this; + while (t.m_seed >= new_pg_num) { + t = t.get_parent(); + } + *parent = t; + } + return true; + } + return false; +} + +pg_t pg_t::get_parent() const +{ + unsigned bits = cbits(m_seed); + ceph_assert(bits); + pg_t retval = *this; + retval.m_seed &= ~((~0)<<(bits - 1)); + return retval; +} + +hobject_t pg_t::get_hobj_start() const +{ + return hobject_t(object_t(), string(), 0, m_seed, m_pool, + string()); +} + +hobject_t pg_t::get_hobj_end(unsigned pg_num) const +{ + // note: this assumes a bitwise sort; with the legacy nibblewise + // sort a PG did not always cover a single contiguous range of the + // (bit-reversed) hash range. + unsigned bits = get_split_bits(pg_num); + uint64_t rev_start = hobject_t::_reverse_bits(m_seed); + uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1; + if (rev_end >= 0x100000000) { + ceph_assert(rev_end == 0x100000000); + return hobject_t::get_max(); + } else { + return hobject_t(object_t(), string(), CEPH_NOSNAP, + hobject_t::_reverse_bits(rev_end), m_pool, + string()); + } +} + +void pg_t::dump(Formatter *f) const +{ + f->dump_unsigned("pool", m_pool); + f->dump_unsigned("seed", m_seed); +} + +void pg_t::generate_test_instances(list& o) +{ + o.push_back(new pg_t); + o.push_back(new pg_t(1, 2)); + o.push_back(new pg_t(13123, 3)); + o.push_back(new pg_t(131223, 4)); +} + +char *pg_t::calc_name(char *buf, const char *suffix_backwords) const +{ + while (*suffix_backwords) + *--buf = *suffix_backwords++; + + buf = ritoa(m_seed, buf); + + *--buf = '.'; + + return ritoa(m_pool, buf); +} + +ostream& operator<<(ostream& out, const pg_t &pg) +{ + char buf[pg_t::calc_name_buf_size]; + buf[pg_t::calc_name_buf_size - 1] = '\0'; + out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, ""); + return out; +} + + +// -- coll_t -- + +void coll_t::calc_str() +{ + switch (type) { + case TYPE_META: + strcpy(_str_buff, "meta"); + _str = _str_buff; + break; + case TYPE_PG: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_"); + break; + case TYPE_PG_TEMP: + _str_buff[spg_t::calc_name_buf_size - 1] = '\0'; + _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_"); + break; + default: + ceph_abort_msg("unknown collection type"); + } +} + +bool coll_t::parse(const std::string& s) +{ + if (s == "meta") { + type = TYPE_META; + pgid = spg_t(); + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_head") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + if (s.find("_TEMP") == s.length() - 5 && + pgid.parse(s.substr(0, s.length() - 5))) { + type = TYPE_PG_TEMP; + removal_seq = 0; + calc_str(); + ceph_assert(s == _str); + return true; + } + return false; +} + +void coll_t::encode(ceph::buffer::list& bl) const +{ + using ceph::encode; + // when changing this, remember to update encoded_size() too. + if (is_temp()) { + // can't express this as v2... + __u8 struct_v = 3; + encode(struct_v, bl); + encode(to_str(), bl); + } else { + __u8 struct_v = 2; + encode(struct_v, bl); + encode((__u8)type, bl); + encode(pgid, bl); + snapid_t snap = CEPH_NOSNAP; + encode(snap, bl); + } +} + +size_t coll_t::encoded_size() const +{ + size_t r = sizeof(__u8); + if (is_temp()) { + // v3 + r += sizeof(__u32); + if (_str) { + r += strlen(_str); + } + } else { + // v2 + // 1. type + r += sizeof(__u8); + // 2. pgid + // - encoding header + r += sizeof(ceph_le32) + 2 * sizeof(__u8); + // - pg_t + r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t); + // - shard_id_t + r += sizeof(int8_t); + // 3. snapid_t + r += sizeof(uint64_t); + } + + return r; +} + +void coll_t::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + switch (struct_v) { + case 1: + { + snapid_t snap; + decode(pgid, bl); + decode(snap, bl); + + // infer the type + if (pgid == spg_t() && snap == 0) { + type = TYPE_META; + } else { + type = TYPE_PG; + } + removal_seq = 0; + } + break; + + case 2: + { + __u8 _type; + snapid_t snap; + decode(_type, bl); + decode(pgid, bl); + decode(snap, bl); + type = (type_t)_type; + removal_seq = 0; + } + break; + + case 3: + { + string str; + decode(str, bl); + bool ok = parse(str); + if (!ok) + throw std::domain_error(std::string("unable to parse pg ") + str); + } + break; + + default: + { + CachedStackStringStream css; + *css << "coll_t::decode(): don't know how to decode version " + << struct_v; + throw std::domain_error(css->str()); + } + } +} + +void coll_t::dump(Formatter *f) const +{ + f->dump_unsigned("type_id", (unsigned)type); + if (type != TYPE_META) + f->dump_stream("pgid") << pgid; + f->dump_string("name", to_str()); +} + +void coll_t::generate_test_instances(list& o) +{ + o.push_back(new coll_t()); + o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12)))); + o.push_back(new coll_t(o.back()->get_temp())); + o.push_back(new coll_t()); +} + +// --- + +std::string pg_vector_string(const vector &a) +{ + CachedStackStringStream css; + *css << "["; + for (auto i = a.cbegin(); i != a.cend(); ++i) { + if (i != a.begin()) + *css << ","; + if (*i != CRUSH_ITEM_NONE) + *css << *i; + else + *css << "NONE"; + } + *css << "]"; + return css->str(); +} + +std::string pg_state_string(uint64_t state) +{ + CachedStackStringStream css; + if (state & PG_STATE_STALE) + *css << "stale+"; + if (state & PG_STATE_CREATING) + *css << "creating+"; + if (state & PG_STATE_ACTIVE) + *css << "active+"; + if (state & PG_STATE_ACTIVATING) + *css << "activating+"; + if (state & PG_STATE_CLEAN) + *css << "clean+"; + if (state & PG_STATE_RECOVERY_WAIT) + *css << "recovery_wait+"; + if (state & PG_STATE_RECOVERY_TOOFULL) + *css << "recovery_toofull+"; + if (state & PG_STATE_RECOVERING) + *css << "recovering+"; + if (state & PG_STATE_FORCED_RECOVERY) + *css << "forced_recovery+"; + if (state & PG_STATE_DOWN) + *css << "down+"; + if (state & PG_STATE_RECOVERY_UNFOUND) + *css << "recovery_unfound+"; + if (state & PG_STATE_BACKFILL_UNFOUND) + *css << "backfill_unfound+"; + if (state & PG_STATE_UNDERSIZED) + *css << "undersized+"; + if (state & PG_STATE_DEGRADED) + *css << "degraded+"; + if (state & PG_STATE_REMAPPED) + *css << "remapped+"; + if (state & PG_STATE_PREMERGE) + *css << "premerge+"; + if (state & PG_STATE_SCRUBBING) + *css << "scrubbing+"; + if (state & PG_STATE_DEEP_SCRUB) + *css << "deep+"; + if (state & PG_STATE_INCONSISTENT) + *css << "inconsistent+"; + if (state & PG_STATE_PEERING) + *css << "peering+"; + if (state & PG_STATE_REPAIR) + *css << "repair+"; + if (state & PG_STATE_BACKFILL_WAIT) + *css << "backfill_wait+"; + if (state & PG_STATE_BACKFILLING) + *css << "backfilling+"; + if (state & PG_STATE_FORCED_BACKFILL) + *css << "forced_backfill+"; + if (state & PG_STATE_BACKFILL_TOOFULL) + *css << "backfill_toofull+"; + if (state & PG_STATE_INCOMPLETE) + *css << "incomplete+"; + if (state & PG_STATE_PEERED) + *css << "peered+"; + if (state & PG_STATE_SNAPTRIM) + *css << "snaptrim+"; + if (state & PG_STATE_SNAPTRIM_WAIT) + *css << "snaptrim_wait+"; + if (state & PG_STATE_SNAPTRIM_ERROR) + *css << "snaptrim_error+"; + if (state & PG_STATE_FAILED_REPAIR) + *css << "failed_repair+"; + if (state & PG_STATE_LAGGY) + *css << "laggy+"; + if (state & PG_STATE_WAIT) + *css << "wait+"; + auto ret = css->str(); + if (ret.length() > 0) + ret.resize(ret.length() - 1); + else + ret = "unknown"; + return ret; +} + +std::optional pg_string_state(const std::string& state) +{ + std::optional type; + if (state == "active") + type = PG_STATE_ACTIVE; + else if (state == "clean") + type = PG_STATE_CLEAN; + else if (state == "down") + type = PG_STATE_DOWN; + else if (state == "recovery_unfound") + type = PG_STATE_RECOVERY_UNFOUND; + else if (state == "backfill_unfound") + type = PG_STATE_BACKFILL_UNFOUND; + else if (state == "premerge") + type = PG_STATE_PREMERGE; + else if (state == "scrubbing") + type = PG_STATE_SCRUBBING; + else if (state == "degraded") + type = PG_STATE_DEGRADED; + else if (state == "inconsistent") + type = PG_STATE_INCONSISTENT; + else if (state == "peering") + type = PG_STATE_PEERING; + else if (state == "repair") + type = PG_STATE_REPAIR; + else if (state == "recovering") + type = PG_STATE_RECOVERING; + else if (state == "forced_recovery") + type = PG_STATE_FORCED_RECOVERY; + else if (state == "backfill_wait") + type = PG_STATE_BACKFILL_WAIT; + else if (state == "incomplete") + type = PG_STATE_INCOMPLETE; + else if (state == "stale") + type = PG_STATE_STALE; + else if (state == "remapped") + type = PG_STATE_REMAPPED; + else if (state == "deep") + type = PG_STATE_DEEP_SCRUB; + else if (state == "backfilling") + type = PG_STATE_BACKFILLING; + else if (state == "forced_backfill") + type = PG_STATE_FORCED_BACKFILL; + else if (state == "backfill_toofull") + type = PG_STATE_BACKFILL_TOOFULL; + else if (state == "recovery_wait") + type = PG_STATE_RECOVERY_WAIT; + else if (state == "recovery_toofull") + type = PG_STATE_RECOVERY_TOOFULL; + else if (state == "undersized") + type = PG_STATE_UNDERSIZED; + else if (state == "activating") + type = PG_STATE_ACTIVATING; + else if (state == "peered") + type = PG_STATE_PEERED; + else if (state == "snaptrim") + type = PG_STATE_SNAPTRIM; + else if (state == "snaptrim_wait") + type = PG_STATE_SNAPTRIM_WAIT; + else if (state == "snaptrim_error") + type = PG_STATE_SNAPTRIM_ERROR; + else if (state == "creating") + type = PG_STATE_CREATING; + else if (state == "failed_repair") + type = PG_STATE_FAILED_REPAIR; + else if (state == "laggy") + type = PG_STATE_LAGGY; + else if (state == "wait") + type = PG_STATE_WAIT; + else if (state == "unknown") + type = 0; + else + type = std::nullopt; + return type; +} + +// -- eversion_t -- +string eversion_t::get_key_name() const +{ + std::string key(32, ' '); + get_key_name(&key[0]); + key.resize(31); // remove the null terminator + return key; +} + +// -- pool_snap_info_t -- +void pool_snap_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", snapid); + f->dump_stream("stamp") << stamp; + f->dump_string("name", name); +} + +void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + __u8 struct_v = 1; + encode(struct_v, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + return; + } + ENCODE_START(2, 2, bl); + encode(snapid, bl); + encode(stamp, bl); + encode(name, bl); + ENCODE_FINISH(bl); +} + +void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(snapid, bl); + decode(stamp, bl); + decode(name, bl); + DECODE_FINISH(bl); +} + +void pool_snap_info_t::generate_test_instances(list& o) +{ + o.push_back(new pool_snap_info_t); + o.push_back(new pool_snap_info_t); + o.back()->snapid = 1; + o.back()->stamp = utime_t(1, 2); + o.back()->name = "foo"; +} + +// -- pool_opts_t -- + +// The order of items in the list is important, therefore, +// you should always add to the end of the list when adding new options. + +typedef std::map opt_mapping_t; +static opt_mapping_t opt_mapping = boost::assign::map_list_of + ("scrub_min_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE)) + ("scrub_max_interval", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE)) + ("deep_scrub_interval", pool_opts_t::opt_desc_t( + pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE)) + ("recovery_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT)) + ("recovery_op_priority", pool_opts_t::opt_desc_t( + pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT)) + ("scrub_priority", pool_opts_t::opt_desc_t( + pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT)) + ("compression_mode", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR)) + ("compression_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR)) + ("compression_required_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE)) + ("compression_max_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT)) + ("compression_min_blob_size", pool_opts_t::opt_desc_t( + pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT)) + ("csum_type", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_TYPE, pool_opts_t::INT)) + ("csum_max_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT)) + ("csum_min_block", pool_opts_t::opt_desc_t( + pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT)) + ("fingerprint_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR)) + ("pg_num_min", pool_opts_t::opt_desc_t( + pool_opts_t::PG_NUM_MIN, pool_opts_t::INT)) + ("target_size_bytes", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT)) + ("target_size_ratio", pool_opts_t::opt_desc_t( + pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE)) + ("pg_autoscale_bias", pool_opts_t::opt_desc_t( + pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE)) + ("read_lease_interval", pool_opts_t::opt_desc_t( + pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE)) + ("dedup_tier", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_TIER, pool_opts_t::INT)) + ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR)) + ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t( + pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT)) + ("pg_num_max", pool_opts_t::opt_desc_t( + pool_opts_t::PG_NUM_MAX, pool_opts_t::INT)); + +bool pool_opts_t::is_opt_name(const std::string& name) +{ + return opt_mapping.count(name); +} + +pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) +{ + auto i = opt_mapping.find(name); + ceph_assert(i != opt_mapping.end()); + return i->second; +} + +bool pool_opts_t::is_set(pool_opts_t::key_t key) const +{ + return opts.count(key); +} + +const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const +{ + auto i = opts.find(key); + ceph_assert(i != opts.end()); + return i->second; +} + +bool pool_opts_t::unset(pool_opts_t::key_t key) { + return opts.erase(key) > 0; +} + +class pool_opts_dumper_t : public boost::static_visitor<> { +public: + pool_opts_dumper_t(const std::string& name_, Formatter* f_) : + name(name_.c_str()), f(f_) {} + + void operator()(std::string s) const { + f->dump_string(name, s); + } + void operator()(int64_t i) const { + f->dump_int(name, i); + } + void operator()(double d) const { + f->dump_float(name, d); + } + +private: + const char* name; + Formatter* f; +}; + +void pool_opts_t::dump(const std::string& name, Formatter* f) const +{ + const opt_desc_t& desc = get_opt_desc(name); + auto i = opts.find(desc.key); + if (i == opts.end()) { + return; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), i->second); +} + +void pool_opts_t::dump(Formatter* f) const +{ + for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) { + const std::string& name = i->first; + const opt_desc_t& desc = i->second; + auto j = opts.find(desc.key); + if (j == opts.end()) { + continue; + } + boost::apply_visitor(pool_opts_dumper_t(name, f), j->second); + } +} + +class pool_opts_encoder_t : public boost::static_visitor<> { +public: + explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features) + : bl(bl_), + features(features) {} + + void operator()(const std::string &s) const { + encode(static_cast(pool_opts_t::STR), bl); + encode(s, bl); + } + void operator()(int64_t i) const { + encode(static_cast(pool_opts_t::INT), bl); + if (HAVE_FEATURE(features, SERVER_NAUTILUS)) { + encode(i, bl); + } else { + encode(static_cast(i), bl); + } + } + void operator()(double d) const { + encode(static_cast(pool_opts_t::DOUBLE), bl); + encode(d, bl); + } + +private: + ceph::buffer::list& bl; + uint64_t features; +}; + +void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + unsigned v = 2; + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 1; + } + ENCODE_START(v, 1, bl); + uint32_t n = static_cast(opts.size()); + encode(n, bl); + for (auto i = opts.cbegin(); i != opts.cend(); ++i) { + encode(static_cast(i->first), bl); + boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second); + } + ENCODE_FINISH(bl); +} + +void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + __u32 n; + decode(n, bl); + opts.clear(); + while (n--) { + int32_t k, t; + decode(k, bl); + decode(t, bl); + if (t == STR) { + std::string s; + decode(s, bl); + opts[static_cast(k)] = s; + } else if (t == INT) { + int64_t i; + if (struct_v >= 2) { + decode(i, bl); + } else { + int ii; + decode(ii, bl); + i = ii; + } + opts[static_cast(k)] = i; + } else if (t == DOUBLE) { + double d; + decode(d, bl); + opts[static_cast(k)] = d; + } else { + ceph_assert(!"invalid type"); + } + } + DECODE_FINISH(bl); +} + +ostream& operator<<(ostream& out, const pool_opts_t& opts) +{ + for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) { + const std::string& name = i->first; + const pool_opts_t::opt_desc_t& desc = i->second; + auto j = opts.opts.find(desc.key); + if (j == opts.opts.end()) { + continue; + } + out << " " << name << " " << j->second; + } + return out; +} + +// -- pg_pool_t -- + +const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs"); +const char *pg_pool_t::APPLICATION_NAME_RBD("rbd"); +const char *pg_pool_t::APPLICATION_NAME_RGW("rgw"); + +void pg_pool_t::dump(Formatter *f) const +{ + f->dump_stream("create_time") << get_create_time(); + f->dump_unsigned("flags", get_flags()); + f->dump_string("flags_names", get_flags_string()); + f->dump_int("type", get_type()); + f->dump_int("size", get_size()); + f->dump_int("min_size", get_min_size()); + f->dump_int("crush_rule", get_crush_rule()); + f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count); + f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target); + f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier); + f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member); + f->dump_int("object_hash", get_object_hash()); + f->dump_string("pg_autoscale_mode", + get_pg_autoscale_mode_name(pg_autoscale_mode)); + f->dump_unsigned("pg_num", get_pg_num()); + f->dump_unsigned("pg_placement_num", get_pgp_num()); + f->dump_unsigned("pg_placement_num_target", get_pgp_num_target()); + f->dump_unsigned("pg_num_target", get_pg_num_target()); + f->dump_unsigned("pg_num_pending", get_pg_num_pending()); + f->dump_object("last_pg_merge_meta", last_pg_merge_meta); + f->dump_stream("last_change") << get_last_change(); + f->dump_stream("last_force_op_resend") << get_last_force_op_resend(); + f->dump_stream("last_force_op_resend_prenautilus") + << get_last_force_op_resend_prenautilus(); + f->dump_stream("last_force_op_resend_preluminous") + << get_last_force_op_resend_preluminous(); + f->dump_unsigned("auid", get_auid()); + f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged"); + f->dump_unsigned("snap_seq", get_snap_seq()); + f->dump_unsigned("snap_epoch", get_snap_epoch()); + f->open_array_section("pool_snaps"); + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) { + f->open_object_section("pool_snap_info"); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->dump_stream("removed_snaps") << removed_snaps; + f->dump_unsigned("quota_max_bytes", quota_max_bytes); + f->dump_unsigned("quota_max_objects", quota_max_objects); + f->open_array_section("tiers"); + for (auto p = tiers.cbegin(); p != tiers.cend(); ++p) + f->dump_unsigned("pool_id", *p); + f->close_section(); + f->dump_int("tier_of", tier_of); + f->dump_int("read_tier", read_tier); + f->dump_int("write_tier", write_tier); + f->dump_string("cache_mode", get_cache_mode_name()); + f->dump_unsigned("target_max_bytes", target_max_bytes); + f->dump_unsigned("target_max_objects", target_max_objects); + f->dump_unsigned("cache_target_dirty_ratio_micro", + cache_target_dirty_ratio_micro); + f->dump_unsigned("cache_target_dirty_high_ratio_micro", + cache_target_dirty_high_ratio_micro); + f->dump_unsigned("cache_target_full_ratio_micro", + cache_target_full_ratio_micro); + f->dump_unsigned("cache_min_flush_age", cache_min_flush_age); + f->dump_unsigned("cache_min_evict_age", cache_min_evict_age); + f->dump_string("erasure_code_profile", erasure_code_profile); + f->open_object_section("hit_set_params"); + hit_set_params.dump(f); + f->close_section(); // hit_set_params + f->dump_unsigned("hit_set_period", hit_set_period); + f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); + f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); + f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote); + f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate); + f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n); + f->open_array_section("grade_table"); + for (unsigned i = 0; i < hit_set_count; ++i) + f->dump_unsigned("value", get_grade(i)); + f->close_section(); + f->dump_unsigned("stripe_width", get_stripe_width()); + f->dump_unsigned("expected_num_objects", expected_num_objects); + f->dump_bool("fast_read", fast_read); + f->open_object_section("options"); + opts.dump(f); + f->close_section(); // options + f->open_object_section("application_metadata"); + for (auto &app_pair : application_metadata) { + f->open_object_section(app_pair.first.c_str()); + for (auto &kv_pair : app_pair.second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); // application + } + f->close_section(); // application_metadata +} + +void pg_pool_t::convert_to_pg_shards(const vector &from, set* to) const { + for (size_t i = 0; i < from.size(); ++i) { + if (from[i] != CRUSH_ITEM_NONE) { + to->insert( + pg_shard_t( + from[i], + is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + } +} + +void pg_pool_t::calc_pg_masks() +{ + pg_num_mask = (1 << cbits(pg_num-1)) - 1; + pgp_num_mask = (1 << cbits(pgp_num-1)) - 1; +} + +unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const +{ + if (pg_num == pg_num_mask + 1) + return pg_num; // power-of-2 split + unsigned mask = pg_num_mask >> 1; + if ((pgid.ps() & mask) < (pg_num & mask)) + return pg_num_mask + 1; // smaller bin size (already split) + else + return (pg_num_mask + 1) >> 1; // bigger bin (not yet split) +} + +bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const +{ + if (pg_num_pending >= pg_num) { + return false; + } + if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) { + if (target) { + *target = false; + } + return true; + } + for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) { + if (pg_t(ps, pgid.pool()).get_parent() == pgid) { + if (target) { + *target = true; + } + return true; + } + } + return false; +} + +/* + * we have two snap modes: + * - pool snaps + * - snap existence/non-existence defined by snaps[] and snap_seq + * - user managed snaps + * - existence tracked by librados user + */ +bool pg_pool_t::is_pool_snaps_mode() const +{ + return has_flag(FLAG_POOL_SNAPS); +} + +bool pg_pool_t::is_unmanaged_snaps_mode() const +{ + return has_flag(FLAG_SELFMANAGED_SNAPS); +} + +bool pg_pool_t::is_removed_snap(snapid_t s) const +{ + if (is_pool_snaps_mode()) + return s <= get_snap_seq() && snaps.count(s) == 0; + else + return removed_snaps.contains(s); +} + +snapid_t pg_pool_t::snap_exists(std::string_view s) const +{ + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) + if (p->second.name == s) + return p->second.snapid; + return 0; +} + +void pg_pool_t::add_snap(const char *n, utime_t stamp) +{ + ceph_assert(!is_unmanaged_snaps_mode()); + flags |= FLAG_POOL_SNAPS; + snapid_t s = get_snap_seq() + 1; + snap_seq = s; + snaps[s].snapid = s; + snaps[s].name = n; + snaps[s].stamp = stamp; +} + +uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat) +{ + ceph_assert(!is_pool_snaps_mode()); + if (snap_seq == 0) { + if (preoctopus_compat) { + // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after + // mimic this field is not decoded but our flag is set; pre-mimic, we + // have a non-empty removed_snaps to signifiy a non-pool-snaps pool. + removed_snaps.insert(snapid_t(1)); + } + snap_seq = 1; + } + flags |= FLAG_SELFMANAGED_SNAPS; + snap_seq = snap_seq + 1; + return snap_seq; +} + +void pg_pool_t::remove_snap(snapid_t s) +{ + ceph_assert(snaps.count(s)); + snaps.erase(s); + snap_seq = snap_seq + 1; +} + +void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat) +{ + ceph_assert(is_unmanaged_snaps_mode()); + ++snap_seq; + if (preoctopus_compat) { + removed_snaps.insert(s); + // try to add in the new seq, just to try to keep the interval_set contiguous + if (!removed_snaps.contains(get_snap_seq())) { + removed_snaps.insert(get_snap_seq()); + } + } +} + +SnapContext pg_pool_t::get_snap_context() const +{ + vector s(snaps.size()); + unsigned i = 0; + for (auto p = snaps.crbegin(); p != snaps.crend(); ++p) + s[i++] = p->first; + return SnapContext(get_snap_seq(), s); +} + +uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const +{ + if (ns.empty()) + return ceph_str_hash(object_hash, key.data(), key.length()); + int nsl = ns.length(); + int len = key.length() + nsl + 1; + char buf[len]; + memcpy(&buf[0], ns.data(), nsl); + buf[nsl] = '\037'; + memcpy(&buf[nsl+1], key.data(), key.length()); + return ceph_str_hash(object_hash, &buf[0], len); +} + +uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const +{ + return ceph_stable_mod(v, pg_num, pg_num_mask); +} + +/* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ +pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const +{ + pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask)); + return pg; +} + +/* + * map raw pg (full precision ps) into a placement seed. include + * pool id in that value so that different pools don't use the same + * seeds. + */ +ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const +{ + if (flags & FLAG_HASHPSPOOL) { + // Hash the pool id so that pool PGs do not overlap. + return + crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask), + pg.pool()); + } else { + // Legacy behavior; add ps and pool together. This is not a great + // idea because the PGs from each pool will essentially overlap on + // top of each other: 0.5 == 1.4 == 2.3 == ... + return + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + + pg.pool(); + } +} + +uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const +{ + uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123); + if (pg_num == pg_num_mask + 1) { + r &= ~pg_num_mask; + } else { + unsigned smaller_mask = pg_num_mask >> 1; + if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) { + r &= ~pg_num_mask; + } else { + r &= ~smaller_mask; + } + } + r |= pg.ps(); + return r; +} + +void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGPOOL3) == 0) { + // this encoding matches the old struct ceph_pg_pool + __u8 struct_v = 2; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + + __u32 n = snaps.size(); + encode(n, bl); + n = removed_snaps.num_intervals(); + encode(n, bl); + + encode(auid, bl); + + encode_nohead(snaps, bl, features); + encode_nohead(removed_snaps, bl); + return; + } + + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 struct_v = 4; + encode(struct_v, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + return; + } + + if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) { + // we simply added last_force_op_resend here, which is a fully + // backward compatible change. however, encoding the same map + // differently between monitors triggers scrub noise (even though + // they are decodable without the feature), so let's be pendantic + // about it. + ENCODE_START(14, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + encode(flags, bl); + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + ENCODE_FINISH(bl); + return; + } + + uint8_t v = 30; + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) { + // this was the first post-hammer thing we added; if it's missing, encode + // like hammer. + v = 21; + } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 24; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 26; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 27; + } else if (!is_stretch_pool()) { + v = 29; + } + + ENCODE_START(v, 5, bl); + encode(type, bl); + encode(size, bl); + encode(crush_rule, bl); + encode(object_hash, bl); + encode(pg_num, bl); + encode(pgp_num, bl); + __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. + encode(lpg_num, bl); + encode(lpgp_num, bl); + encode(last_change, bl); + encode(snap_seq, bl); + encode(snap_epoch, bl); + encode(snaps, bl, features); + encode(removed_snaps, bl); + encode(auid, bl); + if (v >= 27) { + encode(flags, bl); + } else { + auto tmp = flags; + tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING); + encode(tmp, bl); + } + encode((uint32_t)0, bl); // crash_replay_interval + encode(min_size, bl); + encode(quota_max_bytes, bl); + encode(quota_max_objects, bl); + encode(tiers, bl); + encode(tier_of, bl); + __u8 c = cache_mode; + encode(c, bl); + encode(read_tier, bl); + encode(write_tier, bl); + encode(properties, bl); + encode(hit_set_params, bl); + encode(hit_set_period, bl); + encode(hit_set_count, bl); + encode(stripe_width, bl); + encode(target_max_bytes, bl); + encode(target_max_objects, bl); + encode(cache_target_dirty_ratio_micro, bl); + encode(cache_target_full_ratio_micro, bl); + encode(cache_min_flush_age, bl); + encode(cache_min_evict_age, bl); + encode(erasure_code_profile, bl); + encode(last_force_op_resend_preluminous, bl); + encode(min_read_recency_for_promote, bl); + encode(expected_num_objects, bl); + if (v >= 19) { + encode(cache_target_dirty_high_ratio_micro, bl); + } + if (v >= 20) { + encode(min_write_recency_for_promote, bl); + } + if (v >= 21) { + encode(use_gmt_hitset, bl); + } + if (v >= 22) { + encode(fast_read, bl); + } + if (v >= 23) { + encode(hit_set_grade_decay_rate, bl); + encode(hit_set_search_last_n, bl); + } + if (v >= 24) { + encode(opts, bl, features); + } + if (v >= 25) { + encode(last_force_op_resend_prenautilus, bl); + } + if (v >= 26) { + encode(application_metadata, bl); + } + if (v >= 27) { + encode(create_time, bl); + } + if (v >= 28) { + encode(pg_num_target, bl); + encode(pgp_num_target, bl); + encode(pg_num_pending, bl); + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01] + encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01] + encode(last_force_op_resend, bl); + encode(pg_autoscale_mode, bl); + } + if (v >= 29) { + encode(last_pg_merge_meta, bl); + } + if (v >= 30) { + encode(peering_crush_bucket_count, bl); + encode(peering_crush_bucket_target, bl); + encode(peering_crush_bucket_barrier, bl); + encode(peering_crush_mandatory_member, bl); + } + ENCODE_FINISH(bl); +} + +void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl); + decode(type, bl); + decode(size, bl); + decode(crush_rule, bl); + decode(object_hash, bl); + decode(pg_num, bl); + decode(pgp_num, bl); + { + __u32 lpg_num, lpgp_num; + decode(lpg_num, bl); + decode(lpgp_num, bl); + } + decode(last_change, bl); + decode(snap_seq, bl); + decode(snap_epoch, bl); + + if (struct_v >= 3) { + decode(snaps, bl); + decode(removed_snaps, bl); + decode(auid, bl); + } else { + __u32 n, m; + decode(n, bl); + decode(m, bl); + decode(auid, bl); + decode_nohead(n, snaps, bl); + decode_nohead(m, removed_snaps, bl); + } + + if (struct_v >= 4) { + decode(flags, bl); + uint32_t crash_replay_interval; + decode(crash_replay_interval, bl); + } else { + flags = 0; + } + // upgrade path for selfmanaged vs pool snaps + if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) { + if (!removed_snaps.empty()) { + flags |= FLAG_SELFMANAGED_SNAPS; + } else { + flags |= FLAG_POOL_SNAPS; + } + } + if (struct_v >= 7) { + decode(min_size, bl); + } else { + min_size = size - size/2; + } + if (struct_v >= 8) { + decode(quota_max_bytes, bl); + decode(quota_max_objects, bl); + } + if (struct_v >= 9) { + decode(tiers, bl); + decode(tier_of, bl); + __u8 v; + decode(v, bl); + cache_mode = (cache_mode_t)v; + decode(read_tier, bl); + decode(write_tier, bl); + } + if (struct_v >= 10) { + decode(properties, bl); + } + if (struct_v >= 11) { + decode(hit_set_params, bl); + decode(hit_set_period, bl); + decode(hit_set_count, bl); + } else { + pg_pool_t def; + hit_set_period = def.hit_set_period; + hit_set_count = def.hit_set_count; + } + if (struct_v >= 12) { + decode(stripe_width, bl); + } else { + set_stripe_width(0); + } + if (struct_v >= 13) { + decode(target_max_bytes, bl); + decode(target_max_objects, bl); + decode(cache_target_dirty_ratio_micro, bl); + decode(cache_target_full_ratio_micro, bl); + decode(cache_min_flush_age, bl); + decode(cache_min_evict_age, bl); + } else { + target_max_bytes = 0; + target_max_objects = 0; + cache_target_dirty_ratio_micro = 0; + cache_target_full_ratio_micro = 0; + cache_min_flush_age = 0; + cache_min_evict_age = 0; + } + if (struct_v >= 14) { + decode(erasure_code_profile, bl); + } + if (struct_v >= 15) { + decode(last_force_op_resend_preluminous, bl); + } else { + last_force_op_resend_preluminous = 0; + } + if (struct_v >= 16) { + decode(min_read_recency_for_promote, bl); + } else { + min_read_recency_for_promote = 1; + } + if (struct_v >= 17) { + decode(expected_num_objects, bl); + } else { + expected_num_objects = 0; + } + if (struct_v >= 19) { + decode(cache_target_dirty_high_ratio_micro, bl); + } else { + cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro; + } + if (struct_v >= 20) { + decode(min_write_recency_for_promote, bl); + } else { + min_write_recency_for_promote = 1; + } + if (struct_v >= 21) { + decode(use_gmt_hitset, bl); + } else { + use_gmt_hitset = false; + } + if (struct_v >= 22) { + decode(fast_read, bl); + } else { + fast_read = false; + } + if (struct_v >= 23) { + decode(hit_set_grade_decay_rate, bl); + decode(hit_set_search_last_n, bl); + } else { + hit_set_grade_decay_rate = 0; + hit_set_search_last_n = 1; + } + if (struct_v >= 24) { + decode(opts, bl); + } + if (struct_v >= 25) { + decode(last_force_op_resend_prenautilus, bl); + } else { + last_force_op_resend_prenautilus = last_force_op_resend_preluminous; + } + if (struct_v >= 26) { + decode(application_metadata, bl); + } + if (struct_v >= 27) { + decode(create_time, bl); + } + if (struct_v >= 28) { + decode(pg_num_target, bl); + decode(pgp_num_target, bl); + decode(pg_num_pending, bl); + epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started; + decode(old_merge_last_epoch_started, bl); + decode(old_merge_last_epoch_clean, bl); + decode(last_force_op_resend, bl); + decode(pg_autoscale_mode, bl); + if (struct_v >= 29) { + decode(last_pg_merge_meta, bl); + } else { + last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean; + last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started; + } + } else { + pg_num_target = pg_num; + pgp_num_target = pgp_num; + pg_num_pending = pg_num; + last_force_op_resend = last_force_op_resend_prenautilus; + pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade + } + if (struct_v >= 30) { + decode(peering_crush_bucket_count, bl); + decode(peering_crush_bucket_target, bl); + decode(peering_crush_bucket_barrier, bl); + decode(peering_crush_mandatory_member, bl); + } + DECODE_FINISH(bl); + calc_pg_masks(); + calc_grade_table(); +} + +bool pg_pool_t::stretch_set_can_peer(const set& want, const OSDMap& osdmap, + std::ostream * out) const +{ + if (!is_stretch_pool()) return true; + const uint32_t barrier_id = peering_crush_bucket_barrier; + const uint32_t barrier_count = peering_crush_bucket_count; + set ancestors; + const shared_ptr& crush = osdmap.crush; + for (int osdid : want) { + int ancestor = crush->get_parent_of_type(osdid, barrier_id, + crush_rule); + ancestors.insert(ancestor); + } + if (ancestors.size() < barrier_count) { + if (out) { + *out << __func__ << ": not enough crush buckets with OSDs in want set " + << want; + } + return false; + } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE && + !ancestors.count(peering_crush_mandatory_member)) { + if (out) { + *out << __func__ << ": missing mandatory crush bucket member " + << peering_crush_mandatory_member; + } + return false; + } + return true; +} + +void pg_pool_t::generate_test_instances(list& o) +{ + pg_pool_t a; + o.push_back(new pg_pool_t(a)); + + a.create_time = utime_t(4,5); + a.type = TYPE_REPLICATED; + a.size = 2; + a.crush_rule = 3; + a.object_hash = 4; + a.pg_num = 6; + a.pgp_num = 4; + a.pgp_num_target = 4; + a.pg_num_target = 5; + a.pg_num_pending = 5; + a.last_pg_merge_meta.last_epoch_started = 2; + a.last_pg_merge_meta.last_epoch_clean = 2; + a.last_change = 9; + a.last_force_op_resend = 123823; + a.last_force_op_resend_preluminous = 123824; + a.snap_seq = 10; + a.snap_epoch = 11; + a.flags = FLAG_POOL_SNAPS; + a.auid = 12; + a.quota_max_bytes = 473; + a.quota_max_objects = 474; + o.push_back(new pg_pool_t(a)); + + a.snaps[3].name = "asdf"; + a.snaps[3].snapid = 3; + a.snaps[3].stamp = utime_t(123, 4); + a.snaps[6].name = "qwer"; + a.snaps[6].snapid = 6; + a.snaps[6].stamp = utime_t(23423, 4); + o.push_back(new pg_pool_t(a)); + + a.flags = FLAG_SELFMANAGED_SNAPS; + a.snaps.clear(); + a.removed_snaps.insert(2); + a.quota_max_bytes = 2473; + a.quota_max_objects = 4374; + a.tiers.insert(0); + a.tiers.insert(1); + a.tier_of = 2; + a.cache_mode = CACHEMODE_WRITEBACK; + a.read_tier = 1; + a.write_tier = 1; + a.hit_set_params = HitSet::Params(new BloomHitSet::Params); + a.hit_set_period = 3600; + a.hit_set_count = 8; + a.min_read_recency_for_promote = 1; + a.min_write_recency_for_promote = 1; + a.hit_set_grade_decay_rate = 50; + a.hit_set_search_last_n = 1; + a.calc_grade_table(); + a.set_stripe_width(12345); + a.target_max_bytes = 1238132132; + a.target_max_objects = 1232132; + a.cache_target_dirty_ratio_micro = 187232; + a.cache_target_dirty_high_ratio_micro = 309856; + a.cache_target_full_ratio_micro = 987222; + a.cache_min_flush_age = 231; + a.cache_min_evict_age = 2321; + a.erasure_code_profile = "profile in osdmap"; + a.expected_num_objects = 123456; + a.fast_read = false; + a.application_metadata = {{"rbd", {{"key", "value"}}}}; + o.push_back(new pg_pool_t(a)); +} + +ostream& operator<<(ostream& out, const pg_pool_t& p) +{ + out << p.get_type_name(); + if (p.get_type_name() == "erasure") { + out << " profile " << p.erasure_code_profile; + } + out << " size " << p.get_size() + << " min_size " << p.get_min_size() + << " crush_rule " << p.get_crush_rule() + << " object_hash " << p.get_object_hash_name() + << " pg_num " << p.get_pg_num() + << " pgp_num " << p.get_pgp_num(); + if (p.get_pg_num_target() != p.get_pg_num()) { + out << " pg_num_target " << p.get_pg_num_target(); + } + if (p.get_pgp_num_target() != p.get_pgp_num()) { + out << " pgp_num_target " << p.get_pgp_num_target(); + } + if (p.get_pg_num_pending() != p.get_pg_num()) { + out << " pg_num_pending " << p.get_pg_num_pending(); + } + if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode); + } + out << " last_change " << p.get_last_change(); + if (p.get_last_force_op_resend() || + p.get_last_force_op_resend_prenautilus() || + p.get_last_force_op_resend_preluminous()) + out << " lfor " << p.get_last_force_op_resend() << "/" + << p.get_last_force_op_resend_prenautilus() << "/" + << p.get_last_force_op_resend_preluminous(); + if (p.get_auid()) + out << " owner " << p.get_auid(); + if (p.flags) + out << " flags " << p.get_flags_string(); + if (p.quota_max_bytes) + out << " max_bytes " << p.quota_max_bytes; + if (p.quota_max_objects) + out << " max_objects " << p.quota_max_objects; + if (!p.tiers.empty()) + out << " tiers " << p.tiers; + if (p.is_tier()) + out << " tier_of " << p.tier_of; + if (p.has_read_tier()) + out << " read_tier " << p.read_tier; + if (p.has_write_tier()) + out << " write_tier " << p.write_tier; + if (p.cache_mode) + out << " cache_mode " << p.get_cache_mode_name(); + if (p.target_max_bytes) + out << " target_bytes " << p.target_max_bytes; + if (p.target_max_objects) + out << " target_objects " << p.target_max_objects; + if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) { + out << " hit_set " << p.hit_set_params + << " " << p.hit_set_period << "s" + << " x" << p.hit_set_count << " decay_rate " + << p.hit_set_grade_decay_rate + << " search_last_n " << p.hit_set_search_last_n; + } + if (p.min_read_recency_for_promote) + out << " min_read_recency_for_promote " << p.min_read_recency_for_promote; + if (p.min_write_recency_for_promote) + out << " min_write_recency_for_promote " << p.min_write_recency_for_promote; + out << " stripe_width " << p.get_stripe_width(); + if (p.expected_num_objects) + out << " expected_num_objects " << p.expected_num_objects; + if (p.fast_read) + out << " fast_read " << p.fast_read; + out << p.opts; + if (!p.application_metadata.empty()) { + out << " application "; + for (auto it = p.application_metadata.begin(); + it != p.application_metadata.end(); ++it) { + if (it != p.application_metadata.begin()) + out << ","; + out << it->first; + } + } + return out; +} + + +// -- object_stat_sum_t -- + +void object_stat_sum_t::dump(Formatter *f) const +{ + f->dump_int("num_bytes", num_bytes); + f->dump_int("num_objects", num_objects); + f->dump_int("num_object_clones", num_object_clones); + f->dump_int("num_object_copies", num_object_copies); + f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary); + f->dump_int("num_objects_missing", num_objects_missing); + f->dump_int("num_objects_degraded", num_objects_degraded); + f->dump_int("num_objects_misplaced", num_objects_misplaced); + f->dump_int("num_objects_unfound", num_objects_unfound); + f->dump_int("num_objects_dirty", num_objects_dirty); + f->dump_int("num_whiteouts", num_whiteouts); + f->dump_int("num_read", num_rd); + f->dump_int("num_read_kb", num_rd_kb); + f->dump_int("num_write", num_wr); + f->dump_int("num_write_kb", num_wr_kb); + f->dump_int("num_scrub_errors", num_scrub_errors); + f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors); + f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors); + f->dump_int("num_objects_recovered", num_objects_recovered); + f->dump_int("num_bytes_recovered", num_bytes_recovered); + f->dump_int("num_keys_recovered", num_keys_recovered); + f->dump_int("num_objects_omap", num_objects_omap); + f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive); + f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive); + f->dump_int("num_flush", num_flush); + f->dump_int("num_flush_kb", num_flush_kb); + f->dump_int("num_evict", num_evict); + f->dump_int("num_evict_kb", num_evict_kb); + f->dump_int("num_promote", num_promote); + f->dump_int("num_flush_mode_high", num_flush_mode_high); + f->dump_int("num_flush_mode_low", num_flush_mode_low); + f->dump_int("num_evict_mode_some", num_evict_mode_some); + f->dump_int("num_evict_mode_full", num_evict_mode_full); + f->dump_int("num_objects_pinned", num_objects_pinned); + f->dump_int("num_legacy_snapsets", num_legacy_snapsets); + f->dump_int("num_large_omap_objects", num_large_omap_objects); + f->dump_int("num_objects_manifest", num_objects_manifest); + f->dump_int("num_omap_bytes", num_omap_bytes); + f->dump_int("num_omap_keys", num_omap_keys); + f->dump_int("num_objects_repaired", num_objects_repaired); +} + +void object_stat_sum_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(20, 14, bl); +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t)); +#else + encode(num_bytes, bl); + encode(num_objects, bl); + encode(num_object_clones, bl); + encode(num_object_copies, bl); + encode(num_objects_missing_on_primary, bl); + encode(num_objects_degraded, bl); + encode(num_objects_unfound, bl); + encode(num_rd, bl); + encode(num_rd_kb, bl); + encode(num_wr, bl); + encode(num_wr_kb, bl); + encode(num_scrub_errors, bl); + encode(num_objects_recovered, bl); + encode(num_bytes_recovered, bl); + encode(num_keys_recovered, bl); + encode(num_shallow_scrub_errors, bl); + encode(num_deep_scrub_errors, bl); + encode(num_objects_dirty, bl); + encode(num_whiteouts, bl); + encode(num_objects_omap, bl); + encode(num_objects_hit_set_archive, bl); + encode(num_objects_misplaced, bl); + encode(num_bytes_hit_set_archive, bl); + encode(num_flush, bl); + encode(num_flush_kb, bl); + encode(num_evict, bl); + encode(num_evict_kb, bl); + encode(num_promote, bl); + encode(num_flush_mode_high, bl); + encode(num_flush_mode_low, bl); + encode(num_evict_mode_some, bl); + encode(num_evict_mode_full, bl); + encode(num_objects_pinned, bl); + encode(num_objects_missing, bl); + encode(num_legacy_snapsets, bl); + encode(num_large_omap_objects, bl); + encode(num_objects_manifest, bl); + encode(num_omap_bytes, bl); + encode(num_omap_keys, bl); + encode(num_objects_repaired, bl); +#endif + ENCODE_FINISH(bl); +} + +void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl) +{ + bool decode_finish = false; + static const int STAT_SUM_DECODE_VERSION = 20; + DECODE_START(STAT_SUM_DECODE_VERSION, bl); +#if defined(CEPH_LITTLE_ENDIAN) + if (struct_v == STAT_SUM_DECODE_VERSION) { + bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes)); + decode_finish = true; + } +#endif + if (!decode_finish) { + decode(num_bytes, bl); + decode(num_objects, bl); + decode(num_object_clones, bl); + decode(num_object_copies, bl); + decode(num_objects_missing_on_primary, bl); + decode(num_objects_degraded, bl); + decode(num_objects_unfound, bl); + decode(num_rd, bl); + decode(num_rd_kb, bl); + decode(num_wr, bl); + decode(num_wr_kb, bl); + decode(num_scrub_errors, bl); + decode(num_objects_recovered, bl); + decode(num_bytes_recovered, bl); + decode(num_keys_recovered, bl); + decode(num_shallow_scrub_errors, bl); + decode(num_deep_scrub_errors, bl); + decode(num_objects_dirty, bl); + decode(num_whiteouts, bl); + decode(num_objects_omap, bl); + decode(num_objects_hit_set_archive, bl); + decode(num_objects_misplaced, bl); + decode(num_bytes_hit_set_archive, bl); + decode(num_flush, bl); + decode(num_flush_kb, bl); + decode(num_evict, bl); + decode(num_evict_kb, bl); + decode(num_promote, bl); + decode(num_flush_mode_high, bl); + decode(num_flush_mode_low, bl); + decode(num_evict_mode_some, bl); + decode(num_evict_mode_full, bl); + decode(num_objects_pinned, bl); + decode(num_objects_missing, bl); + if (struct_v >= 16) { + decode(num_legacy_snapsets, bl); + } else { + num_legacy_snapsets = num_object_clones; // upper bound + } + if (struct_v >= 17) { + decode(num_large_omap_objects, bl); + } + if (struct_v >= 18) { + decode(num_objects_manifest, bl); + } + if (struct_v >= 19) { + decode(num_omap_bytes, bl); + decode(num_omap_keys, bl); + } + if (struct_v >= 20) { + decode(num_objects_repaired, bl); + } + } + DECODE_FINISH(bl); +} + +void object_stat_sum_t::generate_test_instances(list& o) +{ + object_stat_sum_t a; + + a.num_bytes = 1; + a.num_objects = 3; + a.num_object_clones = 4; + a.num_object_copies = 5; + a.num_objects_missing_on_primary = 6; + a.num_objects_missing = 123; + a.num_objects_degraded = 7; + a.num_objects_unfound = 8; + a.num_rd = 9; a.num_rd_kb = 10; + a.num_wr = 11; a.num_wr_kb = 12; + a.num_objects_recovered = 14; + a.num_bytes_recovered = 15; + a.num_keys_recovered = 16; + a.num_deep_scrub_errors = 17; + a.num_shallow_scrub_errors = 18; + a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors; + a.num_objects_dirty = 21; + a.num_whiteouts = 22; + a.num_objects_misplaced = 1232; + a.num_objects_hit_set_archive = 2; + a.num_bytes_hit_set_archive = 27; + a.num_flush = 5; + a.num_flush_kb = 6; + a.num_evict = 7; + a.num_evict_kb = 8; + a.num_promote = 9; + a.num_flush_mode_high = 0; + a.num_flush_mode_low = 1; + a.num_evict_mode_some = 1; + a.num_evict_mode_full = 0; + a.num_objects_pinned = 20; + a.num_large_omap_objects = 5; + a.num_objects_manifest = 2; + a.num_omap_bytes = 20000; + a.num_omap_keys = 200; + a.num_objects_repaired = 300; + o.push_back(new object_stat_sum_t(a)); +} + +void object_stat_sum_t::add(const object_stat_sum_t& o) +{ + num_bytes += o.num_bytes; + num_objects += o.num_objects; + num_object_clones += o.num_object_clones; + num_object_copies += o.num_object_copies; + num_objects_missing_on_primary += o.num_objects_missing_on_primary; + num_objects_missing += o.num_objects_missing; + num_objects_degraded += o.num_objects_degraded; + num_objects_misplaced += o.num_objects_misplaced; + num_rd += o.num_rd; + num_rd_kb += o.num_rd_kb; + num_wr += o.num_wr; + num_wr_kb += o.num_wr_kb; + num_objects_unfound += o.num_objects_unfound; + num_scrub_errors += o.num_scrub_errors; + num_shallow_scrub_errors += o.num_shallow_scrub_errors; + num_deep_scrub_errors += o.num_deep_scrub_errors; + num_objects_recovered += o.num_objects_recovered; + num_bytes_recovered += o.num_bytes_recovered; + num_keys_recovered += o.num_keys_recovered; + num_objects_dirty += o.num_objects_dirty; + num_whiteouts += o.num_whiteouts; + num_objects_omap += o.num_objects_omap; + num_objects_hit_set_archive += o.num_objects_hit_set_archive; + num_bytes_hit_set_archive += o.num_bytes_hit_set_archive; + num_flush += o.num_flush; + num_flush_kb += o.num_flush_kb; + num_evict += o.num_evict; + num_evict_kb += o.num_evict_kb; + num_promote += o.num_promote; + num_flush_mode_high += o.num_flush_mode_high; + num_flush_mode_low += o.num_flush_mode_low; + num_evict_mode_some += o.num_evict_mode_some; + num_evict_mode_full += o.num_evict_mode_full; + num_objects_pinned += o.num_objects_pinned; + num_legacy_snapsets += o.num_legacy_snapsets; + num_large_omap_objects += o.num_large_omap_objects; + num_objects_manifest += o.num_objects_manifest; + num_omap_bytes += o.num_omap_bytes; + num_omap_keys += o.num_omap_keys; + num_objects_repaired += o.num_objects_repaired; +} + +void object_stat_sum_t::sub(const object_stat_sum_t& o) +{ + num_bytes -= o.num_bytes; + num_objects -= o.num_objects; + num_object_clones -= o.num_object_clones; + num_object_copies -= o.num_object_copies; + num_objects_missing_on_primary -= o.num_objects_missing_on_primary; + num_objects_missing -= o.num_objects_missing; + num_objects_degraded -= o.num_objects_degraded; + num_objects_misplaced -= o.num_objects_misplaced; + num_rd -= o.num_rd; + num_rd_kb -= o.num_rd_kb; + num_wr -= o.num_wr; + num_wr_kb -= o.num_wr_kb; + num_objects_unfound -= o.num_objects_unfound; + num_scrub_errors -= o.num_scrub_errors; + num_shallow_scrub_errors -= o.num_shallow_scrub_errors; + num_deep_scrub_errors -= o.num_deep_scrub_errors; + num_objects_recovered -= o.num_objects_recovered; + num_bytes_recovered -= o.num_bytes_recovered; + num_keys_recovered -= o.num_keys_recovered; + num_objects_dirty -= o.num_objects_dirty; + num_whiteouts -= o.num_whiteouts; + num_objects_omap -= o.num_objects_omap; + num_objects_hit_set_archive -= o.num_objects_hit_set_archive; + num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive; + num_flush -= o.num_flush; + num_flush_kb -= o.num_flush_kb; + num_evict -= o.num_evict; + num_evict_kb -= o.num_evict_kb; + num_promote -= o.num_promote; + num_flush_mode_high -= o.num_flush_mode_high; + num_flush_mode_low -= o.num_flush_mode_low; + num_evict_mode_some -= o.num_evict_mode_some; + num_evict_mode_full -= o.num_evict_mode_full; + num_objects_pinned -= o.num_objects_pinned; + num_legacy_snapsets -= o.num_legacy_snapsets; + num_large_omap_objects -= o.num_large_omap_objects; + num_objects_manifest -= o.num_objects_manifest; + num_omap_bytes -= o.num_omap_bytes; + num_omap_keys -= o.num_omap_keys; + num_objects_repaired -= o.num_objects_repaired; +} + +bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r) +{ + return + l.num_bytes == r.num_bytes && + l.num_objects == r.num_objects && + l.num_object_clones == r.num_object_clones && + l.num_object_copies == r.num_object_copies && + l.num_objects_missing_on_primary == r.num_objects_missing_on_primary && + l.num_objects_missing == r.num_objects_missing && + l.num_objects_degraded == r.num_objects_degraded && + l.num_objects_misplaced == r.num_objects_misplaced && + l.num_objects_unfound == r.num_objects_unfound && + l.num_rd == r.num_rd && + l.num_rd_kb == r.num_rd_kb && + l.num_wr == r.num_wr && + l.num_wr_kb == r.num_wr_kb && + l.num_scrub_errors == r.num_scrub_errors && + l.num_shallow_scrub_errors == r.num_shallow_scrub_errors && + l.num_deep_scrub_errors == r.num_deep_scrub_errors && + l.num_objects_recovered == r.num_objects_recovered && + l.num_bytes_recovered == r.num_bytes_recovered && + l.num_keys_recovered == r.num_keys_recovered && + l.num_objects_dirty == r.num_objects_dirty && + l.num_whiteouts == r.num_whiteouts && + l.num_objects_omap == r.num_objects_omap && + l.num_objects_hit_set_archive == r.num_objects_hit_set_archive && + l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive && + l.num_flush == r.num_flush && + l.num_flush_kb == r.num_flush_kb && + l.num_evict == r.num_evict && + l.num_evict_kb == r.num_evict_kb && + l.num_promote == r.num_promote && + l.num_flush_mode_high == r.num_flush_mode_high && + l.num_flush_mode_low == r.num_flush_mode_low && + l.num_evict_mode_some == r.num_evict_mode_some && + l.num_evict_mode_full == r.num_evict_mode_full && + l.num_objects_pinned == r.num_objects_pinned && + l.num_legacy_snapsets == r.num_legacy_snapsets && + l.num_large_omap_objects == r.num_large_omap_objects && + l.num_objects_manifest == r.num_objects_manifest && + l.num_omap_bytes == r.num_omap_bytes && + l.num_omap_keys == r.num_omap_keys && + l.num_objects_repaired == r.num_objects_repaired; +} + +// -- object_stat_collection_t -- + +void object_stat_collection_t::dump(Formatter *f) const +{ + f->open_object_section("stat_sum"); + sum.dump(f); + f->close_section(); +} + +void object_stat_collection_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(2, 2, bl); + encode(sum, bl); + encode((__u32)0, bl); + ENCODE_FINISH(bl); +} + +void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + decode(sum, bl); + { + map cat_sum; + decode(cat_sum, bl); + } + DECODE_FINISH(bl); +} + +void object_stat_collection_t::generate_test_instances(list& o) +{ + object_stat_collection_t a; + o.push_back(new object_stat_collection_t(a)); + list l; + object_stat_sum_t::generate_test_instances(l); + for (auto p = l.begin(); p != l.end(); ++p) { + a.add(**p); + o.push_back(new object_stat_collection_t(a)); + } +} + + +// -- pg_stat_t -- + +bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const +{ + if (primary && osd == acting_primary) { + return true; + } else if (!primary) { + for(auto it = acting.cbegin(); it != acting.cend(); ++it) + { + if (*it == osd) + return true; + } + } + return false; +} + +void pg_stat_t::dump(Formatter *f) const +{ + f->dump_stream("version") << version; + f->dump_unsigned("reported_seq", reported_seq); + f->dump_unsigned("reported_epoch", reported_epoch); + f->dump_string("state", pg_state_string(state)); + f->dump_stream("last_fresh") << last_fresh; + f->dump_stream("last_change") << last_change; + f->dump_stream("last_active") << last_active; + f->dump_stream("last_peered") << last_peered; + f->dump_stream("last_clean") << last_clean; + f->dump_stream("last_became_active") << last_became_active; + f->dump_stream("last_became_peered") << last_became_peered; + f->dump_stream("last_unstale") << last_unstale; + f->dump_stream("last_undegraded") << last_undegraded; + f->dump_stream("last_fullsized") << last_fullsized; + f->dump_unsigned("mapping_epoch", mapping_epoch); + f->dump_stream("log_start") << log_start; + f->dump_stream("ondisk_log_start") << ondisk_log_start; + f->dump_unsigned("created", created); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); + f->dump_stream("parent") << parent; + f->dump_unsigned("parent_split_bits", parent_split_bits); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; + f->dump_int("objects_scrubbed", objects_scrubbed); + f->dump_int("log_size", log_size); + f->dump_int("log_dups_size", log_dups_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_bool("stats_invalid", stats_invalid); + f->dump_bool("dirty_stats_invalid", dirty_stats_invalid); + f->dump_bool("omap_stats_invalid", omap_stats_invalid); + f->dump_bool("hitset_stats_invalid", hitset_stats_invalid); + f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid); + f->dump_bool("pin_stats_invalid", pin_stats_invalid); + f->dump_bool("manifest_stats_invalid", manifest_stats_invalid); + f->dump_unsigned("snaptrimq_len", snaptrimq_len); + f->dump_int("last_scrub_duration", last_scrub_duration); + f->dump_string("scrub_schedule", dump_scrub_schedule()); + f->dump_float("scrub_duration", scrub_duration); + f->dump_int("objects_trimmed", objects_trimmed); + f->dump_float("snaptrim_duration", snaptrim_duration); + stats.dump(f); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("avail_no_missing"); + for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p) + f->dump_stream("shard") << *p; + f->close_section(); + f->open_array_section("object_location_counts"); + for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) { + f->open_object_section("entry"); + f->dump_stream("shards") << p->first; + f->dump_int("objects", p->second); + f->close_section(); + } + f->close_section(); + f->open_array_section("blocked_by"); + for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); + f->open_array_section("purged_snaps"); + for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) { + f->open_object_section("interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); +} + +void pg_stat_t::dump_brief(Formatter *f) const +{ + f->dump_string("state", pg_state_string(state)); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_primary); + f->dump_int("acting_primary", acting_primary); +} + +std::string pg_stat_t::dump_scrub_schedule() const +{ + if (scrub_sched_status.m_is_active) { + // are we blocked (in fact, stuck) on some locked object? + if (scrub_sched_status.m_sched_status == pg_scrub_sched_status_t::blocked) { + return fmt::format( + "Blocked! locked objects (for {}s)", + scrub_sched_status.m_duration_seconds); + } else { + return fmt::format( + "{}scrubbing for {}s", + ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""), + scrub_sched_status.m_duration_seconds); + } + } + switch (scrub_sched_status.m_sched_status) { + case pg_scrub_sched_status_t::unknown: + // no reported scrub schedule yet + return "--"s; + case pg_scrub_sched_status_t::not_queued: + return "no scrub is scheduled"s; + case pg_scrub_sched_status_t::scheduled: + return fmt::format( + "{} {}scrub scheduled @ {}", + (scrub_sched_status.m_is_periodic ? "periodic" : "user requested"), + ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""), + scrub_sched_status.m_scheduled_at); + case pg_scrub_sched_status_t::queued: + return fmt::format( + "queued for {}scrub", + ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : "")); + default: + // a bug! + return "SCRUB STATE MISMATCH!"s; + } +} + +bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r) +{ + return + l.m_sched_status == r.m_sched_status && + l.m_scheduled_at == r.m_scheduled_at && + l.m_duration_seconds == r.m_duration_seconds && + l.m_is_active == r.m_is_active && + l.m_is_deep == r.m_is_deep && + l.m_is_periodic == r.m_is_periodic; +} + +void pg_stat_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(29, 22, bl); + encode(version, bl); + encode(reported_seq, bl); + encode(reported_epoch, bl); + encode((__u32)state, bl); // for older peers + encode(log_start, bl); + encode(ondisk_log_start, bl); + encode(created, bl); + encode(last_epoch_clean, bl); + encode(parent, bl); + encode(parent_split_bits, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(last_fresh, bl); + encode(last_change, bl); + encode(last_active, bl); + encode(last_clean, bl); + encode(last_unstale, bl); + encode(mapping_epoch, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(stats_invalid, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_became_active, bl); + encode(dirty_stats_invalid, bl); + encode(up_primary, bl); + encode(acting_primary, bl); + encode(omap_stats_invalid, bl); + encode(hitset_stats_invalid, bl); + encode(blocked_by, bl); + encode(last_undegraded, bl); + encode(last_fullsized, bl); + encode(hitset_bytes_stats_invalid, bl); + encode(last_peered, bl); + encode(last_became_peered, bl); + encode(pin_stats_invalid, bl); + encode(snaptrimq_len, bl); + __u32 top_state = (state >> 32); + encode(top_state, bl); + encode(purged_snaps, bl); + encode(manifest_stats_invalid, bl); + encode(avail_no_missing, bl); + encode(object_location_counts, bl); + encode(last_scrub_duration, bl); + encode(scrub_sched_status.m_scheduled_at, bl); + encode(scrub_sched_status.m_duration_seconds, bl); + encode((__u16)scrub_sched_status.m_sched_status, bl); + encode(scrub_sched_status.m_is_active, bl); + encode((scrub_sched_status.m_is_deep==scrub_level_t::deep), bl); + encode(scrub_sched_status.m_is_periodic, bl); + encode(objects_scrubbed, bl); + encode(scrub_duration, bl); + encode(objects_trimmed, bl); + encode(snaptrim_duration, bl); + encode(log_dups_size, bl); + + ENCODE_FINISH(bl); +} + +void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + bool tmp; + uint32_t old_state; + DECODE_START(29, bl); + decode(version, bl); + decode(reported_seq, bl); + decode(reported_epoch, bl); + decode(old_state, bl); + decode(log_start, bl); + decode(ondisk_log_start, bl); + decode(created, bl); + decode(last_epoch_clean, bl); + decode(parent, bl); + decode(parent_split_bits, bl); + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + decode(up, bl); + decode(acting, bl); + decode(last_fresh, bl); + decode(last_change, bl); + decode(last_active, bl); + decode(last_clean, bl); + decode(last_unstale, bl); + decode(mapping_epoch, bl); + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + decode(tmp, bl); + stats_invalid = tmp; + decode(last_clean_scrub_stamp, bl); + decode(last_became_active, bl); + decode(tmp, bl); + dirty_stats_invalid = tmp; + decode(up_primary, bl); + decode(acting_primary, bl); + decode(tmp, bl); + omap_stats_invalid = tmp; + decode(tmp, bl); + hitset_stats_invalid = tmp; + decode(blocked_by, bl); + decode(last_undegraded, bl); + decode(last_fullsized, bl); + decode(tmp, bl); + hitset_bytes_stats_invalid = tmp; + decode(last_peered, bl); + decode(last_became_peered, bl); + decode(tmp, bl); + pin_stats_invalid = tmp; + if (struct_v >= 23) { + decode(snaptrimq_len, bl); + if (struct_v >= 24) { + __u32 top_state; + decode(top_state, bl); + state = (uint64_t)old_state | ((uint64_t)top_state << 32); + decode(purged_snaps, bl); + } else { + state = old_state; + } + if (struct_v >= 25) { + decode(tmp, bl); + manifest_stats_invalid = tmp; + } else { + manifest_stats_invalid = true; + } + if (struct_v >= 26) { + decode(avail_no_missing, bl); + decode(object_location_counts, bl); + } + if (struct_v >= 27) { + decode(last_scrub_duration, bl); + decode(scrub_sched_status.m_scheduled_at, bl); + decode(scrub_sched_status.m_duration_seconds, bl); + __u16 scrub_sched_as_u16; + decode(scrub_sched_as_u16, bl); + scrub_sched_status.m_sched_status = (pg_scrub_sched_status_t)(scrub_sched_as_u16); + decode(tmp, bl); + scrub_sched_status.m_is_active = tmp; + decode(tmp, bl); + scrub_sched_status.m_is_deep = tmp ? scrub_level_t::deep : scrub_level_t::shallow; + decode(tmp, bl); + scrub_sched_status.m_is_periodic = tmp; + decode(objects_scrubbed, bl); + } + if (struct_v >= 28) { + decode(scrub_duration, bl); + decode(objects_trimmed, bl); + decode(snaptrim_duration, bl); + } + if (struct_v >= 29) { + decode(log_dups_size, bl); + } + } + DECODE_FINISH(bl); +} + +void pg_stat_t::generate_test_instances(list& o) +{ + pg_stat_t a; + o.push_back(new pg_stat_t(a)); + + a.version = eversion_t(1, 3); + a.reported_epoch = 1; + a.reported_seq = 2; + a.state = 123; + a.mapping_epoch = 998; + a.last_fresh = utime_t(1002, 1); + a.last_change = utime_t(1002, 2); + a.last_active = utime_t(1002, 3); + a.last_clean = utime_t(1002, 4); + a.last_unstale = utime_t(1002, 5); + a.last_undegraded = utime_t(1002, 7); + a.last_fullsized = utime_t(1002, 8); + a.log_start = eversion_t(1, 4); + a.ondisk_log_start = eversion_t(1, 5); + a.created = 6; + a.last_epoch_clean = 7; + a.parent = pg_t(1, 2); + a.parent_split_bits = 12; + a.last_scrub = eversion_t(9, 10); + a.last_scrub_stamp = utime_t(11, 12); + a.last_deep_scrub = eversion_t(13, 14); + a.last_deep_scrub_stamp = utime_t(15, 16); + a.last_clean_scrub_stamp = utime_t(17, 18); + a.last_scrub_duration = 3617; + a.scrub_duration = 0.003; + a.snaptrimq_len = 1048576; + a.objects_scrubbed = 0; + a.objects_trimmed = 0; + a.snaptrim_duration = 0.123; + list l; + object_stat_collection_t::generate_test_instances(l); + a.stats = *l.back(); + a.log_size = 99; + a.ondisk_log_size = 88; + a.up.push_back(123); + a.up_primary = 123; + a.acting.push_back(456); + a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD)); + set sset = { pg_shard_t(0), pg_shard_t(1) }; + a.object_location_counts.insert(make_pair(sset, 10)); + sset.insert(pg_shard_t(2)); + a.object_location_counts.insert(make_pair(sset, 5)); + a.acting_primary = 456; + o.push_back(new pg_stat_t(a)); + + a.up.push_back(124); + a.up_primary = 124; + a.acting.push_back(124); + a.acting_primary = 124; + a.blocked_by.push_back(155); + a.blocked_by.push_back(156); + o.push_back(new pg_stat_t(a)); +} + +bool operator==(const pg_stat_t& l, const pg_stat_t& r) +{ + return + l.version == r.version && + l.reported_seq == r.reported_seq && + l.reported_epoch == r.reported_epoch && + l.state == r.state && + l.last_fresh == r.last_fresh && + l.last_change == r.last_change && + l.last_active == r.last_active && + l.last_peered == r.last_peered && + l.last_clean == r.last_clean && + l.last_unstale == r.last_unstale && + l.last_undegraded == r.last_undegraded && + l.last_fullsized == r.last_fullsized && + l.log_start == r.log_start && + l.ondisk_log_start == r.ondisk_log_start && + l.created == r.created && + l.last_epoch_clean == r.last_epoch_clean && + l.parent == r.parent && + l.parent_split_bits == r.parent_split_bits && + l.last_scrub == r.last_scrub && + l.last_deep_scrub == r.last_deep_scrub && + l.last_scrub_stamp == r.last_scrub_stamp && + l.last_deep_scrub_stamp == r.last_deep_scrub_stamp && + l.last_clean_scrub_stamp == r.last_clean_scrub_stamp && + l.stats == r.stats && + l.stats_invalid == r.stats_invalid && + l.log_size == r.log_size && + l.log_dups_size == r.log_dups_size && + l.ondisk_log_size == r.ondisk_log_size && + l.up == r.up && + l.acting == r.acting && + l.avail_no_missing == r.avail_no_missing && + l.object_location_counts == r.object_location_counts && + l.mapping_epoch == r.mapping_epoch && + l.blocked_by == r.blocked_by && + l.last_became_active == r.last_became_active && + l.last_became_peered == r.last_became_peered && + l.dirty_stats_invalid == r.dirty_stats_invalid && + l.omap_stats_invalid == r.omap_stats_invalid && + l.hitset_stats_invalid == r.hitset_stats_invalid && + l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid && + l.up_primary == r.up_primary && + l.acting_primary == r.acting_primary && + l.pin_stats_invalid == r.pin_stats_invalid && + l.manifest_stats_invalid == r.manifest_stats_invalid && + l.purged_snaps == r.purged_snaps && + l.snaptrimq_len == r.snaptrimq_len && + l.last_scrub_duration == r.last_scrub_duration && + l.scrub_sched_status == r.scrub_sched_status && + l.objects_scrubbed == r.objects_scrubbed && + l.scrub_duration == r.scrub_duration && + l.objects_trimmed == r.objects_trimmed && + l.snaptrim_duration == r.snaptrim_duration; +} + +// -- store_statfs_t -- + +bool store_statfs_t::operator==(const store_statfs_t& other) const +{ + return total == other.total + && available == other.available + && allocated == other.allocated + && internally_reserved == other.internally_reserved + && data_stored == other.data_stored + && data_compressed == other.data_compressed + && data_compressed_allocated == other.data_compressed_allocated + && data_compressed_original == other.data_compressed_original + && omap_allocated == other.omap_allocated + && internal_metadata == other.internal_metadata; +} + +void store_statfs_t::dump(Formatter *f) const +{ + f->dump_int("total", total); + f->dump_int("available", available); + f->dump_int("internally_reserved", internally_reserved); + f->dump_int("allocated", allocated); + f->dump_int("data_stored", data_stored); + f->dump_int("data_compressed", data_compressed); + f->dump_int("data_compressed_allocated", data_compressed_allocated); + f->dump_int("data_compressed_original", data_compressed_original); + f->dump_int("omap_allocated", omap_allocated); + f->dump_int("internal_metadata", internal_metadata); +} + +ostream& operator<<(ostream& out, const store_statfs_t &s) +{ + out << std::hex + << "store_statfs(0x" << s.available + << "/0x" << s.internally_reserved + << "/0x" << s.total + << ", data 0x" << s.data_stored + << "/0x" << s.allocated + << ", compress 0x" << s.data_compressed + << "/0x" << s.data_compressed_allocated + << "/0x" << s.data_compressed_original + << ", omap 0x" << s.omap_allocated + << ", meta 0x" << s.internal_metadata + << std::dec + << ")"; + return out; +} + +void store_statfs_t::generate_test_instances(list& o) +{ + store_statfs_t a; + o.push_back(new store_statfs_t(a)); + a.total = 234; + a.available = 123; + a.internally_reserved = 33; + a.allocated = 32; + a.data_stored = 44; + a.data_compressed = 21; + a.data_compressed_allocated = 12; + a.data_compressed_original = 13; + a.omap_allocated = 14; + a.internal_metadata = 15; + o.push_back(new store_statfs_t(a)); +} + +// -- pool_stat_t -- + +void pool_stat_t::dump(Formatter *f) const +{ + stats.dump(f); + f->open_object_section("store_stats"); + store_stats.dump(f); + f->close_section(); + f->dump_int("log_size", log_size); + f->dump_int("ondisk_log_size", ondisk_log_size); + f->dump_int("up", up); + f->dump_int("acting", acting); + f->dump_int("num_store_stats", num_store_stats); +} + +void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDENC) == 0) { + __u8 v = 4; + encode(v, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + return; + } + + ENCODE_START(7, 5, bl); + encode(stats, bl); + encode(log_size, bl); + encode(ondisk_log_size, bl); + encode(up, bl); + encode(acting, bl); + encode(store_stats, bl); + encode(num_store_stats, bl); + ENCODE_FINISH(bl); +} + +void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); + if (struct_v >= 4) { + decode(stats, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 6) { + decode(up, bl); + decode(acting, bl); + } else { + up = 0; + acting = 0; + } + if (struct_v >= 7) { + decode(store_stats, bl); + decode(num_store_stats, bl); + } else { + store_stats.reset(); + num_store_stats = 0; + } + + } else { + decode(stats.sum.num_bytes, bl); + uint64_t num_kb; + decode(num_kb, bl); + decode(stats.sum.num_objects, bl); + decode(stats.sum.num_object_clones, bl); + decode(stats.sum.num_object_copies, bl); + decode(stats.sum.num_objects_missing_on_primary, bl); + decode(stats.sum.num_objects_degraded, bl); + decode(log_size, bl); + decode(ondisk_log_size, bl); + if (struct_v >= 2) { + decode(stats.sum.num_rd, bl); + decode(stats.sum.num_rd_kb, bl); + decode(stats.sum.num_wr, bl); + decode(stats.sum.num_wr_kb, bl); + } + if (struct_v >= 3) { + decode(stats.sum.num_objects_unfound, bl); + } + } + DECODE_FINISH(bl); +} + +void pool_stat_t::generate_test_instances(list& o) +{ + pool_stat_t a; + o.push_back(new pool_stat_t(a)); + + list l; + object_stat_collection_t::generate_test_instances(l); + list ll; + store_statfs_t::generate_test_instances(ll); + a.stats = *l.back(); + a.store_stats = *ll.back(); + a.log_size = 123; + a.ondisk_log_size = 456; + a.acting = 3; + a.up = 4; + a.num_store_stats = 1; + o.push_back(new pool_stat_t(a)); +} + + +// -- pg_history_t -- + +void pg_history_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(10, 4, bl); + encode(epoch_created, bl); + encode(last_epoch_started, bl); + encode(last_epoch_clean, bl); + encode(last_epoch_split, bl); + encode(same_interval_since, bl); + encode(same_up_since, bl); + encode(same_primary_since, bl); + encode(last_scrub, bl); + encode(last_scrub_stamp, bl); + encode(last_deep_scrub, bl); + encode(last_deep_scrub_stamp, bl); + encode(last_clean_scrub_stamp, bl); + encode(last_epoch_marked_full, bl); + encode(last_interval_started, bl); + encode(last_interval_clean, bl); + encode(epoch_pool_created, bl); + encode(prior_readable_until_ub, bl); + ENCODE_FINISH(bl); +} + +void pg_history_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl); + decode(epoch_created, bl); + decode(last_epoch_started, bl); + if (struct_v >= 3) + decode(last_epoch_clean, bl); + else + last_epoch_clean = last_epoch_started; // careful, it's a lie! + decode(last_epoch_split, bl); + decode(same_interval_since, bl); + decode(same_up_since, bl); + decode(same_primary_since, bl); + if (struct_v >= 2) { + decode(last_scrub, bl); + decode(last_scrub_stamp, bl); + } + if (struct_v >= 5) { + decode(last_deep_scrub, bl); + decode(last_deep_scrub_stamp, bl); + } + if (struct_v >= 6) { + decode(last_clean_scrub_stamp, bl); + } + if (struct_v >= 7) { + decode(last_epoch_marked_full, bl); + } + if (struct_v >= 8) { + decode(last_interval_started, bl); + decode(last_interval_clean, bl); + } else { + if (last_epoch_started >= same_interval_since) { + last_interval_started = same_interval_since; + } else { + last_interval_started = last_epoch_started; // best guess + } + if (last_epoch_clean >= same_interval_since) { + last_interval_clean = same_interval_since; + } else { + last_interval_clean = last_epoch_clean; // best guess + } + } + if (struct_v >= 9) { + decode(epoch_pool_created, bl); + } else { + epoch_pool_created = epoch_created; + } + if (struct_v >= 10) { + decode(prior_readable_until_ub, bl); + } + DECODE_FINISH(bl); +} + +void pg_history_t::dump(Formatter *f) const +{ + f->dump_int("epoch_created", epoch_created); + f->dump_int("epoch_pool_created", epoch_pool_created); + f->dump_int("last_epoch_started", last_epoch_started); + f->dump_int("last_interval_started", last_interval_started); + f->dump_int("last_epoch_clean", last_epoch_clean); + f->dump_int("last_interval_clean", last_interval_clean); + f->dump_int("last_epoch_split", last_epoch_split); + f->dump_int("last_epoch_marked_full", last_epoch_marked_full); + f->dump_int("same_up_since", same_up_since); + f->dump_int("same_interval_since", same_interval_since); + f->dump_int("same_primary_since", same_primary_since); + f->dump_stream("last_scrub") << last_scrub; + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_stream("last_deep_scrub") << last_deep_scrub; + f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp; + f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp; + f->dump_float( + "prior_readable_until_ub", + std::chrono::duration(prior_readable_until_ub).count()); +} + +void pg_history_t::generate_test_instances(list& o) +{ + o.push_back(new pg_history_t); + o.push_back(new pg_history_t); + o.back()->epoch_created = 1; + o.back()->epoch_pool_created = 1; + o.back()->last_epoch_started = 2; + o.back()->last_interval_started = 2; + o.back()->last_epoch_clean = 3; + o.back()->last_interval_clean = 2; + o.back()->last_epoch_split = 4; + o.back()->prior_readable_until_ub = make_timespan(3.1415); + o.back()->same_up_since = 5; + o.back()->same_interval_since = 6; + o.back()->same_primary_since = 7; + o.back()->last_scrub = eversion_t(8, 9); + o.back()->last_scrub_stamp = utime_t(10, 11); + o.back()->last_deep_scrub = eversion_t(12, 13); + o.back()->last_deep_scrub_stamp = utime_t(14, 15); + o.back()->last_clean_scrub_stamp = utime_t(16, 17); + o.back()->last_epoch_marked_full = 18; +} + + +// -- pg_info_t -- + +void pg_info_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(32, 26, bl); + encode(pgid.pgid, bl); + encode(last_update, bl); + encode(last_complete, bl); + encode(log_tail, bl); + encode(hobject_t(), bl); // old (nibblewise) last_backfill + encode(stats, bl); + history.encode(bl); + encode(purged_snaps, bl); + encode(last_epoch_started, bl); + encode(last_user_version, bl); + encode(hit_set, bl); + encode(pgid.shard, bl); + encode(last_backfill, bl); + encode(true, bl); // was last_backfill_bitwise + encode(last_interval_started, bl); + ENCODE_FINISH(bl); +} + +void pg_info_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(32, bl); + decode(pgid.pgid, bl); + decode(last_update, bl); + decode(last_complete, bl); + decode(log_tail, bl); + { + hobject_t old_last_backfill; + decode(old_last_backfill, bl); + } + decode(stats, bl); + history.decode(bl); + decode(purged_snaps, bl); + decode(last_epoch_started, bl); + decode(last_user_version, bl); + decode(hit_set, bl); + decode(pgid.shard, bl); + decode(last_backfill, bl); + { + bool last_backfill_bitwise; + decode(last_backfill_bitwise, bl); + // note: we may see a false value here since the default value for + // the member was false, so it often didn't get set to true until + // peering progressed. + } + if (struct_v >= 32) { + decode(last_interval_started, bl); + } else { + last_interval_started = last_epoch_started; + } + DECODE_FINISH(bl); +} + +// -- pg_info_t -- + +void pg_info_t::dump(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_stream("last_update") << last_update; + f->dump_stream("last_complete") << last_complete; + f->dump_stream("log_tail") << log_tail; + f->dump_int("last_user_version", last_user_version); + f->dump_stream("last_backfill") << last_backfill; + f->open_array_section("purged_snaps"); + for (interval_set::const_iterator i=purged_snaps.begin(); + i != purged_snaps.end(); + ++i) { + f->open_object_section("purged_snap_interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); + f->open_object_section("history"); + history.dump(f); + f->close_section(); + f->open_object_section("stats"); + stats.dump(f); + f->close_section(); + + f->dump_int("empty", is_empty()); + f->dump_int("dne", dne()); + f->dump_int("incomplete", is_incomplete()); + f->dump_int("last_epoch_started", last_epoch_started); + + f->open_object_section("hit_set_history"); + hit_set.dump(f); + f->close_section(); +} + +void pg_info_t::generate_test_instances(list& o) +{ + o.push_back(new pg_info_t); + o.push_back(new pg_info_t); + list h; + pg_history_t::generate_test_instances(h); + o.back()->history = *h.back(); + o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD); + o.back()->last_update = eversion_t(3, 4); + o.back()->last_complete = eversion_t(5, 6); + o.back()->last_user_version = 2; + o.back()->log_tail = eversion_t(7, 8); + o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, ""); + { + list s; + pg_stat_t::generate_test_instances(s); + o.back()->stats = *s.back(); + } + { + list s; + pg_hit_set_history_t::generate_test_instances(s); + o.back()->hit_set = *s.back(); + } +} + +// -- pg_notify_t -- +void pg_notify_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(3, 2, bl); + encode(query_epoch, bl); + encode(epoch_sent, bl); + encode(info, bl); + encode(to, bl); + encode(from, bl); + encode(past_intervals, bl); + ENCODE_FINISH(bl); +} + +void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(3, bl); + decode(query_epoch, bl); + decode(epoch_sent, bl); + decode(info, bl); + decode(to, bl); + decode(from, bl); + if (struct_v >= 3) { + decode(past_intervals, bl); + } + DECODE_FINISH(bl); +} + +void pg_notify_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_unsigned("query_epoch", query_epoch); + f->dump_unsigned("epoch_sent", epoch_sent); + { + f->open_object_section("info"); + info.dump(f); + f->close_section(); + } + f->dump_object("past_intervals", past_intervals); +} + +void pg_notify_t::generate_test_instances(list& o) +{ + o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, + pg_info_t(), PastIntervals())); + o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, + pg_info_t(), PastIntervals())); +} + +ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify) +{ + lhs << "(query:" << notify.query_epoch + << " sent:" << notify.epoch_sent + << " " << notify.info; + if (notify.from != shard_id_t::NO_SHARD || + notify.to != shard_id_t::NO_SHARD) + lhs << " " << (unsigned)notify.from + << "->" << (unsigned)notify.to; + lhs << " " << notify.past_intervals; + return lhs << ")"; +} + +// -- pg_interval_t -- + +void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(4, 2, bl); + encode(first, bl); + encode(last, bl); + encode(up, bl); + encode(acting, bl); + encode(maybe_went_rw, bl); + encode(primary, bl); + encode(up_primary, bl); + ENCODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); + decode(first, bl); + decode(last, bl); + decode(up, bl); + decode(acting, bl); + decode(maybe_went_rw, bl); + if (struct_v >= 3) { + decode(primary, bl); + } else { + if (acting.size()) + primary = acting[0]; + } + if (struct_v >= 4) { + decode(up_primary, bl); + } else { + if (up.size()) + up_primary = up[0]; + } + DECODE_FINISH(bl); +} + +void PastIntervals::pg_interval_t::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + f->dump_unsigned("last", last); + f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0); + f->open_array_section("up"); + for (auto p = up.cbegin(); p != up.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (auto p = acting.cbegin(); p != acting.cend(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("primary", primary); + f->dump_int("up_primary", up_primary); +} + +void PastIntervals::pg_interval_t::generate_test_instances(list& o) +{ + o.push_back(new pg_interval_t); + o.push_back(new pg_interval_t); + o.back()->up.push_back(1); + o.back()->acting.push_back(2); + o.back()->acting.push_back(3); + o.back()->first = 4; + o.back()->last = 5; + o.back()->maybe_went_rw = true; +} + +WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t) + + +/** + * pi_compact_rep + * + * PastIntervals only needs to be able to answer two questions: + * 1) Where should the primary look for unfound objects? + * 2) List a set of subsets of the OSDs such that contacting at least + * one from each subset guarantees we speak to at least one witness + * of any completed write. + * + * Crucially, 2) does not require keeping *all* past intervals. Certainly, + * we don't need to keep any where maybe_went_rw would be false. We also + * needn't keep two intervals where the actingset in one is a subset + * of the other (only need to keep the smaller of the two sets). In order + * to accurately trim the set of intervals as last_epoch_started changes + * without rebuilding the set from scratch, we'll retain the larger set + * if it in an older interval. + */ +struct compact_interval_t { + epoch_t first; + epoch_t last; + set acting; + bool supersedes(const compact_interval_t &other) { + for (auto &&i: acting) { + if (!other.acting.count(i)) + return false; + } + return true; + } + void dump(Formatter *f) const { + f->open_object_section("compact_interval_t"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->dump_stream("acting") << acting; + f->close_section(); + } + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(acting, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(acting, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(list & o) { + /* Not going to be used, we'll generate pi_compact_rep directly */ + } +}; +ostream &operator<<(ostream &o, const compact_interval_t &rhs) +{ + return o << "([" << rhs.first << "," << rhs.last + << "] acting " << rhs.acting << ")"; +} +WRITE_CLASS_ENCODER(compact_interval_t) + +class pi_compact_rep : public PastIntervals::interval_rep { + epoch_t first = 0; + epoch_t last = 0; // inclusive + set all_participants; + list intervals; + pi_compact_rep( + bool ec_pool, + std::list &&intervals) { + for (auto &&i: intervals) + add_interval(ec_pool, i); + } +public: + pi_compact_rep() = default; + pi_compact_rep(const pi_compact_rep &) = default; + pi_compact_rep(pi_compact_rep &&) = default; + pi_compact_rep &operator=(const pi_compact_rep &) = default; + pi_compact_rep &operator=(pi_compact_rep &&) = default; + + size_t size() const override { return intervals.size(); } + bool empty() const override { + return first > last || (first == 0 && last == 0); + } + void clear() override { + *this = pi_compact_rep(); + } + pair get_bounds() const override { + return make_pair(first, last + 1); + } + void adjust_start_backwards(epoch_t last_epoch_clean) override { + first = last_epoch_clean; + } + + set get_all_participants( + bool ec_pool) const override { + return all_participants; + } + void add_interval( + bool ec_pool, const PastIntervals::pg_interval_t &interval) override { + if (first == 0) + first = interval.first; + ceph_assert(interval.last > last); + last = interval.last; + set acting; + for (unsigned i = 0; i < interval.acting.size(); ++i) { + if (interval.acting[i] == CRUSH_ITEM_NONE) + continue; + acting.insert( + pg_shard_t( + interval.acting[i], + ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD)); + } + all_participants.insert(acting.begin(), acting.end()); + if (!interval.maybe_went_rw) + return; + intervals.push_back( + compact_interval_t{interval.first, interval.last, acting}); + auto plast = intervals.end(); + --plast; + for (auto cur = intervals.begin(); cur != plast; ) { + if (plast->supersedes(*cur)) { + intervals.erase(cur++); + } else { + ++cur; + } + } + } + unique_ptr clone() const override { + return unique_ptr(new pi_compact_rep(*this)); + } + ostream &print(ostream &out) const override { + return out << "([" << first << "," << last + << "] all_participants=" << all_participants + << " intervals=" << intervals << ")"; + } + void encode(ceph::buffer::list &bl) const override { + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(last, bl); + encode(all_participants, bl); + encode(intervals, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &bl) override { + DECODE_START(1, bl); + decode(first, bl); + decode(last, bl); + decode(all_participants, bl); + decode(intervals, bl); + DECODE_FINISH(bl); + } + void dump(Formatter *f) const override { + f->open_object_section("PastIntervals::compact_rep"); + f->dump_stream("first") << first; + f->dump_stream("last") << last; + f->open_array_section("all_participants"); + for (auto& i : all_participants) { + f->dump_object("pg_shard", i); + } + f->close_section(); + f->open_array_section("intervals"); + for (auto &&i: intervals) { + i.dump(f); + } + f->close_section(); + f->close_section(); + } + static void generate_test_instances(list &o) { + using ival = PastIntervals::pg_interval_t; + using ivallst = std::list; + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + false, ivallst + { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0} + , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1} + , ival{{ 2}, { 2}, 31, 35, false, 2, 2} + , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0} + })); + o.push_back( + new pi_compact_rep( + true, ivallst + { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1} + , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0} + , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2} + , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0} + })); + } + void iterate_mayberw_back_to( + epoch_t les, + std::function &)> &&f) const override { + for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) { + if (i->last < les) + break; + f(i->first, i->acting); + } + } + virtual ~pi_compact_rep() override {} +}; +WRITE_CLASS_ENCODER(pi_compact_rep) + +PastIntervals::PastIntervals() +{ + past_intervals.reset(new pi_compact_rep); +} + +PastIntervals::PastIntervals(const PastIntervals &rhs) + : past_intervals(rhs.past_intervals ? + rhs.past_intervals->clone() : + nullptr) {} + +PastIntervals &PastIntervals::operator=(const PastIntervals &rhs) +{ + PastIntervals other(rhs); + swap(other); + return *this; +} + +ostream& operator<<(ostream& out, const PastIntervals &i) +{ + if (i.past_intervals) { + return i.past_intervals->print(out); + } else { + return out << "(empty)"; + } +} + +ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i) +{ + return out << "PriorSet(" + << "ec_pool: " << i.ec_pool + << ", probe: " << i.probe + << ", down: " << i.down + << ", blocked_by: " << i.blocked_by + << ", pg_down: " << i.pg_down + << ")"; +} + +void PastIntervals::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + __u8 type = 0; + decode(type, bl); + switch (type) { + case 0: + break; + case 1: + ceph_abort_msg("pi_simple_rep support removed post-luminous"); + break; + case 2: + past_intervals.reset(new pi_compact_rep); + past_intervals->decode(bl); + break; + } + DECODE_FINISH(bl); +} + +void PastIntervals::generate_test_instances(list &o) +{ + { + list compact; + pi_compact_rep::generate_test_instances(compact); + for (auto &&i: compact) { + // takes ownership of contents + o.push_back(new PastIntervals(i)); + } + } + return; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + unsigned old_pg_num, + unsigned new_pg_num, + unsigned old_pg_num_pending, + unsigned new_pg_num_pending, + bool old_sort_bitwise, + bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, + pg_t pgid) { + return old_acting_primary != new_acting_primary || + new_acting != old_acting || + old_up_primary != new_up_primary || + new_up != old_up || + old_min_size != new_min_size || + old_size != new_size || + pgid.is_split(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge source + pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) || + pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) || + // merge source + pgid.is_merge_source(old_pg_num, new_pg_num, 0) || + // (is or was) pre-merge target + pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) || + pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) || + // merge target + pgid.is_merge_target(old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes || + old_crush_count != new_crush_count || + old_crush_target != new_crush_target || + old_crush_barrier != new_crush_barrier || + old_crush_member != new_crush_member; +} + +bool PastIntervals::is_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + const OSDMap *osdmap, + const OSDMap *lastmap, + pg_t pgid) +{ + const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool()); + if (!plast) { + return false; // after pool is deleted there are no more interval changes + } + const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool()); + if (!pi) { + return true; // pool was deleted this epoch -> (final!) interval change + } + return + is_new_interval(old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + plast->size, + pi->size, + plast->min_size, + pi->min_size, + plast->get_pg_num(), + pi->get_pg_num(), + plast->get_pg_num_pending(), + pi->get_pg_num_pending(), + lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE), + lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + plast->peering_crush_bucket_count, pi->peering_crush_bucket_count, + plast->peering_crush_bucket_target, pi->peering_crush_bucket_target, + plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier, + plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member, + pgid); +} + +bool PastIntervals::check_new_interval( + int old_acting_primary, + int new_acting_primary, + const vector &old_acting, + const vector &new_acting, + int old_up_primary, + int new_up_primary, + const vector &old_up, + const vector &new_up, + epoch_t same_interval_since, + epoch_t last_epoch_clean, + const OSDMap *osdmap, + const OSDMap *lastmap, + pg_t pgid, + const IsPGRecoverablePredicate &could_have_gone_active, + PastIntervals *past_intervals, + std::ostream *out) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (up_thru still 0). + * + * or, + * + * 1: A B + * 2: B up_thru[B]=0 + * 3: B up_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + * written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + + // remember past interval + // NOTE: a change in the up set primary triggers an interval + // change, even though the interval members in the pg_interval_t + // do not change. + ceph_assert(past_intervals); + ceph_assert(past_intervals->past_intervals); + if (is_new_interval( + old_acting_primary, + new_acting_primary, + old_acting, + new_acting, + old_up_primary, + new_up_primary, + old_up, + new_up, + osdmap, + lastmap, + pgid)) { + pg_interval_t i; + i.first = same_interval_since; + i.last = osdmap->get_epoch() - 1; + ceph_assert(i.first <= i.last); + i.acting = old_acting; + i.up = old_up; + i.primary = old_acting_primary; + i.up_primary = old_up_primary; + + unsigned num_acting = 0; + for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p) + if (*p != CRUSH_ITEM_NONE) + ++num_acting; + + ceph_assert(lastmap->get_pools().count(pgid.pool())); + const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second; + set old_acting_shards; + old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards); + + if (num_acting && + i.primary != -1 && + num_acting >= old_pg_pool.min_size && + (!old_pg_pool.is_stretch_pool() || + old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) && + could_have_gone_active(old_acting_shards)) { + if (out) + *out << __func__ << " " << i + << " up_thru " << lastmap->get_up_thru(i.primary) + << " up_from " << lastmap->get_up_from(i.primary) + << " last_epoch_clean " << last_epoch_clean; + if (lastmap->get_up_thru(i.primary) >= i.first && + lastmap->get_up_from(i.primary) <= i.first) { + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " includes interval" + << std::endl; + } else if (last_epoch_clean >= i.first && + last_epoch_clean <= i.last) { + // If the last_epoch_clean is included in this interval, then + // the pg must have been rw (for recovery to have completed). + // This is important because we won't know the _real_ + // first_epoch because we stop at last_epoch_clean, and we + // don't want the oldest interval to randomly have + // maybe_went_rw false depending on the relative up_thru vs + // last_epoch_clean timing. + i.maybe_went_rw = true; + if (out) + *out << " " << i + << " : includes last_epoch_clean " << last_epoch_clean + << " and presumed to have been rw" + << std::endl; + } else { + i.maybe_went_rw = false; + if (out) + *out << " " << i + << " : primary up " << lastmap->get_up_from(i.primary) + << "-" << lastmap->get_up_thru(i.primary) + << " does not include interval" + << std::endl; + } + } else { + i.maybe_went_rw = false; + if (out) + *out << __func__ << " " << i << " : acting set is too small" << std::endl; + } + past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i); + return true; + } else { + return false; + } +} + +// true if the given map affects the prior set +bool PastIntervals::PriorSet::affected_by_map( + const OSDMap &osdmap, + const DoutPrefixProvider *dpp) const +{ + for (auto p = probe.begin(); p != probe.end(); ++p) { + int o = p->osd; + + // did someone in the prior set go down? + if (osdmap.is_down(o) && down.count(o) == 0) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl; + return true; + } + + // did a down osd in cur get (re)marked as lost? + auto r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + // did someone in the prior down set go up? + for (auto p = down.cbegin(); p != down.cend(); ++p) { + int o = *p; + + if (osdmap.is_up(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl; + return true; + } + + // did someone in the prior set get lost or destroyed? + if (!osdmap.exists(o)) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl; + return true; + } + // did a down osd in down get (re)marked as lost? + auto r = blocked_by.find(o); + if (r != blocked_by.end()) { + if (osdmap.get_info(o).lost_at != r->second) { + ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl; + return true; + } + } + } + + return false; +} + +ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i) +{ + out << "interval(" << i.first << "-" << i.last + << " up " << i.up << "(" << i.up_primary << ")" + << " acting " << i.acting << "(" << i.primary << ")"; + if (i.maybe_went_rw) + out << " maybe_went_rw"; + out << ")"; + return out; +} + + + +// -- pg_query_t -- + +void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const { + ENCODE_START(3, 3, bl); + encode(type, bl); + encode(since, bl); + history.encode(bl); + encode(epoch_sent, bl); + encode(to, bl); + encode(from, bl); + ENCODE_FINISH(bl); +} + +void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(3, bl); + decode(type, bl); + decode(since, bl); + history.decode(bl); + decode(epoch_sent, bl); + decode(to, bl); + decode(from, bl); + DECODE_FINISH(bl); +} + +void pg_query_t::dump(Formatter *f) const +{ + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_string("type", get_type_name()); + f->dump_stream("since") << since; + f->dump_stream("epoch_sent") << epoch_sent; + f->open_object_section("history"); + history.dump(f); + f->close_section(); +} +void pg_query_t::generate_test_instances(list& o) +{ + o.push_back(new pg_query_t()); + list h; + pg_history_t::generate_test_instances(h); + o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0), + eversion_t(4, 5), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::FULLLOG, + shard_id_t::NO_SHARD, shard_id_t::NO_SHARD, + *h.back(), 5)); +} + +// -- pg_lease_t -- + +void pg_lease_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(readable_until, bl); + encode(readable_until_ub, bl); + encode(interval, bl); + ENCODE_FINISH(bl); +} + +void pg_lease_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(readable_until, p); + decode(readable_until_ub, p); + decode(interval, p); + DECODE_FINISH(p); +} + +void pg_lease_t::dump(Formatter *f) const +{ + f->dump_stream("readable_until") << readable_until; + f->dump_stream("readable_until_ub") << readable_until_ub; + f->dump_stream("interval") << interval; +} + +void pg_lease_t::generate_test_instances(std::list& o) +{ + o.push_back(new pg_lease_t()); + o.push_back(new pg_lease_t()); + o.back()->readable_until = make_timespan(1.5); + o.back()->readable_until_ub = make_timespan(3.4); + o.back()->interval = make_timespan(1.0); +} + +// -- pg_lease_ack_t -- + +void pg_lease_ack_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(readable_until_ub, bl); + ENCODE_FINISH(bl); +} + +void pg_lease_ack_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(readable_until_ub, p); + DECODE_FINISH(p); +} + +void pg_lease_ack_t::dump(Formatter *f) const +{ + f->dump_stream("readable_until_ub") << readable_until_ub; +} + +void pg_lease_ack_t::generate_test_instances(std::list& o) +{ + o.push_back(new pg_lease_ack_t()); + o.push_back(new pg_lease_ack_t()); + o.back()->readable_until_ub = make_timespan(3.4); +} + + +// -- ObjectModDesc -- +void ObjectModDesc::visit(Visitor *visitor) const +{ + auto bp = bl.cbegin(); + try { + while (!bp.end()) { + DECODE_START(max_required_version, bp); + uint8_t code; + decode(code, bp); + switch (code) { + case APPEND: { + uint64_t size; + decode(size, bp); + visitor->append(size); + break; + } + case SETATTRS: { + map > attrs; + decode(attrs, bp); + visitor->setattrs(attrs); + break; + } + case DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->rmobject(old_version); + break; + } + case CREATE: { + visitor->create(); + break; + } + case UPDATE_SNAPS: { + set snaps; + decode(snaps, bp); + visitor->update_snaps(snaps); + break; + } + case TRY_DELETE: { + version_t old_version; + decode(old_version, bp); + visitor->try_rmobject(old_version); + break; + } + case ROLLBACK_EXTENTS: { + vector > extents; + version_t gen; + decode(gen, bp); + decode(extents, bp); + visitor->rollback_extents(gen,extents); + break; + } + default: + ceph_abort_msg("Invalid rollback code"); + } + DECODE_FINISH(bp); + } + } catch (...) { + ceph_abort_msg("Invalid encoding"); + } +} + +struct DumpVisitor : public ObjectModDesc::Visitor { + Formatter *f; + explicit DumpVisitor(Formatter *f) : f(f) {} + void append(uint64_t old_size) override { + f->open_object_section("op"); + f->dump_string("code", "APPEND"); + f->dump_unsigned("old_size", old_size); + f->close_section(); + } + void setattrs(map > &attrs) override { + f->open_object_section("op"); + f->dump_string("code", "SETATTRS"); + f->open_array_section("attrs"); + for (auto i = attrs.begin(); i != attrs.end(); ++i) { + f->dump_string("attr_name", i->first); + } + f->close_section(); + f->close_section(); + } + void rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void try_rmobject(version_t old_version) override { + f->open_object_section("op"); + f->dump_string("code", "TRY_RMOBJECT"); + f->dump_unsigned("old_version", old_version); + f->close_section(); + } + void create() override { + f->open_object_section("op"); + f->dump_string("code", "CREATE"); + f->close_section(); + } + void update_snaps(const set &snaps) override { + f->open_object_section("op"); + f->dump_string("code", "UPDATE_SNAPS"); + f->dump_stream("snaps") << snaps; + f->close_section(); + } + void rollback_extents( + version_t gen, + const vector > &extents) override { + f->open_object_section("op"); + f->dump_string("code", "ROLLBACK_EXTENTS"); + f->dump_unsigned("gen", gen); + f->dump_stream("snaps") << extents; + f->close_section(); + } +}; + +void ObjectModDesc::dump(Formatter *f) const +{ + f->open_object_section("object_mod_desc"); + f->dump_bool("can_local_rollback", can_local_rollback); + f->dump_bool("rollback_info_completed", rollback_info_completed); + { + f->open_array_section("ops"); + DumpVisitor vis(f); + visit(&vis); + f->close_section(); + } + f->close_section(); +} + +void ObjectModDesc::generate_test_instances(list& o) +{ + map > attrs; + attrs[OI_ATTR]; + attrs[SS_ATTR]; + attrs["asdf"]; + o.push_back(new ObjectModDesc()); + o.back()->append(100); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->rmobject(1001); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.push_back(new ObjectModDesc()); + o.back()->create(); + o.back()->setattrs(attrs); + o.back()->mark_unrollbackable(); + o.back()->append(1000); +} + +void ObjectModDesc::encode(ceph::buffer::list &_bl) const +{ + ENCODE_START(max_required_version, max_required_version, _bl); + encode(can_local_rollback, _bl); + encode(rollback_info_completed, _bl); + encode(bl, _bl); + ENCODE_FINISH(_bl); +} +void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl) +{ + DECODE_START(2, _bl); + max_required_version = struct_v; + decode(can_local_rollback, _bl); + decode(rollback_info_completed, _bl); + decode(bl, _bl); + // ensure bl does not pin a larger ceph::buffer in memory + bl.rebuild(); + bl.reassign_to_mempool(mempool::mempool_osd_pglog); + DECODE_FINISH(_bl); +} + +std::atomic ObjectCleanRegions::max_num_intervals = {10}; + +void ObjectCleanRegions::set_max_num_intervals(uint32_t num) +{ + max_num_intervals = num; +} + +void ObjectCleanRegions::trim() +{ + while(clean_offsets.num_intervals() > max_num_intervals) { + typename interval_set::iterator shortest_interval = clean_offsets.begin(); + if (shortest_interval == clean_offsets.end()) + break; + for (typename interval_set::iterator it = clean_offsets.begin(); + it != clean_offsets.end(); + ++it) { + if (it.get_len() < shortest_interval.get_len()) + shortest_interval = it; + } + clean_offsets.erase(shortest_interval); + } +} + +void ObjectCleanRegions::merge(const ObjectCleanRegions &other) +{ + clean_offsets.intersection_of(other.clean_offsets); + clean_omap = clean_omap && other.clean_omap; + trim(); +} + +void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len) +{ + interval_set clean_region; + clean_region.insert(0, (uint64_t)-1); + clean_region.erase(offset, len); + clean_offsets.intersection_of(clean_region); + trim(); +} + +bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const +{ + return clean_offsets.contains(offset, len); +} + +void ObjectCleanRegions::mark_omap_dirty() +{ + clean_omap = false; +} + +void ObjectCleanRegions::mark_object_new() +{ + new_object = true; +} + +void ObjectCleanRegions::mark_fully_dirty() +{ + mark_data_region_dirty(0, (uint64_t)-1); + mark_omap_dirty(); + mark_object_new(); +} + +interval_set ObjectCleanRegions::get_dirty_regions() const +{ + interval_set dirty_region; + dirty_region.insert(0, (uint64_t)-1); + dirty_region.subtract(clean_offsets); + return dirty_region; +} + +bool ObjectCleanRegions::omap_is_dirty() const +{ + return !clean_omap; +} + +bool ObjectCleanRegions::object_is_exist() const +{ + return !new_object; +} + +void ObjectCleanRegions::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + using ceph::encode; + encode(clean_offsets, bl); + encode(clean_omap, bl); + encode(new_object, bl); + ENCODE_FINISH(bl); +} + +void ObjectCleanRegions::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(1, bl); + using ceph::decode; + decode(clean_offsets, bl); + decode(clean_omap, bl); + decode(new_object, bl); + DECODE_FINISH(bl); +} + +void ObjectCleanRegions::dump(Formatter *f) const +{ + f->open_object_section("object_clean_regions"); + f->dump_stream("clean_offsets") << clean_offsets; + f->dump_bool("clean_omap", clean_omap); + f->dump_bool("new_object", new_object); + f->close_section(); +} + +void ObjectCleanRegions::generate_test_instances(list& o) +{ + o.push_back(new ObjectCleanRegions()); + o.push_back(new ObjectCleanRegions()); + o.back()->mark_data_region_dirty(4096, 40960); + o.back()->mark_omap_dirty(); + o.back()->mark_object_new(); +} + +ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr) +{ + return out << "clean_offsets: " << ocr.clean_offsets + << ", clean_omap: " << ocr.clean_omap + << ", new_object: " << ocr.new_object; +} + +// -- pg_log_entry_t -- + +string pg_log_entry_t::get_key_name() const +{ + return version.get_key_name(); +} + +void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const +{ + using ceph::encode; + ceph::buffer::list ebl(sizeof(*this)*2); + this->encode(ebl); + __u32 crc = ebl.crc32c(0); + encode(ebl, bl); + encode(crc, bl); +} + +void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + ceph::buffer::list bl; + decode(bl, p); + __u32 crc; + decode(crc, p); + if (crc != bl.crc32c(0)) + throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t"); + auto q = bl.cbegin(); + this->decode(q); +} + +void pg_log_entry_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(14, 4, bl); + encode(op, bl); + encode(soid, bl); + encode(version, bl); + + /** + * Added with reverting_to: + * Previous code used prior_version to encode + * what we now call reverting_to. This will + * allow older code to decode reverting_to + * into prior_version as expected. + */ + if (op == LOST_REVERT) + encode(reverting_to, bl); + else + encode(prior_version, bl); + + encode(reqid, bl); + encode(mtime, bl); + if (op == LOST_REVERT) + encode(prior_version, bl); + encode(snaps, bl); + encode(user_version, bl); + encode(mod_desc, bl); + encode(extra_reqids, bl); + if (op == ERROR) + encode(return_code, bl); + if (!extra_reqids.empty()) + encode(extra_reqid_return_codes, bl); + encode(clean_regions, bl); + if (op != ERROR) + encode(return_code, bl); + encode(op_returns, bl); + ENCODE_FINISH(bl); +} + +void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl); + decode(op, bl); + if (struct_v < 2) { + sobject_t old_soid; + decode(old_soid, bl); + soid.oid = old_soid.oid; + soid.snap = old_soid.snap; + invalid_hash = true; + } else { + decode(soid, bl); + } + if (struct_v < 3) + invalid_hash = true; + decode(version, bl); + + if (struct_v >= 6 && op == LOST_REVERT) + decode(reverting_to, bl); + else + decode(prior_version, bl); + + decode(reqid, bl); + + decode(mtime, bl); + if (struct_v < 5) + invalid_pool = true; + + if (op == LOST_REVERT) { + if (struct_v >= 6) { + decode(prior_version, bl); + } else { + reverting_to = prior_version; + } + } + if (struct_v >= 7 || // for v >= 7, this is for all ops. + op == CLONE) { // for v < 7, it's only present for CLONE. + decode(snaps, bl); + // ensure snaps does not pin a larger ceph::buffer in memory + snaps.rebuild(); + snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + if (struct_v >= 8) + decode(user_version, bl); + else + user_version = version.version; + + if (struct_v >= 9) + decode(mod_desc, bl); + else + mod_desc.mark_unrollbackable(); + if (struct_v >= 10) + decode(extra_reqids, bl); + if (struct_v >= 11 && op == ERROR) + decode(return_code, bl); + if (struct_v >= 12 && !extra_reqids.empty()) + decode(extra_reqid_return_codes, bl); + if (struct_v >= 13) + decode(clean_regions, bl); + else + clean_regions.mark_fully_dirty(); + if (struct_v >= 14) { + if (op != ERROR) { + decode(return_code, bl); + } + decode(op_returns, bl); + } + DECODE_FINISH(bl); +} + +void pg_log_entry_t::dump(Formatter *f) const +{ + f->dump_string("op", get_op_name()); + f->dump_stream("object") << soid; + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("reqid") << reqid; + f->open_array_section("extra_reqids"); + uint32_t idx = 0; + for (auto p = extra_reqids.begin(); + p != extra_reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = extra_reqid_return_codes.find(idx); + if (it != extra_reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); + f->dump_stream("mtime") << mtime; + f->dump_int("return_code", return_code); + if (!op_returns.empty()) { + f->open_array_section("op_returns"); + for (auto& i : op_returns) { + f->dump_object("op", i); + } + f->close_section(); + } + if (snaps.length() > 0) { + vector v; + ceph::buffer::list c = snaps; + auto p = c.cbegin(); + try { + using ceph::decode; + decode(v, p); + } catch (...) { + v.clear(); + } + f->open_object_section("snaps"); + for (auto p = v.begin(); p != v.end(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + } + { + f->open_object_section("mod_desc"); + mod_desc.dump(f); + f->close_section(); + } + { + f->open_object_section("clean_regions"); + clean_regions.dump(f); + f->close_section(); + } +} + +void pg_log_entry_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_entry_t()); + hobject_t oid(object_t("objname"), "key", 123, 456, 0, ""); + o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), 0)); + o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4), + 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + utime_t(8,9), -ENOENT)); +} + +ostream& operator<<(ostream& out, const pg_log_entry_t& e) +{ + out << e.version << " (" << e.prior_version << ") " + << std::left << std::setw(8) << e.get_op_name() << ' ' + << e.soid << " by " << e.reqid << " " << e.mtime + << " " << e.return_code; + if (!e.op_returns.empty()) { + out << " " << e.op_returns; + } + if (e.snaps.length()) { + vector snaps; + ceph::buffer::list c = e.snaps; + auto p = c.cbegin(); + try { + decode(snaps, p); + } catch (...) { + snaps.clear(); + } + out << " snaps " << snaps; + } + out << " ObjectCleanRegions " << e.clean_regions; + return out; +} + +// -- pg_log_dup_t -- + +std::string pg_log_dup_t::get_key_name() const +{ + static const char prefix[] = "dup_"; + std::string key(36, ' '); + memcpy(&key[0], prefix, 4); + version.get_key_name(&key[4]); + key.resize(35); // remove the null terminator + return key; +} + +void pg_log_dup_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(2, 1, bl); + encode(reqid, bl); + encode(version, bl); + encode(user_version, bl); + encode(return_code, bl); + encode(op_returns, bl); + ENCODE_FINISH(bl); +} + +void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(2, bl); + decode(reqid, bl); + decode(version, bl); + decode(user_version, bl); + decode(return_code, bl); + if (struct_v >= 2) { + decode(op_returns, bl); + } + DECODE_FINISH(bl); +} + +void pg_log_dup_t::dump(Formatter *f) const +{ + f->dump_stream("reqid") << reqid; + f->dump_stream("version") << version; + f->dump_stream("user_version") << user_version; + f->dump_stream("return_code") << return_code; + if (!op_returns.empty()) { + f->open_array_section("op_returns"); + for (auto& i : op_returns) { + f->dump_object("op", i); + } + f->close_section(); + } +} + +void pg_log_dup_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_dup_t()); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 1, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + 0)); + o.push_back(new pg_log_dup_t(eversion_t(1,2), + 2, + osd_reqid_t(entity_name_t::CLIENT(777), 8, 999), + -ENOENT)); +} + + +std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) { + out << "log_dup(reqid=" << e.reqid << + " v=" << e.version << " uv=" << e.user_version << + " rc=" << e.return_code; + if (!e.op_returns.empty()) { + out << " " << e.op_returns; + } + return out << ")"; +} + + +// -- pg_log_t -- + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const pg_log_t &in, + pg_log_t &out, pg_log_t &reject) +{ + out = in; + out.log.clear(); + reject.log.clear(); + + for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) { + + // Reject pg log entries for temporary objects + if (i->soid.is_temp()) { + reject.log.push_back(*i); + continue; + } + + if (i->soid.nspace != hit_set_namespace) { + object_t oid = i->soid.oid; + object_locator_t loc(i->soid); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.log.push_back(*i); + } else { + reject.log.push_back(*i); + } + } else { + out.log.push_back(*i); + } + } +} + +void pg_log_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(7, 3, bl); + encode(head, bl); + encode(tail, bl); + encode(log, bl); + encode(can_rollback_to, bl); + encode(rollback_info_trimmed_to, bl); + encode(dups, bl); + ENCODE_FINISH(bl); +} + +void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl); + decode(head, bl); + decode(tail, bl); + if (struct_v < 2) { + bool backlog; + decode(backlog, bl); + } + decode(log, bl); + if (struct_v >= 5) + decode(can_rollback_to, bl); + + if (struct_v >= 6) + decode(rollback_info_trimmed_to, bl); + else + rollback_info_trimmed_to = tail; + + if (struct_v >= 7) + decode(dups, bl); + + DECODE_FINISH(bl); + + // handle hobject_t format change + if (struct_v < 4) { + for (auto i = log.begin(); i != log.end(); ++i) { + if (!i->soid.is_max() && i->soid.pool == -1) + i->soid.pool = pool; + } + } +} + +void pg_log_t::dump(Formatter *f) const +{ + f->dump_stream("head") << head; + f->dump_stream("tail") << tail; + f->open_array_section("log"); + for (auto p = log.cbegin(); p != log.cend(); ++p) { + f->open_object_section("entry"); + p->dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("dups"); + for (const auto& entry : dups) { + f->open_object_section("entry"); + entry.dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_log_t::generate_test_instances(list& o) +{ + o.push_back(new pg_log_t); + + // this is nonsensical: + o.push_back(new pg_log_t); + o.back()->head = eversion_t(1,2); + o.back()->tail = eversion_t(3,4); + list e; + pg_log_entry_t::generate_test_instances(e); + for (auto p = e.begin(); p != e.end(); ++p) + o.back()->log.push_back(**p); +} + +static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups) +{ + auto earliest_dup_version = + target.head.version < maxdups ? 0u : target.head.version - maxdups + 1; + lgeneric_subdout(cct, osd, 20) << __func__ << " earliest_dup_version " + << earliest_dup_version << dendl; + + for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) { + if (d->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup version " + << d->version << dendl; + target.dups.push_back(pg_log_dup_t(*d)); + } + } + + for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) { + ceph_assert(i->version > other.tail); + if (i->version > target.tail) + break; + if (i->version.version >= earliest_dup_version) { + lgeneric_subdout(cct, osd, 20) + << "copy_up_to/copy_after copy dup from log version " + << i->version << dendl; + target.dups.push_back(pg_log_dup_t(*i)); + } + } +} + + +void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v) +{ + can_rollback_to = other.can_rollback_to; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; + for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) { + ceph_assert(i->version > other.tail); + if (i->version <= v) { + // make tail accurate. + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); + lgeneric_subdout(cct, osd, 20) << __func__ << " END v " << v + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; +} + +void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max) +{ + can_rollback_to = other.can_rollback_to; + int n = 0; + head = other.head; + tail = other.tail; + lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; + for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) { + ceph_assert(i->version > other.tail); + if (n++ >= max) { + tail = i->version; + break; + } + lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl; + log.push_front(*i); + } + _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked); + lgeneric_subdout(cct, osd, 20) << __func__ << " END max " << max + << " dups.size()=" << dups.size() + << " other.dups.size()=" << other.dups.size() << dendl; +} + +ostream& pg_log_t::print(ostream& out) const +{ + out << *this << std::endl; + for (auto p = log.cbegin(); p != log.cend(); ++p) + out << *p << std::endl; + for (const auto& entry : dups) { + out << " dup entry: " << entry << std::endl; + } + return out; +} + +// -- pg_missing_t -- + +ostream& operator<<(ostream& out, const pg_missing_item& i) +{ + out << i.need; + if (i.have != eversion_t()) + out << "(" << i.have << ")"; + out << " flags = " << i.flag_str() + << " " << i.clean_regions; + return out; +} + +// -- object_copy_cursor_t -- + +void object_copy_cursor_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(attr_complete, bl); + encode(data_offset, bl); + encode(data_complete, bl); + encode(omap_offset, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(attr_complete, bl); + decode(data_offset, bl); + decode(data_complete, bl); + decode(omap_offset, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +void object_copy_cursor_t::dump(Formatter *f) const +{ + f->dump_unsigned("attr_complete", (int)attr_complete); + f->dump_unsigned("data_offset", data_offset); + f->dump_unsigned("data_complete", (int)data_complete); + f->dump_string("omap_offset", omap_offset); + f->dump_unsigned("omap_complete", (int)omap_complete); +} + +void object_copy_cursor_t::generate_test_instances(list& o) +{ + o.push_back(new object_copy_cursor_t); + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_offset = 123; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_offset = "foo"; + o.push_back(new object_copy_cursor_t); + o.back()->attr_complete = true; + o.back()->data_complete = true; + o.back()->omap_complete = true; +} + +// -- object_copy_data_t -- + +void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + ENCODE_START(8, 5, bl); + encode(size, bl); + encode(mtime, bl); + encode(attrs, bl); + encode(data, bl); + encode(omap_data, bl); + encode(cursor, bl); + encode(omap_header, bl); + encode(snaps, bl); + encode(snap_seq, bl); + encode(flags, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(reqids, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(reqid_return_codes, bl); + ENCODE_FINISH(bl); +} + +void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(8, bl); + if (struct_v < 5) { + // old + decode(size, bl); + decode(mtime, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(attrs, bl); + decode(data, bl); + { + map omap; + decode(omap, bl); + omap_data.clear(); + if (!omap.empty()) { + using ceph::encode; + encode(omap, omap_data); + } + } + decode(cursor, bl); + if (struct_v >= 2) + decode(omap_header, bl); + if (struct_v >= 3) { + decode(snaps, bl); + decode(snap_seq, bl); + } else { + snaps.clear(); + snap_seq = 0; + } + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + } else { + // current + decode(size, bl); + decode(mtime, bl); + decode(attrs, bl); + decode(data, bl); + decode(omap_data, bl); + decode(cursor, bl); + decode(omap_header, bl); + decode(snaps, bl); + decode(snap_seq, bl); + if (struct_v >= 4) { + decode(flags, bl); + decode(data_digest, bl); + decode(omap_digest, bl); + } + if (struct_v >= 6) { + decode(reqids, bl); + } + if (struct_v >= 7) { + decode(truncate_seq, bl); + decode(truncate_size, bl); + } + if (struct_v >= 8) { + decode(reqid_return_codes, bl); + } + } + DECODE_FINISH(bl); +} + +void object_copy_data_t::generate_test_instances(list& o) +{ + o.push_back(new object_copy_data_t()); + + list cursors; + object_copy_cursor_t::generate_test_instances(cursors); + auto ci = cursors.begin(); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->cursor = **(ci++); + + o.push_back(new object_copy_data_t()); + o.back()->size = 1234; + o.back()->mtime.set_from_double(1234); + ceph::buffer::ptr bp("there", 5); + ceph::buffer::list bl; + bl.push_back(bp); + o.back()->attrs["hello"] = bl; + ceph::buffer::ptr bp2("not", 3); + ceph::buffer::list bl2; + bl2.push_back(bp2); + map omap; + omap["why"] = bl2; + using ceph::encode; + encode(omap, o.back()->omap_data); + ceph::buffer::ptr databp("iamsomedatatocontain", 20); + o.back()->data.push_back(databp); + o.back()->omap_header.append("this is an omap header"); + o.back()->snaps.push_back(123); + o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t())); +} + +void object_copy_data_t::dump(Formatter *f) const +{ + f->open_object_section("cursor"); + cursor.dump(f); + f->close_section(); // cursor + f->dump_int("size", size); + f->dump_stream("mtime") << mtime; + /* we should really print out the attrs here, but ceph::buffer::list + const-correctness prevents that */ + f->dump_int("attrs_size", attrs.size()); + f->dump_int("flags", flags); + f->dump_unsigned("data_digest", data_digest); + f->dump_unsigned("omap_digest", omap_digest); + f->dump_int("omap_data_length", omap_data.length()); + f->dump_int("omap_header_length", omap_header.length()); + f->dump_int("data_length", data.length()); + f->open_array_section("snaps"); + for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) + f->dump_unsigned("snap", *p); + f->close_section(); + f->open_array_section("reqids"); + uint32_t idx = 0; + for (auto p = reqids.begin(); + p != reqids.end(); + ++idx, ++p) { + f->open_object_section("extra_reqid"); + f->dump_stream("reqid") << p->first; + f->dump_stream("user_version") << p->second; + auto it = reqid_return_codes.find(idx); + if (it != reqid_return_codes.end()) { + f->dump_int("return_code", it->second); + } + f->close_section(); + } + f->close_section(); +} + +// -- pg_create_t -- + +void pg_create_t::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(created, bl); + encode(parent, bl); + encode(split_bits, bl); + ENCODE_FINISH(bl); +} + +void pg_create_t::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(created, bl); + decode(parent, bl); + decode(split_bits, bl); + DECODE_FINISH(bl); +} + +void pg_create_t::dump(Formatter *f) const +{ + f->dump_unsigned("created", created); + f->dump_stream("parent") << parent; + f->dump_int("split_bits", split_bits); +} + +void pg_create_t::generate_test_instances(list& o) +{ + o.push_back(new pg_create_t); + o.push_back(new pg_create_t(1, pg_t(3, 4), 2)); +} + + +// -- pg_hit_set_info_t -- + +void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(2, 1, bl); + encode(begin, bl); + encode(end, bl); + encode(version, bl); + encode(using_gmt, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START(2, p); + decode(begin, p); + decode(end, p); + decode(version, p); + if (struct_v >= 2) { + decode(using_gmt, p); + } else { + using_gmt = false; + } + DECODE_FINISH(p); +} + +void pg_hit_set_info_t::dump(Formatter *f) const +{ + f->dump_stream("begin") << begin; + f->dump_stream("end") << end; + f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; +} + +void pg_hit_set_info_t::generate_test_instances(list& ls) +{ + ls.push_back(new pg_hit_set_info_t); + ls.push_back(new pg_hit_set_info_t); + ls.back()->begin = utime_t(1, 2); + ls.back()->end = utime_t(3, 4); +} + + +// -- pg_hit_set_history_t -- + +void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(current_last_update, bl); + { + utime_t dummy_stamp; + encode(dummy_stamp, bl); + } + { + pg_hit_set_info_t dummy_info; + encode(dummy_info, bl); + } + encode(history, bl); + ENCODE_FINISH(bl); +} + +void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START(1, p); + decode(current_last_update, p); + { + utime_t dummy_stamp; + decode(dummy_stamp, p); + } + { + pg_hit_set_info_t dummy_info; + decode(dummy_info, p); + } + decode(history, p); + DECODE_FINISH(p); +} + +void pg_hit_set_history_t::dump(Formatter *f) const +{ + f->dump_stream("current_last_update") << current_last_update; + f->open_array_section("history"); + for (auto p = history.cbegin(); p != history.cend(); ++p) { + f->open_object_section("info"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void pg_hit_set_history_t::generate_test_instances(list& ls) +{ + ls.push_back(new pg_hit_set_history_t); + ls.push_back(new pg_hit_set_history_t); + ls.back()->current_last_update = eversion_t(1, 2); + ls.back()->history.push_back(pg_hit_set_info_t()); +} + +// -- OSDSuperblock -- + +void OSDSuperblock::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(10, 5, bl); + encode(cluster_fsid, bl); + encode(whoami, bl); + encode(current_epoch, bl); + encode(oldest_map, bl); + encode(newest_map, bl); + encode(weight, bl); + compat_features.encode(bl); + encode(clean_thru, bl); + encode(mounted, bl); + encode(osd_fsid, bl); + encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full + encode((uint32_t)0, bl); // map pool_last_epoch_marked_full + encode(purged_snaps_last, bl); + encode(last_purged_snaps_scrub, bl); + encode(cluster_osdmap_trim_lower_bound, bl); + ENCODE_FINISH(bl); +} + +void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(10, 5, 5, bl); + if (struct_v < 3) { + string magic; + decode(magic, bl); + } + decode(cluster_fsid, bl); + decode(whoami, bl); + decode(current_epoch, bl); + decode(oldest_map, bl); + decode(newest_map, bl); + decode(weight, bl); + if (struct_v >= 2) { + compat_features.decode(bl); + } else { //upgrade it! + compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + } + decode(clean_thru, bl); + decode(mounted, bl); + if (struct_v >= 4) + decode(osd_fsid, bl); + if (struct_v >= 6) { + epoch_t last_map_marked_full; + decode(last_map_marked_full, bl); + } + if (struct_v >= 7) { + map pool_last_map_marked_full; + decode(pool_last_map_marked_full, bl); + } + if (struct_v >= 9) { + decode(purged_snaps_last, bl); + decode(last_purged_snaps_scrub, bl); + } else { + purged_snaps_last = 0; + } + if (struct_v >= 10) { + decode(cluster_osdmap_trim_lower_bound, bl); + } else { + cluster_osdmap_trim_lower_bound = 0; + } + DECODE_FINISH(bl); +} + +void OSDSuperblock::dump(Formatter *f) const +{ + f->dump_stream("cluster_fsid") << cluster_fsid; + f->dump_stream("osd_fsid") << osd_fsid; + f->dump_int("whoami", whoami); + f->dump_int("current_epoch", current_epoch); + f->dump_int("oldest_map", oldest_map); + f->dump_int("newest_map", newest_map); + f->dump_float("weight", weight); + f->open_object_section("compat"); + compat_features.dump(f); + f->close_section(); + f->dump_int("clean_thru", clean_thru); + f->dump_int("last_epoch_mounted", mounted); + f->dump_unsigned("purged_snaps_last", purged_snaps_last); + f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; + f->dump_int("cluster_osdmap_trim_lower_bound", + cluster_osdmap_trim_lower_bound); +} + +void OSDSuperblock::generate_test_instances(list& o) +{ + OSDSuperblock z; + o.push_back(new OSDSuperblock(z)); + z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101"); + z.osd_fsid.parse("02020202-0202-0202-0202-020202020202"); + z.whoami = 3; + z.current_epoch = 4; + z.oldest_map = 5; + z.newest_map = 9; + z.mounted = 8; + z.clean_thru = 7; + o.push_back(new OSDSuperblock(z)); + o.push_back(new OSDSuperblock(z)); +} + +// -- SnapSet -- + +void SnapSet::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(3, 2, bl); + encode(seq, bl); + encode(true, bl); // head_exists + encode(snaps, bl); + encode(clones, bl); + encode(clone_overlap, bl); + encode(clone_size, bl); + encode(clone_snaps, bl); + ENCODE_FINISH(bl); +} + +void SnapSet::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(seq, bl); + bl += 1u; // skip legacy head_exists (always true) + decode(snaps, bl); + decode(clones, bl); + decode(clone_overlap, bl); + decode(clone_size, bl); + if (struct_v >= 3) { + decode(clone_snaps, bl); + } else { + clone_snaps.clear(); + } + DECODE_FINISH(bl); +} + +void SnapSet::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->open_array_section("clones"); + for (auto p = clones.cbegin(); p != clones.cend(); ++p) { + f->open_object_section("clone"); + f->dump_unsigned("snap", *p); + auto cs = clone_size.find(*p); + if (cs != clone_size.end()) + f->dump_unsigned("size", cs->second); + else + f->dump_string("size", "????"); + auto co = clone_overlap.find(*p); + if (co != clone_overlap.end()) + f->dump_stream("overlap") << co->second; + else + f->dump_stream("overlap") << "????"; + auto q = clone_snaps.find(*p); + if (q != clone_snaps.end()) { + f->open_array_section("snaps"); + for (auto s : q->second) { + f->dump_unsigned("snap", s); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +void SnapSet::generate_test_instances(list& o) +{ + o.push_back(new SnapSet); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.push_back(new SnapSet); + o.back()->seq = 123; + o.back()->snaps.push_back(123); + o.back()->snaps.push_back(12); + o.back()->clones.push_back(12); + o.back()->clone_size[12] = 12345; + o.back()->clone_overlap[12]; + o.back()->clone_snaps[12] = {12, 10, 8}; +} + +ostream& operator<<(ostream& out, const SnapSet& cs) +{ + return out << cs.seq << "=" << cs.snaps << ":" + << cs.clone_snaps; +} + +void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy) +{ + // NOTE: our reconstruction of snaps (and the snapc) is not strictly + // correct: it will not include snaps that still logically exist + // but for which there was no clone that is defined. For all + // practical purposes this doesn't matter, since we only use that + // information to clone on the OSD, and we have already moved + // forward past that part of the object history. + + seq = ss.seq; + set _snaps; + set _clones; + for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) { + if (p->cloneid != librados::SNAP_HEAD) { + _clones.insert(p->cloneid); + _snaps.insert(p->snaps.begin(), p->snaps.end()); + clone_size[p->cloneid] = p->size; + clone_overlap[p->cloneid]; // the entry must exist, even if it's empty. + for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q) + clone_overlap[p->cloneid].insert(q->first, q->second); + if (!legacy) { + // p->snaps is ascending; clone_snaps is descending + vector& v = clone_snaps[p->cloneid]; + for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) { + v.push_back(*q); + } + } + } + } + + // ascending + clones.clear(); + clones.reserve(_clones.size()); + for (auto p = _clones.begin(); p != _clones.end(); ++p) + clones.push_back(*p); + + // descending + snaps.clear(); + snaps.reserve(_snaps.size()); + for (auto p = _snaps.rbegin(); + p != _snaps.rend(); ++p) + snaps.push_back(*p); +} + +uint64_t SnapSet::get_clone_bytes(snapid_t clone) const +{ + ceph_assert(clone_size.count(clone)); + uint64_t size = clone_size.find(clone)->second; + ceph_assert(clone_overlap.count(clone)); + const interval_set &overlap = clone_overlap.find(clone)->second; + ceph_assert(size >= (uint64_t)overlap.size()); + return size - overlap.size(); +} + +void SnapSet::filter(const pg_pool_t &pinfo) +{ + vector oldsnaps; + oldsnaps.swap(snaps); + for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) { + if (!pinfo.is_removed_snap(*i)) + snaps.push_back(*i); + } +} + +SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const +{ + SnapSet ss = *this; + ss.filter(pinfo); + return ss; +} + +// -- watch_info_t -- + +void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + ENCODE_START(4, 3, bl); + encode(cookie, bl); + encode(timeout_seconds, bl); + encode(addr, bl, features); + ENCODE_FINISH(bl); +} + +void watch_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + decode(cookie, bl); + if (struct_v < 2) { + uint64_t ver; + decode(ver, bl); + } + decode(timeout_seconds, bl); + if (struct_v >= 4) { + decode(addr, bl); + } + DECODE_FINISH(bl); +} + +void watch_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("cookie", cookie); + f->dump_unsigned("timeout_seconds", timeout_seconds); + f->open_object_section("addr"); + addr.dump(f); + f->close_section(); +} + +void watch_info_t::generate_test_instances(list& o) +{ + o.push_back(new watch_info_t); + o.push_back(new watch_info_t); + o.back()->cookie = 123; + o.back()->timeout_seconds = 99; + entity_addr_t ea; + ea.set_type(entity_addr_t::TYPE_LEGACY); + ea.set_nonce(1); + ea.set_family(AF_INET); + ea.set_in4_quad(0, 127); + ea.set_in4_quad(1, 0); + ea.set_in4_quad(2, 1); + ea.set_in4_quad(3, 2); + ea.set_port(2); + o.back()->addr = ea; +} + +// -- chunk_info_t -- + +void chunk_info_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(offset, bl); + encode(length, bl); + encode(oid, bl); + __u32 _flags = flags; + encode(_flags, bl); + ENCODE_FINISH(bl); +} + +void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(offset, bl); + decode(length, bl); + decode(oid, bl); + __u32 _flags; + decode(_flags, bl); + flags = (cflag_t)_flags; + DECODE_FINISH(bl); +} + +void chunk_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("length", length); + f->open_object_section("oid"); + oid.dump(f); + f->close_section(); + f->dump_unsigned("flags", flags); +} + + +bool chunk_info_t::operator==(const chunk_info_t& cit) const +{ + if (has_fingerprint()) { + if (oid.oid.name == cit.oid.oid.name) { + return true; + } + } else { + if (offset == cit.offset && length == cit.length && + oid.oid.name == cit.oid.oid.name) { + return true; + } + + } + return false; +} + +bool operator==(const std::pair & l, + const std::pair & r) +{ + return l.first == r.first && + l.second == r.second; +} + +ostream& operator<<(ostream& out, const chunk_info_t& ci) +{ + return out << "(len: " << ci.length << " oid: " << ci.oid + << " offset: " << ci.offset + << " flags: " << ci.get_flag_string(ci.flags) << ")"; +} + +// -- object_manifest_t -- + +std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci) +{ + return out << ci.ref_delta << std::endl; +} + +void object_manifest_t::calc_refs_to_inc_on_set( + const object_manifest_t* _g, + const object_manifest_t* _l, + object_ref_delta_t &refs) const +{ + /* avoid to increment the same reference on adjacent clones */ + auto iter = chunk_map.begin(); + auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur) + -> bool { + if (cur) { + auto c = cur->chunk_map.find(i->first); + if (c != cur->chunk_map.end() && c->second == i->second) { + return true; + + } + } + return false; + }; + + /* If at least a same chunk exists on either _g or _l, do not increment + * the reference + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) aaa, <- set_chunk + * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc + * --> incremnt the reference + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) ccc, <- set_chunk + * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc + * --> do not need to increment + * + * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) ccc, <- set_chunk + * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc + * --> decrement the reference of ccc + * + */ + for (; iter != chunk_map.end(); ++iter) { + auto found_g = find_chunk(iter, _g); + auto found_l = find_chunk(iter, _l); + if (!found_g && !found_l) { + refs.inc_ref(iter->second.oid); + } else if (found_g && found_l) { + refs.dec_ref(iter->second.oid); + } + } +} + +void object_manifest_t::calc_refs_to_drop_on_modify( + const object_manifest_t* _l, + const ObjectCleanRegions& clean_regions, + object_ref_delta_t &refs) const +{ + for (auto &p : chunk_map) { + if (!clean_regions.is_clean_region(p.first, p.second.length)) { + // has previous snapshot + if (_l) { + /* + * Let's assume that there is a manifest snapshotted object which has three chunks + * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc + * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc + * + * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because + * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks + * (head: [6, 2) and 20: [6, 2)) are different. + * + */ + auto c = _l->chunk_map.find(p.first); + if (c != _l->chunk_map.end()) { + if (p.second == c->second) { + continue; + } + } + refs.dec_ref(p.second.oid); + } else { + // decrement the reference of the updated chunks if the manifest object has no snapshot + refs.dec_ref(p.second.oid); + } + } + } +} + +void object_manifest_t::calc_refs_to_drop_on_removal( + const object_manifest_t* _g, + const object_manifest_t* _l, + object_ref_delta_t &refs) const +{ + /* At a high level, the rule is that consecutive clones with the same reference + * at the same offset share a reference. As such, removing *this may result + * in removing references in two cases: + * 1) *this has a reference which it shares with neither _g nor _l + * 2) _g and _l have a reference which they share with each other but not + * *this. + * + * For a particular offset, both 1 and 2 can happen. + * + * Notably, this means that to evaluate the reference change from removing + * the object with *this, we only need to look at the two adjacent clones. + */ + + // Paper over possibly missing _g or _l -- nullopt is semantically the same + // as an empty chunk_map + static const object_manifest_t empty; + const object_manifest_t &g = _g ? *_g : empty; + const object_manifest_t &l = _l ? *_l : empty; + + auto giter = g.chunk_map.begin(); + auto iter = chunk_map.begin(); + auto liter = l.chunk_map.begin(); + + // Translate iter, map pair to the current offset, end() -> max + auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest) + -> uint64_t { + return i == manifest.chunk_map.end() ? + std::numeric_limits::max() : i->first; + }; + + /* If current matches the offset at iter, returns the chunk at *iter + * and increments iter. Otherwise, returns nullptr. + * + * current will always be derived from the min of *giter, *iter, and + * *liter on each cycle, so the result will be that each loop iteration + * will pick up all chunks at the offest being considered, each offset + * will be considered once, and all offsets will be considered. + */ + auto get_chunk = []( + uint64_t current, decltype(iter) &i, const object_manifest_t &manifest) + -> const chunk_info_t * { + if (i == manifest.chunk_map.end() || current != i->first) { + return nullptr; + } else { + return &(i++)->second; + } + }; + + while (giter != g.chunk_map.end() || + iter != chunk_map.end() || + liter != l.chunk_map.end()) { + auto current = std::min( + std::min(get_offset(giter, g), get_offset(iter, *this)), + get_offset(liter, l)); + + auto gchunk = get_chunk(current, giter, g); + auto chunk = get_chunk(current, iter, *this); + auto lchunk = get_chunk(current, liter, l); + + if (gchunk && lchunk && *gchunk == *lchunk && + (!chunk || *gchunk != *chunk)) { + // case 1 from above: l and g match, chunk does not + refs.dec_ref(gchunk->oid); + } + + if (chunk && + (!gchunk || chunk->oid != gchunk->oid) && + (!lchunk || chunk->oid != lchunk->oid)) { + // case 2 from above: *this matches neither + refs.dec_ref(chunk->oid); + } + } +} + +void object_manifest_t::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(1, 1, bl); + encode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + encode(redirect_target, bl); + break; + case TYPE_CHUNKED: + encode(chunk_map, bl); + break; + default: + ceph_abort(); + } + ENCODE_FINISH(bl); +} + +void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(1, bl); + decode(type, bl); + switch (type) { + case TYPE_NONE: break; + case TYPE_REDIRECT: + decode(redirect_target, bl); + break; + case TYPE_CHUNKED: + decode(chunk_map, bl); + break; + default: + ceph_abort(); + } + DECODE_FINISH(bl); +} + +void object_manifest_t::dump(Formatter *f) const +{ + f->dump_unsigned("type", type); + if (type == TYPE_REDIRECT) { + f->open_object_section("redirect_target"); + redirect_target.dump(f); + f->close_section(); + } else if (type == TYPE_CHUNKED) { + f->open_array_section("chunk_map"); + for (auto& p : chunk_map) { + f->open_object_section("chunk"); + f->dump_unsigned("offset", p.first); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + } +} + +void object_manifest_t::generate_test_instances(list& o) +{ + o.push_back(new object_manifest_t()); + o.back()->type = TYPE_REDIRECT; +} + +ostream& operator<<(ostream& out, const object_manifest_t& om) +{ + out << "manifest(" << om.get_type_name(); + if (om.is_redirect()) { + out << " " << om.redirect_target; + } else if (om.is_chunked()) { + out << " " << om.chunk_map; + } + out << ")"; + return out; +} + +// -- object_info_t -- + +void object_info_t::copy_user_bits(const object_info_t& other) +{ + // these bits are copied from head->clone. + size = other.size; + mtime = other.mtime; + local_mtime = other.local_mtime; + last_reqid = other.last_reqid; + truncate_seq = other.truncate_seq; + truncate_size = other.truncate_size; + flags = other.flags; + user_version = other.user_version; + data_digest = other.data_digest; + omap_digest = other.omap_digest; +} + +void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + object_locator_t myoloc(soid); + map old_watchers; + for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) { + old_watchers.insert(make_pair(i->first.second, i->second)); + } + ENCODE_START(17, 8, bl); + encode(soid, bl); + encode(myoloc, bl); //Retained for compatibility + encode((__u32)0, bl); // was category, no longer used + encode(version, bl); + encode(prior_version, bl); + encode(last_reqid, bl); + encode(size, bl); + encode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) + encode(osd_reqid_t(), bl); // used to be wrlock_by + else + encode((uint32_t)0, bl); // was legacy_snaps + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(is_lost(), bl); + encode(old_watchers, bl, features); + /* shenanigans to avoid breaking backwards compatibility in the disk format. + * When we can, switch this out for simply putting the version_t on disk. */ + eversion_t user_eversion(0, user_version); + encode(user_eversion, bl); + encode(test_flag(FLAG_USES_TMAP), bl); + encode(watchers, bl, features); + __u32 _flags = flags; + encode(_flags, bl); + encode(local_mtime, bl); + encode(data_digest, bl); + encode(omap_digest, bl); + encode(expected_object_size, bl); + encode(expected_write_size, bl); + encode(alloc_hint_flags, bl); + if (has_manifest()) { + encode(manifest, bl); + } + ENCODE_FINISH(bl); +} + +void object_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + object_locator_t myoloc; + DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl); + map old_watchers; + decode(soid, bl); + decode(myoloc, bl); + { + string category; + decode(category, bl); // no longer used + } + decode(version, bl); + decode(prior_version, bl); + decode(last_reqid, bl); + decode(size, bl); + decode(mtime, bl); + if (soid.snap == CEPH_NOSNAP) { + osd_reqid_t wrlock_by; + decode(wrlock_by, bl); + } else { + vector legacy_snaps; + decode(legacy_snaps, bl); + } + decode(truncate_seq, bl); + decode(truncate_size, bl); + + // if this is struct_v >= 13, we will overwrite this + // below since this field is just here for backwards + // compatibility + __u8 lo; + decode(lo, bl); + flags = (flag_t)lo; + + decode(old_watchers, bl); + eversion_t user_eversion; + decode(user_eversion, bl); + user_version = user_eversion.version; + + if (struct_v >= 9) { + bool uses_tmap = false; + decode(uses_tmap, bl); + if (uses_tmap) + set_flag(FLAG_USES_TMAP); + } else { + set_flag(FLAG_USES_TMAP); + } + if (struct_v < 10) + soid.pool = myoloc.pool; + if (struct_v >= 11) { + decode(watchers, bl); + } else { + for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) { + watchers.insert( + make_pair( + make_pair(i->second.cookie, i->first), i->second)); + } + } + if (struct_v >= 13) { + __u32 _flags; + decode(_flags, bl); + flags = (flag_t)_flags; + } + if (struct_v >= 14) { + decode(local_mtime, bl); + } else { + local_mtime = utime_t(); + } + if (struct_v >= 15) { + decode(data_digest, bl); + decode(omap_digest, bl); + } else { + data_digest = omap_digest = -1; + clear_flag(FLAG_DATA_DIGEST); + clear_flag(FLAG_OMAP_DIGEST); + } + if (struct_v >= 16) { + decode(expected_object_size, bl); + decode(expected_write_size, bl); + decode(alloc_hint_flags, bl); + } else { + expected_object_size = 0; + expected_write_size = 0; + alloc_hint_flags = 0; + } + if (struct_v >= 17) { + if (has_manifest()) { + decode(manifest, bl); + } + } + DECODE_FINISH(bl); +} + +void object_info_t::dump(Formatter *f) const +{ + f->open_object_section("oid"); + soid.dump(f); + f->close_section(); + f->dump_stream("version") << version; + f->dump_stream("prior_version") << prior_version; + f->dump_stream("last_reqid") << last_reqid; + f->dump_unsigned("user_version", user_version); + f->dump_unsigned("size", size); + f->dump_stream("mtime") << mtime; + f->dump_stream("local_mtime") << local_mtime; + f->dump_unsigned("lost", (int)is_lost()); + vector sv = get_flag_vector(flags); + f->open_array_section("flags"); + for (const auto& str: sv) { + f->dump_string("flags", str); + } + f->close_section(); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_format("data_digest", "0x%08x", data_digest); + f->dump_format("omap_digest", "0x%08x", omap_digest); + f->dump_unsigned("expected_object_size", expected_object_size); + f->dump_unsigned("expected_write_size", expected_write_size); + f->dump_unsigned("alloc_hint_flags", alloc_hint_flags); + f->dump_object("manifest", manifest); + f->open_object_section("watchers"); + for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) { + CachedStackStringStream css; + *css << p->first.second; + f->open_object_section(css->strv()); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void object_info_t::generate_test_instances(list& o) +{ + o.push_back(new object_info_t()); + + // fixme +} + + +ostream& operator<<(ostream& out, const object_info_t& oi) +{ + out << oi.soid << "(" << oi.version + << " " << oi.last_reqid; + if (oi.flags) + out << " " << oi.get_flag_string(); + out << " s " << oi.size; + out << " uv " << oi.user_version; + if (oi.is_data_digest()) + out << " dd " << std::hex << oi.data_digest << std::dec; + if (oi.is_omap_digest()) + out << " od " << std::hex << oi.omap_digest << std::dec; + out << " alloc_hint [" << oi.expected_object_size + << " " << oi.expected_write_size + << " " << oi.alloc_hint_flags << "]"; + if (oi.has_manifest()) + out << " " << oi.manifest; + out << ")"; + return out; +} + +// -- ObjectRecovery -- +void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(first, bl); + encode(data_complete, bl); + encode(data_recovered_to, bl); + encode(omap_recovered_to, bl); + encode(omap_complete, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(first, bl); + decode(data_complete, bl); + decode(data_recovered_to, bl); + decode(omap_recovered_to, bl); + decode(omap_complete, bl); + DECODE_FINISH(bl); +} + +ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog) +{ + return prog.print(out); +} + +void ObjectRecoveryProgress::generate_test_instances( + list& o) +{ + o.push_back(new ObjectRecoveryProgress); + o.back()->first = false; + o.back()->data_complete = true; + o.back()->omap_complete = true; + o.back()->data_recovered_to = 100; + + o.push_back(new ObjectRecoveryProgress); + o.back()->first = true; + o.back()->data_complete = false; + o.back()->omap_complete = false; + o.back()->data_recovered_to = 0; +} + +ostream &ObjectRecoveryProgress::print(ostream &out) const +{ + return out << "ObjectRecoveryProgress(" + << ( first ? "" : "!" ) << "first, " + << "data_recovered_to:" << data_recovered_to + << ", data_complete:" << ( data_complete ? "true" : "false" ) + << ", omap_recovered_to:" << omap_recovered_to + << ", omap_complete:" << ( omap_complete ? "true" : "false" ) + << ", error:" << ( error ? "true" : "false" ) + << ")"; +} + +void ObjectRecoveryProgress::dump(Formatter *f) const +{ + f->dump_int("first?", first); + f->dump_int("data_complete?", data_complete); + f->dump_unsigned("data_recovered_to", data_recovered_to); + f->dump_int("omap_complete?", omap_complete); + f->dump_string("omap_recovered_to", omap_recovered_to); +} + +void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(3, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(size, bl); + encode(oi, bl, features); + encode(ss, bl); + encode(copy_subset, bl); + encode(clone_subset, bl); + encode(object_exist, bl); + ENCODE_FINISH(bl); +} + +void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl, + int64_t pool) +{ + DECODE_START(3, bl); + decode(soid, bl); + decode(version, bl); + decode(size, bl); + decode(oi, bl); + decode(ss, bl); + decode(copy_subset, bl); + decode(clone_subset, bl); + if (struct_v > 2) + decode(object_exist, bl); + else + object_exist = false; + DECODE_FINISH(bl); + if (struct_v < 2) { + if (!soid.is_max() && soid.pool == -1) + soid.pool = pool; + map> tmp; + tmp.swap(clone_subset); + for (auto i = tmp.begin(); i != tmp.end(); ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + clone_subset[first].swap(i->second); + } + } +} + +void ObjectRecoveryInfo::generate_test_instances( + list& o) +{ + o.push_back(new ObjectRecoveryInfo); + o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP)); + o.back()->version = eversion_t(0,0); + o.back()->size = 100; + o.back()->object_exist = false; +} + + +void ObjectRecoveryInfo::dump(Formatter *f) const +{ + f->dump_stream("object") << soid; + f->dump_stream("at_version") << version; + f->dump_stream("size") << size; + { + f->open_object_section("object_info"); + oi.dump(f); + f->close_section(); + } + { + f->open_object_section("snapset"); + ss.dump(f); + f->close_section(); + } + f->dump_stream("copy_subset") << copy_subset; + f->dump_stream("clone_subset") << clone_subset; + f->dump_stream("object_exist") << object_exist; +} + +ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf) +{ + return inf.print(out); +} + +ostream &ObjectRecoveryInfo::print(ostream &out) const +{ + return out << "ObjectRecoveryInfo(" + << soid << "@" << version + << ", size: " << size + << ", copy_subset: " << copy_subset + << ", clone_subset: " << clone_subset + << ", snapset: " << ss + << ", object_exist: " << object_exist + << ")"; +} + +// -- PushReplyOp -- +void PushReplyOp::generate_test_instances(list &o) +{ + o.push_back(new PushReplyOp); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.push_back(new PushReplyOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); +} + +void PushReplyOp::encode(ceph::buffer::list &bl) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + ENCODE_FINISH(bl); +} + +void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + DECODE_FINISH(bl); +} + +void PushReplyOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; +} + +ostream &PushReplyOp::print(ostream &out) const +{ + return out + << "PushReplyOp(" << soid + << ")"; +} + +ostream& operator<<(ostream& out, const PushReplyOp &op) +{ + return op.print(out); +} + +uint64_t PushReplyOp::cost(CephContext *cct) const +{ + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + /* In general, we really never want to throttle PushReplyOp messages. + * As long as the object is smaller than osd_recovery_max_chunk (8M at + * time of writing this comment, so this is basically always true), + * processing the PushReplyOp does not cost any further IO and simply + * permits the object once more to be written to. + * + * In the unlikely event that the object is larger than + * osd_recovery_max_chunk (again, 8M at the moment, so never for common + * configurations of rbd and virtually never for cephfs and rgw), + * we *still* want to push out the next portion immediately so that we can + * release the object for IO. + * + * The throttling for this operation on the primary occurs at the point + * where we queue the PGRecoveryContext which calls into recover_missing + * and recover_backfill to initiate pushes. + * See OSD::queue_recovery_context. + */ + return 1; + } else { + /* We retain this legacy behavior for WeightedPriorityQueue. It seems to + * require very large costs for several messages in order to do any + * meaningful amount of throttling. This branch should be removed after + * Reef. + */ + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; + } +} + +// -- PullOp -- +void PullOp::generate_test_instances(list &o) +{ + o.push_back(new PullOp); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->recovery_info.version = eversion_t(3, 10); + o.push_back(new PullOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->recovery_info.version = eversion_t(0, 0); +} + +void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(recovery_info, bl, features); + encode(recovery_progress, bl); + ENCODE_FINISH(bl); +} + +void PullOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(recovery_info, bl); + decode(recovery_progress, bl); + DECODE_FINISH(bl); +} + +void PullOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("recovery_progress"); + recovery_progress.dump(f); + f->close_section(); + } +} + +ostream &PullOp::print(ostream &out) const +{ + return out + << "PullOp(" << soid + << ", recovery_info: " << recovery_info + << ", recovery_progress: " << recovery_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PullOp &op) +{ + return op.print(out); +} + +uint64_t PullOp::cost(CephContext *cct) const +{ + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + return std::clamp( + recovery_progress.estimate_remaining_data_to_recover(recovery_info), + 1, + cct->_conf->osd_recovery_max_chunk); + } else { + /* We retain this legacy behavior for WeightedPriorityQueue. It seems to + * require very large costs for several messages in order to do any + * meaningful amount of throttling. This branch should be removed after + * Reef. + */ + return cct->_conf->osd_push_per_object_cost + + cct->_conf->osd_recovery_max_chunk; + } +} + +// -- PushOp -- +void PushOp::generate_test_instances(list &o) +{ + o.push_back(new PushOp); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", 2)); + o.back()->version = eversion_t(3, 10); + o.push_back(new PushOp); + o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP)); + o.back()->version = eversion_t(0, 0); +} + +void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(soid, bl); + encode(version, bl); + encode(data, bl); + encode(data_included, bl); + encode(omap_header, bl); + encode(omap_entries, bl); + encode(attrset, bl); + encode(recovery_info, bl, features); + encode(after_progress, bl); + encode(before_progress, bl); + ENCODE_FINISH(bl); +} + +void PushOp::decode(ceph::buffer::list::const_iterator &bl) +{ + DECODE_START(1, bl); + decode(soid, bl); + decode(version, bl); + decode(data, bl); + decode(data_included, bl); + decode(omap_header, bl); + decode(omap_entries, bl); + decode(attrset, bl); + decode(recovery_info, bl); + decode(after_progress, bl); + decode(before_progress, bl); + DECODE_FINISH(bl); +} + +void PushOp::dump(Formatter *f) const +{ + f->dump_stream("soid") << soid; + f->dump_stream("version") << version; + f->dump_int("data_len", data.length()); + f->dump_stream("data_included") << data_included; + f->dump_int("omap_header_len", omap_header.length()); + f->dump_int("omap_entries_len", omap_entries.size()); + f->dump_int("attrset_len", attrset.size()); + { + f->open_object_section("recovery_info"); + recovery_info.dump(f); + f->close_section(); + } + { + f->open_object_section("after_progress"); + after_progress.dump(f); + f->close_section(); + } + { + f->open_object_section("before_progress"); + before_progress.dump(f); + f->close_section(); + } +} + +ostream &PushOp::print(ostream &out) const +{ + return out + << "PushOp(" << soid + << ", version: " << version + << ", data_included: " << data_included + << ", data_size: " << data.length() + << ", omap_header_size: " << omap_header.length() + << ", omap_entries_size: " << omap_entries.size() + << ", attrset_size: " << attrset.size() + << ", recovery_info: " << recovery_info + << ", after_progress: " << after_progress + << ", before_progress: " << before_progress + << ")"; +} + +ostream& operator<<(ostream& out, const PushOp &op) +{ + return op.print(out); +} + +uint64_t PushOp::cost(CephContext *cct) const +{ + uint64_t cost = data_included.size(); + for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) { + cost += i->second.length(); + } + cost += cct->_conf->osd_push_per_object_cost; + return cost; +} + +// -- ScrubMap -- + +void ScrubMap::merge_incr(const ScrubMap &l) +{ + ceph_assert(valid_through == l.incr_since); + valid_through = l.valid_through; + + for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){ + if (p->second.negative) { + auto q = objects.find(p->first); + if (q != objects.end()) { + objects.erase(q); + } + } else { + objects[p->first] = p->second; + } + } +} + +void ScrubMap::encode(ceph::buffer::list& bl) const +{ + ENCODE_START(3, 2, bl); + encode(objects, bl); + encode((__u32)0, bl); // used to be attrs; now deprecated + ceph::buffer::list old_logbl; // not used + encode(old_logbl, bl); + encode(valid_through, bl); + encode(incr_since, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + decode(objects, bl); + { + map attrs; // deprecated + decode(attrs, bl); + } + ceph::buffer::list old_logbl; // not used + decode(old_logbl, bl); + decode(valid_through, bl); + decode(incr_since, bl); + DECODE_FINISH(bl); + + // handle hobject_t upgrade + if (struct_v < 3) { + map tmp; + tmp.swap(objects); + for (auto i = tmp.begin(); i != tmp.end(); ++i) { + hobject_t first(i->first); + if (!first.is_max() && first.pool == -1) + first.pool = pool; + objects[first] = i->second; + } + } +} + +void ScrubMap::dump(Formatter *f) const +{ + f->dump_stream("valid_through") << valid_through; + f->dump_stream("incremental_since") << incr_since; + f->open_array_section("objects"); + for (auto p = objects.cbegin(); p != objects.cend(); ++p) { + f->open_object_section("object"); + f->dump_string("name", p->first.oid.name); + f->dump_unsigned("hash", p->first.get_hash()); + f->dump_string("key", p->first.get_key()); + f->dump_int("snapid", p->first.snap); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::generate_test_instances(list& o) +{ + o.push_back(new ScrubMap); + o.push_back(new ScrubMap); + o.back()->valid_through = eversion_t(1, 2); + o.back()->incr_since = eversion_t(3, 4); + list obj; + object::generate_test_instances(obj); + o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back(); + obj.pop_back(); + o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back(); +} + +// -- ScrubMap::object -- + +void ScrubMap::object::encode(ceph::buffer::list& bl) const +{ + bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch; + ENCODE_START(10, 7, bl); + encode(size, bl); + encode(negative, bl); + encode(attrs, bl); + encode(digest, bl); + encode(digest_present, bl); + encode((uint32_t)0, bl); // obsolete nlinks + encode((uint32_t)0, bl); // snapcolls + encode(omap_digest, bl); + encode(omap_digest_present, bl); + encode(compat_read_error, bl); + encode(stat_error, bl); + encode(read_error, bl); + encode(ec_hash_mismatch, bl); + encode(ec_size_mismatch, bl); + encode(large_omap_object_found, bl); + encode(large_omap_object_key_count, bl); + encode(large_omap_object_value_size, bl); + encode(object_omap_bytes, bl); + encode(object_omap_keys, bl); + ENCODE_FINISH(bl); +} + +void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(10, bl); + decode(size, bl); + bool tmp, compat_read_error = false; + decode(tmp, bl); + negative = tmp; + decode(attrs, bl); + decode(digest, bl); + decode(tmp, bl); + digest_present = tmp; + { + uint32_t nlinks; + decode(nlinks, bl); + set snapcolls; + decode(snapcolls, bl); + } + decode(omap_digest, bl); + decode(tmp, bl); + omap_digest_present = tmp; + decode(compat_read_error, bl); + decode(tmp, bl); + stat_error = tmp; + if (struct_v >= 8) { + decode(tmp, bl); + read_error = tmp; + decode(tmp, bl); + ec_hash_mismatch = tmp; + decode(tmp, bl); + ec_size_mismatch = tmp; + } + // If older encoder found a read_error, set read_error + if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch) + read_error = true; + if (struct_v >= 9) { + decode(tmp, bl); + large_omap_object_found = tmp; + decode(large_omap_object_key_count, bl); + decode(large_omap_object_value_size, bl); + } + if (struct_v >= 10) { + decode(object_omap_bytes, bl); + decode(object_omap_keys, bl); + } + DECODE_FINISH(bl); +} + +void ScrubMap::object::dump(Formatter *f) const +{ + f->dump_int("size", size); + f->dump_int("negative", negative); + f->open_array_section("attrs"); + for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) { + f->open_object_section("attr"); + f->dump_string("name", p->first); + f->dump_int("length", p->second.length()); + f->close_section(); + } + f->close_section(); +} + +void ScrubMap::object::generate_test_instances(list& o) +{ + o.push_back(new object); + o.push_back(new object); + o.back()->negative = true; + o.push_back(new object); + o.back()->size = 123; + o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3); + o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6); +} + +// -- OSDOp -- + +ostream& operator<<(ostream& out, const OSDOp& op) +{ + out << ceph_osd_op_name(op.op.op); + if (ceph_osd_op_type_data(op.op.op)) { + // data extent + switch (op.op.op) { + case CEPH_OSD_OP_ASSERT_VER: + out << " v" << op.op.assert_ver.ver; + break; + case CEPH_OSD_OP_TRUNCATE: + out << " " << op.op.extent.offset; + break; + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_TRIMTRUNC: + out << " " << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size; + break; + case CEPH_OSD_OP_ROLLBACK: + out << " " << snapid_t(op.op.snap.snapid); + break; + case CEPH_OSD_OP_WATCH: + out << " " << ceph_osd_watch_op_name(op.op.watch.op) + << " cookie " << op.op.watch.cookie; + if (op.op.watch.gen) + out << " gen " << op.op.watch.gen; + break; + case CEPH_OSD_OP_NOTIFY: + out << " cookie " << op.op.notify.cookie; + break; + case CEPH_OSD_OP_COPY_GET: + out << " max " << op.op.copy_get.max; + break; + case CEPH_OSD_OP_COPY_FROM: + out << " ver " << op.op.copy_from.src_version; + break; + case CEPH_OSD_OP_SETALLOCHINT: + out << " object_size " << op.op.alloc_hint.expected_object_size + << " write_size " << op.op.alloc_hint.expected_write_size; + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_CMPEXT: + out << " " << op.op.extent.offset << "~" << op.op.extent.length; + if (op.op.extent.truncate_seq) + out << " [" << op.op.extent.truncate_seq << "@" + << (int64_t)op.op.extent.truncate_size << "]"; + if (op.op.flags) + out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]"; + default: + // don't show any arg info + break; + } + } else if (ceph_osd_op_type_attr(op.op.op)) { + // xattr name + if (op.op.xattr.name_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.xattr.name_len, out); + } + if (op.op.xattr.value_len) + out << " (" << op.op.xattr.value_len << ")"; + if (op.op.op == CEPH_OSD_OP_CMPXATTR) + out << " op " << (int)op.op.xattr.cmp_op + << " mode " << (int)op.op.xattr.cmp_mode; + } else if (ceph_osd_op_type_exec(op.op.op)) { + // class.method + if (op.op.cls.class_len && op.indata.length()) { + out << " "; + op.indata.write(0, op.op.cls.class_len, out); + out << "."; + op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out); + } + } else if (ceph_osd_op_type_pg(op.op.op)) { + switch (op.op.op) { + case CEPH_OSD_OP_PGLS: + case CEPH_OSD_OP_PGLS_FILTER: + case CEPH_OSD_OP_PGNLS: + case CEPH_OSD_OP_PGNLS_FILTER: + out << " start_epoch " << op.op.pgls.start_epoch; + break; + case CEPH_OSD_OP_PG_HITSET_LS: + break; + case CEPH_OSD_OP_PG_HITSET_GET: + out << " " << utime_t(op.op.hit_set_get.stamp); + break; + case CEPH_OSD_OP_SCRUBLS: + break; + } + } + if (op.indata.length()) { + out << " in=" << op.indata.length() << "b"; + } + if (op.outdata.length()) { + out << " out=" << op.outdata.length() << "b"; + } + return out; +} + + +void OSDOp::split_osd_op_vector_out_data(vector& ops, ceph::buffer::list& in) +{ + auto datap = in.begin(); + for (unsigned i = 0; i < ops.size(); i++) { + if (ops[i].op.payload_len) { + datap.copy(ops[i].op.payload_len, ops[i].outdata); + } + } +} + +void OSDOp::merge_osd_op_vector_out_data(vector& ops, ceph::buffer::list& out) +{ + for (unsigned i = 0; i < ops.size(); i++) { + ops[i].op.payload_len = ops[i].outdata.length(); + if (ops[i].outdata.length()) { + out.append(ops[i].outdata); + } + } +} + +int prepare_info_keymap( + CephContext* cct, + map *km, + string *key_to_remove, + epoch_t epoch, + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + bool dirty_big_info, + bool dirty_epoch, + bool try_fast_info, + PerfCounters *logger, + DoutPrefixProvider *dpp) +{ + if (dirty_epoch) { + encode(epoch, (*km)[string(epoch_key)]); + } + + if (logger) + logger->inc(l_osd_pg_info); + + // try to do info efficiently? + if (!dirty_big_info && try_fast_info && + info.last_update > last_written_info.last_update) { + pg_fast_info_t fast; + fast.populate_from(info); + bool did = fast.try_apply_to(&last_written_info); + ceph_assert(did); // we verified last_update increased above + if (info == last_written_info) { + encode(fast, (*km)[string(fastinfo_key)]); + if (logger) + logger->inc(l_osd_pg_fastinfo); + return 0; + } + if (dpp) { + ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n"; + { + JSONFormatter jf(true); + jf.dump_object("info", info); + jf.flush(*_dout); + } + { + *_dout << "\nlast_written_info:\n"; + JSONFormatter jf(true); + jf.dump_object("last_written_info", last_written_info); + jf.flush(*_dout); + } + *_dout << dendl; + } + } else if (info.last_update <= last_written_info.last_update) { + // clean up any potentially stale fastinfo key resulting from last_update + // not moving forwards (e.g., a backwards jump during peering) + *key_to_remove = fastinfo_key; + } + + last_written_info = info; + + // info. store purged_snaps separately. + interval_set purged_snaps; + purged_snaps.swap(info.purged_snaps); + encode(info, (*km)[string(info_key)]); + purged_snaps.swap(info.purged_snaps); + + if (dirty_big_info) { + // potentially big stuff + bufferlist& bigbl = (*km)[string(biginfo_key)]; + encode(past_intervals, bigbl); + encode(info.purged_snaps, bigbl); + //dout(20) << "write_info bigbl " << bigbl.length() << dendl; + if (logger) + logger->inc(l_osd_pg_biginfo); + } + + return 0; +} + +void create_pg_collection( + ceph::os::Transaction& t, spg_t pgid, int bits) +{ + coll_t coll(pgid); + t.create_collection(coll, bits); +} + +void init_pg_ondisk( + ceph::os::Transaction& t, + spg_t pgid, + const pg_pool_t *pool) +{ + coll_t coll(pgid); + if (pool) { + // Give a hint to the PG collection + bufferlist hint; + uint32_t pg_num = pool->get_pg_num(); + uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num; + encode(pg_num, hint); + encode(expected_num_objects_pg, hint); + uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS; + t.collection_hint(coll, hint_type, hint); + } + + ghobject_t pgmeta_oid(pgid.make_pgmeta_oid()); + t.touch(coll, pgmeta_oid); + map values; + __u8 struct_v = pg_latest_struct_v; + encode(struct_v, values[string(infover_key)]); + t.omap_setkeys(coll, pgmeta_oid, values); +} + +PGLSFilter::PGLSFilter() : cct(nullptr) +{ +} + +PGLSFilter::~PGLSFilter() +{ +} + +int PGLSPlainFilter::init(ceph::bufferlist::const_iterator ¶ms) +{ + try { + decode(xattr, params); + decode(val, params); + } catch (ceph::buffer::error &e) { + return -EINVAL; + } + return 0; +} + +bool PGLSPlainFilter::filter(const hobject_t& obj, + const ceph::bufferlist& xattr_data) const +{ + return xattr_data.contents_equal(val.c_str(), val.size()); +} -- cgit v1.2.3