From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rgw/driver/rados/rgw_log_backing.cc | 708 ++++++++++++++++++++++++++++++++ 1 file changed, 708 insertions(+) create mode 100644 src/rgw/driver/rados/rgw_log_backing.cc (limited to 'src/rgw/driver/rados/rgw_log_backing.cc') diff --git a/src/rgw/driver/rados/rgw_log_backing.cc b/src/rgw/driver/rados/rgw_log_backing.cc new file mode 100644 index 000000000..7c9dafe7e --- /dev/null +++ b/src/rgw/driver/rados/rgw_log_backing.cc @@ -0,0 +1,708 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "cls/log/cls_log_client.h" +#include "cls/version/cls_version_client.h" + +#include "rgw_log_backing.h" +#include "rgw_tools.h" +#include "cls_fifo_legacy.h" + +using namespace std::chrono_literals; +namespace cb = ceph::buffer; + +static constexpr auto dout_subsys = ceph_subsys_rgw; + +enum class shard_check { dne, omap, fifo, corrupt }; +inline std::ostream& operator <<(std::ostream& m, const shard_check& t) { + switch (t) { + case shard_check::dne: + return m << "shard_check::dne"; + case shard_check::omap: + return m << "shard_check::omap"; + case shard_check::fifo: + return m << "shard_check::fifo"; + case shard_check::corrupt: + return m << "shard_check::corrupt"; + } + + return m << "shard_check::UNKNOWN=" << static_cast(t); +} + +namespace { +/// Return the shard type, and a bool to see whether it has entries. +shard_check +probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid, + bool& fifo_unsupported, optional_yield y) +{ + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " probing oid=" << oid + << dendl; + if (!fifo_unsupported) { + std::unique_ptr fifo; + auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid, + &fifo, y, + std::nullopt, true); + switch (r) { + case 0: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " is FIFO" + << dendl; + return shard_check::fifo; + + case -ENODATA: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " is empty and therefore OMAP" + << dendl; + return shard_check::omap; + + case -ENOENT: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid << " does not exist" + << dendl; + return shard_check::dne; + + case -EPERM: + ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": FIFO is unsupported, marking." + << dendl; + fifo_unsupported = true; + return shard_check::omap; + + default: + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": error probing: r=" << r + << ", oid=" << oid << dendl; + return shard_check::corrupt; + } + } else { + // Since FIFO is unsupported, OMAP is the only alternative + return shard_check::omap; + } +} + +tl::expected +handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, + log_type def, + std::string oid, + bool fifo_unsupported, + optional_yield y) +{ + if (def == log_type::fifo) { + if (fifo_unsupported) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " WARNING: FIFO set as default but not supported by OSD. " + << "Falling back to OMAP." << dendl; + return log_type::omap; + } + std::unique_ptr fifo; + auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, + &fifo, y, + std::nullopt); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " error creating FIFO: r=" << r + << ", oid=" << oid << dendl; + return tl::unexpected(bs::error_code(-r, bs::system_category())); + } + } + return def; +} +} + +tl::expected +log_backing_type(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + log_type def, + int shards, + const fu2::unique_function& get_oid, + optional_yield y) +{ + auto check = shard_check::dne; + bool fifo_unsupported = false; + for (int i = 0; i < shards; ++i) { + auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y); + if (c == shard_check::corrupt) + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + if (c == shard_check::dne) continue; + if (check == shard_check::dne) { + check = c; + continue; + } + + if (check != c) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " clashing types: check=" << check + << ", c=" << c << dendl; + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + } + } + if (check == shard_check::corrupt) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " should be unreachable!" << dendl; + return tl::unexpected(bs::error_code(EIO, bs::system_category())); + } + + if (check == shard_check::dne) + return handle_dne(dpp, ioctx, + def, + get_oid(0), + fifo_unsupported, + y); + + return (check == shard_check::fifo ? log_type::fifo : log_type::omap); +} + +bs::error_code log_remove(const DoutPrefixProvider *dpp, + librados::IoCtx& ioctx, + int shards, + const fu2::unique_function& get_oid, + bool leave_zero, + optional_yield y) +{ + bs::error_code ec; + for (int i = 0; i < shards; ++i) { + auto oid = get_oid(i); + rados::cls::fifo::info info; + uint32_t part_header_size = 0, part_entry_overhead = 0; + + auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info, + &part_header_size, &part_entry_overhead, + 0, y, true); + if (r == -ENOENT) continue; + if (r == 0 && info.head_part_num > -1) { + for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) { + librados::ObjectWriteOperation op; + op.remove(); + auto part_oid = info.part_oid(j); + auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield); + if (subr < 0 && subr != -ENOENT) { + if (!ec) + ec = bs::error_code(-subr, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed removing FIFO part: part_oid=" << part_oid + << ", subr=" << subr << dendl; + } + } + } + if (r < 0 && r != -ENODATA) { + if (!ec) + ec = bs::error_code(-r, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed checking FIFO part: oid=" << oid + << ", r=" << r << dendl; + } + librados::ObjectWriteOperation op; + if (i == 0 && leave_zero) { + // Leave shard 0 in existence, but remove contents and + // omap. cls_lock stores things in the xattrs. And sync needs to + // rendezvous with locks on generation 0 shard 0. + op.omap_set_header({}); + op.omap_clear(); + op.truncate(0); + } else { + op.remove(); + } + r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield); + if (r < 0 && r != -ENOENT) { + if (!ec) + ec = bs::error_code(-r, bs::system_category()); + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed removing shard: oid=" << oid + << ", r=" << r << dendl; + } + } + return ec; +} + +logback_generations::~logback_generations() { + if (watchcookie > 0) { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.unwatch2(watchcookie); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed unwatching oid=" << oid + << ", r=" << r << dendl; + } + } +} + +bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp, + log_type def, + optional_yield y) noexcept +{ + try { + // First, read. + auto cct = static_cast(ioctx.cct()); + auto res = read(dpp, y); + if (!res && res.error() != bs::errc::no_such_file_or_directory) { + return res.error(); + } + if (res) { + std::unique_lock lock(m); + std::tie(entries_, version) = std::move(*res); + } else { + // Are we the first? Then create generation 0 and the generations + // metadata. + librados::ObjectWriteOperation op; + auto type = log_backing_type(dpp, ioctx, def, shards, + [this](int shard) { + return this->get_oid(0, shard); + }, y); + if (!type) + return type.error(); + + logback_generation l; + l.type = *type; + + std::unique_lock lock(m); + version.ver = 1; + static constexpr auto TAG_LEN = 24; + version.tag.clear(); + append_rand_alpha(cct, version.tag, version.tag, TAG_LEN); + op.create(true); + cls_version_set(op, version); + cb::list bl; + entries_.emplace(0, std::move(l)); + encode(entries_, bl); + lock.unlock(); + + op.write_full(bl); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r < 0 && r != -EEXIST) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed writing oid=" << oid + << ", r=" << r << dendl; + bs::system_error(-r, bs::system_category()); + } + // Did someone race us? Then re-read. + if (r != 0) { + res = read(dpp, y); + if (!res) + return res.error(); + if (res->first.empty()) + return bs::error_code(EIO, bs::system_category()); + auto l = res->first.begin()->second; + // In the unlikely event that someone raced us, created + // generation zero, incremented, then erased generation zero, + // don't leave generation zero lying around. + if (l.gen_id != 0) { + auto ec = log_remove(dpp, ioctx, shards, + [this](int shard) { + return this->get_oid(0, shard); + }, true, y); + if (ec) return ec; + } + std::unique_lock lock(m); + std::tie(entries_, version) = std::move(*res); + } + } + // Pass all non-empty generations to the handler + std::unique_lock lock(m); + auto i = lowest_nomempty(entries_); + entries_t e; + std::copy(i, entries_.cend(), + std::inserter(e, e.end())); + m.unlock(); + auto ec = watch(); + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to re-establish watch, unsafe to continue: oid=" + << oid << ", ec=" << ec.message() << dendl; + } + return handle_init(std::move(e)); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } +} + +bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept +{ + try { + auto res = read(dpp, y); + if (!res) { + return res.error(); + } + + std::unique_lock l(m); + auto& [es, v] = *res; + if (v == version) { + // Nothing to do! + return {}; + } + + // Check consistency and prepare update + if (es.empty()) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Read empty update." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + auto cur_lowest = lowest_nomempty(entries_); + // Straight up can't happen + assert(cur_lowest != entries_.cend()); + auto new_lowest = lowest_nomempty(es); + if (new_lowest == es.cend()) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Read update with no active head." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + if (new_lowest->first < cur_lowest->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Tail moved wrong way." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + + std::optional highest_empty; + if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) { + --new_lowest; + highest_empty = new_lowest->first; + } + + entries_t new_entries; + + if ((es.end() - 1)->first < (entries_.end() - 1)->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": INCONSISTENCY! Head moved wrong way." << dendl; + return bs::error_code(EFAULT, bs::system_category()); + } + + if ((es.end() - 1)->first > (entries_.end() - 1)->first) { + auto ei = es.lower_bound((entries_.end() - 1)->first + 1); + std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end())); + } + + // Everything checks out! + + version = v; + entries_ = es; + l.unlock(); + + if (highest_empty) { + auto ec = handle_empty_to(*highest_empty); + if (ec) return ec; + } + + if (!new_entries.empty()) { + auto ec = handle_new_gens(std::move(new_entries)); + if (ec) return ec; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept -> + tl::expected, bs::error_code> +{ + try { + librados::ObjectReadOperation op; + std::unique_lock l(m); + cls_version_check(op, version, VER_COND_GE); + l.unlock(); + obj_version v2; + cls_version_read(op, &v2); + cb::list bl; + op.read(0, 0, &bl, nullptr); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y); + if (r < 0) { + if (r == -ENOENT) { + ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": oid=" << oid + << " not found" << dendl; + } else { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed reading oid=" << oid + << ", r=" << r << dendl; + } + return tl::unexpected(bs::error_code(-r, bs::system_category())); + } + auto bi = bl.cbegin(); + entries_t e; + try { + decode(e, bi); + } catch (const cb::error& err) { + return tl::unexpected(err.code()); + } + return std::pair{ std::move(e), std::move(v2) }; + } catch (const std::bad_alloc&) { + return tl::unexpected(bs::error_code(ENOMEM, bs::system_category())); + } +} + +bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e, + std::unique_lock&& l_, + optional_yield y) noexcept +{ + auto l = std::move(l_); + ceph_assert(l.mutex() == &m && + l.owns_lock()); + try { + librados::ObjectWriteOperation op; + cls_version_check(op, version, VER_COND_GE); + cb::list bl; + encode(e, bl); + op.write_full(bl); + cls_version_inc(op); + auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y); + if (r == 0) { + entries_ = std::move(e); + version.inc(); + return {}; + } + l.unlock(); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed reading oid=" << oid + << ", r=" << r << dendl; + return { -r, bs::system_category() }; + } + if (r == -ECANCELED) { + auto ec = update(dpp, y); + if (ec) { + return ec; + } else { + return { ECANCELED, bs::system_category() }; + } + } + } catch (const std::bad_alloc&) { + return { ENOMEM, bs::system_category() }; + } + return {}; +} + + +bs::error_code logback_generations::watch() noexcept { + try { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.watch2(oid, &watchcookie, this); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to set watch oid=" << oid + << ", r=" << r << dendl; + return { -r, bs::system_category() }; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp, + log_type type, + optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + entries_t new_entries; + do { + std::unique_lock l(m); + auto last = entries_.end() - 1; + if (last->second.type == type) { + // Nothing to be done + return {}; + } + auto newgenid = last->first + 1; + logback_generation newgen; + newgen.gen_id = newgenid; + newgen.type = type; + new_entries.emplace(newgenid, newgen); + auto es = entries_; + es.emplace(newgenid, std::move(newgen)); + ec = write(dpp, std::move(es), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + + cb::list bl, rbl; + + auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": notify failed with r=" << r << dendl; + return { -r, bs::system_category() }; + } + ec = handle_new_gens(new_entries); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp, + uint64_t gen_id, + optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + uint64_t newtail = 0; + do { + std::unique_lock l(m); + { + auto last = entries_.end() - 1; + if (gen_id >= last->first) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": Attempt to trim beyond the possible." << dendl; + return bs::error_code(EINVAL, bs::system_category()); + } + } + auto es = entries_; + auto ei = es.upper_bound(gen_id); + if (ei == es.begin()) { + // Nothing to be done. + return {}; + } + for (auto i = es.begin(); i < ei; ++i) { + newtail = i->first; + i->second.pruned = ceph::real_clock::now(); + } + ec = write(dpp, std::move(es), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + + cb::list bl, rbl; + + auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y); + if (r < 0) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": notify failed with r=" << r << dendl; + return { -r, bs::system_category() }; + } + ec = handle_empty_to(newtail); + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept { + static constexpr auto max_tries = 10; + try { + auto ec = update(dpp, y); + if (ec) return ec; + auto tries = 0; + entries_t new_entries; + std::unique_lock l(m); + ceph_assert(!entries_.empty()); + { + auto i = lowest_nomempty(entries_); + if (i == entries_.begin()) { + return {}; + } + } + entries_t es; + auto now = ceph::real_clock::now(); + l.unlock(); + do { + std::copy_if(entries_.cbegin(), entries_.cend(), + std::inserter(es, es.end()), + [now](const auto& e) { + if (!e.second.pruned) + return false; + + auto pruned = *e.second.pruned; + return (now - pruned) >= 1h; + }); + auto es2 = entries_; + for (const auto& [gen_id, e] : es) { + ceph_assert(e.pruned); + auto ec = log_remove(dpp, ioctx, shards, + [this, gen_id = gen_id](int shard) { + return this->get_oid(gen_id, shard); + }, (gen_id == 0), y); + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": Error pruning: gen_id=" << gen_id + << " ec=" << ec.message() << dendl; + } + if (auto i = es2.find(gen_id); i != es2.end()) { + es2.erase(i); + } + } + l.lock(); + es.clear(); + ec = write(dpp, std::move(es2), std::move(l), y); + ++tries; + } while (ec == bs::errc::operation_canceled && + tries < max_tries); + if (tries >= max_tries) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": exhausted retry attempts." << dendl; + return ec; + } + + if (ec) { + ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": write failed with ec=" << ec.message() << dendl; + return ec; + } + } catch (const std::bad_alloc&) { + return bs::error_code(ENOMEM, bs::system_category()); + } + return {}; +} + +void logback_generations::handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) +{ + auto cct = static_cast(ioctx.cct()); + const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: "); + if (notifier_id != my_id) { + auto ec = update(&dp, null_yield); + if (ec) { + lderr(cct) + << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": update failed, no one to report to and no safe way to continue." + << dendl; + abort(); + } + } + cb::list rbl; + ioctx.notify_ack(oid, notify_id, watchcookie, rbl); +} + +void logback_generations::handle_error(uint64_t cookie, int err) { + auto cct = static_cast(ioctx.cct()); + auto r = ioctx.unwatch2(watchcookie); + if (r < 0) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to set unwatch oid=" << oid + << ", r=" << r << dendl; + } + + auto ec = watch(); + if (ec) { + lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << ": failed to re-establish watch, unsafe to continue: oid=" + << oid << ", ec=" << ec.message() << dendl; + } +} -- cgit v1.2.3