summaryrefslogtreecommitdiffstats
path: root/src/rgw/driver/rados
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/rgw/driver/rados
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rgw/driver/rados')
-rw-r--r--src/rgw/driver/rados/cls_fifo_legacy.cc2539
-rw-r--r--src/rgw/driver/rados/cls_fifo_legacy.h334
-rw-r--r--src/rgw/driver/rados/config/impl.cc129
-rw-r--r--src/rgw/driver/rados/config/impl.h139
-rw-r--r--src/rgw/driver/rados/config/period.cc230
-rw-r--r--src/rgw/driver/rados/config/period_config.cc55
-rw-r--r--src/rgw/driver/rados/config/realm.cc364
-rw-r--r--src/rgw/driver/rados/config/store.cc52
-rw-r--r--src/rgw/driver/rados/config/store.h182
-rw-r--r--src/rgw/driver/rados/config/zone.cc312
-rw-r--r--src/rgw/driver/rados/config/zonegroup.cc315
-rw-r--r--src/rgw/driver/rados/rgw_bucket.cc3316
-rw-r--r--src/rgw/driver/rados/rgw_bucket.h766
-rw-r--r--src/rgw/driver/rados/rgw_bucket_sync.cc1018
-rw-r--r--src/rgw/driver/rados/rgw_bucket_sync.h416
-rw-r--r--src/rgw/driver/rados/rgw_cr_rados.cc1165
-rw-r--r--src/rgw/driver/rados/rgw_cr_rados.h1647
-rw-r--r--src/rgw/driver/rados/rgw_cr_tools.cc292
-rw-r--r--src/rgw/driver/rados/rgw_cr_tools.h85
-rw-r--r--src/rgw/driver/rados/rgw_d3n_datacache.cc369
-rw-r--r--src/rgw/driver/rados/rgw_d3n_datacache.h259
-rw-r--r--src/rgw/driver/rados/rgw_data_sync.cc6762
-rw-r--r--src/rgw/driver/rados/rgw_data_sync.h868
-rw-r--r--src/rgw/driver/rados/rgw_datalog.cc1090
-rw-r--r--src/rgw/driver/rados/rgw_datalog.h394
-rw-r--r--src/rgw/driver/rados/rgw_datalog_notify.cc76
-rw-r--r--src/rgw/driver/rados/rgw_datalog_notify.h31
-rw-r--r--src/rgw/driver/rados/rgw_etag_verifier.cc191
-rw-r--r--src/rgw/driver/rados/rgw_etag_verifier.h90
-rw-r--r--src/rgw/driver/rados/rgw_gc.cc811
-rw-r--r--src/rgw/driver/rados/rgw_gc.h82
-rw-r--r--src/rgw/driver/rados/rgw_gc_log.cc55
-rw-r--r--src/rgw/driver/rados/rgw_lc_tier.cc1310
-rw-r--r--src/rgw/driver/rados/rgw_lc_tier.h51
-rw-r--r--src/rgw/driver/rados/rgw_log_backing.cc708
-rw-r--r--src/rgw/driver/rados/rgw_log_backing.h394
-rw-r--r--src/rgw/driver/rados/rgw_metadata.cc233
-rw-r--r--src/rgw/driver/rados/rgw_metadata.h298
-rw-r--r--src/rgw/driver/rados/rgw_notify.cc1023
-rw-r--r--src/rgw/driver/rados/rgw_notify.h121
-rw-r--r--src/rgw/driver/rados/rgw_obj_manifest.cc409
-rw-r--r--src/rgw/driver/rados/rgw_obj_manifest.h622
-rw-r--r--src/rgw/driver/rados/rgw_object_expirer_core.cc442
-rw-r--r--src/rgw/driver/rados/rgw_object_expirer_core.h146
-rw-r--r--src/rgw/driver/rados/rgw_otp.cc211
-rw-r--r--src/rgw/driver/rados/rgw_otp.h110
-rw-r--r--src/rgw/driver/rados/rgw_period.cc324
-rw-r--r--src/rgw/driver/rados/rgw_pubsub_push.cc460
-rw-r--r--src/rgw/driver/rados/rgw_pubsub_push.h47
-rw-r--r--src/rgw/driver/rados/rgw_putobj_processor.cc761
-rw-r--r--src/rgw/driver/rados/rgw_putobj_processor.h282
-rw-r--r--src/rgw/driver/rados/rgw_rados.cc10076
-rw-r--r--src/rgw/driver/rados/rgw_rados.h1661
-rw-r--r--src/rgw/driver/rados/rgw_reshard.cc1419
-rw-r--r--src/rgw/driver/rados/rgw_reshard.h274
-rw-r--r--src/rgw/driver/rados/rgw_rest_bucket.cc413
-rw-r--r--src/rgw/driver/rados/rgw_rest_bucket.h36
-rw-r--r--src/rgw/driver/rados/rgw_rest_log.cc1268
-rw-r--r--src/rgw/driver/rados/rgw_rest_log.h337
-rw-r--r--src/rgw/driver/rados/rgw_rest_pubsub.h38
-rw-r--r--src/rgw/driver/rados/rgw_rest_realm.cc376
-rw-r--r--src/rgw/driver/rados/rgw_rest_realm.h16
-rw-r--r--src/rgw/driver/rados/rgw_rest_user.cc1137
-rw-r--r--src/rgw/driver/rados/rgw_rest_user.h36
-rw-r--r--src/rgw/driver/rados/rgw_sal_rados.cc3846
-rw-r--r--src/rgw/driver/rados/rgw_sal_rados.h978
-rw-r--r--src/rgw/driver/rados/rgw_service.cc476
-rw-r--r--src/rgw/driver/rados/rgw_service.h215
-rw-r--r--src/rgw/driver/rados/rgw_sync.cc2568
-rw-r--r--src/rgw/driver/rados/rgw_sync.h547
-rw-r--r--src/rgw/driver/rados/rgw_sync_counters.cc28
-rw-r--r--src/rgw/driver/rados/rgw_sync_counters.h25
-rw-r--r--src/rgw/driver/rados/rgw_sync_error_repo.cc205
-rw-r--r--src/rgw/driver/rados/rgw_sync_error_repo.h59
-rw-r--r--src/rgw/driver/rados/rgw_sync_module.cc87
-rw-r--r--src/rgw/driver/rados/rgw_sync_module.h203
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_aws.cc1823
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_aws.h108
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_es.cc962
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_es.h59
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_es_rest.cc428
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_es_rest.h18
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_log.cc76
-rw-r--r--src/rgw/driver/rados/rgw_sync_module_log.h15
-rw-r--r--src/rgw/driver/rados/rgw_sync_trace.cc290
-rw-r--r--src/rgw/driver/rados/rgw_sync_trace.h141
-rw-r--r--src/rgw/driver/rados/rgw_tools.cc437
-rw-r--r--src/rgw/driver/rados/rgw_tools.h276
-rw-r--r--src/rgw/driver/rados/rgw_trim_bilog.cc1445
-rw-r--r--src/rgw/driver/rados/rgw_trim_bilog.h121
-rw-r--r--src/rgw/driver/rados/rgw_trim_datalog.cc252
-rw-r--r--src/rgw/driver/rados/rgw_trim_datalog.h28
-rw-r--r--src/rgw/driver/rados/rgw_trim_mdlog.cc795
-rw-r--r--src/rgw/driver/rados/rgw_trim_mdlog.h25
-rw-r--r--src/rgw/driver/rados/rgw_user.cc2776
-rw-r--r--src/rgw/driver/rados/rgw_user.h885
-rw-r--r--src/rgw/driver/rados/rgw_zone.cc1288
-rw-r--r--src/rgw/driver/rados/rgw_zone.h943
98 files changed, 71901 insertions, 0 deletions
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.cc b/src/rgw/driver/rados/cls_fifo_legacy.cc
new file mode 100644
index 000000000..f5bb485fa
--- /dev/null
+++ b/src/rgw/driver/rados/cls_fifo_legacy.cc
@@ -0,0 +1,2539 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+#include <optional>
+#include <string_view>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+
+#include "include/buffer.h"
+
+#include "common/async/yield_context.h"
+#include "common/random_string.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "cls_fifo_legacy.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+
+using ceph::from_error_code;
+
+inline constexpr auto MAX_RACE_RETRIES = 10;
+
+void create_meta(lr::ObjectWriteOperation* op,
+ std::string_view id,
+ std::optional<fifo::objv> objv,
+ std::optional<std::string_view> oid_prefix,
+ bool exclusive,
+ std::uint64_t max_part_size,
+ std::uint64_t max_entry_size)
+{
+ fifo::op::create_meta cm;
+
+ cm.id = id;
+ cm.version = objv;
+ cm.oid_prefix = oid_prefix;
+ cm.max_part_size = max_part_size;
+ cm.max_entry_size = max_entry_size;
+ cm.exclusive = exclusive;
+
+ cb::list in;
+ encode(cm, in);
+ op->exec(fifo::op::CLASS, fifo::op::CREATE_META, in);
+}
+
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+ std::optional<fifo::objv> objv, fifo::info* info,
+ std::uint32_t* part_header_size,
+ std::uint32_t* part_entry_overhead,
+ uint64_t tid, optional_yield y,
+ bool probe)
+{
+ lr::ObjectReadOperation op;
+ fifo::op::get_meta gm;
+ gm.version = objv;
+ cb::list in;
+ encode(gm, in);
+ cb::list bl;
+
+ op.exec(fifo::op::CLASS, fifo::op::GET_META, in,
+ &bl, nullptr);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+ if (r >= 0) try {
+ fifo::op::get_meta_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ if (info) *info = std::move(reply.info);
+ if (part_header_size) *part_header_size = reply.part_header_size;
+ if (part_entry_overhead)
+ *part_entry_overhead = reply.part_entry_overhead;
+ } catch (const cb::error& err) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " decode failed: " << err.what()
+ << " tid=" << tid << dendl;
+ r = from_error_code(err.code());
+ } else if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::GET_META failed r=" << r << " tid=" << tid
+ << dendl;
+ }
+ return r;
+};
+
+namespace {
+void update_meta(lr::ObjectWriteOperation* op, const fifo::objv& objv,
+ const fifo::update& update)
+{
+ fifo::op::update_meta um;
+
+ um.version = objv;
+ um.tail_part_num = update.tail_part_num();
+ um.head_part_num = update.head_part_num();
+ um.min_push_part_num = update.min_push_part_num();
+ um.max_push_part_num = update.max_push_part_num();
+ um.journal_entries_add = std::move(update).journal_entries_add();
+ um.journal_entries_rm = std::move(update).journal_entries_rm();
+
+ cb::list in;
+ encode(um, in);
+ op->exec(fifo::op::CLASS, fifo::op::UPDATE_META, in);
+}
+
+void part_init(lr::ObjectWriteOperation* op, fifo::data_params params)
+{
+ fifo::op::init_part ip;
+
+ ip.params = params;
+
+ cb::list in;
+ encode(ip, in);
+ op->exec(fifo::op::CLASS, fifo::op::INIT_PART, in);
+}
+
+int push_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+ std::deque<cb::list> data_bufs, std::uint64_t tid,
+ optional_yield y)
+{
+ lr::ObjectWriteOperation op;
+ fifo::op::push_part pp;
+
+ op.assert_exists();
+
+ pp.data_bufs = data_bufs;
+ pp.total_len = 0;
+
+ for (const auto& bl : data_bufs)
+ pp.total_len += bl.length();
+
+ cb::list in;
+ encode(pp, in);
+ auto retval = 0;
+ op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in, nullptr, &retval);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y, lr::OPERATION_RETURNVEC);
+ if (r < 0) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::PUSH_PART failed r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ if (retval < 0) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " error handling response retval=" << retval
+ << " tid=" << tid << dendl;
+ }
+ return retval;
+}
+
+void push_part(lr::IoCtx& ioctx, const std::string& oid,
+ std::deque<cb::list> data_bufs, std::uint64_t tid,
+ lr::AioCompletion* c)
+{
+ lr::ObjectWriteOperation op;
+ fifo::op::push_part pp;
+
+ pp.data_bufs = data_bufs;
+ pp.total_len = 0;
+
+ for (const auto& bl : data_bufs)
+ pp.total_len += bl.length();
+
+ cb::list in;
+ encode(pp, in);
+ op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in);
+ auto r = ioctx.aio_operate(oid, c, &op, lr::OPERATION_RETURNVEC);
+ ceph_assert(r >= 0);
+}
+
+void trim_part(lr::ObjectWriteOperation* op,
+ std::uint64_t ofs, bool exclusive)
+{
+ fifo::op::trim_part tp;
+
+ tp.ofs = ofs;
+ tp.exclusive = exclusive;
+
+ cb::list in;
+ encode(tp, in);
+ op->exec(fifo::op::CLASS, fifo::op::TRIM_PART, in);
+}
+
+int list_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+ std::uint64_t ofs, std::uint64_t max_entries,
+ std::vector<fifo::part_list_entry>* entries,
+ bool* more, bool* full_part,
+ std::uint64_t tid, optional_yield y)
+{
+ lr::ObjectReadOperation op;
+ fifo::op::list_part lp;
+
+ lp.ofs = ofs;
+ lp.max_entries = max_entries;
+
+ cb::list in;
+ encode(lp, in);
+ cb::list bl;
+ op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, &bl, nullptr);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+ if (r >= 0) try {
+ fifo::op::list_part_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ if (entries) *entries = std::move(reply.entries);
+ if (more) *more = reply.more;
+ if (full_part) *full_part = reply.full_part;
+ } catch (const cb::error& err) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " decode failed: " << err.what()
+ << " tid=" << tid << dendl;
+ r = from_error_code(err.code());
+ } else if (r != -ENOENT) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+ << dendl;
+ }
+ return r;
+}
+
+struct list_entry_completion : public lr::ObjectOperationCompletion {
+ CephContext* cct;
+ int* r_out;
+ std::vector<fifo::part_list_entry>* entries;
+ bool* more;
+ bool* full_part;
+ std::uint64_t tid;
+
+ list_entry_completion(CephContext* cct, int* r_out, std::vector<fifo::part_list_entry>* entries,
+ bool* more, bool* full_part, std::uint64_t tid)
+ : cct(cct), r_out(r_out), entries(entries), more(more),
+ full_part(full_part), tid(tid) {}
+ virtual ~list_entry_completion() = default;
+ void handle_completion(int r, bufferlist& bl) override {
+ if (r >= 0) try {
+ fifo::op::list_part_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ if (entries) *entries = std::move(reply.entries);
+ if (more) *more = reply.more;
+ if (full_part) *full_part = reply.full_part;
+ } catch (const cb::error& err) {
+ lderr(cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " decode failed: " << err.what()
+ << " tid=" << tid << dendl;
+ r = from_error_code(err.code());
+ } else if (r < 0) {
+ lderr(cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+ << dendl;
+ }
+ if (r_out) *r_out = r;
+ }
+};
+
+lr::ObjectReadOperation list_part(CephContext* cct,
+ std::uint64_t ofs,
+ std::uint64_t max_entries,
+ int* r_out,
+ std::vector<fifo::part_list_entry>* entries,
+ bool* more, bool* full_part,
+ std::uint64_t tid)
+{
+ lr::ObjectReadOperation op;
+ fifo::op::list_part lp;
+
+ lp.ofs = ofs;
+ lp.max_entries = max_entries;
+
+ cb::list in;
+ encode(lp, in);
+ op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in,
+ new list_entry_completion(cct, r_out, entries, more, full_part,
+ tid));
+ return op;
+}
+
+int get_part_info(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+ fifo::part_header* header,
+ std::uint64_t tid, optional_yield y)
+{
+ lr::ObjectReadOperation op;
+ fifo::op::get_part_info gpi;
+
+ cb::list in;
+ cb::list bl;
+ encode(gpi, in);
+ op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, &bl, nullptr);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+ if (r >= 0) try {
+ fifo::op::get_part_info_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ if (header) *header = std::move(reply.header);
+ } catch (const cb::error& err) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " decode failed: " << err.what()
+ << " tid=" << tid << dendl;
+ r = from_error_code(err.code());
+ } else {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+ << dendl;
+ }
+ return r;
+}
+
+struct partinfo_completion : public lr::ObjectOperationCompletion {
+ CephContext* cct;
+ int* rp;
+ fifo::part_header* h;
+ std::uint64_t tid;
+ partinfo_completion(CephContext* cct, int* rp, fifo::part_header* h,
+ std::uint64_t tid) :
+ cct(cct), rp(rp), h(h), tid(tid) {
+ }
+ virtual ~partinfo_completion() = default;
+ void handle_completion(int r, bufferlist& bl) override {
+ if (r >= 0) try {
+ fifo::op::get_part_info_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ if (h) *h = std::move(reply.header);
+ } catch (const cb::error& err) {
+ r = from_error_code(err.code());
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " decode failed: " << err.what()
+ << " tid=" << tid << dendl;
+ } else {
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+ << dendl;
+ }
+ if (rp) {
+ *rp = r;
+ }
+ }
+};
+
+lr::ObjectReadOperation get_part_info(CephContext* cct,
+ fifo::part_header* header,
+ std::uint64_t tid, int* r = 0)
+{
+ lr::ObjectReadOperation op;
+ fifo::op::get_part_info gpi;
+
+ cb::list in;
+ cb::list bl;
+ encode(gpi, in);
+ op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in,
+ new partinfo_completion(cct, r, header, tid));
+ return op;
+}
+}
+
+std::optional<marker> FIFO::to_marker(std::string_view s)
+{
+ marker m;
+ if (s.empty()) {
+ m.num = info.tail_part_num;
+ m.ofs = 0;
+ return m;
+ }
+
+ auto pos = s.find(':');
+ if (pos == s.npos) {
+ return std::nullopt;
+ }
+
+ auto num = s.substr(0, pos);
+ auto ofs = s.substr(pos + 1);
+
+ auto n = ceph::parse<decltype(m.num)>(num);
+ if (!n) {
+ return std::nullopt;
+ }
+ m.num = *n;
+ auto o = ceph::parse<decltype(m.ofs)>(ofs);
+ if (!o) {
+ return std::nullopt;
+ }
+ m.ofs = *o;
+ return m;
+}
+
+int FIFO::apply_update(const DoutPrefixProvider *dpp,
+ fifo::info* info,
+ const fifo::objv& objv,
+ const fifo::update& update,
+ std::uint64_t tid)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::unique_lock l(m);
+ if (objv != info->version) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " version mismatch, canceling: tid=" << tid << dendl;
+ return -ECANCELED;
+ }
+
+ info->apply_update(update);
+ return {};
+}
+
+int FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+ fifo::objv version, bool* pcanceled,
+ std::uint64_t tid, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ bool canceled = false;
+ update_meta(&op, version, update);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r >= 0 || r == -ECANCELED) {
+ canceled = (r == -ECANCELED);
+ if (!canceled) {
+ r = apply_update(dpp, &info, version, update, tid);
+ if (r < 0) canceled = true;
+ }
+ if (canceled) {
+ r = read_meta(dpp, tid, y);
+ canceled = r < 0 ? false : true;
+ }
+ }
+ if (pcanceled) *pcanceled = canceled;
+ if (canceled) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled: tid=" << tid << dendl;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " returning error: r=" << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+struct Updater : public Completion<Updater> {
+ FIFO* fifo;
+ fifo::update update;
+ fifo::objv version;
+ bool reread = false;
+ bool* pcanceled = nullptr;
+ std::uint64_t tid;
+ Updater(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super,
+ const fifo::update& update, fifo::objv version,
+ bool* pcanceled, std::uint64_t tid)
+ : Completion(dpp, super), fifo(fifo), update(update), version(version),
+ pcanceled(pcanceled) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (reread)
+ handle_reread(dpp, std::move(p), r);
+ else
+ handle_update(dpp, std::move(p), r);
+ }
+
+ void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " handling async update_meta: tid="
+ << tid << dendl;
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " update failed: r=" << r << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ bool canceled = (r == -ECANCELED);
+ if (!canceled) {
+ int r = fifo->apply_update(dpp, &fifo->info, version, update, tid);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " update failed, marking canceled: r=" << r
+ << " tid=" << tid << dendl;
+ canceled = true;
+ }
+ }
+ if (canceled) {
+ reread = true;
+ fifo->read_meta(dpp, tid, call(std::move(p)));
+ return;
+ }
+ if (pcanceled)
+ *pcanceled = false;
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " completing: tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ }
+
+ void handle_reread(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " handling async read_meta: tid="
+ << tid << dendl;
+ if (r < 0 && pcanceled) {
+ *pcanceled = false;
+ } else if (r >= 0 && pcanceled) {
+ *pcanceled = true;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " failed dispatching read_meta: r=" << r << " tid="
+ << tid << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " completing: tid=" << tid << dendl;
+ }
+ complete(std::move(p), r);
+ }
+};
+
+void FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+ fifo::objv version, bool* pcanceled,
+ std::uint64_t tid, lr::AioCompletion* c)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ update_meta(&op, info.version, update);
+ auto updater = std::make_unique<Updater>(dpp, this, c, update, version, pcanceled,
+ tid);
+ auto r = ioctx.aio_operate(oid, Updater::call(std::move(updater)), &op);
+ assert(r >= 0);
+}
+
+int FIFO::create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+ optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ op.create(false); /* We don't need exclusivity, part_init ensures
+ we're creating from the same journal entry. */
+ std::unique_lock l(m);
+ part_init(&op, info.params);
+ auto oid = info.part_oid(part_num);
+ l.unlock();
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " part_init failed: r=" << r << " tid="
+ << tid << dendl;
+ }
+ return r;
+}
+
+int FIFO::remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+ optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ op.remove();
+ std::unique_lock l(m);
+ auto oid = info.part_oid(part_num);
+ l.unlock();
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " remove failed: r=" << r << " tid="
+ << tid << dendl;
+ }
+ return r;
+}
+
+int FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::vector<fifo::journal_entry> processed;
+
+ std::unique_lock l(m);
+ auto tmpjournal = info.journal;
+ auto new_tail = info.tail_part_num;
+ auto new_head = info.head_part_num;
+ auto new_max = info.max_push_part_num;
+ l.unlock();
+
+ int r = 0;
+ for (auto& entry : tmpjournal) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " processing entry: entry=" << entry << " tid=" << tid
+ << dendl;
+ switch (entry.op) {
+ using enum fifo::journal_entry::Op;
+ case create:
+ r = create_part(dpp, entry.part_num, tid, y);
+ if (entry.part_num > new_max) {
+ new_max = entry.part_num;
+ }
+ break;
+ case set_head:
+ r = 0;
+ if (entry.part_num > new_head) {
+ new_head = entry.part_num;
+ }
+ break;
+ case remove:
+ r = remove_part(dpp, entry.part_num, tid, y);
+ if (r == -ENOENT) r = 0;
+ if (entry.part_num >= new_tail) {
+ new_tail = entry.part_num + 1;
+ }
+ break;
+ default:
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " unknown journaled op: entry=" << entry << " tid="
+ << tid << dendl;
+ return -EIO;
+ }
+
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " processing entry failed: entry=" << entry
+ << " r=" << r << " tid=" << tid << dendl;
+ return -r;
+ }
+
+ processed.push_back(std::move(entry));
+ }
+
+ // Postprocess
+ bool canceled = true;
+
+ for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " postprocessing: i=" << i << " tid=" << tid << dendl;
+
+ std::optional<int64_t> tail_part_num;
+ std::optional<int64_t> head_part_num;
+ std::optional<int64_t> max_part_num;
+
+ std::unique_lock l(m);
+ auto objv = info.version;
+ if (new_tail > tail_part_num) tail_part_num = new_tail;
+ if (new_head > info.head_part_num) head_part_num = new_head;
+ if (new_max > info.max_push_part_num) max_part_num = new_max;
+ l.unlock();
+
+ if (processed.empty() &&
+ !tail_part_num &&
+ !max_part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " nothing to update any more: i=" << i << " tid="
+ << tid << dendl;
+ canceled = false;
+ break;
+ }
+ auto u = fifo::update().tail_part_num(tail_part_num)
+ .head_part_num(head_part_num).max_push_part_num(max_part_num)
+ .journal_entries_rm(processed);
+ r = _update_meta(dpp, u, objv, &canceled, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: update=" << u
+ << " r=" << r << " tid=" << tid << dendl;
+ break;
+ }
+
+ if (canceled) {
+ std::vector<fifo::journal_entry> new_processed;
+ std::unique_lock l(m);
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " update canceled, retrying: i=" << i << " tid="
+ << tid << dendl;
+ for (auto& e : processed) {
+ if (info.journal.contains(e)) {
+ new_processed.push_back(e);
+ }
+ }
+ processed = std::move(new_processed);
+ }
+ }
+ if (r == 0 && canceled) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ r = -ECANCELED;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " failed, r=: " << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+int FIFO::_prepare_new_part(const DoutPrefixProvider *dpp,
+ std::int64_t new_part_num, bool is_head,
+ std::uint64_t tid, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::unique_lock l(m);
+ using enum fifo::journal_entry::Op;
+ std::vector<fifo::journal_entry> jentries{{ create, new_part_num }};
+ if (info.journal.contains({create, new_part_num}) &&
+ (!is_head || info.journal.contains({set_head, new_part_num}))) {
+ l.unlock();
+ ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " new part journaled, but not processed: tid="
+ << tid << dendl;
+ auto r = process_journal(dpp, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+ }
+ return r;
+ }
+ auto version = info.version;
+
+ if (is_head) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " needs new head: tid=" << tid << dendl;
+ jentries.push_back({ set_head, new_part_num });
+ }
+ l.unlock();
+
+ int r = 0;
+ bool canceled = true;
+ for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+ canceled = false;
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " updating metadata: i=" << i << " tid=" << tid << dendl;
+ auto u = fifo::update{}.journal_entries_add(jentries);
+ r = _update_meta(dpp, u, version, &canceled, tid, y);
+ if (r >= 0 && canceled) {
+ std::unique_lock l(m);
+ version = info.version;
+ auto found = (info.journal.contains({create, new_part_num}) ||
+ info.journal.contains({set_head, new_part_num}));
+ if ((info.max_push_part_num >= new_part_num &&
+ info.head_part_num >= new_part_num)) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, but journaled and processed: i=" << i
+ << " tid=" << tid << dendl;
+ return 0;
+ }
+ if (found) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, journaled but not processed: i=" << i
+ << " tid=" << tid << dendl;
+ canceled = false;
+ }
+ l.unlock();
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: update=" << u << " r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ }
+ if (canceled) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ return -ECANCELED;
+ }
+ r = process_journal(dpp, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+int FIFO::_prepare_new_head(const DoutPrefixProvider *dpp,
+ std::int64_t new_head_part_num,
+ std::uint64_t tid, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::unique_lock l(m);
+ auto max_push_part_num = info.max_push_part_num;
+ auto version = info.version;
+ l.unlock();
+
+ int r = 0;
+ if (max_push_part_num < new_head_part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new part: tid=" << tid << dendl;
+ r = _prepare_new_part(dpp, new_head_part_num, true, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _prepare_new_part failed: r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ std::unique_lock l(m);
+ if (info.max_push_part_num < new_head_part_num) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " inconsistency, push part less than head part: "
+ << " tid=" << tid << dendl;
+ return -EIO;
+ }
+ l.unlock();
+ return 0;
+ }
+
+ using enum fifo::journal_entry::Op;
+ fifo::journal_entry jentry;
+ jentry.op = set_head;
+ jentry.part_num = new_head_part_num;
+
+ r = 0;
+ bool canceled = true;
+ for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+ canceled = false;
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " updating metadata: i=" << i << " tid=" << tid << dendl;
+ auto u = fifo::update{}.journal_entries_add({{ jentry }});
+ r = _update_meta(dpp, u, version, &canceled, tid, y);
+ if (r >= 0 && canceled) {
+ std::unique_lock l(m);
+ auto found = (info.journal.contains({create, new_head_part_num}) ||
+ info.journal.contains({set_head, new_head_part_num}));
+ version = info.version;
+ if ((info.head_part_num >= new_head_part_num)) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, but journaled and processed: i=" << i
+ << " tid=" << tid << dendl;
+ return 0;
+ }
+ if (found) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, journaled but not processed: i=" << i
+ << " tid=" << tid << dendl;
+ canceled = false;
+ }
+ l.unlock();
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: update=" << u << " r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ }
+ if (canceled) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ return -ECANCELED;
+ }
+ r = process_journal(dpp, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+struct NewPartPreparer : public Completion<NewPartPreparer> {
+ FIFO* f;
+ std::vector<fifo::journal_entry> jentries;
+ int i = 0;
+ std::int64_t new_part_num;
+ bool canceled = false;
+ uint64_t tid;
+
+ NewPartPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+ std::vector<fifo::journal_entry> jentries,
+ std::int64_t new_part_num,
+ std::uint64_t tid)
+ : Completion(dpp, super), f(f), jentries(std::move(jentries)),
+ new_part_num(new_part_num), tid(tid) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+
+ if (canceled) {
+ using enum fifo::journal_entry::Op;
+ std::unique_lock l(f->m);
+ auto found = (f->info.journal.contains({create, new_part_num}) ||
+ f->info.journal.contains({set_head, new_part_num}));
+ auto max_push_part_num = f->info.max_push_part_num;
+ auto head_part_num = f->info.head_part_num;
+ auto version = f->info.version;
+ l.unlock();
+ if ((max_push_part_num >= new_part_num &&
+ head_part_num >= new_part_num)) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, but journaled and processed: i=" << i
+ << " tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ return;
+ }
+ if (i >= MAX_RACE_RETRIES) {
+ complete(std::move(p), -ECANCELED);
+ return;
+ }
+ if (!found) {
+ ++i;
+ f->_update_meta(dpp, fifo::update{}
+ .journal_entries_add(jentries),
+ version, &canceled, tid, call(std::move(p)));
+ return;
+ } else {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, journaled but not processed: i=" << i
+ << " tid=" << tid << dendl;
+ canceled = false;
+ }
+ // Fall through. We still need to process the journal.
+ }
+ f->process_journal(dpp, tid, super());
+ return;
+ }
+};
+
+void FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num,
+ bool is_head, std::uint64_t tid, lr::AioCompletion* c)
+{
+ std::unique_lock l(m);
+ using enum fifo::journal_entry::Op;
+ std::vector<fifo::journal_entry> jentries{{create, new_part_num}};
+ if (info.journal.contains({create, new_part_num}) &&
+ (!is_head || info.journal.contains({set_head, new_part_num}))) {
+ l.unlock();
+ ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " new part journaled, but not processed: tid="
+ << tid << dendl;
+ process_journal(dpp, tid, c);
+ return;
+ }
+ auto version = info.version;
+
+ if (is_head) {
+ jentries.push_back({ set_head, new_part_num });
+ }
+ l.unlock();
+
+ auto n = std::make_unique<NewPartPreparer>(dpp, this, c, jentries,
+ new_part_num, tid);
+ auto np = n.get();
+ _update_meta(dpp, fifo::update{}.journal_entries_add(jentries), version,
+ &np->canceled, tid, NewPartPreparer::call(std::move(n)));
+}
+
+struct NewHeadPreparer : public Completion<NewHeadPreparer> {
+ FIFO* f;
+ int i = 0;
+ bool newpart;
+ std::int64_t new_head_part_num;
+ bool canceled = false;
+ std::uint64_t tid;
+
+ NewHeadPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+ bool newpart, std::int64_t new_head_part_num,
+ std::uint64_t tid)
+ : Completion(dpp, super), f(f), newpart(newpart),
+ new_head_part_num(new_head_part_num), tid(tid) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ if (newpart)
+ handle_newpart(std::move(p), r);
+ else
+ handle_update(dpp, std::move(p), r);
+ }
+
+ void handle_newpart(Ptr&& p, int r) {
+ if (r < 0) {
+ lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _prepare_new_part failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ std::unique_lock l(f->m);
+ if (f->info.max_push_part_num < new_head_part_num) {
+ l.unlock();
+ lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _prepare_new_part failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), -EIO);
+ } else {
+ l.unlock();
+ complete(std::move(p), 0);
+ }
+ }
+
+ void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+
+ if (canceled) {
+ using enum fifo::journal_entry::Op;
+ std::unique_lock l(f->m);
+ auto found = (f->info.journal.contains({create, new_head_part_num }) ||
+ f->info.journal.contains({set_head, new_head_part_num }));
+ auto head_part_num = f->info.head_part_num;
+ auto version = f->info.version;
+
+ l.unlock();
+ if ((head_part_num >= new_head_part_num)) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, but journaled and processed: i=" << i
+ << " tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ return;
+ }
+ if (i >= MAX_RACE_RETRIES) {
+ complete(std::move(p), -ECANCELED);
+ return;
+ }
+ if (!found) {
+ ++i;
+ fifo::journal_entry jentry;
+ jentry.op = set_head;
+ jentry.part_num = new_head_part_num;
+ f->_update_meta(dpp, fifo::update{}
+ .journal_entries_add({{jentry}}),
+ version, &canceled, tid, call(std::move(p)));
+ return;
+ } else {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced, journaled but not processed: i=" << i
+ << " tid=" << tid << dendl;
+ canceled = false;
+ }
+ // Fall through. We still need to process the journal.
+ }
+ f->process_journal(dpp, tid, super());
+ return;
+ }
+};
+
+void FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num,
+ std::uint64_t tid, lr::AioCompletion* c)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::unique_lock l(m);
+ auto max_push_part_num = info.max_push_part_num;
+ auto version = info.version;
+ l.unlock();
+
+ if (max_push_part_num < new_head_part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new part: tid=" << tid << dendl;
+ auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, true, new_head_part_num,
+ tid);
+ _prepare_new_part(dpp, new_head_part_num, true, tid,
+ NewHeadPreparer::call(std::move(n)));
+ } else {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " updating head: tid=" << tid << dendl;
+ auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, false, new_head_part_num,
+ tid);
+ auto np = n.get();
+ using enum fifo::journal_entry::Op;
+ fifo::journal_entry jentry;
+ jentry.op = set_head;
+ jentry.part_num = new_head_part_num;
+ _update_meta(dpp, fifo::update{}.journal_entries_add({{jentry}}), version,
+ &np->canceled, tid, NewHeadPreparer::call(std::move(n)));
+ }
+}
+
+int FIFO::push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+ std::uint64_t tid, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::unique_lock l(m);
+ auto head_part_num = info.head_part_num;
+ const auto part_oid = info.part_oid(head_part_num);
+ l.unlock();
+
+ auto r = push_part(dpp, ioctx, part_oid, data_bufs, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " push_part failed: r=" << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+void FIFO::push_entries(const std::deque<cb::list>& data_bufs,
+ std::uint64_t tid, lr::AioCompletion* c)
+{
+ std::unique_lock l(m);
+ auto head_part_num = info.head_part_num;
+ const auto part_oid = info.part_oid(head_part_num);
+ l.unlock();
+
+ push_part(ioctx, part_oid, data_bufs, tid, c);
+}
+
+int FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+ bool exclusive, std::uint64_t tid,
+ optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ std::unique_lock l(m);
+ const auto part_oid = info.part_oid(part_num);
+ l.unlock();
+ rgw::cls::fifo::trim_part(&op, ofs, exclusive);
+ auto r = rgw_rados_operate(dpp, ioctx, part_oid, &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " trim_part failed: r=" << r << " tid=" << tid << dendl;
+ }
+ return 0;
+}
+
+void FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+ bool exclusive, std::uint64_t tid,
+ lr::AioCompletion* c)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectWriteOperation op;
+ std::unique_lock l(m);
+ const auto part_oid = info.part_oid(part_num);
+ l.unlock();
+ rgw::cls::fifo::trim_part(&op, ofs, exclusive);
+ auto r = ioctx.aio_operate(part_oid, c, &op);
+ ceph_assert(r >= 0);
+}
+
+int FIFO::open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+ optional_yield y, std::optional<fifo::objv> objv,
+ bool probe)
+{
+ ldpp_dout(dpp, 20)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering" << dendl;
+ fifo::info info;
+ std::uint32_t size;
+ std::uint32_t over;
+ int r = get_meta(dpp, ioctx, std::move(oid), objv, &info, &size, &over, 0, y,
+ probe);
+ if (r < 0) {
+ if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " get_meta failed: r=" << r << dendl;
+ }
+ return r;
+ }
+ std::unique_ptr<FIFO> f(new FIFO(std::move(ioctx), oid));
+ f->info = info;
+ f->part_header_size = size;
+ f->part_entry_overhead = over;
+ // If there are journal entries, process them, in case
+ // someone crashed mid-transaction.
+ if (!info.journal.empty()) {
+ ldpp_dout(dpp, 20)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " processing leftover journal" << dendl;
+ r = f->process_journal(dpp, 0, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " process_journal failed: r=" << r << dendl;
+ return r;
+ }
+ }
+ *fifo = std::move(f);
+ return 0;
+}
+
+int FIFO::create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+ optional_yield y, std::optional<fifo::objv> objv,
+ std::optional<std::string_view> oid_prefix,
+ bool exclusive, std::uint64_t max_part_size,
+ std::uint64_t max_entry_size)
+{
+ ldpp_dout(dpp, 20)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering" << dendl;
+ lr::ObjectWriteOperation op;
+ create_meta(&op, oid, objv, oid_prefix, exclusive, max_part_size,
+ max_entry_size);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " create_meta failed: r=" << r << dendl;
+ return r;
+ }
+ r = open(dpp, std::move(ioctx), std::move(oid), fifo, y, objv);
+ return r;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ fifo::info _info;
+ std::uint32_t _phs;
+ std::uint32_t _peo;
+
+ auto r = get_meta(dpp, ioctx, oid, std::nullopt, &_info, &_phs, &_peo, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " get_meta failed: r=" << r << " tid=" << tid << dendl;
+ return r;
+ }
+ std::unique_lock l(m);
+ // We have a newer version already!
+ if (_info.version.same_or_later(this->info.version)) {
+ info = std::move(_info);
+ part_header_size = _phs;
+ part_entry_overhead = _peo;
+ }
+ return 0;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ l.unlock();
+ return read_meta(dpp, tid, y);
+}
+
+struct Reader : public Completion<Reader> {
+ FIFO* fifo;
+ cb::list bl;
+ std::uint64_t tid;
+ Reader(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, std::uint64_t tid)
+ : Completion(dpp, super), fifo(fifo), tid(tid) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (r >= 0) try {
+ fifo::op::get_meta_reply reply;
+ auto iter = bl.cbegin();
+ decode(reply, iter);
+ std::unique_lock l(fifo->m);
+ if (reply.info.version.same_or_later(fifo->info.version)) {
+ fifo->info = std::move(reply.info);
+ fifo->part_header_size = reply.part_header_size;
+ fifo->part_entry_overhead = reply.part_entry_overhead;
+ }
+ } catch (const cb::error& err) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " failed to decode response err=" << err.what()
+ << " tid=" << tid << dendl;
+ r = from_error_code(err.code());
+ } else {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed r=" << r
+ << " tid=" << tid << dendl;
+ }
+ complete(std::move(p), r);
+ }
+};
+
+void FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ lr::ObjectReadOperation op;
+ fifo::op::get_meta gm;
+ cb::list in;
+ encode(gm, in);
+ auto reader = std::make_unique<Reader>(dpp, this, c, tid);
+ auto rp = reader.get();
+ auto r = ioctx.aio_exec(oid, Reader::call(std::move(reader)), fifo::op::CLASS,
+ fifo::op::GET_META, in, &rp->bl);
+ assert(r >= 0);
+}
+
+const fifo::info& FIFO::meta() const {
+ return info;
+}
+
+std::pair<std::uint32_t, std::uint32_t> FIFO::get_part_layout_info() const {
+ return {part_header_size, part_entry_overhead};
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, optional_yield y) {
+ return push(dpp, std::vector{ bl }, y);
+}
+
+void FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, lr::AioCompletion* c) {
+ push(dpp, std::vector{ bl }, c);
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, optional_yield y)
+{
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ auto max_entry_size = info.params.max_entry_size;
+ auto need_new_head = info.need_new_head();
+ auto head_part_num = info.head_part_num;
+ l.unlock();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (data_bufs.empty()) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " empty push, returning success tid=" << tid << dendl;
+ return 0;
+ }
+
+ // Validate sizes
+ for (const auto& bl : data_bufs) {
+ if (bl.length() > max_entry_size) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entry bigger than max_entry_size tid=" << tid << dendl;
+ return -E2BIG;
+ }
+ }
+
+ int r = 0;
+ if (need_new_head) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new head tid=" << tid << dendl;
+ r = _prepare_new_head(dpp, head_part_num + 1, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _prepare_new_head failed: r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ }
+
+ std::deque<cb::list> remaining(data_bufs.begin(), data_bufs.end());
+ std::deque<cb::list> batch;
+
+ uint64_t batch_len = 0;
+ auto retries = 0;
+ bool canceled = true;
+ while ((!remaining.empty() || !batch.empty()) &&
+ (retries <= MAX_RACE_RETRIES)) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " preparing push: remaining=" << remaining.size()
+ << " batch=" << batch.size() << " retries=" << retries
+ << " tid=" << tid << dendl;
+ std::unique_lock l(m);
+ head_part_num = info.head_part_num;
+ auto max_part_size = info.params.max_part_size;
+ auto overhead = part_entry_overhead;
+ l.unlock();
+
+ while (!remaining.empty() &&
+ (remaining.front().length() + batch_len <= max_part_size)) {
+ /* We can send entries with data_len up to max_entry_size,
+ however, we want to also account the overhead when
+ dealing with multiple entries. Previous check doesn't
+ account for overhead on purpose. */
+ batch_len += remaining.front().length() + overhead;
+ batch.push_back(std::move(remaining.front()));
+ remaining.pop_front();
+ }
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " prepared push: remaining=" << remaining.size()
+ << " batch=" << batch.size() << " retries=" << retries
+ << " batch_len=" << batch_len
+ << " tid=" << tid << dendl;
+
+ auto r = push_entries(dpp, batch, tid, y);
+ if (r == -ERANGE) {
+ canceled = true;
+ ++retries;
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new head tid=" << tid << dendl;
+ r = _prepare_new_head(dpp, head_part_num + 1, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " prepare_new_head failed: r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ r = 0;
+ continue;
+ }
+ if (r == -ENOENT) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " racing client trimmed part, rereading metadata "
+ << "tid=" << tid << dendl;
+ canceled = true;
+ ++retries;
+ r = read_meta(dpp, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed: r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ r = 0;
+ continue;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " push_entries failed: r=" << r
+ << " tid=" << tid << dendl;
+ return r;
+ }
+ // Made forward progress!
+ canceled = false;
+ retries = 0;
+ batch_len = 0;
+ if (r == ssize(batch)) {
+ batch.clear();
+ } else {
+ batch.erase(batch.begin(), batch.begin() + r);
+ for (const auto& b : batch) {
+ batch_len += b.length() + part_entry_overhead;
+ }
+ }
+ }
+ if (canceled) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ return -ECANCELED;
+ }
+ return 0;
+}
+
+struct Pusher : public Completion<Pusher> {
+ FIFO* f;
+ std::deque<cb::list> remaining;
+ std::deque<cb::list> batch;
+ int i = 0;
+ std::int64_t head_part_num;
+ std::uint64_t tid;
+ enum { pushing, new_heading, meta_reading } state = pushing;
+
+ void prep_then_push(const DoutPrefixProvider *dpp, Ptr&& p, const unsigned successes) {
+ std::unique_lock l(f->m);
+ auto max_part_size = f->info.params.max_part_size;
+ auto part_entry_overhead = f->part_entry_overhead;
+ head_part_num = f->info.head_part_num;
+ l.unlock();
+
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " preparing push: remaining=" << remaining.size()
+ << " batch=" << batch.size() << " i=" << i
+ << " tid=" << tid << dendl;
+
+ uint64_t batch_len = 0;
+ if (successes > 0) {
+ if (successes == batch.size()) {
+ batch.clear();
+ } else {
+ batch.erase(batch.begin(), batch.begin() + successes);
+ for (const auto& b : batch) {
+ batch_len += b.length() + part_entry_overhead;
+ }
+ }
+ }
+
+ if (batch.empty() && remaining.empty()) {
+ complete(std::move(p), 0);
+ return;
+ }
+
+ while (!remaining.empty() &&
+ (remaining.front().length() + batch_len <= max_part_size)) {
+
+ /* We can send entries with data_len up to max_entry_size,
+ however, we want to also account the overhead when
+ dealing with multiple entries. Previous check doesn't
+ account for overhead on purpose. */
+ batch_len += remaining.front().length() + part_entry_overhead;
+ batch.push_back(std::move(remaining.front()));
+ remaining.pop_front();
+ }
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " prepared push: remaining=" << remaining.size()
+ << " batch=" << batch.size() << " i=" << i
+ << " batch_len=" << batch_len
+ << " tid=" << tid << dendl;
+ push(std::move(p));
+ }
+
+ void push(Ptr&& p) {
+ f->push_entries(batch, tid, call(std::move(p)));
+ }
+
+ void new_head(const DoutPrefixProvider *dpp, Ptr&& p) {
+ state = new_heading;
+ f->_prepare_new_head(dpp, head_part_num + 1, tid, call(std::move(p)));
+ }
+
+ void read_meta(const DoutPrefixProvider *dpp, Ptr&& p) {
+ ++i;
+ state = meta_reading;
+ f->read_meta(dpp, tid, call(std::move(p)));
+ }
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ switch (state) {
+ case pushing:
+ if (r == -ERANGE) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new head tid=" << tid << dendl;
+ new_head(dpp, std::move(p));
+ return;
+ }
+ if (r == -ENOENT) {
+ if (i > MAX_RACE_RETRIES) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " racing client deleted part, but we're out"
+ << " of retries: tid=" << tid << dendl;
+ complete(std::move(p), r);
+ }
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " racing client deleted part: tid=" << tid << dendl;
+ read_meta(dpp, std::move(p));
+ return;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " push_entries failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ i = 0; // We've made forward progress, so reset the race counter!
+ prep_then_push(dpp, std::move(p), r);
+ break;
+
+ case new_heading:
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " prepare_new_head failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ state = pushing;
+ handle_new_head(dpp, std::move(p), r);
+ break;
+
+ case meta_reading:
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed: r=" << r
+ << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ state = pushing;
+ prep_then_push(dpp, std::move(p), r);
+ break;
+ }
+ }
+
+ void handle_new_head(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ if (r == -ECANCELED) {
+ if (p->i == MAX_RACE_RETRIES) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ complete(std::move(p), -ECANCELED);
+ return;
+ }
+ ++p->i;
+ } else if (r) {
+ complete(std::move(p), r);
+ return;
+ }
+
+ if (p->batch.empty()) {
+ prep_then_push(dpp, std::move(p), 0);
+ return;
+ } else {
+ push(std::move(p));
+ return;
+ }
+ }
+
+ Pusher(const DoutPrefixProvider *dpp, FIFO* f, std::deque<cb::list>&& remaining,
+ std::int64_t head_part_num, std::uint64_t tid,
+ lr::AioCompletion* super)
+ : Completion(dpp, super), f(f), remaining(std::move(remaining)),
+ head_part_num(head_part_num), tid(tid) {}
+};
+
+void FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs,
+ lr::AioCompletion* c)
+{
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ auto max_entry_size = info.params.max_entry_size;
+ auto need_new_head = info.need_new_head();
+ auto head_part_num = info.head_part_num;
+ l.unlock();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ auto p = std::make_unique<Pusher>(dpp, this, std::deque<cb::list>(data_bufs.begin(), data_bufs.end()),
+ head_part_num, tid, c);
+ // Validate sizes
+ for (const auto& bl : data_bufs) {
+ if (bl.length() > max_entry_size) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entry bigger than max_entry_size tid=" << tid << dendl;
+ Pusher::complete(std::move(p), -E2BIG);
+ return;
+ }
+ }
+
+ if (data_bufs.empty() ) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " empty push, returning success tid=" << tid << dendl;
+ Pusher::complete(std::move(p), 0);
+ return;
+ }
+
+ if (need_new_head) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " need new head tid=" << tid << dendl;
+ p->new_head(dpp, std::move(p));
+ } else {
+ p->prep_then_push(dpp, std::move(p), 0);
+ }
+}
+
+int FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+ std::optional<std::string_view> markstr,
+ std::vector<list_entry>* presult, bool* pmore,
+ optional_yield y)
+{
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ std::int64_t part_num = info.tail_part_num;
+ l.unlock();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::uint64_t ofs = 0;
+ if (markstr) {
+ auto marker = to_marker(*markstr);
+ if (!marker) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " invalid marker string: " << markstr
+ << " tid= "<< tid << dendl;
+ return -EINVAL;
+ }
+ part_num = marker->num;
+ ofs = marker->ofs;
+ }
+
+ std::vector<list_entry> result;
+ result.reserve(max_entries);
+ bool more = false;
+
+ std::vector<fifo::part_list_entry> entries;
+ int r = 0;
+ while (max_entries > 0) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " max_entries=" << max_entries << " tid=" << tid << dendl;
+ bool part_more = false;
+ bool part_full = false;
+
+ std::unique_lock l(m);
+ auto part_oid = info.part_oid(part_num);
+ l.unlock();
+
+ r = list_part(dpp, ioctx, part_oid, ofs, max_entries, &entries,
+ &part_more, &part_full, tid, y);
+ if (r == -ENOENT) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " missing part, rereading metadata"
+ << " tid= "<< tid << dendl;
+ r = read_meta(dpp, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed: r=" << r
+ << " tid= "<< tid << dendl;
+ return r;
+ }
+ if (part_num < info.tail_part_num) {
+ /* raced with trim? restart */
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " raced with trim, restarting: tid=" << tid << dendl;
+ max_entries += result.size();
+ result.clear();
+ std::unique_lock l(m);
+ part_num = info.tail_part_num;
+ l.unlock();
+ ofs = 0;
+ continue;
+ }
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " assuming part was not written yet, so end of data: "
+ << "tid=" << tid << dendl;
+ more = false;
+ r = 0;
+ break;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " list_entries failed: r=" << r
+ << " tid= "<< tid << dendl;
+ return r;
+ }
+ more = part_full || part_more;
+ for (auto& entry : entries) {
+ list_entry e;
+ e.data = std::move(entry.data);
+ e.marker = marker{part_num, entry.ofs}.to_string();
+ e.mtime = entry.mtime;
+ result.push_back(std::move(e));
+ --max_entries;
+ if (max_entries == 0)
+ break;
+ }
+ entries.clear();
+ if (max_entries > 0 &&
+ part_more) {
+ }
+
+ if (!part_full) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " head part is not full, so we can assume we're done: "
+ << "tid=" << tid << dendl;
+ break;
+ }
+ if (!part_more) {
+ ++part_num;
+ ofs = 0;
+ }
+ }
+ if (presult)
+ *presult = std::move(result);
+ if (pmore)
+ *pmore = more;
+ return 0;
+}
+
+int FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y)
+{
+ bool overshoot = false;
+ auto marker = to_marker(markstr);
+ if (!marker) {
+ return -EINVAL;
+ }
+ auto part_num = marker->num;
+ auto ofs = marker->ofs;
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ auto hn = info.head_part_num;
+ const auto max_part_size = info.params.max_part_size;
+ if (part_num > hn) {
+ l.unlock();
+ auto r = read_meta(dpp, tid, y);
+ if (r < 0) {
+ return r;
+ }
+ l.lock();
+ auto hn = info.head_part_num;
+ if (part_num > hn) {
+ overshoot = true;
+ part_num = hn;
+ ofs = max_part_size;
+ }
+ }
+ if (part_num < info.tail_part_num) {
+ return -ENODATA;
+ }
+ auto pn = info.tail_part_num;
+ l.unlock();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+
+ int r = 0;
+ while (pn < part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " pn=" << pn << " tid=" << tid << dendl;
+ std::unique_lock l(m);
+ l.unlock();
+ r = trim_part(dpp, pn, max_part_size, false, tid, y);
+ if (r < 0 && r == -ENOENT) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " trim_part failed: r=" << r
+ << " tid= "<< tid << dendl;
+ return r;
+ }
+ ++pn;
+ }
+ r = trim_part(dpp, part_num, ofs, exclusive, tid, y);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " trim_part failed: r=" << r
+ << " tid= "<< tid << dendl;
+ return r;
+ }
+
+ l.lock();
+ auto tail_part_num = info.tail_part_num;
+ auto objv = info.version;
+ l.unlock();
+ bool canceled = tail_part_num < part_num;
+ int retries = 0;
+ while ((tail_part_num < part_num) &&
+ canceled &&
+ (retries <= MAX_RACE_RETRIES)) {
+ r = _update_meta(dpp, fifo::update{}.tail_part_num(part_num), objv, &canceled,
+ tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " _update_meta failed: r=" << r
+ << " tid= "<< tid << dendl;
+ return r;
+ }
+ if (canceled) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled: retries=" << retries
+ << " tid=" << tid << dendl;
+ l.lock();
+ tail_part_num = info.tail_part_num;
+ objv = info.version;
+ l.unlock();
+ ++retries;
+ }
+ }
+ if (canceled) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ return -EIO;
+ }
+ return overshoot ? -ENODATA : 0;
+}
+
+struct Trimmer : public Completion<Trimmer> {
+ FIFO* fifo;
+ std::int64_t part_num;
+ std::uint64_t ofs;
+ std::int64_t pn;
+ bool exclusive;
+ std::uint64_t tid;
+ bool update = false;
+ bool reread = false;
+ bool canceled = false;
+ bool overshoot = false;
+ int retries = 0;
+
+ Trimmer(const DoutPrefixProvider *dpp, FIFO* fifo, std::int64_t part_num, std::uint64_t ofs, std::int64_t pn,
+ bool exclusive, lr::AioCompletion* super, std::uint64_t tid)
+ : Completion(dpp, super), fifo(fifo), part_num(part_num), ofs(ofs), pn(pn),
+ exclusive(exclusive), tid(tid) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+
+ if (reread) {
+ reread = false;
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed: r="
+ << r << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+ std::unique_lock l(fifo->m);
+ auto hn = fifo->info.head_part_num;
+ const auto max_part_size = fifo->info.params.max_part_size;
+ const auto tail_part_num = fifo->info.tail_part_num;
+ l.unlock();
+ if (part_num > hn) {
+ part_num = hn;
+ ofs = max_part_size;
+ overshoot = true;
+ }
+ if (part_num < tail_part_num) {
+ complete(std::move(p), -ENODATA);
+ return;
+ }
+ pn = tail_part_num;
+ if (pn < part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " pn=" << pn << " tid=" << tid << dendl;
+ fifo->trim_part(dpp, pn++, max_part_size, false, tid,
+ call(std::move(p)));
+ } else {
+ update = true;
+ canceled = tail_part_num < part_num;
+ fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p)));
+ }
+ return;
+ }
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << (update ? " update_meta " : " trim ") << "failed: r="
+ << r << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ }
+
+ if (!update) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " handling preceding trim callback: tid=" << tid << dendl;
+ retries = 0;
+ if (pn < part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " pn=" << pn << " tid=" << tid << dendl;
+ std::unique_lock l(fifo->m);
+ const auto max_part_size = fifo->info.params.max_part_size;
+ l.unlock();
+ fifo->trim_part(dpp, pn++, max_part_size, false, tid,
+ call(std::move(p)));
+ return;
+ }
+
+ std::unique_lock l(fifo->m);
+ const auto tail_part_num = fifo->info.tail_part_num;
+ l.unlock();
+ update = true;
+ canceled = tail_part_num < part_num;
+ fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p)));
+ return;
+ }
+
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " handling update-needed callback: tid=" << tid << dendl;
+ std::unique_lock l(fifo->m);
+ auto tail_part_num = fifo->info.tail_part_num;
+ auto objv = fifo->info.version;
+ l.unlock();
+ if ((tail_part_num < part_num) &&
+ canceled) {
+ if (retries > MAX_RACE_RETRIES) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid=" << tid << dendl;
+ complete(std::move(p), -EIO);
+ return;
+ }
+ ++retries;
+ fifo->_update_meta(dpp, fifo::update{}
+ .tail_part_num(part_num), objv, &canceled,
+ tid, call(std::move(p)));
+ } else {
+ complete(std::move(p), overshoot ? -ENODATA : 0);
+ }
+ }
+};
+
+void FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive,
+ lr::AioCompletion* c) {
+ auto marker = to_marker(markstr);
+ auto realmark = marker.value_or(::rgw::cls::fifo::marker{});
+ std::unique_lock l(m);
+ const auto hn = info.head_part_num;
+ const auto max_part_size = info.params.max_part_size;
+ const auto pn = info.tail_part_num;
+ const auto part_oid = info.part_oid(pn);
+ auto tid = ++next_tid;
+ l.unlock();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ auto trimmer = std::make_unique<Trimmer>(dpp, this, realmark.num, realmark.ofs,
+ pn, exclusive, c, tid);
+ if (!marker) {
+ Trimmer::complete(std::move(trimmer), -EINVAL);
+ return;
+ }
+ ++trimmer->pn;
+ auto ofs = marker->ofs;
+ if (marker->num > hn) {
+ trimmer->reread = true;
+ read_meta(dpp, tid, Trimmer::call(std::move(trimmer)));
+ return;
+ }
+ if (pn < marker->num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " pn=" << pn << " tid=" << tid << dendl;
+ ofs = max_part_size;
+ } else {
+ trimmer->update = true;
+ }
+ trim_part(dpp, pn, ofs, exclusive, tid, Trimmer::call(std::move(trimmer)));
+}
+
+int FIFO::get_part_info(const DoutPrefixProvider *dpp, int64_t part_num,
+ fifo::part_header* header,
+ optional_yield y)
+{
+ std::unique_lock l(m);
+ const auto part_oid = info.part_oid(part_num);
+ auto tid = ++next_tid;
+ l.unlock();
+ auto r = rgw::cls::fifo::get_part_info(dpp, ioctx, part_oid, header, tid, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " get_part_info failed: r="
+ << r << " tid=" << tid << dendl;
+ }
+ return r;
+}
+
+void FIFO::get_part_info(int64_t part_num,
+ fifo::part_header* header,
+ lr::AioCompletion* c)
+{
+ std::unique_lock l(m);
+ const auto part_oid = info.part_oid(part_num);
+ auto tid = ++next_tid;
+ l.unlock();
+ auto op = rgw::cls::fifo::get_part_info(cct, header, tid);
+ auto r = ioctx.aio_operate(part_oid, c, &op, nullptr);
+ ceph_assert(r >= 0);
+}
+
+struct InfoGetter : Completion<InfoGetter> {
+ FIFO* fifo;
+ fifo::part_header header;
+ fu2::function<void(int r, fifo::part_header&&)> f;
+ std::uint64_t tid;
+ bool headerread = false;
+
+ InfoGetter(const DoutPrefixProvider *dpp, FIFO* fifo, fu2::function<void(int r, fifo::part_header&&)> f,
+ std::uint64_t tid, lr::AioCompletion* super)
+ : Completion(dpp, super), fifo(fifo), f(std::move(f)), tid(tid) {}
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ if (!headerread) {
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " read_meta failed: r="
+ << r << " tid=" << tid << dendl;
+ if (f)
+ f(r, {});
+ complete(std::move(p), r);
+ return;
+ }
+
+ auto info = fifo->meta();
+ auto hpn = info.head_part_num;
+ if (hpn < 0) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " no head, returning empty partinfo r="
+ << r << " tid=" << tid << dendl;
+ if (f)
+ f(0, {});
+ complete(std::move(p), r);
+ return;
+ }
+ headerread = true;
+ auto op = rgw::cls::fifo::get_part_info(fifo->cct, &header, tid);
+ std::unique_lock l(fifo->m);
+ auto oid = fifo->info.part_oid(hpn);
+ l.unlock();
+ r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op,
+ nullptr);
+ ceph_assert(r >= 0);
+ return;
+ }
+
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " get_part_info failed: r="
+ << r << " tid=" << tid << dendl;
+ }
+
+ if (f)
+ f(r, std::move(header));
+ complete(std::move(p), r);
+ return;
+ }
+};
+
+void FIFO::get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<void(int r,
+ fifo::part_header&&)> f,
+ lr::AioCompletion* c)
+{
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ l.unlock();
+ auto ig = std::make_unique<InfoGetter>(dpp, this, std::move(f), tid, c);
+ read_meta(dpp, tid, InfoGetter::call(std::move(ig)));
+}
+
+struct JournalProcessor : public Completion<JournalProcessor> {
+private:
+ FIFO* const fifo;
+
+ std::vector<fifo::journal_entry> processed;
+ decltype(fifo->info.journal) journal;
+ decltype(journal)::iterator iter;
+ std::int64_t new_tail;
+ std::int64_t new_head;
+ std::int64_t new_max;
+ int race_retries = 0;
+ bool first_pp = true;
+ bool canceled = false;
+ std::uint64_t tid;
+
+ enum {
+ entry_callback,
+ pp_callback,
+ } state;
+
+ void create_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ state = entry_callback;
+ lr::ObjectWriteOperation op;
+ op.create(false); /* We don't need exclusivity, part_init ensures
+ we're creating from the same journal entry. */
+ std::unique_lock l(fifo->m);
+ part_init(&op, fifo->info.params);
+ auto oid = fifo->info.part_oid(part_num);
+ l.unlock();
+ auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+ ceph_assert(r >= 0);
+ return;
+ }
+
+ void remove_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ state = entry_callback;
+ lr::ObjectWriteOperation op;
+ op.remove();
+ std::unique_lock l(fifo->m);
+ auto oid = fifo->info.part_oid(part_num);
+ l.unlock();
+ auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+ ceph_assert(r >= 0);
+ return;
+ }
+
+ void finish_je(const DoutPrefixProvider *dpp, Ptr&& p, int r,
+ const fifo::journal_entry& entry) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " finishing entry: entry=" << entry
+ << " tid=" << tid << dendl;
+
+ using enum fifo::journal_entry::Op;
+ if (entry.op == remove && r == -ENOENT)
+ r = 0;
+
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " processing entry failed: entry=" << entry
+ << " r=" << r << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ return;
+ } else {
+ switch (entry.op) {
+ case unknown:
+ case set_head:
+ // Can't happen. Filtered out in process.
+ complete(std::move(p), -EIO);
+ return;
+
+ case create:
+ if (entry.part_num > new_max) {
+ new_max = entry.part_num;
+ }
+ break;
+ case remove:
+ if (entry.part_num >= new_tail) {
+ new_tail = entry.part_num + 1;
+ }
+ break;
+ }
+ processed.push_back(entry);
+ }
+ ++iter;
+ process(dpp, std::move(p));
+ }
+
+ void postprocess(const DoutPrefixProvider *dpp, Ptr&& p) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ if (processed.empty()) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " nothing to update any more: race_retries="
+ << race_retries << " tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ return;
+ }
+ pp_run(dpp, std::move(p), 0, false);
+ }
+
+public:
+
+ JournalProcessor(const DoutPrefixProvider *dpp, FIFO* fifo, std::uint64_t tid, lr::AioCompletion* super)
+ : Completion(dpp, super), fifo(fifo), tid(tid) {
+ std::unique_lock l(fifo->m);
+ journal = fifo->info.journal;
+ iter = journal.begin();
+ new_tail = fifo->info.tail_part_num;
+ new_head = fifo->info.head_part_num;
+ new_max = fifo->info.max_push_part_num;
+ }
+
+ void pp_run(const DoutPrefixProvider *dpp, Ptr&& p, int r, bool canceled) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ std::optional<int64_t> tail_part_num;
+ std::optional<int64_t> head_part_num;
+ std::optional<int64_t> max_part_num;
+
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " failed, r=: " << r << " tid=" << tid << dendl;
+ complete(std::move(p), r);
+ }
+
+
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " postprocessing: race_retries="
+ << race_retries << " tid=" << tid << dendl;
+
+ if (!first_pp && r == 0 && !canceled) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " nothing to update any more: race_retries="
+ << race_retries << " tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ return;
+ }
+
+ first_pp = false;
+
+ if (canceled) {
+ if (race_retries >= MAX_RACE_RETRIES) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " canceled too many times, giving up: tid="
+ << tid << dendl;
+ complete(std::move(p), -ECANCELED);
+ return;
+ }
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " update canceled, retrying: race_retries="
+ << race_retries << " tid=" << tid << dendl;
+
+ ++race_retries;
+
+ std::vector<fifo::journal_entry> new_processed;
+ std::unique_lock l(fifo->m);
+ for (auto& e : processed) {
+ if (fifo->info.journal.contains(e)) {
+ new_processed.push_back(e);
+ }
+ }
+ processed = std::move(new_processed);
+ }
+
+ std::unique_lock l(fifo->m);
+ auto objv = fifo->info.version;
+ if (new_tail > fifo->info.tail_part_num) {
+ tail_part_num = new_tail;
+ }
+
+ if (new_head > fifo->info.head_part_num) {
+ head_part_num = new_head;
+ }
+
+ if (new_max > fifo->info.max_push_part_num) {
+ max_part_num = new_max;
+ }
+ l.unlock();
+
+ if (processed.empty() &&
+ !tail_part_num &&
+ !max_part_num) {
+ /* nothing to update anymore */
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " nothing to update any more: race_retries="
+ << race_retries << " tid=" << tid << dendl;
+ complete(std::move(p), 0);
+ return;
+ }
+ state = pp_callback;
+ fifo->_update_meta(dpp, fifo::update{}
+ .tail_part_num(tail_part_num)
+ .head_part_num(head_part_num)
+ .max_push_part_num(max_part_num)
+ .journal_entries_rm(processed),
+ objv, &this->canceled, tid, call(std::move(p)));
+ return;
+ }
+
+ JournalProcessor(const JournalProcessor&) = delete;
+ JournalProcessor& operator =(const JournalProcessor&) = delete;
+ JournalProcessor(JournalProcessor&&) = delete;
+ JournalProcessor& operator =(JournalProcessor&&) = delete;
+
+ void process(const DoutPrefixProvider *dpp, Ptr&& p) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ while (iter != journal.end()) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " processing entry: entry=" << *iter
+ << " tid=" << tid << dendl;
+ const auto entry = *iter;
+ switch (entry.op) {
+ using enum fifo::journal_entry::Op;
+ case create:
+ create_part(dpp, std::move(p), entry.part_num);
+ return;
+ case set_head:
+ if (entry.part_num > new_head) {
+ new_head = entry.part_num;
+ }
+ processed.push_back(entry);
+ ++iter;
+ continue;
+ case remove:
+ remove_part(dpp, std::move(p), entry.part_num);
+ return;
+ default:
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " unknown journaled op: entry=" << entry << " tid="
+ << tid << dendl;
+ complete(std::move(p), -EIO);
+ return;
+ }
+ }
+ postprocess(dpp, std::move(p));
+ return;
+ }
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " entering: tid=" << tid << dendl;
+ switch (state) {
+ case entry_callback:
+ finish_je(dpp, std::move(p), r, *iter);
+ return;
+ case pp_callback:
+ auto c = canceled;
+ canceled = false;
+ pp_run(dpp, std::move(p), r, c);
+ return;
+ }
+
+ abort();
+ }
+
+};
+
+void FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) {
+ auto p = std::make_unique<JournalProcessor>(dpp, this, tid, c);
+ p->process(dpp, std::move(p));
+}
+
+struct Lister : Completion<Lister> {
+ FIFO* f;
+ std::vector<list_entry> result;
+ bool more = false;
+ std::int64_t part_num;
+ std::uint64_t ofs;
+ int max_entries;
+ int r_out = 0;
+ std::vector<fifo::part_list_entry> entries;
+ bool part_more = false;
+ bool part_full = false;
+ std::vector<list_entry>* entries_out;
+ bool* more_out;
+ std::uint64_t tid;
+
+ bool read = false;
+
+ void complete(Ptr&& p, int r) {
+ if (r >= 0) {
+ if (more_out) *more_out = more;
+ if (entries_out) *entries_out = std::move(result);
+ }
+ Completion::complete(std::move(p), r);
+ }
+
+public:
+ Lister(const DoutPrefixProvider *dpp, FIFO* f, std::int64_t part_num, std::uint64_t ofs, int max_entries,
+ std::vector<list_entry>* entries_out, bool* more_out,
+ std::uint64_t tid, lr::AioCompletion* super)
+ : Completion(dpp, super), f(f), part_num(part_num), ofs(ofs), max_entries(max_entries),
+ entries_out(entries_out), more_out(more_out), tid(tid) {
+ result.reserve(max_entries);
+ }
+
+ Lister(const Lister&) = delete;
+ Lister& operator =(const Lister&) = delete;
+ Lister(Lister&&) = delete;
+ Lister& operator =(Lister&&) = delete;
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ if (read)
+ handle_read(std::move(p), r);
+ else
+ handle_list(dpp, std::move(p), r);
+ }
+
+ void list(Ptr&& p) {
+ if (max_entries > 0) {
+ part_more = false;
+ part_full = false;
+ entries.clear();
+
+ std::unique_lock l(f->m);
+ auto part_oid = f->info.part_oid(part_num);
+ l.unlock();
+
+ read = false;
+ auto op = list_part(f->cct, ofs, max_entries, &r_out,
+ &entries, &part_more, &part_full, tid);
+ f->ioctx.aio_operate(part_oid, call(std::move(p)), &op, nullptr);
+ } else {
+ complete(std::move(p), 0);
+ }
+ }
+
+ void handle_read(Ptr&& p, int r) {
+ read = false;
+ if (r >= 0) r = r_out;
+ r_out = 0;
+
+ if (r < 0) {
+ complete(std::move(p), r);
+ return;
+ }
+
+ if (part_num < f->info.tail_part_num) {
+ /* raced with trim? restart */
+ max_entries += result.size();
+ result.clear();
+ part_num = f->info.tail_part_num;
+ ofs = 0;
+ list(std::move(p));
+ return;
+ }
+ /* assuming part was not written yet, so end of data */
+ more = false;
+ complete(std::move(p), 0);
+ return;
+ }
+
+ void handle_list(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ if (r >= 0) r = r_out;
+ r_out = 0;
+ std::unique_lock l(f->m);
+ auto part_oid = f->info.part_oid(part_num);
+ l.unlock();
+ if (r == -ENOENT) {
+ read = true;
+ f->read_meta(dpp, tid, call(std::move(p)));
+ return;
+ }
+ if (r < 0) {
+ complete(std::move(p), r);
+ return;
+ }
+
+ more = part_full || part_more;
+ for (auto& entry : entries) {
+ list_entry e;
+ e.data = std::move(entry.data);
+ e.marker = marker{part_num, entry.ofs}.to_string();
+ e.mtime = entry.mtime;
+ result.push_back(std::move(e));
+ }
+ max_entries -= entries.size();
+ entries.clear();
+ if (max_entries > 0 && part_more) {
+ list(std::move(p));
+ return;
+ }
+
+ if (!part_full) { /* head part is not full */
+ complete(std::move(p), 0);
+ return;
+ }
+ ++part_num;
+ ofs = 0;
+ list(std::move(p));
+ }
+};
+
+void FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+ std::optional<std::string_view> markstr,
+ std::vector<list_entry>* out,
+ bool* more,
+ lr::AioCompletion* c) {
+ std::unique_lock l(m);
+ auto tid = ++next_tid;
+ std::int64_t part_num = info.tail_part_num;
+ l.unlock();
+ std::uint64_t ofs = 0;
+ std::optional<::rgw::cls::fifo::marker> marker;
+
+ if (markstr) {
+ marker = to_marker(*markstr);
+ if (marker) {
+ part_num = marker->num;
+ ofs = marker->ofs;
+ }
+ }
+
+ auto ls = std::make_unique<Lister>(dpp, this, part_num, ofs, max_entries, out,
+ more, tid, c);
+ if (markstr && !marker) {
+ auto l = ls.get();
+ l->complete(std::move(ls), -EINVAL);
+ } else {
+ ls->list(std::move(ls));
+ }
+}
+}
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h
new file mode 100644
index 000000000..b0a68157e
--- /dev/null
+++ b/src/rgw/driver/rados/cls_fifo_legacy.h
@@ -0,0 +1,334 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string_view>
+#include <vector>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "include/function2.hpp"
+
+#include "common/async/yield_context.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "librados/AioCompletionImpl.h"
+
+#include "rgw_tools.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+namespace lr = librados;
+
+inline constexpr std::uint64_t default_max_part_size = 4 * 1024 * 1024;
+inline constexpr std::uint64_t default_max_entry_size = 32 * 1024;
+
+void create_meta(lr::ObjectWriteOperation* op, std::string_view id,
+ std::optional<fifo::objv> objv,
+ std::optional<std::string_view> oid_prefix,
+ bool exclusive = false,
+ std::uint64_t max_part_size = default_max_part_size,
+ std::uint64_t max_entry_size = default_max_entry_size);
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+ std::optional<fifo::objv> objv, fifo::info* info,
+ std::uint32_t* part_header_size,
+ std::uint32_t* part_entry_overhead,
+ std::uint64_t tid, optional_yield y,
+ bool probe = false);
+struct marker {
+ std::int64_t num = 0;
+ std::uint64_t ofs = 0;
+
+ marker() = default;
+ marker(std::int64_t num, std::uint64_t ofs) : num(num), ofs(ofs) {}
+ static marker max() {
+ return { std::numeric_limits<decltype(num)>::max(),
+ std::numeric_limits<decltype(ofs)>::max() };
+ }
+
+ std::string to_string() {
+ return fmt::format("{:0>20}:{:0>20}", num, ofs);
+ }
+};
+
+struct list_entry {
+ cb::list data;
+ std::string marker;
+ ceph::real_time mtime;
+};
+
+using part_info = fifo::part_header;
+
+/// This is an implementation of FIFO using librados to facilitate
+/// backports. Please see /src/neorados/cls/fifo.h for full
+/// information.
+///
+/// This library uses optional_yield. Please see
+/// /src/common/async/yield_context.h. In summary, optional_yield
+/// contains either a spawn::yield_context (in which case the current
+/// coroutine is suspended until completion) or null_yield (in which
+/// case the current thread is blocked until completion.)
+///
+/// Please see the librados documentation for information on
+/// AioCompletion and IoCtx.
+
+class FIFO {
+ friend struct Reader;
+ friend struct Updater;
+ friend struct Trimmer;
+ friend struct InfoGetter;
+ friend struct Pusher;
+ friend struct NewPartPreparer;
+ friend struct NewHeadPreparer;
+ friend struct JournalProcessor;
+ friend struct Lister;
+
+ mutable lr::IoCtx ioctx;
+ CephContext* cct = static_cast<CephContext*>(ioctx.cct());
+ const std::string oid;
+ std::mutex m;
+ std::uint64_t next_tid = 0;
+
+ fifo::info info;
+
+ std::uint32_t part_header_size = 0xdeadbeef;
+ std::uint32_t part_entry_overhead = 0xdeadbeef;
+
+ std::optional<marker> to_marker(std::string_view s);
+
+ FIFO(lr::IoCtx&& ioc,
+ std::string oid)
+ : ioctx(std::move(ioc)), oid(oid) {}
+
+ int apply_update(const DoutPrefixProvider *dpp,
+ fifo::info* info,
+ const fifo::objv& objv,
+ const fifo::update& update,
+ std::uint64_t tid);
+ int _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+ fifo::objv version, bool* pcanceled,
+ std::uint64_t tid, optional_yield y);
+ void _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+ fifo::objv version, bool* pcanceled,
+ std::uint64_t tid, lr::AioCompletion* c);
+ int create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+ optional_yield y);
+ int remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+ optional_yield y);
+ int process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+ void process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+ int _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, optional_yield y);
+ void _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, lr::AioCompletion* c);
+ int _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num,
+ std::uint64_t tid, optional_yield y);
+ void _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num, std::uint64_t tid, lr::AioCompletion* c);
+ int push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+ std::uint64_t tid, optional_yield y);
+ void push_entries(const std::deque<cb::list>& data_bufs,
+ std::uint64_t tid, lr::AioCompletion* c);
+ int trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+ bool exclusive, std::uint64_t tid, optional_yield y);
+ void trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+ bool exclusive, std::uint64_t tid, lr::AioCompletion* c);
+
+ /// Force refresh of metadata, yielding/blocking style
+ int read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+ /// Force refresh of metadata, with a librados Completion
+ void read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+
+public:
+
+ FIFO(const FIFO&) = delete;
+ FIFO& operator =(const FIFO&) = delete;
+ FIFO(FIFO&&) = delete;
+ FIFO& operator =(FIFO&&) = delete;
+
+ /// Open an existing FIFO.
+ static int open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+ std::string oid, //< OID for metadata object
+ std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+ optional_yield y, //< Optional yield context
+ /// Operation will fail if FIFO is not at this version
+ std::optional<fifo::objv> objv = std::nullopt,
+ /// Probing for existence, don't print errors if we
+ /// can't find it.
+ bool probe = false);
+ /// Create a new or open an existing FIFO.
+ static int create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+ std::string oid, //< OID for metadata object
+ std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+ optional_yield y, //< Optional yield context
+ /// Operation will fail if the FIFO exists and is
+ /// not of this version.
+ std::optional<fifo::objv> objv = std::nullopt,
+ /// Prefix for all objects
+ std::optional<std::string_view> oid_prefix = std::nullopt,
+ /// Fail if the FIFO already exists
+ bool exclusive = false,
+ /// Maximum allowed size of parts
+ std::uint64_t max_part_size = default_max_part_size,
+ /// Maximum allowed size of entries
+ std::uint64_t max_entry_size = default_max_entry_size);
+
+ /// Force refresh of metadata, yielding/blocking style
+ int read_meta(const DoutPrefixProvider *dpp, optional_yield y);
+ /// Get currently known metadata
+ const fifo::info& meta() const;
+ /// Get partition header and entry overhead size
+ std::pair<std::uint32_t, std::uint32_t> get_part_layout_info() const;
+ /// Push an entry to the FIFO
+ int push(const DoutPrefixProvider *dpp,
+ const cb::list& bl, //< Entry to push
+ optional_yield y //< Optional yield
+ );
+ /// Push an entry to the FIFO
+ void push(const DoutPrefixProvider *dpp, const cb::list& bl, //< Entry to push
+ lr::AioCompletion* c //< Async Completion
+ );
+ /// Push entries to the FIFO
+ int push(const DoutPrefixProvider *dpp,
+ const std::vector<cb::list>& data_bufs, //< Entries to push
+ optional_yield y //< Optional yield
+ );
+ /// Push entries to the FIFO
+ void push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, //< Entries to push
+ lr::AioCompletion* c //< Async Completion
+ );
+ /// List entries
+ int list(const DoutPrefixProvider *dpp,
+ int max_entries, //< Maximum entries to list
+ /// Point after which to begin listing. Start at tail if null
+ std::optional<std::string_view> markstr,
+ std::vector<list_entry>* out, //< OUT: entries
+ /// OUT: True if more entries in FIFO beyond the last returned
+ bool* more,
+ optional_yield y //< Optional yield
+ );
+ void list(const DoutPrefixProvider *dpp,
+ int max_entries, //< Maximum entries to list
+ /// Point after which to begin listing. Start at tail if null
+ std::optional<std::string_view> markstr,
+ std::vector<list_entry>* out, //< OUT: entries
+ /// OUT: True if more entries in FIFO beyond the last returned
+ bool* more,
+ lr::AioCompletion* c //< Async Completion
+ );
+ /// Trim entries, coroutine/block style
+ int trim(const DoutPrefixProvider *dpp,
+ std::string_view markstr, //< Position to which to trim, inclusive
+ bool exclusive, //< If true, do not trim the target entry
+ //< itself, just all those before it.
+ optional_yield y //< Optional yield
+ );
+ /// Trim entries, librados AioCompletion style
+ void trim(const DoutPrefixProvider *dpp,
+ std::string_view markstr, //< Position to which to trim, inclusive
+ bool exclusive, //< If true, do not trim the target entry
+ //< itself, just all those before it.
+ lr::AioCompletion* c //< librados AIO Completion
+ );
+ /// Get part info
+ int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, /// Part number
+ fifo::part_header* header, //< OUT: Information
+ optional_yield y //< Optional yield
+ );
+ /// Get part info
+ void get_part_info(int64_t part_num, //< Part number
+ fifo::part_header* header, //< OUT: Information
+ lr::AioCompletion* c //< AIO Completion
+ );
+ /// A convenience method to fetch the part information for the FIFO
+ /// head, using librados::AioCompletion, since
+ /// libradio::AioCompletions compose lousily.
+ void get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< //< Function to receive info
+ void(int r, fifo::part_header&&)>,
+ lr::AioCompletion* c //< AIO Completion
+ );
+};
+
+template<typename T>
+struct Completion {
+private:
+ const DoutPrefixProvider *_dpp;
+ lr::AioCompletion* _cur = nullptr;
+ lr::AioCompletion* _super;
+public:
+
+ using Ptr = std::unique_ptr<T>;
+
+ lr::AioCompletion* cur() const {
+ return _cur;
+ }
+ lr::AioCompletion* super() const {
+ return _super;
+ }
+
+ Completion(const DoutPrefixProvider *dpp, lr::AioCompletion* super) : _dpp(dpp), _super(super) {
+ super->pc->get();
+ }
+
+ ~Completion() {
+ if (_super) {
+ _super->pc->put();
+ }
+ if (_cur)
+ _cur->release();
+ _super = nullptr;
+ _cur = nullptr;
+ }
+
+ // The only times that aio_operate can return an error are:
+ // 1. The completion contains a null pointer. This should just
+ // crash, and in our case it does.
+ // 2. An attempt is made to write to a snapshot. RGW doesn't use
+ // snapshots, so we don't care.
+ //
+ // So we will just assert that initiating an Aio operation succeeds
+ // and not worry about recovering.
+ static lr::AioCompletion* call(Ptr&& p) {
+ p->_cur = lr::Rados::aio_create_completion(static_cast<void*>(p.get()),
+ &cb);
+ auto c = p->_cur;
+ p.release();
+ return c;
+ }
+ static void complete(Ptr&& p, int r) {
+ auto c = p->_super;
+ p->_super = nullptr;
+ rgw_complete_aio_completion(c, r);
+ }
+
+ static void cb(lr::completion_t, void* arg) {
+ auto t = static_cast<T*>(arg);
+ auto r = t->_cur->get_return_value();
+ t->_cur->release();
+ t->_cur = nullptr;
+ t->handle(t->_dpp, Ptr(t), r);
+ }
+};
+
+}
diff --git a/src/rgw/driver/rados/config/impl.cc b/src/rgw/driver/rados/config/impl.cc
new file mode 100644
index 000000000..f1b2befad
--- /dev/null
+++ b/src/rgw/driver/rados/config/impl.cc
@@ -0,0 +1,129 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "impl.h"
+
+#include "common/async/yield_context.h"
+#include "common/errno.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+
+namespace rgw::rados {
+
+// default pool names
+constexpr std::string_view default_zone_root_pool = "rgw.root";
+constexpr std::string_view default_zonegroup_root_pool = "rgw.root";
+constexpr std::string_view default_realm_root_pool = "rgw.root";
+constexpr std::string_view default_period_root_pool = "rgw.root";
+
+static rgw_pool default_pool(std::string_view name,
+ std::string_view default_name)
+{
+ return std::string{name_or_default(name, default_name)};
+}
+
+ConfigImpl::ConfigImpl(const ceph::common::ConfigProxy& conf)
+ : realm_pool(default_pool(conf->rgw_realm_root_pool,
+ default_realm_root_pool)),
+ period_pool(default_pool(conf->rgw_period_root_pool,
+ default_period_root_pool)),
+ zonegroup_pool(default_pool(conf->rgw_zonegroup_root_pool,
+ default_zonegroup_root_pool)),
+ zone_pool(default_pool(conf->rgw_zone_root_pool,
+ default_zone_root_pool))
+{
+}
+
+int ConfigImpl::read(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ bufferlist& bl, RGWObjVersionTracker* objv)
+{
+ librados::IoCtx ioctx;
+ int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+ if (r < 0) {
+ return r;
+ }
+ librados::ObjectReadOperation op;
+ if (objv) {
+ objv->prepare_op_for_read(&op);
+ }
+ op.read(0, 0, &bl, nullptr);
+ return rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+}
+
+int ConfigImpl::write(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ Create create, const bufferlist& bl,
+ RGWObjVersionTracker* objv)
+{
+ librados::IoCtx ioctx;
+ int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectWriteOperation op;
+ switch (create) {
+ case Create::MustNotExist: op.create(true); break;
+ case Create::MayExist: op.create(false); break;
+ case Create::MustExist: op.assert_exists(); break;
+ }
+ if (objv) {
+ objv->prepare_op_for_write(&op);
+ }
+ op.write_full(bl);
+
+ r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r >= 0 && objv) {
+ objv->apply_write();
+ }
+ return r;
+}
+
+int ConfigImpl::remove(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ RGWObjVersionTracker* objv)
+{
+ librados::IoCtx ioctx;
+ int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectWriteOperation op;
+ if (objv) {
+ objv->prepare_op_for_write(&op);
+ }
+ op.remove();
+
+ r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r >= 0 && objv) {
+ objv->apply_write();
+ }
+ return r;
+}
+
+int ConfigImpl::notify(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ bufferlist& bl, uint64_t timeout_ms)
+{
+ librados::IoCtx ioctx;
+ int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+ if (r < 0) {
+ return r;
+ }
+ return rgw_rados_notify(dpp, ioctx, oid, bl, timeout_ms, nullptr, y);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/impl.h b/src/rgw/driver/rados/config/impl.h
new file mode 100644
index 000000000..3aed451f9
--- /dev/null
+++ b/src/rgw/driver/rados/config/impl.h
@@ -0,0 +1,139 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "rgw_basic_types.h"
+#include "rgw_tools.h"
+#include "rgw_sal_config.h"
+
+namespace rgw::rados {
+
+// write options that control object creation
+enum class Create {
+ MustNotExist, // fail with EEXIST if the object already exists
+ MayExist, // create if the object didn't exist, overwrite if it did
+ MustExist, // fail with ENOENT if the object doesn't exist
+};
+
+struct ConfigImpl {
+ librados::Rados rados;
+
+ const rgw_pool realm_pool;
+ const rgw_pool period_pool;
+ const rgw_pool zonegroup_pool;
+ const rgw_pool zone_pool;
+
+ ConfigImpl(const ceph::common::ConfigProxy& conf);
+
+ int read(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ bufferlist& bl, RGWObjVersionTracker* objv);
+
+ template <typename T>
+ int read(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ T& data, RGWObjVersionTracker* objv)
+ {
+ bufferlist bl;
+ int r = read(dpp, y, pool, oid, bl, objv);
+ if (r < 0) {
+ return r;
+ }
+ try {
+ auto p = bl.cbegin();
+ decode(data, p);
+ } catch (const buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from "
+ << pool << ":" << oid << dendl;
+ return -EIO;
+ }
+ return 0;
+ }
+
+ int write(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid, Create create,
+ const bufferlist& bl, RGWObjVersionTracker* objv);
+
+ template <typename T>
+ int write(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid, Create create,
+ const T& data, RGWObjVersionTracker* objv)
+ {
+ bufferlist bl;
+ encode(data, bl);
+
+ return write(dpp, y, pool, oid, create, bl, objv);
+ }
+
+ int remove(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ RGWObjVersionTracker* objv);
+
+ int list(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& marker,
+ std::regular_invocable<std::string> auto filter,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result)
+ {
+ librados::IoCtx ioctx;
+ int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+ if (r < 0) {
+ return r;
+ }
+ librados::ObjectCursor oc;
+ if (!oc.from_str(marker)) {
+ ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
+ return -EINVAL;
+ }
+ std::size_t count = 0;
+ try {
+ auto iter = ioctx.nobjects_begin(oc);
+ const auto end = ioctx.nobjects_end();
+ for (; count < entries.size() && iter != end; ++iter) {
+ std::string entry = filter(iter->get_oid());
+ if (!entry.empty()) {
+ entries[count++] = std::move(entry);
+ }
+ }
+ if (iter == end) {
+ result.next.clear();
+ } else {
+ result.next = iter.get_cursor().to_str();
+ }
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "NObjectIterator exception " << e.what() << dendl;
+ return -EIO;
+ }
+ result.entries = entries.first(count);
+ return 0;
+ }
+
+ int notify(const DoutPrefixProvider* dpp, optional_yield y,
+ const rgw_pool& pool, const std::string& oid,
+ bufferlist& bl, uint64_t timeout_ms);
+};
+
+inline std::string_view name_or_default(std::string_view name,
+ std::string_view default_name)
+{
+ if (!name.empty()) {
+ return name;
+ }
+ return default_name;
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period.cc b/src/rgw/driver/rados/config/period.cc
new file mode 100644
index 000000000..bc3fa27e7
--- /dev/null
+++ b/src/rgw/driver/rados/config/period.cc
@@ -0,0 +1,230 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period oids
+constexpr std::string_view period_info_oid_prefix = "periods.";
+constexpr std::string_view period_latest_epoch_info_oid = ".latest_epoch";
+constexpr std::string_view period_staging_suffix = ":staging";
+
+static std::string period_oid(std::string_view period_id, uint32_t epoch)
+{
+ // omit the epoch for the staging period
+ if (period_id.ends_with(period_staging_suffix)) {
+ return string_cat_reserve(period_info_oid_prefix, period_id);
+ }
+ return fmt::format("{}{}.{}", period_info_oid_prefix, period_id, epoch);
+}
+
+static std::string latest_epoch_oid(const ceph::common::ConfigProxy& conf,
+ std::string_view period_id)
+{
+ return string_cat_reserve(
+ period_info_oid_prefix, period_id,
+ name_or_default(conf->rgw_period_latest_epoch_info_oid,
+ period_latest_epoch_info_oid));
+}
+
+static int read_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+ ConfigImpl* impl, std::string_view period_id,
+ uint32_t& epoch, RGWObjVersionTracker* objv)
+{
+ const auto& pool = impl->period_pool;
+ const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+ RGWPeriodLatestEpochInfo latest;
+ int r = impl->read(dpp, y, pool, latest_oid, latest, objv);
+ if (r >= 0) {
+ epoch = latest.epoch;
+ }
+ return r;
+}
+
+static int write_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+ ConfigImpl* impl, bool exclusive,
+ std::string_view period_id, uint32_t epoch,
+ RGWObjVersionTracker* objv)
+{
+ const auto& pool = impl->period_pool;
+ const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+ RGWPeriodLatestEpochInfo latest{epoch};
+ return impl->write(dpp, y, pool, latest_oid, create, latest, objv);
+}
+
+static int delete_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+ ConfigImpl* impl, std::string_view period_id,
+ RGWObjVersionTracker* objv)
+{
+ const auto& pool = impl->period_pool;
+ const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+ return impl->remove(dpp, y, pool, latest_oid, objv);
+}
+
+static int update_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+ ConfigImpl* impl, std::string_view period_id,
+ uint32_t epoch)
+{
+ static constexpr int MAX_RETRIES = 20;
+
+ for (int i = 0; i < MAX_RETRIES; i++) {
+ uint32_t existing_epoch = 0;
+ RGWObjVersionTracker objv;
+ bool exclusive = false;
+
+ // read existing epoch
+ int r = read_latest_epoch(dpp, y, impl, period_id, existing_epoch, &objv);
+ if (r == -ENOENT) {
+ // use an exclusive create to set the epoch atomically
+ exclusive = true;
+ objv.generate_new_write_ver(dpp->get_cct());
+ ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch
+ << " for period=" << period_id << dendl;
+ } else if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl;
+ return r;
+ } else if (epoch <= existing_epoch) {
+ r = -EEXIST; // fail with EEXIST if epoch is not newer
+ ldpp_dout(dpp, 10) << "found existing latest_epoch " << existing_epoch
+ << " >= given epoch " << epoch << ", returning r=" << r << dendl;
+ return r;
+ } else {
+ ldpp_dout(dpp, 20) << "updating latest_epoch from " << existing_epoch
+ << " -> " << epoch << " on period=" << period_id << dendl;
+ }
+
+ r = write_latest_epoch(dpp, y, impl, exclusive, period_id, epoch, &objv);
+ if (r == -EEXIST) {
+ continue; // exclusive create raced with another update, retry
+ } else if (r == -ECANCELED) {
+ continue; // write raced with a conflicting version, retry
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl;
+ return r;
+ }
+ return 0; // return success
+ }
+
+ return -ECANCELED; // fail after max retries
+}
+
+int RadosConfigStore::create_period(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWPeriod& info)
+{
+ if (info.get_id().empty()) {
+ ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
+ return -EINVAL;
+ }
+ if (info.get_epoch() == 0) {
+ ldpp_dout(dpp, 0) << "period cannot have an empty epoch" << dendl;
+ return -EINVAL;
+ }
+ const auto& pool = impl->period_pool;
+ const auto info_oid = period_oid(info.get_id(), info.get_epoch());
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+ RGWObjVersionTracker objv;
+ objv.generate_new_write_ver(dpp->get_cct());
+ int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ (void) update_latest_epoch(dpp, y, impl.get(), info.get_id(), info.get_epoch());
+ return 0;
+}
+
+int RadosConfigStore::read_period(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view period_id,
+ std::optional<uint32_t> epoch,
+ RGWPeriod& info)
+{
+ int r = 0;
+ if (!epoch) {
+ epoch = 0;
+ r = read_latest_epoch(dpp, y, impl.get(), period_id, *epoch, nullptr);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ const auto& pool = impl->period_pool;
+ const auto info_oid = period_oid(period_id, *epoch);
+ return impl->read(dpp, y, pool, info_oid, info, nullptr);
+}
+
+int RadosConfigStore::delete_period(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view period_id)
+{
+ const auto& pool = impl->period_pool;
+
+ // read the latest_epoch
+ uint32_t latest_epoch = 0;
+ RGWObjVersionTracker latest_objv;
+ int r = read_latest_epoch(dpp, y, impl.get(), period_id,
+ latest_epoch, &latest_objv);
+ if (r < 0 && r != -ENOENT) { // just delete epoch=0 on ENOENT
+ ldpp_dout(dpp, 0) << "failed to read latest epoch for period "
+ << period_id << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (uint32_t epoch = 0; epoch <= latest_epoch; epoch++) {
+ const auto info_oid = period_oid(period_id, epoch);
+ r = impl->remove(dpp, y, pool, info_oid, nullptr);
+ if (r < 0 && r != -ENOENT) { // ignore ENOENT
+ ldpp_dout(dpp, 0) << "failed to delete period " << info_oid
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ return delete_latest_epoch(dpp, y, impl.get(), period_id, &latest_objv);
+}
+
+int RadosConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result)
+{
+ const auto& pool = impl->period_pool;
+ constexpr auto prefix = [] (std::string oid) -> std::string {
+ if (!oid.starts_with(period_info_oid_prefix)) {
+ return {};
+ }
+ if (!oid.ends_with(period_latest_epoch_info_oid)) {
+ return {};
+ }
+ // trim the prefix and suffix
+ const std::size_t count = oid.size() -
+ period_info_oid_prefix.size() -
+ period_latest_epoch_info_oid.size();
+ return oid.substr(period_info_oid_prefix.size(), count);
+ };
+
+ return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period_config.cc b/src/rgw/driver/rados/config/period_config.cc
new file mode 100644
index 000000000..ec984ebdc
--- /dev/null
+++ b/src/rgw/driver/rados/config/period_config.cc
@@ -0,0 +1,55 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period config oids
+constexpr std::string_view period_config_prefix = "period_config.";
+constexpr std::string_view period_config_realm_default = "default";
+
+std::string period_config_oid(std::string_view realm_id)
+{
+ if (realm_id.empty()) {
+ realm_id = period_config_realm_default;
+ }
+ return string_cat_reserve(period_config_prefix, realm_id);
+}
+
+int RadosConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWPeriodConfig& info)
+{
+ const auto& pool = impl->period_pool;
+ const auto oid = period_config_oid(realm_id);
+ return impl->read(dpp, y, pool, oid, info, nullptr);
+}
+
+int RadosConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id,
+ const RGWPeriodConfig& info)
+{
+ const auto& pool = impl->period_pool;
+ const auto oid = period_config_oid(realm_id);
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+ return impl->write(dpp, y, pool, oid, create, info, nullptr);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/realm.cc b/src/rgw/driver/rados/config/realm.cc
new file mode 100644
index 000000000..331e0ffd2
--- /dev/null
+++ b/src/rgw/driver/rados/config/realm.cc
@@ -0,0 +1,364 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// realm oids
+constexpr std::string_view realm_names_oid_prefix = "realms_names.";
+constexpr std::string_view realm_info_oid_prefix = "realms.";
+constexpr std::string_view realm_control_oid_suffix = ".control";
+constexpr std::string_view default_realm_info_oid = "default.realm";
+
+static std::string realm_info_oid(std::string_view realm_id)
+{
+ return string_cat_reserve(realm_info_oid_prefix, realm_id);
+}
+static std::string realm_name_oid(std::string_view realm_id)
+{
+ return string_cat_reserve(realm_names_oid_prefix, realm_id);
+}
+static std::string realm_control_oid(std::string_view realm_id)
+{
+ return string_cat_reserve(realm_info_oid_prefix, realm_id,
+ realm_control_oid_suffix);
+}
+static std::string default_realm_oid(const ceph::common::ConfigProxy& conf)
+{
+ return std::string{name_or_default(conf->rgw_default_realm_info_oid,
+ default_realm_info_oid)};
+}
+
+
+int RadosConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id)
+{
+ const auto& pool = impl->realm_pool;
+ const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ default_info.default_id = realm_id;
+
+ return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string& realm_id)
+{
+ const auto& pool = impl->realm_pool;
+ const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+ if (r >= 0) {
+ realm_id = default_info.default_id;
+ }
+ return r;
+}
+
+int RadosConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y)
+{
+ const auto& pool = impl->realm_pool;
+ const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+ return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosRealmWriter : public sal::RealmWriter {
+ ConfigImpl* impl;
+ RGWObjVersionTracker objv;
+ std::string realm_id;
+ std::string realm_name;
+ public:
+ RadosRealmWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+ std::string_view realm_id, std::string_view realm_name)
+ : impl(impl), objv(std::move(objv)),
+ realm_id(realm_id), realm_name(realm_name)
+ {
+ }
+
+ int write(const DoutPrefixProvider* dpp, optional_yield y,
+ const RGWRealm& info) override
+ {
+ if (realm_id != info.get_id() || realm_name != info.get_name()) {
+ return -EINVAL; // can't modify realm id or name directly
+ }
+
+ const auto& pool = impl->realm_pool;
+ const auto info_oid = realm_info_oid(info.get_id());
+ return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ }
+
+ int rename(const DoutPrefixProvider* dpp, optional_yield y,
+ RGWRealm& info, std::string_view new_name) override
+ {
+ if (realm_id != info.get_id() || realm_name != info.get_name()) {
+ return -EINVAL; // can't modify realm id or name directly
+ }
+ if (new_name.empty()) {
+ ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->realm_pool;
+ const auto name = RGWNameToId{info.get_id()};
+ const auto info_oid = realm_info_oid(info.get_id());
+ const auto old_oid = realm_name_oid(info.get_name());
+ const auto new_oid = realm_name_oid(new_name);
+
+ // link the new name
+ RGWObjVersionTracker new_objv;
+ new_objv.generate_new_write_ver(dpp->get_cct());
+ int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+ name, &new_objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the info with updated name
+ info.set_name(std::string{new_name});
+ r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ if (r < 0) {
+ // on failure, unlink the new name
+ (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+ return r;
+ }
+
+ // unlink the old name
+ (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+ realm_name = new_name;
+ return 0;
+ }
+
+ int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+ {
+ const auto& pool = impl->realm_pool;
+ const auto info_oid = realm_info_oid(realm_id);
+ int r = impl->remove(dpp, y, pool, info_oid, &objv);
+ if (r < 0) {
+ return r;
+ }
+ const auto name_oid = realm_name_oid(realm_name);
+ (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+ const auto control_oid = realm_control_oid(realm_id);
+ (void) impl->remove(dpp, y, pool, control_oid, nullptr);
+ return 0;
+ }
+}; // RadosRealmWriter
+
+
+int RadosConfigStore::create_realm(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer)
+{
+ if (info.get_id().empty()) {
+ ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
+ return -EINVAL;
+ }
+ if (info.get_name().empty()) {
+ ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->realm_pool;
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ // write the realm info
+ const auto info_oid = realm_info_oid(info.get_id());
+ RGWObjVersionTracker objv;
+ objv.generate_new_write_ver(dpp->get_cct());
+
+ int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the realm name
+ const auto name_oid = realm_name_oid(info.get_name());
+ const auto name = RGWNameToId{info.get_id()};
+ RGWObjVersionTracker name_objv;
+ name_objv.generate_new_write_ver(dpp->get_cct());
+
+ r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+ if (r < 0) {
+ (void) impl->remove(dpp, y, pool, info_oid, &objv);
+ return r;
+ }
+
+ // create control object for watch/notify
+ const auto control_oid = realm_control_oid(info.get_id());
+ bufferlist empty_bl;
+ r = impl->write(dpp, y, pool, control_oid, Create::MayExist,
+ empty_bl, nullptr);
+ if (r < 0) {
+ (void) impl->remove(dpp, y, pool, name_oid, &name_objv);
+ (void) impl->remove(dpp, y, pool, info_oid, &objv);
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosRealmWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer)
+{
+ const auto& pool = impl->realm_pool;
+ const auto info_oid = realm_info_oid(realm_id);
+ RGWObjVersionTracker objv;
+ int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosRealmWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_name,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer)
+{
+ const auto& pool = impl->realm_pool;
+
+ // look up realm id by name
+ RGWNameToId name;
+ const auto name_oid = realm_name_oid(realm_name);
+ int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = realm_info_oid(name.obj_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosRealmWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer)
+{
+ const auto& pool = impl->realm_pool;
+
+ // read default realm id
+ RGWDefaultSystemMetaObjInfo default_info;
+ const auto default_oid = default_realm_oid(dpp->get_cct()->_conf);
+ int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = realm_info_oid(default_info.default_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosRealmWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_name,
+ std::string& realm_id)
+{
+ const auto& pool = impl->realm_pool;
+ RGWNameToId name;
+
+ // look up realm id by name
+ const auto name_oid = realm_name_oid(realm_name);
+ int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+ if (r < 0) {
+ return r;
+ }
+ realm_id = std::move(name.obj_id);
+ return 0;
+}
+
+int RadosConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const RGWPeriod& period)
+{
+ const auto& pool = impl->realm_pool;
+ const auto control_oid = realm_control_oid(period.get_realm());
+
+ bufferlist bl;
+ using ceph::encode;
+ // push the period to dependent zonegroups/zones
+ encode(RGWRealmNotify::ZonesNeedPeriod, bl);
+ encode(period, bl);
+ // reload the gateway with the new period
+ encode(RGWRealmNotify::Reload, bl);
+
+ constexpr uint64_t timeout_ms = 0;
+ return impl->notify(dpp, y, pool, control_oid, bl, timeout_ms);
+}
+
+int RadosConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result)
+{
+ const auto& pool = impl->realm_pool;
+ constexpr auto prefix = [] (std::string oid) -> std::string {
+ if (!oid.starts_with(realm_names_oid_prefix)) {
+ return {};
+ }
+ return oid.substr(realm_names_oid_prefix.size());
+ };
+ return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.cc b/src/rgw/driver/rados/config/store.cc
new file mode 100644
index 000000000..ec2b034a8
--- /dev/null
+++ b/src/rgw/driver/rados/config/store.cc
@@ -0,0 +1,52 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "impl.h"
+#include "store.h"
+
+namespace rgw::rados {
+
+RadosConfigStore::RadosConfigStore(std::unique_ptr<ConfigImpl> impl)
+ : impl(std::move(impl))
+{
+}
+
+RadosConfigStore::~RadosConfigStore() = default;
+
+
+auto create_config_store(const DoutPrefixProvider* dpp)
+ -> std::unique_ptr<RadosConfigStore>
+{
+ auto impl = std::make_unique<ConfigImpl>(dpp->get_cct()->_conf);
+
+ // initialize a Rados client
+ int r = impl->rados.init_with_context(dpp->get_cct());
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "Rados client initialization failed with "
+ << cpp_strerror(-r) << dendl;
+ return nullptr;
+ }
+ r = impl->rados.connect();
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "Rados client connection failed with "
+ << cpp_strerror(-r) << dendl;
+ return nullptr;
+ }
+
+ return std::make_unique<RadosConfigStore>(std::move(impl));
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.h b/src/rgw/driver/rados/config/store.h
new file mode 100644
index 000000000..1b93a803d
--- /dev/null
+++ b/src/rgw/driver/rados/config/store.h
@@ -0,0 +1,182 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+#include "rgw_common.h"
+#include "rgw_sal_config.h"
+
+class DoutPrefixProvider;
+class optional_yield;
+
+namespace rgw::rados {
+
+struct ConfigImpl;
+
+class RadosConfigStore : public sal::ConfigStore {
+ public:
+ explicit RadosConfigStore(std::unique_ptr<ConfigImpl> impl);
+ virtual ~RadosConfigStore() override;
+
+ // Realm
+ virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id) override;
+ virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string& realm_id) override;
+ virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y) override;
+
+ virtual int create_realm(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer) override;
+ virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer) override;
+ virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_name,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer) override;
+ virtual int read_default_realm(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer) override;
+ virtual int read_realm_id(const DoutPrefixProvider* dpp,
+ optional_yield y, std::string_view realm_name,
+ std::string& realm_id) override;
+ virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const RGWPeriod& period) override;
+ virtual int list_realm_names(const DoutPrefixProvider* dpp,
+ optional_yield y, const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result) override;
+
+ // Period
+ virtual int create_period(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWPeriod& info) override;
+ virtual int read_period(const DoutPrefixProvider* dpp,
+ optional_yield y, std::string_view period_id,
+ std::optional<uint32_t> epoch, RGWPeriod& info) override;
+ virtual int delete_period(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view period_id) override;
+ virtual int list_period_ids(const DoutPrefixProvider* dpp,
+ optional_yield y, const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result) override;
+
+ // ZoneGroup
+ virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id,
+ std::string_view zonegroup_id) override;
+ virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ std::string& zonegroup_id) override;
+ virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id) override;
+
+ virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+ virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zonegroup_id,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+ virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zonegroup_name,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+ virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+ virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+ optional_yield y, const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result) override;
+
+ // Zone
+ virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id,
+ std::string_view zone_id) override;
+ virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ std::string& zone_id) override;
+ virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id) override;
+
+ virtual int create_zone(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer) override;
+ virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zone_id,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer) override;
+ virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zone_name,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer) override;
+ virtual int read_default_zone(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer) override;
+ virtual int list_zone_names(const DoutPrefixProvider* dpp,
+ optional_yield y, const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result) override;
+
+ // PeriodConfig
+ virtual int read_period_config(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWPeriodConfig& info) override;
+ virtual int write_period_config(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ std::string_view realm_id,
+ const RGWPeriodConfig& info) override;
+
+ private:
+ std::unique_ptr<ConfigImpl> impl;
+}; // RadosConfigStore
+
+
+/// RadosConfigStore factory function
+auto create_config_store(const DoutPrefixProvider* dpp)
+ -> std::unique_ptr<RadosConfigStore>;
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zone.cc b/src/rgw/driver/rados/config/zone.cc
new file mode 100644
index 000000000..e06c1606c
--- /dev/null
+++ b/src/rgw/driver/rados/config/zone.cc
@@ -0,0 +1,312 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zone oids
+constexpr std::string_view zone_info_oid_prefix = "zone_info.";
+constexpr std::string_view zone_names_oid_prefix = "zone_names.";
+
+std::string zone_info_oid(std::string_view zone_id)
+{
+ return string_cat_reserve(zone_info_oid_prefix, zone_id);
+}
+std::string zone_name_oid(std::string_view zone_id)
+{
+ return string_cat_reserve(zone_names_oid_prefix, zone_id);
+}
+std::string default_zone_oid(const ceph::common::ConfigProxy& conf,
+ std::string_view realm_id)
+{
+ return fmt::format("{}.{}", conf->rgw_default_zone_info_oid, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ bool exclusive,
+ std::string_view realm_id,
+ std::string_view zone_id)
+{
+ const auto& pool = impl->zone_pool;
+ const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ default_info.default_id = zone_id;
+
+ return impl->write(dpp, y, pool, default_oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ std::string& zone_id)
+{
+ const auto& pool = impl->zone_pool;
+ const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+ if (r >= 0) {
+ zone_id = default_info.default_id;
+ }
+ return r;
+}
+
+int RadosConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id)
+{
+ const auto& pool = impl->zone_pool;
+ const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+ return impl->remove(dpp, y, pool, default_oid, nullptr);
+}
+
+
+class RadosZoneWriter : public sal::ZoneWriter {
+ ConfigImpl* impl;
+ RGWObjVersionTracker objv;
+ std::string zone_id;
+ std::string zone_name;
+ public:
+ RadosZoneWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+ std::string_view zone_id, std::string_view zone_name)
+ : impl(impl), objv(std::move(objv)),
+ zone_id(zone_id), zone_name(zone_name)
+ {
+ }
+
+ int write(const DoutPrefixProvider* dpp, optional_yield y,
+ const RGWZoneParams& info) override
+ {
+ if (zone_id != info.get_id() || zone_name != info.get_name()) {
+ return -EINVAL; // can't modify zone id or name directly
+ }
+
+ const auto& pool = impl->zone_pool;
+ const auto info_oid = zone_info_oid(info.get_id());
+ return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ }
+
+ int rename(const DoutPrefixProvider* dpp, optional_yield y,
+ RGWZoneParams& info, std::string_view new_name) override
+ {
+ if (zone_id != info.get_id() || zone_name != info.get_name()) {
+ return -EINVAL; // can't modify zone id or name directly
+ }
+ if (new_name.empty()) {
+ ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->zone_pool;
+ const auto name = RGWNameToId{info.get_id()};
+ const auto info_oid = zone_info_oid(info.get_id());
+ const auto old_oid = zone_name_oid(info.get_name());
+ const auto new_oid = zone_name_oid(new_name);
+
+ // link the new name
+ RGWObjVersionTracker new_objv;
+ new_objv.generate_new_write_ver(dpp->get_cct());
+ int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+ name, &new_objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the info with updated name
+ info.set_name(std::string{new_name});
+ r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ if (r < 0) {
+ // on failure, unlink the new name
+ (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+ return r;
+ }
+
+ // unlink the old name
+ (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+ zone_name = new_name;
+ return 0;
+ }
+
+ int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+ {
+ const auto& pool = impl->zone_pool;
+ const auto info_oid = zone_info_oid(zone_id);
+ int r = impl->remove(dpp, y, pool, info_oid, &objv);
+ if (r < 0) {
+ return r;
+ }
+ const auto name_oid = zone_name_oid(zone_name);
+ (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+ return 0;
+ }
+}; // RadosZoneWriter
+
+
+int RadosConfigStore::create_zone(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ if (info.get_id().empty()) {
+ ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
+ return -EINVAL;
+ }
+ if (info.get_name().empty()) {
+ ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->zone_pool;
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ // write the zone info
+ const auto info_oid = zone_info_oid(info.get_id());
+ RGWObjVersionTracker objv;
+ objv.generate_new_write_ver(dpp->get_cct());
+
+ int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the zone name
+ const auto name_oid = zone_name_oid(info.get_name());
+ const auto name = RGWNameToId{info.get_id()};
+ RGWObjVersionTracker name_objv;
+ name_objv.generate_new_write_ver(dpp->get_cct());
+
+ r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+ if (r < 0) {
+ (void) impl->remove(dpp, y, pool, info_oid, &objv);
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zone_id,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ const auto& pool = impl->zone_pool;
+ const auto info_oid = zone_info_oid(zone_id);
+ RGWObjVersionTracker objv;
+
+ int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zone_name,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ const auto& pool = impl->zone_pool;
+
+ // look up zone id by name
+ const auto name_oid = zone_name_oid(zone_name);
+ RGWNameToId name;
+ int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = zone_info_oid(name.obj_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ const auto& pool = impl->zone_pool;
+
+ // read default zone id
+ const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+ RGWDefaultSystemMetaObjInfo default_info;
+ int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = zone_info_oid(default_info.default_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result)
+{
+ const auto& pool = impl->zone_pool;
+ constexpr auto prefix = [] (std::string oid) -> std::string {
+ if (!oid.starts_with(zone_names_oid_prefix)) {
+ return {};
+ }
+ return oid.substr(zone_names_oid_prefix.size());
+ };
+ return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zonegroup.cc b/src/rgw/driver/rados/config/zonegroup.cc
new file mode 100644
index 000000000..1766a68ce
--- /dev/null
+++ b/src/rgw/driver/rados/config/zonegroup.cc
@@ -0,0 +1,315 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zonegroup oids
+constexpr std::string_view zonegroup_names_oid_prefix = "zonegroups_names.";
+constexpr std::string_view zonegroup_info_oid_prefix = "zonegroup_info.";
+constexpr std::string_view default_zonegroup_info_oid = "default.zonegroup";
+
+static std::string zonegroup_info_oid(std::string_view zonegroup_id)
+{
+ return string_cat_reserve(zonegroup_info_oid_prefix, zonegroup_id);
+}
+static std::string zonegroup_name_oid(std::string_view zonegroup_id)
+{
+ return string_cat_reserve(zonegroup_names_oid_prefix, zonegroup_id);
+}
+static std::string default_zonegroup_oid(const ceph::common::ConfigProxy& conf,
+ std::string_view realm_id)
+{
+ const auto prefix = name_or_default(conf->rgw_default_zonegroup_info_oid,
+ default_zonegroup_info_oid);
+ return fmt::format("{}.{}", prefix, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ bool exclusive,
+ std::string_view realm_id,
+ std::string_view zonegroup_id)
+{
+ const auto& pool = impl->zonegroup_pool;
+ const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ default_info.default_id = zonegroup_id;
+
+ return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ std::string& zonegroup_id)
+{
+ const auto& pool = impl->zonegroup_pool;
+ const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+
+ RGWDefaultSystemMetaObjInfo default_info;
+ int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+ if (r >= 0) {
+ zonegroup_id = default_info.default_id;
+ }
+ return r;
+}
+
+int RadosConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id)
+{
+ const auto& pool = impl->zonegroup_pool;
+ const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+ return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosZoneGroupWriter : public sal::ZoneGroupWriter {
+ ConfigImpl* impl;
+ RGWObjVersionTracker objv;
+ std::string zonegroup_id;
+ std::string zonegroup_name;
+ public:
+ RadosZoneGroupWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+ std::string_view zonegroup_id,
+ std::string_view zonegroup_name)
+ : impl(impl), objv(std::move(objv)),
+ zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
+ {
+ }
+
+ int write(const DoutPrefixProvider* dpp, optional_yield y,
+ const RGWZoneGroup& info) override
+ {
+ if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+ return -EINVAL; // can't modify zonegroup id or name directly
+ }
+
+ const auto& pool = impl->zonegroup_pool;
+ const auto info_oid = zonegroup_info_oid(info.get_id());
+ return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ }
+
+ int rename(const DoutPrefixProvider* dpp, optional_yield y,
+ RGWZoneGroup& info, std::string_view new_name) override
+ {
+ if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+ return -EINVAL; // can't modify zonegroup id or name directly
+ }
+ if (new_name.empty()) {
+ ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->zonegroup_pool;
+ const auto name = RGWNameToId{info.get_id()};
+ const auto info_oid = zonegroup_info_oid(info.get_id());
+ const auto old_oid = zonegroup_name_oid(info.get_name());
+ const auto new_oid = zonegroup_name_oid(new_name);
+
+ // link the new name
+ RGWObjVersionTracker new_objv;
+ new_objv.generate_new_write_ver(dpp->get_cct());
+ int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+ name, &new_objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the info with updated name
+ info.set_name(std::string{new_name});
+ r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+ if (r < 0) {
+ // on failure, unlink the new name
+ (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+ return r;
+ }
+
+ // unlink the old name
+ (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+ zonegroup_name = new_name;
+ return 0;
+ }
+
+ int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+ {
+ const auto& pool = impl->zonegroup_pool;
+ const auto info_oid = zonegroup_info_oid(zonegroup_id);
+ int r = impl->remove(dpp, y, pool, info_oid, &objv);
+ if (r < 0) {
+ return r;
+ }
+ const auto name_oid = zonegroup_name_oid(zonegroup_name);
+ (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+ return 0;
+ }
+}; // RadosZoneGroupWriter
+
+
+int RadosConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+ optional_yield y, bool exclusive,
+ const RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+ if (info.get_id().empty()) {
+ ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
+ return -EINVAL;
+ }
+ if (info.get_name().empty()) {
+ ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+ return -EINVAL;
+ }
+
+ const auto& pool = impl->zonegroup_pool;
+ const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+ // write the zonegroup info
+ const auto info_oid = zonegroup_info_oid(info.get_id());
+ RGWObjVersionTracker objv;
+ objv.generate_new_write_ver(dpp->get_cct());
+
+ int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ // write the zonegroup name
+ const auto name_oid = zonegroup_name_oid(info.get_name());
+ const auto name = RGWNameToId{info.get_id()};
+ RGWObjVersionTracker name_objv;
+ name_objv.generate_new_write_ver(dpp->get_cct());
+
+ r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+ if (r < 0) {
+ (void) impl->remove(dpp, y, pool, info_oid, &objv);
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneGroupWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zonegroup_id,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+ const auto& pool = impl->zonegroup_pool;
+ const auto info_oid = zonegroup_info_oid(zonegroup_id);
+ RGWObjVersionTracker objv;
+
+ int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneGroupWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view zonegroup_name,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+ const auto& pool = impl->zonegroup_pool;
+
+ // look up zonegroup id by name
+ RGWNameToId name;
+ const auto name_oid = zonegroup_name_oid(zonegroup_name);
+ int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = zonegroup_info_oid(name.obj_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneGroupWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ std::string_view realm_id,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+ const auto& pool = impl->zonegroup_pool;
+
+ // read default zonegroup id
+ RGWDefaultSystemMetaObjInfo default_info;
+ const auto default_oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+ int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ const auto info_oid = zonegroup_info_oid(default_info.default_id);
+ RGWObjVersionTracker objv;
+ r = impl->read(dpp, y, pool, info_oid, info, &objv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (writer) {
+ *writer = std::make_unique<RadosZoneGroupWriter>(
+ impl.get(), std::move(objv), info.get_id(), info.get_name());
+ }
+ return 0;
+}
+
+int RadosConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ const std::string& marker,
+ std::span<std::string> entries,
+ sal::ListResult<std::string>& result)
+{
+ const auto& pool = impl->zonegroup_pool;
+ constexpr auto prefix = [] (std::string oid) -> std::string {
+ if (!oid.starts_with(zonegroup_names_oid_prefix)) {
+ return {};
+ }
+ return oid.substr(zonegroup_names_oid_prefix.size());
+ };
+ return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
new file mode 100644
index 000000000..32cd1ccf9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -0,0 +1,3316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_acl_s3.h"
+#include "rgw_tag_s3.h"
+
+#include "rgw_bucket.h"
+#include "rgw_op.h"
+#include "rgw_bucket_sync.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket.h"
+#include "services/svc_user.h"
+
+#include "rgw_reshard.h"
+
+// stolen from src/cls/version/cls_version.cc
+#define VERSION_ATTR "ceph.objclass.version"
+
+#include "cls/user/cls_user_types.h"
+
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+// seconds for timeout during RGWBucket::check_object_index
+constexpr uint64_t BUCKET_TAG_QUICK_TIMEOUT = 30;
+
+using namespace std;
+
+// these values are copied from cls/rgw/cls_rgw.cc
+static const string BI_OLH_ENTRY_NS_START = "\x80" "1001_";
+static const string BI_INSTANCE_ENTRY_NS_START = "\x80" "1000_";
+
+// number of characters that we should allow to be buffered by the formatter
+// before flushing (used by index check methods with dump_keys=true)
+static constexpr int FORMATTER_LEN_FLUSH_THRESHOLD = 4 * 1024 * 1024;
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+
+/*
+ * The tenant_name is always returned on purpose. May be empty, of course.
+ */
+static void parse_bucket(const string& bucket,
+ string *tenant_name,
+ string *bucket_name,
+ string *bucket_instance = nullptr /* optional */)
+{
+ /*
+ * expected format: [tenant/]bucket:bucket_instance
+ */
+ int pos = bucket.find('/');
+ if (pos >= 0) {
+ *tenant_name = bucket.substr(0, pos);
+ } else {
+ tenant_name->clear();
+ }
+ string bn = bucket.substr(pos + 1);
+ pos = bn.find (':');
+ if (pos < 0) {
+ *bucket_name = std::move(bn);
+ return;
+ }
+ *bucket_name = bn.substr(0, pos);
+ if (bucket_instance) {
+ *bucket_instance = bn.substr(pos + 1);
+ }
+
+ /*
+ * deal with the possible tenant:bucket:bucket_instance case
+ */
+ if (tenant_name->empty()) {
+ pos = bucket_instance->find(':');
+ if (pos >= 0) {
+ *tenant_name = *bucket_name;
+ *bucket_name = bucket_instance->substr(0, pos);
+ *bucket_instance = bucket_instance->substr(pos + 1);
+ }
+ }
+}
+
+static void dump_mulipart_index_results(list<rgw_obj_index_key>& objs_to_unlink,
+ Formatter *f)
+{
+ for (const auto& o : objs_to_unlink) {
+ f->dump_string("object", o.name);
+ }
+}
+
+void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user,
+ bool fix,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ rgw::sal::BucketList user_buckets;
+ string marker;
+
+ CephContext *cct = driver->ctx();
+
+ size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+ do {
+ int ret = user.list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y);
+ if (ret < 0) {
+ ldout(driver->ctx(), 0) << "failed to read user buckets: "
+ << cpp_strerror(-ret) << dendl;
+ return;
+ }
+
+ map<string, std::unique_ptr<rgw::sal::Bucket>>& buckets = user_buckets.get_buckets();
+ for (auto i = buckets.begin();
+ i != buckets.end();
+ ++i) {
+ marker = i->first;
+
+ auto& bucket = i->second;
+
+ std::unique_ptr<rgw::sal::Bucket> actual_bucket;
+ int r = driver->get_bucket(dpp, &user, user.get_tenant(), bucket->get_name(), &actual_bucket, y);
+ if (r < 0) {
+ ldout(driver->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl;
+ continue;
+ }
+
+ if (actual_bucket->get_name().compare(bucket->get_name()) != 0 ||
+ actual_bucket->get_tenant().compare(bucket->get_tenant()) != 0 ||
+ actual_bucket->get_marker().compare(bucket->get_marker()) != 0 ||
+ actual_bucket->get_bucket_id().compare(bucket->get_bucket_id()) != 0) {
+ cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
+ if (fix) {
+ cout << "fixing" << std::endl;
+ r = actual_bucket->chown(dpp, user, y);
+ if (r < 0) {
+ cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
+ }
+ }
+ }
+ }
+ } while (user_buckets.is_truncated());
+}
+
+// returns true if entry is in the empty namespace. note: function
+// type conforms to type RGWBucketListNameFilter
+bool rgw_bucket_object_check_filter(const std::string& oid)
+{
+ const static std::string empty_ns;
+ rgw_obj_key key; // thrown away but needed for parsing
+ return rgw_obj_key::oid_to_key_in_ns(oid, &key, empty_ns);
+}
+
+int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key)
+{
+ if (key.instance.empty()) {
+ key.instance = "null";
+ }
+
+ std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
+
+ return object->delete_object(dpp, null_yield);
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+ if (sink && !msg.empty())
+ *sink = msg;
+}
+
+int RGWBucket::init(rgw::sal::Driver* _driver, RGWBucketAdminOpState& op_state,
+ optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+ if (!_driver) {
+ set_err_msg(err_msg, "no storage!");
+ return -EINVAL;
+ }
+
+ driver = _driver;
+
+ std::string bucket_name = op_state.get_bucket_name();
+
+ if (bucket_name.empty() && op_state.get_user_id().empty())
+ return -EINVAL;
+
+ user = driver->get_user(op_state.get_user_id());
+ std::string tenant = user->get_tenant();
+
+ // split possible tenant/name
+ auto pos = bucket_name.find('/');
+ if (pos != string::npos) {
+ tenant = bucket_name.substr(0, pos);
+ bucket_name = bucket_name.substr(pos + 1);
+ }
+
+ int r = driver->get_bucket(dpp, user.get(), tenant, bucket_name, &bucket, y);
+ if (r < 0) {
+ set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket_name);
+ return r;
+ }
+
+ op_state.set_bucket(bucket->clone());
+
+ if (!rgw::sal::User::empty(user.get())) {
+ r = user->load_user(dpp, y);
+ if (r < 0) {
+ set_err_msg(err_msg, "failed to fetch user info");
+ return r;
+ }
+ }
+
+ op_state.display_name = user->get_display_name();
+
+ clear_failure();
+ return 0;
+}
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver,
+ const string& marker, const string& bucket_id, rgw_bucket* bucket_out)
+{
+ void *handle = NULL;
+ bool truncated = false;
+ string s;
+
+ int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+ if (ret < 0) {
+ cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+ driver->meta_list_keys_complete(handle);
+ return -ret;
+ }
+ do {
+ list<string> keys;
+ ret = driver->meta_list_keys_next(dpp, handle, 1000, keys, &truncated);
+ if (ret < 0) {
+ cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+ driver->meta_list_keys_complete(handle);
+ return -ret;
+ }
+ for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+ s = *iter;
+ ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, nullptr);
+ if (ret < 0) {
+ continue;
+ }
+ if (bucket_id == bucket_out->bucket_id) {
+ driver->meta_list_keys_complete(handle);
+ return true;
+ }
+ }
+ } while (truncated);
+ driver->meta_list_keys_complete(handle);
+ return false;
+}
+
+int RGWBucket::chown(RGWBucketAdminOpState& op_state, const string& marker,
+ optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+ /* User passed in by rgw_admin is the new user; get the current user and set it in
+ * the bucket */
+ std::unique_ptr<rgw::sal::User> old_user = driver->get_user(bucket->get_info().owner);
+ bucket->set_owner(old_user.get());
+
+ return rgw_chown_bucket_and_objects(driver, bucket.get(), user.get(), marker, err_msg, dpp, y);
+}
+
+int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+ bucket = op_state.get_bucket()->clone();
+
+ bucket->get_info().quota = op_state.quota;
+ int r = bucket->put_info(dpp, false, real_time());
+ if (r < 0) {
+ set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+ return r;
+ }
+ return r;
+}
+
+int RGWBucket::remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg)
+{
+ std::string object_name = op_state.get_object_name();
+
+ rgw_obj_key key(object_name);
+
+ bucket = op_state.get_bucket()->clone();
+
+ int ret = rgw_remove_object(dpp, driver, bucket.get(), key);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret));
+ return ret;
+ }
+
+ return 0;
+}
+
+static void dump_bucket_index(const vector<rgw_bucket_dir_entry>& objs, Formatter *f)
+{
+ for (auto iter = objs.begin(); iter != objs.end(); ++iter) {
+ f->dump_string("object", iter->key.name);
+ }
+}
+
+static void dump_bucket_usage(map<RGWObjCategory, RGWStorageStats>& stats, Formatter *formatter)
+{
+ map<RGWObjCategory, RGWStorageStats>::iterator iter;
+
+ formatter->open_object_section("usage");
+ for (iter = stats.begin(); iter != stats.end(); ++iter) {
+ RGWStorageStats& s = iter->second;
+ formatter->open_object_section(to_string(iter->first));
+ s.dump(formatter);
+ formatter->close_section();
+ }
+ formatter->close_section();
+}
+
+static void dump_index_check(map<RGWObjCategory, RGWStorageStats> existing_stats,
+ map<RGWObjCategory, RGWStorageStats> calculated_stats,
+ Formatter *formatter)
+{
+ formatter->open_object_section("check_result");
+ formatter->open_object_section("existing_header");
+ dump_bucket_usage(existing_stats, formatter);
+ formatter->close_section();
+ formatter->open_object_section("calculated_header");
+ dump_bucket_usage(calculated_stats, formatter);
+ formatter->close_section();
+ formatter->close_section();
+}
+
+int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp,
+ std::string *err_msg)
+{
+ const bool fix_index = op_state.will_fix_index();
+
+ bucket = op_state.get_bucket()->clone();
+
+ rgw::sal::Bucket::ListParams params;
+ params.list_versions = true;
+ params.ns = RGW_OBJ_NS_MULTIPART;
+
+ std::map<std::string, bool> meta_objs;
+ std::map<rgw_obj_index_key, std::string> all_objs;
+ bool is_truncated;
+ do {
+ rgw::sal::Bucket::ListResults results;
+ int r = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+ if (r < 0) {
+ set_err_msg(err_msg, "failed to list objects in bucket=" + bucket->get_name() +
+ " err=" + cpp_strerror(-r));
+
+ return r;
+ }
+ is_truncated = results.is_truncated;
+
+ for (const auto& o : results.objs) {
+ rgw_obj_index_key key = o.key;
+ rgw_obj obj(bucket->get_key(), key);
+ std::string oid = obj.get_oid();
+
+ int pos = oid.find_last_of('.');
+ if (pos < 0) {
+ /* obj has no suffix */
+ all_objs[key] = oid;
+ } else {
+ /* obj has suffix */
+ std::string name = oid.substr(0, pos);
+ std::string suffix = oid.substr(pos + 1);
+
+ if (suffix.compare("meta") == 0) {
+ meta_objs[name] = true;
+ } else {
+ all_objs[key] = name;
+ }
+ }
+ }
+ } while (is_truncated);
+
+ std::list<rgw_obj_index_key> objs_to_unlink;
+ Formatter *f = flusher.get_formatter();
+
+ f->open_array_section("invalid_multipart_entries");
+
+ for (const auto& o : all_objs) {
+ const std::string& name = o.second;
+ if (meta_objs.find(name) == meta_objs.end()) {
+ objs_to_unlink.push_back(o.first);
+ }
+
+ if (objs_to_unlink.size() > listing_max_entries) {
+ if (fix_index) {
+ // note: under rados this removes directly from rados index objects
+ int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+ if (r < 0) {
+ set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+ cpp_strerror(-r));
+ return r;
+ }
+ }
+
+ dump_mulipart_index_results(objs_to_unlink, f);
+ flusher.flush();
+ objs_to_unlink.clear();
+ }
+ }
+
+ if (fix_index) {
+ // note: under rados this removes directly from rados index objects
+ int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+ if (r < 0) {
+ set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+ cpp_strerror(-r));
+
+ return r;
+ }
+ }
+
+ dump_mulipart_index_results(objs_to_unlink, f);
+ f->close_section();
+ flusher.flush();
+
+ return 0;
+}
+
+int RGWBucket::check_object_index(const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y,
+ std::string *err_msg)
+{
+
+ bool fix_index = op_state.will_fix_index();
+
+ if (!fix_index) {
+ set_err_msg(err_msg, "check-objects flag requires fix index enabled");
+ return -EINVAL;
+ }
+
+ // use a quicker/shorter tag timeout during this process
+ bucket->set_tag_timeout(dpp, BUCKET_TAG_QUICK_TIMEOUT);
+
+ rgw::sal::Bucket::ListResults results;
+ results.is_truncated = true;
+
+ Formatter *formatter = flusher.get_formatter();
+ formatter->open_object_section("objects");
+
+ while (results.is_truncated) {
+ rgw::sal::Bucket::ListParams params;
+ params.marker = results.next_marker;
+ params.force_check_filter = rgw_bucket_object_check_filter;
+
+ int r = bucket->list(dpp, params, listing_max_entries, results, y);
+
+ if (r == -ENOENT) {
+ break;
+ } else if (r < 0) {
+ set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
+ }
+
+ dump_bucket_index(results.objs, formatter);
+ flusher.flush();
+ }
+
+ formatter->close_section();
+
+ // restore normal tag timeout for bucket
+ bucket->set_tag_timeout(dpp, 0);
+
+ return 0;
+}
+
+/**
+ * Loops over all olh entries in a bucket shard and finds ones with
+ * exists=false and pending_removal=true. If the pending log is empty on
+ * these entries, they were left behind after the last remaining version of
+ * an object was deleted or after an incomplete upload. This was known to
+ * happen historically due to concurrency conflicts among requests referencing
+ * the same object key. If op_state.fix_index is true, we continue where the
+ * request left off by calling RGWRados::clear_olh. If the pending log is not
+ * empty, we attempt to apply it.
+ */
+static int check_index_olh(rgw::sal::RadosStore* const rados_store,
+ rgw::sal::Bucket* const bucket,
+ const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const int shard,
+ uint64_t* const count_out,
+ optional_yield y)
+{
+ string marker = BI_OLH_ENTRY_NS_START;
+ bool is_truncated = true;
+ list<rgw_cls_bi_entry> entries;
+
+ RGWObjectCtx obj_ctx(rados_store);
+ RGWRados* store = rados_store->getRados();
+ RGWRados::BucketShard bs(store);
+
+ int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ *count_out = 0;
+ do {
+ entries.clear();
+ ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl;
+ break;
+ }
+ list<rgw_cls_bi_entry>::iterator iter;
+ for (iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_cls_bi_entry& entry = *iter;
+ marker = entry.idx;
+ if (entry.type != BIIndexType::OLH) {
+ is_truncated = false;
+ break;
+ }
+ rgw_bucket_olh_entry olh_entry;
+ auto iiter = entry.data.cbegin();
+ try {
+ decode(olh_entry, iiter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, -1) << "ERROR failed to decode olh entry for key: " << entry.idx << dendl;
+ continue;
+ }
+ if (olh_entry.exists || !olh_entry.pending_removal) {
+ continue;
+ }
+ if (op_state.will_fix_index()) {
+ rgw_obj obj(bucket->get_key(), olh_entry.key.name);
+ if (olh_entry.pending_log.empty()) {
+ ret = store->clear_olh(dpp, obj_ctx, obj, bucket->get_info(), olh_entry.tag, olh_entry.epoch, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR failed to clear olh for: " << olh_entry.key.name << " clear_olh(): " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ } else {
+ std::unique_ptr<rgw::sal::Object> object = bucket->get_object({olh_entry.key.name});
+ RGWObjState *state;
+ ret = object->get_obj_state(dpp, &state, y, false);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR failed to get state for: " << olh_entry.key.name << " get_obj_state(): " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ ret = store->update_olh(dpp, obj_ctx, state, bucket->get_info(), obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR failed to update olh for: " << olh_entry.key.name << " update_olh(): " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ }
+ }
+ if (op_state.dump_keys) {
+ flusher.get_formatter()->dump_string("", olh_entry.key.name);
+ if (flusher.get_formatter()->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) {
+ flusher.flush();
+ }
+ }
+ *count_out += 1;
+ }
+ } while (is_truncated);
+ flusher.flush();
+ return 0;
+}
+
+
+/**
+ * Spawns separate coroutines to check each bucket shard for leftover
+ * olh entries (and remove them if op_state.fix_index is true).
+ */
+int RGWBucket::check_index_olh(rgw::sal::RadosStore* const rados_store,
+ const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher)
+{
+ const RGWBucketInfo& bucket_info = get_bucket_info();
+ if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) {
+ ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl;
+ return 0;
+ }
+
+ Formatter* formatter = flusher.get_formatter();
+ if (op_state.dump_keys) {
+ formatter->open_array_section("");
+ }
+
+ const int max_shards = rgw::num_shards(bucket_info.layout.current_index);
+ std::string verb = op_state.will_fix_index() ? "removed" : "found";
+ uint64_t count_out = 0;
+
+ boost::asio::io_context context;
+ int next_shard = 0;
+
+ const int max_aio = std::max(1, op_state.get_max_aio());
+
+ for (int i=0; i<max_aio; i++) {
+ spawn::spawn(context, [&](yield_context yield) {
+ while (true) {
+ int shard = next_shard;
+ next_shard += 1;
+ if (shard >= max_shards) {
+ return;
+ }
+ optional_yield y(context, yield);
+ uint64_t shard_count;
+ int r = ::check_index_olh(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "NOTICE: error processing shard " << shard <<
+ " check_index_olh(): " << r << dendl;
+ }
+ count_out += shard_count;
+ if (!op_state.hide_progress) {
+ ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count <<
+ " entries " << verb << ")" << dendl;
+ }
+ }
+ });
+ }
+ try {
+ context.run();
+ } catch (const std::system_error& e) {
+ return -e.code().value();
+ }
+ if (!op_state.hide_progress) {
+ ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out <<
+ " entries " << verb << ")" << dendl;
+ }
+ if (op_state.dump_keys) {
+ formatter->close_section();
+ flusher.flush();
+ }
+ return 0;
+}
+
+/**
+ * Indicates whether a versioned bucket instance entry is listable in the
+ * index. It does this by looping over all plain entries with prefix equal to
+ * the key name, and checking whether any have an instance ID matching the one
+ * on the specified key. The existence of an instance entry without a matching
+ * plain entry indicates that the object was uploaded successfully, but the
+ * request exited prior to linking the object into the index (via the creation
+ * of a plain entry).
+ */
+static int is_versioned_instance_listable(const DoutPrefixProvider *dpp,
+ RGWRados::BucketShard& bs,
+ const cls_rgw_obj_key& key,
+ bool& listable,
+ optional_yield y)
+{
+ const std::string empty_delim;
+ cls_rgw_obj_key marker;
+ rgw_cls_list_ret result;
+ listable = false;
+
+ do {
+ librados::ObjectReadOperation op;
+ cls_rgw_bucket_list_op(op, marker, key.name, empty_delim, 1000,
+ true, &result);
+ bufferlist ibl;
+ int r = bs.bucket_obj.operate(dpp, &op, &ibl, y);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto const& entry : result.dir.m) {
+ if (entry.second.key == key) {
+ listable = true;
+ return 0;
+ }
+ marker = entry.second.key;
+ }
+ } while (result.is_truncated);
+ return 0;
+}
+
+/**
+ * Loops over all instance entries in a bucket shard and finds ones with
+ * versioned_epoch=0 and an mtime that is earlier than op_state.min_age
+ * relative to the current time. These entries represent objects that were
+ * uploaded successfully but were not successfully linked into the object
+ * index. As an extra precaution, we also verify that these entries are indeed
+ * non listable (have no corresponding plain entry in the index). We can assume
+ * that clients received an error response for the associated upload requests
+ * since the bucket index linking transaction did not complete. Therefore, if
+ * op_state.fix_index is true, we remove the object that is associated with the
+ * instance entry.
+ */
+static int check_index_unlinked(rgw::sal::RadosStore* const rados_store,
+ rgw::sal::Bucket* const bucket,
+ const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const int shard,
+ uint64_t* const count_out,
+ optional_yield y)
+{
+ string marker = BI_INSTANCE_ENTRY_NS_START;
+ bool is_truncated = true;
+ list<rgw_cls_bi_entry> entries;
+
+ RGWObjectCtx obj_ctx(rados_store);
+ RGWRados* store = rados_store->getRados();
+ RGWRados::BucketShard bs(store);
+
+ int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ceph::real_clock::time_point now = ceph::real_clock::now();
+ ceph::real_clock::time_point not_after = now - op_state.min_age;
+
+ *count_out = 0;
+ do {
+ entries.clear();
+ ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl;
+ break;
+ }
+ list<rgw_cls_bi_entry>::iterator iter;
+ for (iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_cls_bi_entry& entry = *iter;
+ marker = entry.idx;
+ if (entry.type != BIIndexType::Instance) {
+ is_truncated = false;
+ break;
+ }
+ rgw_bucket_dir_entry dir_entry;
+ auto iiter = entry.data.cbegin();
+ try {
+ decode(dir_entry, iiter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, -1) << "ERROR failed to decode instance entry for key: " <<
+ entry.idx << dendl;
+ continue;
+ }
+ if (dir_entry.versioned_epoch != 0 || dir_entry.meta.mtime > not_after) {
+ continue;
+ }
+ bool listable;
+ ret = is_versioned_instance_listable(dpp, bs, dir_entry.key, listable, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR is_versioned_instance_listable(key='" <<
+ dir_entry.key << "'): " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ if (listable) {
+ continue;
+ }
+ if (op_state.will_fix_index()) {
+ rgw_obj_key key(dir_entry.key.name, dir_entry.key.instance);
+ ret = rgw_remove_object(dpp, rados_store, bucket, key);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR rgw_remove_obj(key='" <<
+ dir_entry.key << "'): " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ }
+ if (op_state.dump_keys) {
+ Formatter* const formatter = flusher.get_formatter();
+ formatter->open_object_section("object_instance");
+ formatter->dump_string("name", dir_entry.key.name);
+ formatter->dump_string("instance", dir_entry.key.instance);
+ formatter->close_section();
+ if (formatter->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) {
+ flusher.flush();
+ }
+ }
+ *count_out += 1;
+ }
+ } while (is_truncated);
+ flusher.flush();
+ return 0;
+}
+
+/**
+ * Spawns separate coroutines to check each bucket shard for unlinked
+ * instance entries (and remove them if op_state.fix_index is true).
+ */
+int RGWBucket::check_index_unlinked(rgw::sal::RadosStore* const rados_store,
+ const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher)
+{
+ const RGWBucketInfo& bucket_info = get_bucket_info();
+ if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) {
+ ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl;
+ return 0;
+ }
+
+ Formatter* formatter = flusher.get_formatter();
+ if (op_state.dump_keys) {
+ formatter->open_array_section("");
+ }
+
+ const int max_shards = rgw::num_shards(bucket_info.layout.current_index);
+ std::string verb = op_state.will_fix_index() ? "removed" : "found";
+ uint64_t count_out = 0;
+
+ int max_aio = std::max(1, op_state.get_max_aio());
+ int next_shard = 0;
+ boost::asio::io_context context;
+ for (int i=0; i<max_aio; i++) {
+ spawn::spawn(context, [&](yield_context yield) {
+ while (true) {
+ int shard = next_shard;
+ next_shard += 1;
+ if (shard >= max_shards) {
+ return;
+ }
+ uint64_t shard_count;
+ optional_yield y {context, yield};
+ int r = ::check_index_unlinked(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: error processing shard " << shard <<
+ " check_index_unlinked(): " << r << dendl;
+ }
+ count_out += shard_count;
+ if (!op_state.hide_progress) {
+ ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count <<
+ " entries " << verb << ")" << dendl;
+ }
+ }
+ });
+ }
+ try {
+ context.run();
+ } catch (const std::system_error& e) {
+ return -e.code().value();
+ }
+
+ if (!op_state.hide_progress) {
+ ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out <<
+ " entries " << verb << ")" << dendl;
+ }
+ if (op_state.dump_keys) {
+ formatter->close_section();
+ flusher.flush();
+ }
+ return 0;
+}
+
+int RGWBucket::check_index(const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ map<RGWObjCategory, RGWStorageStats>& existing_stats,
+ map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+ std::string *err_msg)
+{
+ bool fix_index = op_state.will_fix_index();
+
+ int r = bucket->check_index(dpp, existing_stats, calculated_stats);
+ if (r < 0) {
+ set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r));
+ return r;
+ }
+
+ if (fix_index) {
+ r = bucket->rebuild_index(dpp);
+ if (r < 0) {
+ set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r));
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int RGWBucket::sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+ if (!driver->is_meta_master()) {
+ set_err_msg(err_msg, "ERROR: failed to update bucket sync: only allowed on meta master zone");
+ return -EINVAL;
+ }
+ bool sync = op_state.will_sync_bucket();
+ if (sync) {
+ bucket->get_info().flags &= ~BUCKET_DATASYNC_DISABLED;
+ } else {
+ bucket->get_info().flags |= BUCKET_DATASYNC_DISABLED;
+ }
+
+ // when writing this metadata, RGWSI_BucketIndex_RADOS::handle_overwrite()
+ // will write the corresponding datalog and bilog entries
+ int r = bucket->put_info(dpp, false, real_time());
+ if (r < 0) {
+ set_err_msg(err_msg, "ERROR: failed writing bucket instance info:" + cpp_strerror(-r));
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o)
+{
+ RGWAccessControlPolicy_S3 policy(g_ceph_context);
+ int ret = decode_bl(bl, policy);
+ if (ret < 0) {
+ ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+ }
+ policy.to_xml(o);
+ return 0;
+}
+
+int rgw_object_get_attr(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, rgw::sal::Object* obj,
+ const char* attr_name, bufferlist& out_bl, optional_yield y)
+{
+ std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
+
+ return rop->get_attr(dpp, attr_name, out_bl, y);
+}
+
+int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp)
+{
+ int ret;
+ std::string object_name = op_state.get_object_name();
+
+ bucket = op_state.get_bucket()->clone();
+
+ if (!object_name.empty()) {
+ bufferlist bl;
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(object_name));
+
+ ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_ACL, bl, y);
+ if (ret < 0){
+ return ret;
+ }
+
+ ret = decode_bl(bl, policy);
+ if (ret < 0) {
+ ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+ }
+ return ret;
+ }
+
+ map<string, bufferlist>::iterator aiter = bucket->get_attrs().find(RGW_ATTR_ACL);
+ if (aiter == bucket->get_attrs().end()) {
+ return -ENOENT;
+ }
+
+ ret = decode_bl(aiter->second, policy);
+ if (ret < 0) {
+ ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+ }
+
+ return ret;
+}
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = bucket.get_policy(op_state, policy, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/* Wrappers to facilitate RESTful interface */
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
+{
+ RGWAccessControlPolicy policy(driver->ctx());
+
+ int ret = get_policy(driver, op_state, policy, dpp);
+ if (ret < 0)
+ return ret;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ flusher.start(0);
+
+ formatter->open_object_section("policy");
+ policy.dump(formatter);
+ formatter->close_section();
+
+ flusher.flush();
+
+ return 0;
+}
+
+int RGWBucketAdminOp::dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ ostream& os, const DoutPrefixProvider *dpp)
+{
+ RGWAccessControlPolicy_S3 policy(driver->ctx());
+
+ int ret = get_policy(driver, op_state, policy, dpp);
+ if (ret < 0)
+ return ret;
+
+ policy.to_xml(os);
+
+ return 0;
+}
+
+int RGWBucketAdminOp::unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+
+ return static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(op_state.get_user_id(), op_state.get_bucket()->get_info().bucket, null_yield, dpp, true);
+}
+
+int RGWBucketAdminOp::link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err)
+{
+ if (!op_state.is_user_op()) {
+ set_err_msg(err, "empty user id");
+ return -EINVAL;
+ }
+
+ RGWBucket bucket;
+ int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+ if (ret < 0)
+ return ret;
+
+ string bucket_id = op_state.get_bucket_id();
+ std::string display_name = op_state.get_user_display_name();
+ std::unique_ptr<rgw::sal::Bucket> loc_bucket;
+ std::unique_ptr<rgw::sal::Bucket> old_bucket;
+
+ loc_bucket = op_state.get_bucket()->clone();
+
+ if (!bucket_id.empty() && bucket_id != loc_bucket->get_bucket_id()) {
+ set_err_msg(err,
+ "specified bucket id does not match " + loc_bucket->get_bucket_id());
+ return -EINVAL;
+ }
+
+ old_bucket = loc_bucket->clone();
+
+ loc_bucket->get_key().tenant = op_state.get_user_id().tenant;
+
+ if (!op_state.new_bucket_name.empty()) {
+ auto pos = op_state.new_bucket_name.find('/');
+ if (pos != string::npos) {
+ loc_bucket->get_key().tenant = op_state.new_bucket_name.substr(0, pos);
+ loc_bucket->get_key().name = op_state.new_bucket_name.substr(pos + 1);
+ } else {
+ loc_bucket->get_key().name = op_state.new_bucket_name;
+ }
+ }
+
+ RGWObjVersionTracker objv_tracker;
+ RGWObjVersionTracker old_version = loc_bucket->get_info().objv_tracker;
+
+ map<string, bufferlist>::iterator aiter = loc_bucket->get_attrs().find(RGW_ATTR_ACL);
+ if (aiter == loc_bucket->get_attrs().end()) {
+ // should never happen; only pre-argonaut buckets lacked this.
+ ldpp_dout(dpp, 0) << "WARNING: can't bucket link because no acl on bucket=" << old_bucket << dendl;
+ set_err_msg(err,
+ "While crossing the Anavros you have displeased the goddess Hera."
+ " You must sacrifice your ancient bucket " + loc_bucket->get_bucket_id());
+ return -EINVAL;
+ }
+ bufferlist& aclbl = aiter->second;
+ RGWAccessControlPolicy policy;
+ ACLOwner owner;
+ try {
+ auto iter = aclbl.cbegin();
+ decode(policy, iter);
+ owner = policy.get_owner();
+ } catch (buffer::error& e) {
+ set_err_msg(err, "couldn't decode policy");
+ return -EIO;
+ }
+
+ int r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(owner.get_id(), old_bucket->get_info().bucket, null_yield, dpp, false);
+ if (r < 0) {
+ set_err_msg(err, "could not unlink policy from user " + owner.get_id().to_str());
+ return r;
+ }
+
+ // now update the user for the bucket...
+ if (display_name.empty()) {
+ ldpp_dout(dpp, 0) << "WARNING: user " << op_state.get_user_id() << " has no display name set" << dendl;
+ }
+
+ RGWAccessControlPolicy policy_instance;
+ policy_instance.create_default(op_state.get_user_id(), display_name);
+ owner = policy_instance.get_owner();
+
+ aclbl.clear();
+ policy_instance.encode(aclbl);
+
+ bool exclusive = false;
+ loc_bucket->get_info().owner = op_state.get_user_id();
+ if (*loc_bucket != *old_bucket) {
+ loc_bucket->get_info().bucket = loc_bucket->get_key();
+ loc_bucket->get_info().objv_tracker.version_for_read()->ver = 0;
+ exclusive = true;
+ }
+
+ r = loc_bucket->put_info(dpp, exclusive, ceph::real_time());
+ if (r < 0) {
+ set_err_msg(err, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+ return r;
+ }
+
+ /* link to user */
+ RGWBucketEntryPoint ep;
+ ep.bucket = loc_bucket->get_info().bucket;
+ ep.owner = op_state.get_user_id();
+ ep.creation_time = loc_bucket->get_info().creation_time;
+ ep.linked = true;
+ rgw::sal::Attrs ep_attrs;
+ rgw_ep_info ep_data{ep, ep_attrs};
+
+ r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->link_bucket(op_state.get_user_id(), loc_bucket->get_info().bucket, loc_bucket->get_info().creation_time, null_yield, dpp, true, &ep_data);
+ if (r < 0) {
+ set_err_msg(err, "failed to relink bucket");
+ return r;
+ }
+
+ if (*loc_bucket != *old_bucket) {
+ // like RGWRados::delete_bucket -- excepting no bucket_index work.
+ r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_entrypoint_info(
+ old_bucket->get_key(), null_yield, dpp,
+ RGWBucketCtl::Bucket::RemoveParams()
+ .set_objv_tracker(&ep_data.ep_objv));
+ if (r < 0) {
+ set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+ return r;
+ }
+ r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_instance_info(
+ old_bucket->get_key(), old_bucket->get_info(),
+ null_yield, dpp,
+ RGWBucketCtl::BucketInstance::RemoveParams()
+ .set_objv_tracker(&ep_data.ep_objv));
+ if (r < 0) {
+ set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int RGWBucketAdminOp::chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const string& marker, const DoutPrefixProvider *dpp, string *err)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+ if (ret < 0)
+ return ret;
+
+ return bucket.chown(op_state, marker, null_yield, dpp, err);
+
+}
+
+int RGWBucketAdminOp::check_index_olh(rgw::sal::RadosStore* store, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+ int ret = bucket.init(store, op_state, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl;
+ return ret;
+ }
+ flusher.start(0);
+ ret = bucket.check_index_olh(store, dpp, op_state, flusher);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "check_index_olh(): " << ret << dendl;
+ return ret;
+ }
+ flusher.flush();
+ return 0;
+}
+
+int RGWBucketAdminOp::check_index_unlinked(rgw::sal::RadosStore* store,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp)
+{
+ flusher.start(0);
+ RGWBucket bucket;
+ int ret = bucket.init(store, op_state, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl;
+ return ret;
+ }
+ ret = bucket.check_index_unlinked(store, dpp, op_state, flusher);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "check_index_unlinked(): " << ret << dendl;
+ return ret;
+ }
+ flusher.flush();
+ return 0;
+}
+
+int RGWBucketAdminOp::check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp)
+{
+ int ret;
+ map<RGWObjCategory, RGWStorageStats> existing_stats;
+ map<RGWObjCategory, RGWStorageStats> calculated_stats;
+
+
+ RGWBucket bucket;
+
+ ret = bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+
+ Formatter *formatter = flusher.get_formatter();
+ flusher.start(0);
+ formatter->open_object_section("bucket_check");
+
+ ret = bucket.check_bad_index_multipart(op_state, flusher, dpp);
+ if (ret < 0)
+ return ret;
+
+ if (op_state.will_check_objects()) {
+ ret = bucket.check_object_index(dpp, op_state, flusher, y);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = bucket.check_index(dpp, op_state, existing_stats, calculated_stats);
+ if (ret < 0)
+ return ret;
+
+ dump_index_check(existing_stats, calculated_stats, formatter);
+
+ formatter->close_section();
+ flusher.flush();
+
+ return 0;
+}
+
+int RGWBucketAdminOp::remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ bool bypass_gc, bool keep_index_consistent)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+
+ int ret = driver->get_bucket(dpp, user.get(), user->get_tenant(), op_state.get_bucket_name(),
+ &bucket, y);
+ if (ret < 0)
+ return ret;
+
+ if (bypass_gc)
+ ret = bucket->remove_bucket_bypass_gc(op_state.get_max_aio(), keep_index_consistent, y, dpp);
+ else
+ ret = bucket->remove_bucket(dpp, op_state.will_delete_children(),
+ false, nullptr, y);
+
+ return ret;
+}
+
+int RGWBucketAdminOp::remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+
+ return bucket.remove_object(dpp, op_state);
+}
+
+int RGWBucketAdminOp::sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err_msg)
+{
+ RGWBucket bucket;
+ int ret = bucket.init(driver, op_state, null_yield, dpp, err_msg);
+ if (ret < 0)
+ {
+ return ret;
+ }
+ return bucket.sync(op_state, dpp, err_msg);
+}
+
+static int bucket_stats(rgw::sal::Driver* driver,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ Formatter *formatter,
+ const DoutPrefixProvider *dpp)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ map<RGWObjCategory, RGWStorageStats> stats;
+
+ int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+
+ const RGWBucketInfo& bucket_info = bucket->get_info();
+
+ const auto& index = bucket->get_info().get_current_index();
+ if (is_layout_indexless(index)) {
+ cerr << "error, indexless buckets do not maintain stats; bucket=" <<
+ bucket->get_name() << std::endl;
+ return -EINVAL;
+ }
+
+ std::string bucket_ver, master_ver;
+ std::string max_marker;
+ ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker);
+ if (ret < 0) {
+ cerr << "error getting bucket stats bucket=" << bucket->get_name() << " ret=" << ret << std::endl;
+ return ret;
+ }
+
+ utime_t ut(bucket->get_modification_time());
+ utime_t ctime_ut(bucket->get_creation_time());
+
+ formatter->open_object_section("stats");
+ formatter->dump_string("bucket", bucket->get_name());
+ formatter->dump_int("num_shards",
+ bucket->get_info().layout.current_index.layout.normal.num_shards);
+ formatter->dump_string("tenant", bucket->get_tenant());
+ formatter->dump_string("zonegroup", bucket->get_info().zonegroup);
+ formatter->dump_string("placement_rule", bucket->get_info().placement_rule.to_str());
+ ::encode_json("explicit_placement", bucket->get_key().explicit_placement, formatter);
+ formatter->dump_string("id", bucket->get_bucket_id());
+ formatter->dump_string("marker", bucket->get_marker());
+ formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type;
+ formatter->dump_bool("versioned", bucket_info.versioned());
+ formatter->dump_bool("versioning_enabled", bucket_info.versioning_enabled());
+ formatter->dump_bool("object_lock_enabled", bucket_info.obj_lock_enabled());
+ formatter->dump_bool("mfa_enabled", bucket_info.mfa_enabled());
+ ::encode_json("owner", bucket->get_info().owner, formatter);
+ formatter->dump_string("ver", bucket_ver);
+ formatter->dump_string("master_ver", master_ver);
+ ut.gmtime(formatter->dump_stream("mtime"));
+ ctime_ut.gmtime(formatter->dump_stream("creation_time"));
+ formatter->dump_string("max_marker", max_marker);
+ dump_bucket_usage(stats, formatter);
+ encode_json("bucket_quota", bucket->get_info().quota, formatter);
+
+ // bucket tags
+ auto iter = bucket->get_attrs().find(RGW_ATTR_TAGS);
+ if (iter != bucket->get_attrs().end()) {
+ RGWObjTagSet_S3 tagset;
+ bufferlist::const_iterator piter{&iter->second};
+ try {
+ tagset.decode(piter);
+ tagset.dump(formatter);
+ } catch (buffer::error& err) {
+ cerr << "ERROR: caught buffer:error, couldn't decode TagSet" << std::endl;
+ }
+ }
+
+ // TODO: bucket CORS
+ // TODO: bucket LC
+ formatter->close_section();
+
+ return 0;
+}
+
+int RGWBucketAdminOp::limit_check(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ const std::list<std::string>& user_ids,
+ RGWFormatterFlusher& flusher, optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool warnings_only)
+{
+ int ret = 0;
+ const size_t max_entries =
+ driver->ctx()->_conf->rgw_list_buckets_max_chunk;
+
+ const size_t safe_max_objs_per_shard =
+ driver->ctx()->_conf->rgw_safe_max_objects_per_shard;
+
+ uint16_t shard_warn_pct =
+ driver->ctx()->_conf->rgw_shard_warning_threshold;
+ if (shard_warn_pct > 100)
+ shard_warn_pct = 90;
+
+ Formatter *formatter = flusher.get_formatter();
+ flusher.start(0);
+
+ formatter->open_array_section("users");
+
+ for (const auto& user_id : user_ids) {
+
+ formatter->open_object_section("user");
+ formatter->dump_string("user_id", user_id);
+ formatter->open_array_section("buckets");
+
+ string marker;
+ rgw::sal::BucketList buckets;
+ do {
+ std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_id));
+
+ ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y);
+
+ if (ret < 0)
+ return ret;
+
+ map<string, std::unique_ptr<rgw::sal::Bucket>>& m_buckets = buckets.get_buckets();
+
+ for (const auto& iter : m_buckets) {
+ auto& bucket = iter.second;
+ uint64_t num_objects = 0;
+
+ marker = bucket->get_name(); /* Casey's location for marker update,
+ * as we may now not reach the end of
+ * the loop body */
+
+ ret = bucket->load_bucket(dpp, y);
+ if (ret < 0)
+ continue;
+
+ const auto& index = bucket->get_info().get_current_index();
+ if (is_layout_indexless(index)) {
+ continue; // indexless buckets don't have stats
+ }
+
+ /* need stats for num_entries */
+ string bucket_ver, master_ver;
+ std::map<RGWObjCategory, RGWStorageStats> stats;
+ ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, nullptr);
+
+ if (ret < 0)
+ continue;
+
+ for (const auto& s : stats) {
+ num_objects += s.second.num_objects;
+ }
+
+ const uint32_t num_shards = rgw::num_shards(index.layout.normal);
+ uint64_t objs_per_shard =
+ (num_shards) ? num_objects/num_shards : num_objects;
+ {
+ bool warn;
+ stringstream ss;
+ uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard;
+ if (fill_pct > 100) {
+ ss << "OVER " << fill_pct << "%";
+ warn = true;
+ } else if (fill_pct >= shard_warn_pct) {
+ ss << "WARN " << fill_pct << "%";
+ warn = true;
+ } else {
+ ss << "OK";
+ warn = false;
+ }
+
+ if (warn || !warnings_only) {
+ formatter->open_object_section("bucket");
+ formatter->dump_string("bucket", bucket->get_name());
+ formatter->dump_string("tenant", bucket->get_tenant());
+ formatter->dump_int("num_objects", num_objects);
+ formatter->dump_int("num_shards", num_shards);
+ formatter->dump_int("objects_per_shard", objs_per_shard);
+ formatter->dump_string("fill_status", ss.str());
+ formatter->close_section();
+ }
+ }
+ }
+ formatter->flush(cout);
+ } while (buckets.is_truncated()); /* foreach: bucket */
+
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+
+ } /* foreach: user_id */
+
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return ret;
+} /* RGWBucketAdminOp::limit_check */
+
+int RGWBucketAdminOp::info(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+ int ret = 0;
+ const std::string& bucket_name = op_state.get_bucket_name();
+ if (!bucket_name.empty()) {
+ ret = bucket.init(driver, op_state, y, dpp);
+ if (-ENOENT == ret)
+ return -ERR_NO_SUCH_BUCKET;
+ else if (ret < 0)
+ return ret;
+ }
+
+ Formatter *formatter = flusher.get_formatter();
+ flusher.start(0);
+
+ CephContext *cct = driver->ctx();
+
+ const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+ const bool show_stats = op_state.will_fetch_stats();
+ const rgw_user& user_id = op_state.get_user_id();
+ if (op_state.is_user_op()) {
+ formatter->open_array_section("buckets");
+
+ rgw::sal::BucketList buckets;
+ std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+ std::string marker;
+ const std::string empty_end_marker;
+ constexpr bool no_need_stats = false; // set need_stats to false
+
+ do {
+ ret = user->list_buckets(dpp, marker, empty_end_marker, max_entries,
+ no_need_stats, buckets, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ const std::string* marker_cursor = nullptr;
+ map<string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
+
+ for (const auto& i : m) {
+ const std::string& obj_name = i.first;
+ if (!bucket_name.empty() && bucket_name != obj_name) {
+ continue;
+ }
+
+ if (show_stats) {
+ bucket_stats(driver, user_id.tenant, obj_name, formatter, dpp);
+ } else {
+ formatter->dump_string("bucket", obj_name);
+ }
+
+ marker_cursor = &obj_name;
+ } // for loop
+ if (marker_cursor) {
+ marker = *marker_cursor;
+ }
+
+ flusher.flush();
+ } while (buckets.is_truncated());
+
+ formatter->close_section();
+ } else if (!bucket_name.empty()) {
+ ret = bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ void *handle = nullptr;
+ bool truncated = true;
+
+ formatter->open_array_section("buckets");
+ ret = driver->meta_list_keys_init(dpp, "bucket", string(), &handle);
+ while (ret == 0 && truncated) {
+ std::list<std::string> buckets;
+ constexpr int max_keys = 1000;
+ ret = driver->meta_list_keys_next(dpp, handle, max_keys, buckets,
+ &truncated);
+ for (auto& bucket_name : buckets) {
+ if (show_stats) {
+ bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+ } else {
+ formatter->dump_string("bucket", bucket_name);
+ }
+ }
+ }
+ driver->meta_list_keys_complete(handle);
+
+ formatter->close_section();
+ }
+
+ flusher.flush();
+
+ return 0;
+}
+
+int RGWBucketAdminOp::set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+ RGWBucket bucket;
+
+ int ret = bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0)
+ return ret;
+ return bucket.set_quota(op_state, dpp);
+}
+
+inline auto split_tenant(const std::string& bucket_name){
+ auto p = bucket_name.find('/');
+ if(p != std::string::npos) {
+ return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+ }
+ return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(rgw::sal::Driver* driver, const std::string& bucket_name,
+ const vector<std::string>& lst,
+ bucket_instance_ls& stale_instances,
+ const DoutPrefixProvider *dpp)
+{
+
+ bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+ for (const auto& bucket_instance : lst){
+ RGWBucketInfo binfo;
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket rbucket;
+ rgw_bucket_parse_bucket_key(driver->ctx(), bucket_instance, &rbucket, nullptr);
+ int r = driver->get_bucket(dpp, nullptr, rbucket, &bucket, null_yield);
+ if (r < 0){
+ // this can only happen if someone deletes us right when we're processing
+ ldpp_dout(dpp, -1) << "Bucket instance is invalid: " << bucket_instance
+ << cpp_strerror(-r) << dendl;
+ continue;
+ }
+ binfo = bucket->get_info();
+ if (binfo.reshard_status == cls_rgw_reshard_status::DONE)
+ stale_instances.emplace_back(std::move(binfo));
+ else {
+ other_instances.emplace_back(std::move(binfo));
+ }
+ }
+
+ // Read the cur bucket info, if the bucket doesn't exist we can simply return
+ // all the instances
+ auto [tenant, bname] = split_tenant(bucket_name);
+ RGWBucketInfo cur_bucket_info;
+ std::unique_ptr<rgw::sal::Bucket> cur_bucket;
+ int r = driver->get_bucket(dpp, nullptr, tenant, bname, &cur_bucket, null_yield);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ // bucket doesn't exist, everything is stale then
+ stale_instances.insert(std::end(stale_instances),
+ std::make_move_iterator(other_instances.begin()),
+ std::make_move_iterator(other_instances.end()));
+ } else {
+ // all bets are off if we can't read the bucket, just return the sureshot stale instances
+ ldpp_dout(dpp, -1) << "error: reading bucket info for bucket: "
+ << bname << cpp_strerror(-r) << dendl;
+ }
+ return;
+ }
+
+ // Don't process further in this round if bucket is resharding
+ cur_bucket_info = cur_bucket->get_info();
+ if (cur_bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS)
+ return;
+
+ other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+ [&cur_bucket_info](const RGWBucketInfo& b){
+ return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+ b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+ }),
+ other_instances.end());
+
+ // check if there are still instances left
+ if (other_instances.empty()) {
+ return;
+ }
+
+ // Now we have a bucket with instances where the reshard status is none, this
+ // usually happens when the reshard process couldn't complete, lockdown the
+ // bucket and walk through these instances to make sure no one else interferes
+ // with these
+ {
+ RGWBucketReshardLock reshard_lock(static_cast<rgw::sal::RadosStore*>(driver), cur_bucket->get_info(), true);
+ r = reshard_lock.lock(dpp);
+ if (r < 0) {
+ // most likely bucket is under reshard, return the sureshot stale instances
+ ldpp_dout(dpp, 5) << __func__
+ << "failed to take reshard lock; reshard underway likey" << dendl;
+ return;
+ }
+ auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+ // this should be fast enough that we may not need to renew locks and check
+ // exit status?, should we read the values of the instances again?
+ stale_instances.insert(std::end(stale_instances),
+ std::make_move_iterator(other_instances.begin()),
+ std::make_move_iterator(other_instances.end()));
+ }
+
+ return;
+}
+
+static int process_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp,
+ std::function<void(const bucket_instance_ls&,
+ Formatter *,
+ rgw::sal::Driver*)> process_f)
+{
+ std::string marker;
+ void *handle;
+ Formatter *formatter = flusher.get_formatter();
+ static constexpr auto default_max_keys = 1000;
+
+ int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+ if (ret < 0) {
+ cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+
+ bool truncated;
+
+ formatter->open_array_section("keys");
+ auto g = make_scope_guard([&driver, &handle, &formatter]() {
+ driver->meta_list_keys_complete(handle);
+ formatter->close_section(); // keys
+ formatter->flush(cout);
+ });
+
+ do {
+ list<std::string> keys;
+
+ ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+ if (ret < 0 && ret != -ENOENT) {
+ cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ } if (ret != -ENOENT) {
+ // partition the list of buckets by buckets as the listing is un sorted,
+ // since it would minimize the reads to bucket_info
+ std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+ for (auto &key: keys) {
+ auto pos = key.find(':');
+ if(pos != std::string::npos)
+ bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+ }
+ for (const auto& kv: bucket_instance_map) {
+ bucket_instance_ls stale_lst;
+ get_stale_instances(driver, kv.first, kv.second, stale_lst, dpp);
+ process_f(stale_lst, formatter, driver);
+ }
+ }
+ } while (truncated);
+
+ return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp)
+{
+ auto process_f = [](const bucket_instance_ls& lst,
+ Formatter *formatter,
+ rgw::sal::Driver*){
+ for (const auto& binfo: lst)
+ formatter->dump_string("key", binfo.bucket.get_key());
+ };
+ return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp)
+{
+ auto process_f = [dpp](const bucket_instance_ls& lst,
+ Formatter *formatter,
+ rgw::sal::Driver* driver){
+ for (const auto &binfo: lst) {
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ driver->get_bucket(nullptr, binfo, &bucket);
+ int ret = bucket->purge_instance(dpp);
+ if (ret == 0){
+ auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+ ret = driver->meta_remove(dpp, md_key, null_yield);
+ }
+ formatter->open_object_section("delete_status");
+ formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+ formatter->dump_int("status", -ret);
+ formatter->close_section();
+ }
+ };
+
+ return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+static int fix_single_bucket_lc(rgw::sal::Driver* driver,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ const DoutPrefixProvider *dpp)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+ if (ret < 0) {
+ // TODO: Should we handle the case where the bucket could've been removed between
+ // listing and fetching?
+ return ret;
+ }
+
+ return rgw::lc::fix_lc_shard_entry(dpp, driver, driver->get_rgwlc()->get_lc(), bucket.get());
+}
+
+static void format_lc_status(Formatter* formatter,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ int status)
+{
+ formatter->open_object_section("bucket_entry");
+ std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name;
+ formatter->dump_string("bucket", entry);
+ formatter->dump_int("status", status);
+ formatter->close_section(); // bucket_entry
+}
+
+static void process_single_lc_entry(rgw::sal::Driver* driver,
+ Formatter *formatter,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ const DoutPrefixProvider *dpp)
+{
+ int ret = fix_single_bucket_lc(driver, tenant_name, bucket_name, dpp);
+ format_lc_status(formatter, tenant_name, bucket_name, -ret);
+}
+
+int RGWBucketAdminOp::fix_lc_shards(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp)
+{
+ std::string marker;
+ void *handle;
+ Formatter *formatter = flusher.get_formatter();
+ static constexpr auto default_max_keys = 1000;
+
+ bool truncated;
+ if (const std::string& bucket_name = op_state.get_bucket_name();
+ ! bucket_name.empty()) {
+ const rgw_user user_id = op_state.get_user_id();
+ process_single_lc_entry(driver, formatter, user_id.tenant, bucket_name, dpp);
+ formatter->flush(cout);
+ } else {
+ int ret = driver->meta_list_keys_init(dpp, "bucket", marker, &handle);
+ if (ret < 0) {
+ std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+
+ {
+ formatter->open_array_section("lc_fix_status");
+ auto sg = make_scope_guard([&driver, &handle, &formatter](){
+ driver->meta_list_keys_complete(handle);
+ formatter->close_section(); // lc_fix_status
+ formatter->flush(cout);
+ });
+ do {
+ list<std::string> keys;
+ ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+ if (ret < 0 && ret != -ENOENT) {
+ std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ } if (ret != -ENOENT) {
+ for (const auto &key:keys) {
+ auto [tenant_name, bucket_name] = split_tenant(key);
+ process_single_lc_entry(driver, formatter, tenant_name, bucket_name, dpp);
+ }
+ }
+ formatter->flush(cout); // regularly flush every 1k entries
+ } while (truncated);
+ }
+
+ }
+ return 0;
+
+}
+
+static bool has_object_expired(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ rgw::sal::Bucket* bucket,
+ const rgw_obj_key& key, utime_t& delete_at)
+{
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+ bufferlist delete_at_bl;
+
+ int ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_DELETE_AT, delete_at_bl, null_yield);
+ if (ret < 0) {
+ return false; // no delete at attr, proceed
+ }
+
+ ret = decode_bl(delete_at_bl, delete_at);
+ if (ret < 0) {
+ return false; // failed to parse
+ }
+
+ if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
+ return true;
+ }
+
+ return false;
+}
+
+static int fix_bucket_obj_expiry(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ rgw::sal::Bucket* bucket,
+ RGWFormatterFlusher& flusher, bool dry_run)
+{
+ if (bucket->get_key().bucket_id == bucket->get_key().marker) {
+ ldpp_dout(dpp, -1) << "Not a resharded bucket skipping" << dendl;
+ return 0; // not a resharded bucket, move along
+ }
+
+ Formatter *formatter = flusher.get_formatter();
+ formatter->open_array_section("expired_deletion_status");
+ auto sg = make_scope_guard([&formatter] {
+ formatter->close_section();
+ formatter->flush(std::cout);
+ });
+
+ rgw::sal::Bucket::ListParams params;
+ rgw::sal::Bucket::ListResults results;
+
+ params.list_versions = bucket->versioned();
+ params.allow_unordered = true;
+
+ do {
+ int ret = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR failed to list objects in the bucket" << dendl;
+ return ret;
+ }
+ for (const auto& obj : results.objs) {
+ rgw_obj_key key(obj.key);
+ utime_t delete_at;
+ if (has_object_expired(dpp, driver, bucket, key, delete_at)) {
+ formatter->open_object_section("object_status");
+ formatter->dump_string("object", key.name);
+ formatter->dump_stream("delete_at") << delete_at;
+
+ if (!dry_run) {
+ ret = rgw_remove_object(dpp, driver, bucket, key);
+ formatter->dump_int("status", ret);
+ }
+
+ formatter->close_section(); // object_status
+ }
+ }
+ formatter->flush(cout); // regularly flush every 1k entries
+ } while (results.is_truncated);
+
+ return 0;
+}
+
+int RGWBucketAdminOp::fix_obj_expiry(rgw::sal::Driver* driver,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp, bool dry_run)
+{
+ RGWBucket admin_bucket;
+ int ret = admin_bucket.init(driver, op_state, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "failed to initialize bucket" << dendl;
+ return ret;
+ }
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ ret = driver->get_bucket(nullptr, admin_bucket.get_bucket_info(), &bucket);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return fix_bucket_obj_expiry(dpp, driver, bucket.get(), flusher, dry_run);
+}
+
+void RGWBucketCompleteInfo::dump(Formatter *f) const {
+ encode_json("bucket_info", info, f);
+ encode_json("attrs", attrs, f);
+}
+
+void RGWBucketCompleteInfo::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket_info", info, obj);
+ JSONDecoder::decode_json("attrs", attrs, obj);
+}
+
+class RGWBucketMetadataHandler : public RGWBucketMetadataHandlerBase {
+public:
+ struct Svc {
+ RGWSI_Bucket *bucket{nullptr};
+ } svc;
+
+ struct Ctl {
+ RGWBucketCtl *bucket{nullptr};
+ } ctl;
+
+ RGWBucketMetadataHandler() {}
+
+ void init(RGWSI_Bucket *bucket_svc,
+ RGWBucketCtl *bucket_ctl) override {
+ base_init(bucket_svc->ctx(),
+ bucket_svc->get_ep_be_handler().get());
+ svc.bucket = bucket_svc;
+ ctl.bucket = bucket_ctl;
+ }
+
+ string get_type() override { return "bucket"; }
+
+ RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+ RGWBucketEntryPoint be;
+
+ try {
+ decode_json_obj(be, jo);
+ } catch (JSONDecoder::err& e) {
+ return nullptr;
+ }
+
+ return new RGWBucketEntryMetadataObject(be, objv, mtime);
+ }
+
+ int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWObjVersionTracker ot;
+ RGWBucketEntryPoint be;
+
+ real_time mtime;
+ map<string, bufferlist> attrs;
+
+ RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+ int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &ot, &mtime, &attrs, y, dpp);
+ if (ret < 0)
+ return ret;
+
+ RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime, std::move(attrs));
+
+ *obj = mdo;
+
+ return 0;
+ }
+
+ int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone) override;
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWBucketEntryPoint be;
+
+ real_time orig_mtime;
+
+ RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+ int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &orig_mtime, nullptr, y, dpp);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing
+ * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
+ * will incorrectly fail.
+ */
+ ret = ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+ }
+
+ ret = svc.bucket->remove_bucket_entrypoint_info(ctx, entry, &objv_tracker, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+ }
+ /* idempotent */
+ return 0;
+ }
+
+ int call(std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+ return call(nullopt, f);
+ }
+
+ int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+ std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+ return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+ RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+ return f(ctx);
+ });
+ }
+};
+
+class RGWMetadataHandlerPut_Bucket : public RGWMetadataHandlerPut_SObj
+{
+ RGWBucketMetadataHandler *bhandler;
+ RGWBucketEntryMetadataObject *obj;
+public:
+ RGWMetadataHandlerPut_Bucket(RGWBucketMetadataHandler *_handler,
+ RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+ bhandler(_handler) {
+ obj = static_cast<RGWBucketEntryMetadataObject *>(_obj);
+ }
+ ~RGWMetadataHandlerPut_Bucket() {}
+
+ void encode_obj(bufferlist *bl) override {
+ obj->get_ep().encode(*bl);
+ }
+
+ int put_checked(const DoutPrefixProvider *dpp) override;
+ int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone)
+{
+ RGWMetadataHandlerPut_Bucket put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+ return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_checked(const DoutPrefixProvider *dpp)
+{
+ RGWBucketEntryMetadataObject *orig_obj = static_cast<RGWBucketEntryMetadataObject *>(old_obj);
+
+ if (orig_obj) {
+ obj->set_pattrs(&orig_obj->get_attrs());
+ }
+
+ auto& be = obj->get_ep();
+ auto mtime = obj->get_mtime();
+ auto pattrs = obj->get_pattrs();
+
+ RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+ return bhandler->svc.bucket->store_bucket_entrypoint_info(ctx, entry,
+ be,
+ false,
+ mtime,
+ pattrs,
+ &objv_tracker,
+ y,
+ dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_post(const DoutPrefixProvider *dpp)
+{
+ auto& be = obj->get_ep();
+
+ int ret;
+
+ /* link bucket */
+ if (be.linked) {
+ ret = bhandler->ctl.bucket->link_bucket(be.owner, be.bucket, be.creation_time, y, dpp, false);
+ } else {
+ ret = bhandler->ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+ }
+
+ return ret;
+}
+
+static void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) {
+
+ char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+ unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ bufferlist bl;
+
+ Formatter *f = new JSONFormatter(false);
+ be->dump(f);
+ f->flush(bl);
+
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ hash.Update((const unsigned char *)bl.c_str(), bl.length());
+ hash.Final(m);
+
+ buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5);
+
+ delete f;
+
+ md5_digest = md5;
+}
+
+#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info"
+
+struct archive_meta_info {
+ rgw_bucket orig_bucket;
+
+ bool from_attrs(CephContext *cct, map<string, bufferlist>& attrs) {
+ auto iter = attrs.find(ARCHIVE_META_ATTR);
+ if (iter == attrs.end()) {
+ return false;
+ }
+
+ auto bliter = iter->second.cbegin();
+ try {
+ decode(bliter);
+ } catch (buffer::error& err) {
+ ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl;
+ return false;
+ }
+
+ return true;
+ }
+
+ void store_in_attrs(map<string, bufferlist>& attrs) const {
+ encode(attrs[ARCHIVE_META_ATTR]);
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(orig_bucket, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(orig_bucket, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(archive_meta_info)
+
+class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler {
+public:
+ RGWArchiveBucketMetadataHandler() {}
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override {
+ auto cct = svc.bucket->ctx();
+
+ RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+ ldpp_dout(dpp, 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl;
+
+ string tenant_name, bucket_name;
+ parse_bucket(entry, &tenant_name, &bucket_name);
+ rgw_bucket entry_bucket;
+ entry_bucket.tenant = tenant_name;
+ entry_bucket.name = bucket_name;
+
+ real_time mtime;
+
+ /* read original entrypoint */
+
+ RGWBucketEntryPoint be;
+ map<string, bufferlist> attrs;
+ int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &mtime, &attrs, y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ string bi_meta_name = RGWSI_Bucket::get_bi_meta_key(be.bucket);
+
+ /* read original bucket instance info */
+
+ map<string, bufferlist> attrs_m;
+ ceph::real_time orig_mtime;
+ RGWBucketInfo old_bi;
+
+ ret = ctl.bucket->read_bucket_instance_info(be.bucket, &old_bi, y, dpp, RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(&orig_mtime)
+ .set_attrs(&attrs_m));
+ if (ret < 0) {
+ return ret;
+ }
+
+ archive_meta_info ami;
+
+ if (!ami.from_attrs(svc.bucket->ctx(), attrs_m)) {
+ ami.orig_bucket = old_bi.bucket;
+ ami.store_in_attrs(attrs_m);
+ }
+
+ /* generate a new bucket instance. We could have avoided this if we could just point a new
+ * bucket entry point to the old bucket instance, however, due to limitation in the way
+ * we index buckets under the user, bucket entrypoint and bucket instance of the same
+ * bucket need to have the same name, so we need to copy the old bucket instance into
+ * to a new entry with the new name
+ */
+
+ string new_bucket_name;
+
+ RGWBucketInfo new_bi = old_bi;
+ RGWBucketEntryPoint new_be = be;
+
+ string md5_digest;
+
+ get_md5_digest(&new_be, md5_digest);
+ new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest;
+
+ new_bi.bucket.name = new_bucket_name;
+ new_bi.objv_tracker.clear();
+
+ new_be.bucket.name = new_bucket_name;
+
+ ret = ctl.bucket->store_bucket_instance_info(new_be.bucket, new_bi, y, dpp, RGWBucketCtl::BucketInstance::PutParams()
+ .set_exclusive(false)
+ .set_mtime(orig_mtime)
+ .set_attrs(&attrs_m)
+ .set_orig_info(&old_bi));
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ /* store a new entrypoint */
+
+ RGWObjVersionTracker ot;
+ ot.generate_new_write_ver(cct);
+
+ ret = svc.bucket->store_bucket_entrypoint_info(ctx, RGWSI_Bucket::get_entrypoint_meta_key(new_be.bucket),
+ new_be, true, mtime, &attrs, nullptr, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ /* link new bucket */
+
+ ret = ctl.bucket->link_bucket(new_be.owner, new_be.bucket, new_be.creation_time, y, dpp, false);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ /* clean up old stuff */
+
+ ret = ctl.bucket->unlink_bucket(be.owner, entry_bucket, y, dpp, false);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+ }
+
+ // if (ret == -ECANCELED) it means that there was a race here, and someone
+ // wrote to the bucket entrypoint just before we removed it. The question is
+ // whether it was a newly created bucket entrypoint ... in which case we
+ // should ignore the error and move forward, or whether it is a higher version
+ // of the same bucket instance ... in which we should retry
+ ret = svc.bucket->remove_bucket_entrypoint_info(ctx,
+ RGWSI_Bucket::get_entrypoint_meta_key(be.bucket),
+ &objv_tracker,
+ y,
+ dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ ret = ctl.bucket->remove_bucket_instance_info(be.bucket, old_bi, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+ }
+
+
+ /* idempotent */
+
+ return 0;
+ }
+
+ int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone) override {
+ if (entry.find("-deleted-") != string::npos) {
+ RGWObjVersionTracker ot;
+ RGWMetadataObject *robj;
+ int ret = do_get(op, entry, &robj, y, dpp);
+ if (ret != -ENOENT) {
+ if (ret < 0) {
+ return ret;
+ }
+ ot.read_version = robj->get_version();
+ delete robj;
+
+ ret = do_remove(op, entry, ot, y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ }
+
+ return RGWBucketMetadataHandler::do_put(op, entry, obj,
+ objv_tracker, y, dpp, type, from_remote_zone);
+ }
+
+};
+
+class RGWBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandlerBase {
+ int read_bucket_instance_entry(RGWSI_Bucket_BI_Ctx& ctx,
+ const string& entry,
+ RGWBucketCompleteInfo *bi,
+ ceph::real_time *pmtime,
+ optional_yield y,
+ const DoutPrefixProvider *dpp) {
+ return svc.bucket->read_bucket_instance_info(ctx,
+ entry,
+ &bi->info,
+ pmtime, &bi->attrs,
+ y,
+ dpp);
+ }
+
+public:
+ struct Svc {
+ RGWSI_Zone *zone{nullptr};
+ RGWSI_Bucket *bucket{nullptr};
+ RGWSI_BucketIndex *bi{nullptr};
+ } svc;
+
+ rgw::sal::Driver* driver;
+
+ RGWBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+ : driver(driver) {}
+
+ void init(RGWSI_Zone *zone_svc,
+ RGWSI_Bucket *bucket_svc,
+ RGWSI_BucketIndex *bi_svc) override {
+ base_init(bucket_svc->ctx(),
+ bucket_svc->get_bi_be_handler().get());
+ svc.zone = zone_svc;
+ svc.bucket = bucket_svc;
+ svc.bi = bi_svc;
+ }
+
+ string get_type() override { return "bucket.instance"; }
+
+ RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+ RGWBucketCompleteInfo bci;
+
+ try {
+ decode_json_obj(bci, jo);
+ } catch (JSONDecoder::err& e) {
+ return nullptr;
+ }
+
+ return new RGWBucketInstanceMetadataObject(bci, objv, mtime);
+ }
+
+ int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWBucketCompleteInfo bci;
+ real_time mtime;
+
+ RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+ int ret = svc.bucket->read_bucket_instance_info(ctx, entry, &bci.info, &mtime, &bci.attrs, y, dpp);
+ if (ret < 0)
+ return ret;
+
+ RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime);
+
+ *obj = mdo;
+
+ return 0;
+ }
+
+ int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType sync_type, bool from_remote_zone) override;
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWBucketCompleteInfo bci;
+
+ RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+ int ret = read_bucket_instance_entry(ctx, entry, &bci, nullptr, y, dpp);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+
+ return svc.bucket->remove_bucket_instance_info(ctx, entry, bci.info, &bci.info.objv_tracker, y, dpp);
+ }
+
+ int call(std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+ return call(nullopt, f);
+ }
+
+ int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+ std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+ return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+ RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+ return f(ctx);
+ });
+ }
+};
+
+class RGWMetadataHandlerPut_BucketInstance : public RGWMetadataHandlerPut_SObj
+{
+ CephContext *cct;
+ RGWBucketInstanceMetadataHandler *bihandler;
+ RGWBucketInstanceMetadataObject *obj;
+public:
+ RGWMetadataHandlerPut_BucketInstance(CephContext *_cct,
+ RGWBucketInstanceMetadataHandler *_handler,
+ RGWSI_MetaBackend_Handler::Op *_op, string& entry,
+ RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, _op, entry, _obj, objv_tracker, y, type, from_remote_zone),
+ cct(_cct), bihandler(_handler) {
+ obj = static_cast<RGWBucketInstanceMetadataObject *>(_obj);
+
+ auto& bci = obj->get_bci();
+ obj->set_pattrs(&bci.attrs);
+ }
+
+ void encode_obj(bufferlist *bl) override {
+ obj->get_bucket_info().encode(*bl);
+ }
+
+ int put_check(const DoutPrefixProvider *dpp) override;
+ int put_checked(const DoutPrefixProvider *dpp) override;
+ int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketInstanceMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op,
+ string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone)
+{
+ RGWMetadataHandlerPut_BucketInstance put_op(svc.bucket->ctx(), this, op, entry, obj,
+ objv_tracker, y, type, from_remote_zone);
+ return do_put_operate(&put_op, dpp);
+}
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+ const RGWZone& zone,
+ std::optional<uint32_t> shards,
+ std::optional<rgw::BucketIndexType> type) {
+ layout.current_index.gen = 0;
+ layout.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod;
+
+ layout.current_index.layout.type =
+ type.value_or(rgw::BucketIndexType::Normal);
+
+ if (shards) {
+ layout.current_index.layout.normal.num_shards = *shards;
+ } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
+ layout.current_index.layout.normal.num_shards =
+ cct->_conf->rgw_override_bucket_index_max_shards;
+ } else {
+ layout.current_index.layout.normal.num_shards =
+ zone.bucket_index_max_shards;
+ }
+
+ if (layout.current_index.layout.type == rgw::BucketIndexType::Normal) {
+ layout.logs.push_back(log_layout_from_index(0, layout.current_index));
+ }
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_check(const DoutPrefixProvider *dpp)
+{
+ int ret;
+
+ RGWBucketCompleteInfo& bci = obj->get_bci();
+
+ RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+ RGWBucketCompleteInfo *old_bci = (orig_obj ? &orig_obj->get_bci() : nullptr);
+
+ const bool exists = (!!orig_obj);
+
+ if (from_remote_zone) {
+ // don't sync bucket layout changes
+ if (!exists) {
+ // replace peer's layout with default-constructed, then apply our defaults
+ bci.info.layout = rgw::BucketLayout{};
+ init_default_bucket_layout(cct, bci.info.layout,
+ bihandler->svc.zone->get_zone(),
+ std::nullopt, std::nullopt);
+ } else {
+ bci.info.layout = old_bci->info.layout;
+ }
+ }
+
+ if (!exists || old_bci->info.bucket.bucket_id != bci.info.bucket.bucket_id) {
+ /* a new bucket, we need to select a new bucket placement for it */
+ string tenant_name;
+ string bucket_name;
+ string bucket_instance;
+ parse_bucket(entry, &tenant_name, &bucket_name, &bucket_instance);
+
+ RGWZonePlacementInfo rule_info;
+ bci.info.bucket.name = bucket_name;
+ bci.info.bucket.bucket_id = bucket_instance;
+ bci.info.bucket.tenant = tenant_name;
+ // if the sync module never writes data, don't require the zone to specify all placement targets
+ if (bihandler->svc.zone->sync_module_supports_writes()) {
+ ret = bihandler->svc.zone->select_bucket_location_by_rule(dpp, bci.info.placement_rule, &rule_info, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
+ return ret;
+ }
+ }
+ bci.info.layout.current_index.layout.type = rule_info.index_type;
+ } else {
+ /* always keep bucket versioning enabled on archive zone */
+ if (bihandler->driver->get_zone()->get_tier_type() == "archive") {
+ bci.info.flags = (bci.info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
+ }
+ /* existing bucket, keep its placement */
+ bci.info.bucket.explicit_placement = old_bci->info.bucket.explicit_placement;
+ bci.info.placement_rule = old_bci->info.placement_rule;
+ }
+
+ /* record the read version (if any), store the new version */
+ bci.info.objv_tracker.read_version = objv_tracker.read_version;
+ bci.info.objv_tracker.write_version = objv_tracker.write_version;
+
+ return 0;
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_checked(const DoutPrefixProvider *dpp)
+{
+ RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+ RGWBucketInfo *orig_info = (orig_obj ? &orig_obj->get_bucket_info() : nullptr);
+
+ auto& info = obj->get_bucket_info();
+ auto mtime = obj->get_mtime();
+ auto pattrs = obj->get_pattrs();
+
+ RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+ return bihandler->svc.bucket->store_bucket_instance_info(ctx,
+ entry,
+ info,
+ orig_info,
+ false,
+ mtime,
+ pattrs,
+ y,
+ dpp);
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_post(const DoutPrefixProvider *dpp)
+{
+ RGWBucketCompleteInfo& bci = obj->get_bci();
+
+ objv_tracker = bci.info.objv_tracker;
+
+ int ret = bihandler->svc.bi->init_index(dpp, bci.info, bci.info.layout.current_index);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* update lifecyle policy */
+ {
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ ret = bihandler->driver->get_bucket(nullptr, bci.info, &bucket);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << " failed to get_bucket(...) for "
+ << bci.info.bucket.name
+ << dendl;
+ return ret;
+ }
+
+ auto lc = bihandler->driver->get_rgwlc();
+
+ auto lc_it = bci.attrs.find(RGW_ATTR_LC);
+ if (lc_it != bci.attrs.end()) {
+ ldpp_dout(dpp, 20) << "set lc config for " << bci.info.bucket.name << dendl;
+ ret = lc->set_bucket_config(bucket.get(), bci.attrs, nullptr);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << " failed to set lc config for "
+ << bci.info.bucket.name
+ << dendl;
+ return ret;
+ }
+
+ } else {
+ ldpp_dout(dpp, 20) << "remove lc config for " << bci.info.bucket.name << dendl;
+ ret = lc->remove_bucket_config(bucket.get(), bci.attrs, false /* cannot merge attrs */);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << " failed to remove lc config for "
+ << bci.info.bucket.name
+ << dendl;
+ return ret;
+ }
+ }
+ } /* update lc */
+
+ return STATUS_APPLIED;
+}
+
+class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler {
+public:
+ RGWArchiveBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+ : RGWBucketInstanceMetadataHandler(driver) {}
+
+ // N.B. replication of lifecycle policy relies on logic in RGWBucketInstanceMetadataHandler::do_put(...), override with caution
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {
+ ldpp_dout(dpp, 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl;
+ return 0;
+ }
+};
+
+RGWBucketCtl::RGWBucketCtl(RGWSI_Zone *zone_svc,
+ RGWSI_Bucket *bucket_svc,
+ RGWSI_Bucket_Sync *bucket_sync_svc,
+ RGWSI_BucketIndex *bi_svc,
+ RGWSI_User* user_svc)
+ : cct(zone_svc->ctx())
+{
+ svc.zone = zone_svc;
+ svc.bucket = bucket_svc;
+ svc.bucket_sync = bucket_sync_svc;
+ svc.bi = bi_svc;
+ svc.user = user_svc;
+}
+
+void RGWBucketCtl::init(RGWUserCtl *user_ctl,
+ RGWBucketMetadataHandler *_bm_handler,
+ RGWBucketInstanceMetadataHandler *_bmi_handler,
+ RGWDataChangesLog *datalog,
+ const DoutPrefixProvider *dpp)
+{
+ ctl.user = user_ctl;
+
+ bm_handler = _bm_handler;
+ bmi_handler = _bmi_handler;
+
+ bucket_be_handler = bm_handler->get_be_handler();
+ bi_be_handler = bmi_handler->get_be_handler();
+
+ datalog->set_bucket_filter(
+ [this](const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp) {
+ return bucket_exports_data(bucket, y, dpp);
+ });
+}
+
+int RGWBucketCtl::call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f) {
+ return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ep_ctx) {
+ return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& bi_ctx) {
+ RGWSI_Bucket_X_Ctx ctx{ep_ctx, bi_ctx};
+ return f(ctx);
+ });
+ });
+}
+
+int RGWBucketCtl::read_bucket_entrypoint_info(const rgw_bucket& bucket,
+ RGWBucketEntryPoint *info,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ const Bucket::GetParams& params)
+{
+ return bm_handler->call(params.bectx_params, [&](RGWSI_Bucket_EP_Ctx& ctx) {
+ return svc.bucket->read_bucket_entrypoint_info(ctx,
+ RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+ info,
+ params.objv_tracker,
+ params.mtime,
+ params.attrs,
+ y,
+ dpp,
+ params.cache_info,
+ params.refresh_version);
+ });
+}
+
+int RGWBucketCtl::store_bucket_entrypoint_info(const rgw_bucket& bucket,
+ RGWBucketEntryPoint& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const Bucket::PutParams& params)
+{
+ return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+ return svc.bucket->store_bucket_entrypoint_info(ctx,
+ RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+ info,
+ params.exclusive,
+ params.mtime,
+ params.attrs,
+ params.objv_tracker,
+ y,
+ dpp);
+ });
+}
+
+int RGWBucketCtl::remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const Bucket::RemoveParams& params)
+{
+ return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+ return svc.bucket->remove_bucket_entrypoint_info(ctx,
+ RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+ params.objv_tracker,
+ y,
+ dpp);
+ });
+}
+
+int RGWBucketCtl::read_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::GetParams& params)
+{
+ int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+ return svc.bucket->read_bucket_instance_info(ctx,
+ RGWSI_Bucket::get_bi_meta_key(bucket),
+ info,
+ params.mtime,
+ params.attrs,
+ y,
+ dpp,
+ params.cache_info,
+ params.refresh_version);
+ });
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (params.objv_tracker) {
+ *params.objv_tracker = info->objv_tracker;
+ }
+
+ return 0;
+}
+
+int RGWBucketCtl::read_bucket_info(const rgw_bucket& bucket,
+ RGWBucketInfo *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::GetParams& params,
+ RGWObjVersionTracker *ep_objv_tracker)
+{
+ const rgw_bucket *b = &bucket;
+
+ std::optional<RGWBucketEntryPoint> ep;
+
+ if (b->bucket_id.empty()) {
+ ep.emplace();
+
+ int r = read_bucket_entrypoint_info(*b, &(*ep), y, dpp, RGWBucketCtl::Bucket::GetParams()
+ .set_bectx_params(params.bectx_params)
+ .set_objv_tracker(ep_objv_tracker));
+ if (r < 0) {
+ return r;
+ }
+
+ b = &ep->bucket;
+ }
+
+ int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+ return svc.bucket->read_bucket_instance_info(ctx,
+ RGWSI_Bucket::get_bi_meta_key(*b),
+ info,
+ params.mtime,
+ params.attrs,
+ y, dpp,
+ params.cache_info,
+ params.refresh_version);
+ });
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (params.objv_tracker) {
+ *params.objv_tracker = info->objv_tracker;
+ }
+
+ return 0;
+}
+
+int RGWBucketCtl::do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+ const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::PutParams& params)
+{
+ if (params.objv_tracker) {
+ info.objv_tracker = *params.objv_tracker;
+ }
+
+ return svc.bucket->store_bucket_instance_info(ctx,
+ RGWSI_Bucket::get_bi_meta_key(bucket),
+ info,
+ params.orig_info,
+ params.exclusive,
+ params.mtime,
+ params.attrs,
+ y,
+ dpp);
+}
+
+int RGWBucketCtl::store_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::PutParams& params)
+{
+ return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+ return do_store_bucket_instance_info(ctx, bucket, info, y, dpp, params);
+ });
+}
+
+int RGWBucketCtl::remove_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::RemoveParams& params)
+{
+ if (params.objv_tracker) {
+ info.objv_tracker = *params.objv_tracker;
+ }
+
+ return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+ return svc.bucket->remove_bucket_instance_info(ctx,
+ RGWSI_Bucket::get_bi_meta_key(bucket),
+ info,
+ &info.objv_tracker,
+ y,
+ dpp);
+ });
+}
+
+int RGWBucketCtl::do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+ RGWBucketInfo& info,
+ RGWBucketInfo *orig_info,
+ bool exclusive, real_time mtime,
+ obj_version *pep_objv,
+ map<string, bufferlist> *pattrs,
+ bool create_entry_point,
+ optional_yield y, const DoutPrefixProvider *dpp)
+{
+ bool create_head = !info.has_instance_obj || create_entry_point;
+
+ int ret = svc.bucket->store_bucket_instance_info(ctx.bi,
+ RGWSI_Bucket::get_bi_meta_key(info.bucket),
+ info,
+ orig_info,
+ exclusive,
+ mtime, pattrs,
+ y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!create_head)
+ return 0; /* done! */
+
+ RGWBucketEntryPoint entry_point;
+ entry_point.bucket = info.bucket;
+ entry_point.owner = info.owner;
+ entry_point.creation_time = info.creation_time;
+ entry_point.linked = true;
+ RGWObjVersionTracker ot;
+ if (pep_objv && !pep_objv->tag.empty()) {
+ ot.write_version = *pep_objv;
+ } else {
+ ot.generate_new_write_ver(cct);
+ if (pep_objv) {
+ *pep_objv = ot.write_version;
+ }
+ }
+ ret = svc.bucket->store_bucket_entrypoint_info(ctx.ep,
+ RGWSI_Bucket::get_entrypoint_meta_key(info.bucket),
+ entry_point,
+ exclusive,
+ mtime,
+ pattrs,
+ &ot,
+ y,
+ dpp);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+int RGWBucketCtl::convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+ const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ RGWBucketEntryPoint entry_point;
+ real_time ep_mtime;
+ RGWObjVersionTracker ot;
+ map<string, bufferlist> attrs;
+ RGWBucketInfo info;
+ auto cct = svc.bucket->ctx();
+
+ ldpp_dout(dpp, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket << dendl;
+
+ int ret = svc.bucket->read_bucket_entrypoint_info(ctx.ep,
+ RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+ &entry_point, &ot, &ep_mtime, &attrs, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket << dendl;
+ return ret;
+ }
+
+ if (!entry_point.has_bucket_info) {
+ /* already converted! */
+ return 0;
+ }
+
+ info = entry_point.old_bucket_info;
+
+ ot.generate_new_write_ver(cct);
+
+ ret = do_store_linked_bucket_info(ctx, info, nullptr, false, ep_mtime, &ot.write_version, &attrs, true, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWBucketCtl::set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+ map<string, bufferlist>& attrs,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+ rgw_bucket& bucket = bucket_info.bucket;
+
+ if (!bucket_info.has_instance_obj) {
+ /* an old bucket object, need to convert it */
+ int ret = convert_old_bucket_info(ctx, bucket, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed converting old bucket info: " << ret << dendl;
+ return ret;
+ }
+ }
+
+ return do_store_bucket_instance_info(ctx.bi,
+ bucket,
+ bucket_info,
+ y,
+ dpp,
+ BucketInstance::PutParams().set_attrs(&attrs)
+ .set_objv_tracker(objv_tracker)
+ .set_orig_info(&bucket_info));
+ });
+}
+
+
+int RGWBucketCtl::link_bucket(const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool update_entrypoint,
+ rgw_ep_info *pinfo)
+{
+ return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+ return do_link_bucket(ctx, user_id, bucket, creation_time,
+ update_entrypoint, pinfo, y, dpp);
+ });
+}
+
+int RGWBucketCtl::do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+ const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ bool update_entrypoint,
+ rgw_ep_info *pinfo,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ int ret;
+
+ RGWBucketEntryPoint ep;
+ RGWObjVersionTracker ot;
+ RGWObjVersionTracker& rot = (pinfo) ? pinfo->ep_objv : ot;
+ map<string, bufferlist> attrs, *pattrs = nullptr;
+ string meta_key;
+
+ if (update_entrypoint) {
+ meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+ if (pinfo) {
+ ep = pinfo->ep;
+ pattrs = &pinfo->attrs;
+ } else {
+ ret = svc.bucket->read_bucket_entrypoint_info(ctx,
+ meta_key,
+ &ep, &rot,
+ nullptr, &attrs,
+ y, dpp);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() returned: "
+ << cpp_strerror(-ret) << dendl;
+ }
+ pattrs = &attrs;
+ }
+ }
+
+ ret = svc.user->add_bucket(dpp, user_id, bucket, creation_time, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user directory:"
+ << " user=" << user_id
+ << " bucket=" << bucket
+ << " err=" << cpp_strerror(-ret)
+ << dendl;
+ goto done_err;
+ }
+
+ if (!update_entrypoint)
+ return 0;
+
+ ep.linked = true;
+ ep.owner = user_id;
+ ep.bucket = bucket;
+ ret = svc.bucket->store_bucket_entrypoint_info(
+ ctx, meta_key, ep, false, real_time(), pattrs, &rot, y, dpp);
+ if (ret < 0)
+ goto done_err;
+
+ return 0;
+
+done_err:
+ int r = do_unlink_bucket(ctx, user_id, bucket, true, y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed unlinking bucket on error cleanup: "
+ << cpp_strerror(-r) << dendl;
+ }
+ return ret;
+}
+
+int RGWBucketCtl::unlink_bucket(const rgw_user& user_id, const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp, bool update_entrypoint)
+{
+ return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+ return do_unlink_bucket(ctx, user_id, bucket, update_entrypoint, y, dpp);
+ });
+}
+
+int RGWBucketCtl::do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+ const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ bool update_entrypoint,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ int ret = svc.user->remove_bucket(dpp, user_id, bucket, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: error removing bucket from directory: "
+ << cpp_strerror(-ret)<< dendl;
+ }
+
+ if (!update_entrypoint)
+ return 0;
+
+ RGWBucketEntryPoint ep;
+ RGWObjVersionTracker ot;
+ map<string, bufferlist> attrs;
+ string meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+ ret = svc.bucket->read_bucket_entrypoint_info(ctx, meta_key, &ep, &ot, nullptr, &attrs, y, dpp);
+ if (ret == -ENOENT)
+ return 0;
+ if (ret < 0)
+ return ret;
+
+ if (!ep.linked)
+ return 0;
+
+ if (ep.owner != user_id) {
+ ldpp_dout(dpp, 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl;
+ return -EINVAL;
+ }
+
+ ep.linked = false;
+ return svc.bucket->store_bucket_entrypoint_info(ctx, meta_key, ep, false, real_time(), &attrs, &ot, y, dpp);
+}
+
+int RGWBucketCtl::read_bucket_stats(const rgw_bucket& bucket,
+ RGWBucketEnt *result,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+ return svc.bucket->read_bucket_stats(ctx, bucket, result, y, dpp);
+ });
+}
+
+int RGWBucketCtl::read_buckets_stats(map<string, RGWBucketEnt>& m,
+ optional_yield y, const DoutPrefixProvider *dpp)
+{
+ return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+ return svc.bucket->read_buckets_stats(ctx, m, y, dpp);
+ });
+}
+
+int RGWBucketCtl::sync_user_stats(const DoutPrefixProvider *dpp,
+ const rgw_user& user_id,
+ const RGWBucketInfo& bucket_info,
+ optional_yield y,
+ RGWBucketEnt* pent)
+{
+ RGWBucketEnt ent;
+ if (!pent) {
+ pent = &ent;
+ }
+ int r = svc.bi->read_stats(dpp, bucket_info, pent, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << __func__ << "(): failed to read bucket stats (r=" << r << ")" << dendl;
+ return r;
+ }
+
+ return svc.user->flush_bucket_stats(dpp, user_id, *pent, y);
+}
+
+int RGWBucketCtl::get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+ std::optional<rgw_bucket> bucket,
+ RGWBucketSyncPolicyHandlerRef *phandler,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ int r = call([&](RGWSI_Bucket_X_Ctx& ctx) {
+ return svc.bucket_sync->get_policy_handler(ctx, zone, bucket, phandler, y, dpp);
+ });
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << __func__ << "(): failed to get policy handler for bucket=" << bucket << " (r=" << r << ")" << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int RGWBucketCtl::bucket_exports_data(const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+
+ RGWBucketSyncPolicyHandlerRef handler;
+
+ int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ return handler->bucket_exports_data();
+}
+
+int RGWBucketCtl::bucket_imports_data(const rgw_bucket& bucket,
+ optional_yield y, const DoutPrefixProvider *dpp)
+{
+
+ RGWBucketSyncPolicyHandlerRef handler;
+
+ int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ return handler->bucket_imports_data();
+}
+
+RGWBucketMetadataHandlerBase* RGWBucketMetaHandlerAllocator::alloc()
+{
+ return new RGWBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+ return new RGWBucketInstanceMetadataHandler(driver);
+}
+
+RGWBucketMetadataHandlerBase* RGWArchiveBucketMetaHandlerAllocator::alloc()
+{
+ return new RGWArchiveBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+ return new RGWArchiveBucketInstanceMetadataHandler(driver);
+}
+
+
+void RGWBucketEntryPoint::generate_test_instances(list<RGWBucketEntryPoint*>& o)
+{
+ RGWBucketEntryPoint *bp = new RGWBucketEntryPoint();
+ init_bucket(&bp->bucket, "tenant", "bucket", "pool", ".index.pool", "marker", "10");
+ bp->owner = "owner";
+ bp->creation_time = ceph::real_clock::from_ceph_timespec({ceph_le32(2), ceph_le32(3)});
+
+ o.push_back(bp);
+ o.push_back(new RGWBucketEntryPoint);
+}
+
+void RGWBucketEntryPoint::dump(Formatter *f) const
+{
+ encode_json("bucket", bucket, f);
+ encode_json("owner", owner, f);
+ utime_t ut(creation_time);
+ encode_json("creation_time", ut, f);
+ encode_json("linked", linked, f);
+ encode_json("has_bucket_info", has_bucket_info, f);
+ if (has_bucket_info) {
+ encode_json("old_bucket_info", old_bucket_info, f);
+ }
+}
+
+void RGWBucketEntryPoint::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket", bucket, obj);
+ JSONDecoder::decode_json("owner", owner, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("creation_time", ut, obj);
+ creation_time = ut.to_real_time();
+ JSONDecoder::decode_json("linked", linked, obj);
+ JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj);
+ if (has_bucket_info) {
+ JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj);
+ }
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket.h b/src/rgw/driver/rados/rgw_bucket.h
new file mode 100644
index 000000000..c13e737ce
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket.h
@@ -0,0 +1,766 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <variant>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_metadata.h"
+#include "rgw/rgw_bucket.h"
+
+#include "rgw_string.h"
+#include "rgw_sal.h"
+
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/ceph_time.h"
+
+#include "rgw_formats.h"
+
+#include "services/svc_bucket_types.h"
+#include "services/svc_bucket_sync.h"
+
+// define as static when RGWBucket implementation completes
+extern void rgw_get_buckets_obj(const rgw_user& user_id, std::string& buckets_obj_id);
+
+class RGWSI_Meta;
+class RGWBucketMetadataHandler;
+class RGWBucketInstanceMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWZone;
+struct RGWZoneParams;
+
+// this is used as a filter to RGWRados::cls_bucket_list_ordered; it
+// conforms to the type RGWBucketListNameFilter
+extern bool rgw_bucket_object_check_filter(const std::string& oid);
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+ const RGWZone& zone,
+ std::optional<uint32_t> shards,
+ std::optional<rgw::BucketIndexType> type);
+
+struct RGWBucketCompleteInfo {
+ RGWBucketInfo info;
+ std::map<std::string, bufferlist> attrs;
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+
+class RGWBucketEntryMetadataObject : public RGWMetadataObject {
+ RGWBucketEntryPoint ep;
+ std::map<std::string, bufferlist> attrs;
+public:
+ RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m) : ep(_ep) {
+ objv = v;
+ mtime = m;
+ set_pattrs (&attrs);
+ }
+ RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m, std::map<std::string, bufferlist>&& _attrs) :
+ ep(_ep), attrs(std::move(_attrs)) {
+ objv = v;
+ mtime = m;
+ set_pattrs (&attrs);
+ }
+
+ void dump(Formatter *f) const override {
+ ep.dump(f);
+ }
+
+ RGWBucketEntryPoint& get_ep() {
+ return ep;
+ }
+
+ std::map<std::string, bufferlist>& get_attrs() {
+ return attrs;
+ }
+};
+
+class RGWBucketInstanceMetadataObject : public RGWMetadataObject {
+ RGWBucketCompleteInfo info;
+public:
+ RGWBucketInstanceMetadataObject() {}
+ RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, const obj_version& v, real_time m) : info(i) {
+ objv = v;
+ mtime = m;
+ }
+
+ void dump(Formatter *f) const override {
+ info.dump(f);
+ }
+
+ void decode_json(JSONObj *obj) {
+ info.decode_json(obj);
+ }
+
+ RGWBucketCompleteInfo& get_bci() {
+ return info;
+ }
+ RGWBucketInfo& get_bucket_info() {
+ return info.info;
+ }
+};
+
+/**
+ * store a list of the user's buckets, with associated functinos.
+ */
+class RGWUserBuckets {
+ std::map<std::string, RGWBucketEnt> buckets;
+
+public:
+ RGWUserBuckets() = default;
+ RGWUserBuckets(RGWUserBuckets&&) = default;
+
+ RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(buckets, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(buckets, bl);
+ }
+ /**
+ * Check if the user owns a bucket by the given name.
+ */
+ bool owns(std::string& name) {
+ std::map<std::string, RGWBucketEnt>::iterator iter;
+ iter = buckets.find(name);
+ return (iter != buckets.end());
+ }
+
+ /**
+ * Add a (created) bucket to the user's bucket list.
+ */
+ void add(const RGWBucketEnt& bucket) {
+ buckets[bucket.bucket.name] = bucket;
+ }
+
+ /**
+ * Remove a bucket from the user's list by name.
+ */
+ void remove(const std::string& name) {
+ std::map<std::string, RGWBucketEnt>::iterator iter;
+ iter = buckets.find(name);
+ if (iter != buckets.end()) {
+ buckets.erase(iter);
+ }
+ }
+
+ /**
+ * Get the user's buckets as a map.
+ */
+ std::map<std::string, RGWBucketEnt>& get_buckets() { return buckets; }
+
+ /**
+ * Cleanup data structure
+ */
+ void clear() { buckets.clear(); }
+
+ size_t count() { return buckets.size(); }
+};
+WRITE_CLASS_ENCODER(RGWUserBuckets)
+
+class RGWBucketMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+ virtual ~RGWBucketMetadataHandlerBase() {}
+ virtual void init(RGWSI_Bucket *bucket_svc,
+ RGWBucketCtl *bucket_ctl) = 0;
+
+};
+
+class RGWBucketInstanceMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+ virtual ~RGWBucketInstanceMetadataHandlerBase() {}
+ virtual void init(RGWSI_Zone *zone_svc,
+ RGWSI_Bucket *bucket_svc,
+ RGWSI_BucketIndex *bi_svc) = 0;
+};
+
+class RGWBucketMetaHandlerAllocator {
+public:
+ static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWBucketInstanceMetaHandlerAllocator {
+public:
+ static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+class RGWArchiveBucketMetaHandlerAllocator {
+public:
+ static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWArchiveBucketInstanceMetaHandlerAllocator {
+public:
+ static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+extern int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key);
+
+extern int rgw_object_get_attr(rgw::sal::Driver* driver, rgw::sal::Object* obj,
+ const char* attr_name, bufferlist& out_bl,
+ optional_yield y);
+
+extern void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user, bool fix, optional_yield y, const DoutPrefixProvider *dpp);
+
+struct RGWBucketAdminOpState {
+ rgw_user uid;
+ std::string display_name;
+ std::string bucket_name;
+ std::string bucket_id;
+ std::string object_name;
+ std::string new_bucket_name;
+
+ bool list_buckets;
+ bool stat_buckets;
+ bool check_objects;
+ bool fix_index;
+ bool delete_child_objects;
+ bool bucket_stored;
+ bool sync_bucket;
+ bool dump_keys;
+ bool hide_progress;
+ int max_aio = 0;
+ ceph::timespan min_age = std::chrono::hours::zero();
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+
+ RGWQuotaInfo quota;
+ RGWRateLimitInfo ratelimit_info;
+
+ void set_fetch_stats(bool value) { stat_buckets = value; }
+ void set_check_objects(bool value) { check_objects = value; }
+ void set_fix_index(bool value) { fix_index = value; }
+ void set_delete_children(bool value) { delete_child_objects = value; }
+ void set_hide_progress(bool value) { hide_progress = value; }
+ void set_dump_keys(bool value) { dump_keys = value; }
+
+ void set_max_aio(int value) { max_aio = value; }
+ void set_min_age(ceph::timespan value) { min_age = value; }
+
+ void set_user_id(const rgw_user& user_id) {
+ if (!user_id.empty())
+ uid = user_id;
+ }
+ void set_tenant(const std::string& tenant_str) {
+ uid.tenant = tenant_str;
+ }
+ void set_bucket_name(const std::string& bucket_str) {
+ bucket_name = bucket_str;
+ }
+ void set_object(std::string& object_str) {
+ object_name = object_str;
+ }
+ void set_new_bucket_name(std::string& new_bucket_str) {
+ new_bucket_name = new_bucket_str;
+ }
+ void set_quota(RGWQuotaInfo& value) {
+ quota = value;
+ }
+ void set_bucket_ratelimit(RGWRateLimitInfo& value) {
+ ratelimit_info = value;
+ }
+
+
+ void set_sync_bucket(bool value) { sync_bucket = value; }
+
+ rgw_user& get_user_id() { return uid; }
+ std::string& get_user_display_name() { return display_name; }
+ std::string& get_bucket_name() { return bucket_name; }
+ std::string& get_object_name() { return object_name; }
+ std::string& get_tenant() { return uid.tenant; }
+
+ rgw::sal::Bucket* get_bucket() { return bucket.get(); }
+ void set_bucket(std::unique_ptr<rgw::sal::Bucket> _bucket) {
+ bucket = std::move(_bucket);
+ bucket_stored = true;
+ }
+
+ void set_bucket_id(const std::string& bi) {
+ bucket_id = bi;
+ }
+ const std::string& get_bucket_id() { return bucket_id; }
+
+ bool will_fetch_stats() { return stat_buckets; }
+ bool will_fix_index() { return fix_index; }
+ bool will_delete_children() { return delete_child_objects; }
+ bool will_check_objects() { return check_objects; }
+ bool is_user_op() { return !uid.empty(); }
+ bool is_system_op() { return uid.empty(); }
+ bool has_bucket_stored() { return bucket_stored; }
+ int get_max_aio() { return max_aio; }
+ bool will_sync_bucket() { return sync_bucket; }
+
+ RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false),
+ fix_index(false), delete_child_objects(false),
+ bucket_stored(false), sync_bucket(true),
+ dump_keys(false), hide_progress(false) {}
+};
+
+
+/*
+ * A simple wrapper class for administrative bucket operations
+ */
+class RGWBucket {
+ RGWUserBuckets buckets;
+ rgw::sal::Driver* driver;
+ RGWAccessHandle handle;
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ std::unique_ptr<rgw::sal::User> user;
+
+ bool failure;
+
+ RGWObjVersionTracker ep_objv; // entrypoint object version
+
+public:
+ RGWBucket() : driver(NULL), handle(NULL), failure(false) {}
+ int init(rgw::sal::Driver* storage, RGWBucketAdminOpState& op_state, optional_yield y,
+ const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ int check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ int check_object_index(const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y,
+ std::string *err_msg = NULL);
+ int check_index_olh(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher);
+ int check_index_unlinked(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher);
+
+ int check_index(const DoutPrefixProvider *dpp,
+ RGWBucketAdminOpState& op_state,
+ std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+ std::map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+ std::string *err_msg = NULL);
+
+ int chown(RGWBucketAdminOpState& op_state, const std::string& marker,
+ optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+ int set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ int remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
+ int policy_bl_to_stream(bufferlist& bl, std::ostream& o);
+ int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp);
+ int sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ void clear_failure() { failure = false; }
+
+ const RGWBucketInfo& get_bucket_info() const { return bucket->get_info(); }
+};
+
+class RGWBucketAdminOp {
+public:
+ static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+ static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp);
+ static int dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ std::ostream& os, const DoutPrefixProvider *dpp);
+
+ static int unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+ static int link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+ static int chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const std::string& marker, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ static int check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+ static int check_index_olh(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+ static int check_index_unlinked(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+
+ static int remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, optional_yield y,
+ const DoutPrefixProvider *dpp, bool bypass_gc = false, bool keep_index_consistent = true);
+ static int remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+ static int info(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+ static int limit_check(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ const std::list<std::string>& user_ids,
+ RGWFormatterFlusher& flusher, optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool warnings_only = false);
+ static int set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+
+ static int list_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+
+ static int clear_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+ static int fix_lc_shards(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+ static int fix_obj_expiry(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp, bool dry_run = false);
+
+ static int sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+};
+
+struct rgw_ep_info {
+ RGWBucketEntryPoint &ep;
+ std::map<std::string, buffer::list>& attrs;
+ RGWObjVersionTracker ep_objv;
+ rgw_ep_info(RGWBucketEntryPoint &ep, std::map<std::string, bufferlist>& attrs)
+ : ep(ep), attrs(attrs) {}
+};
+
+class RGWBucketCtl {
+ CephContext *cct;
+
+ struct Svc {
+ RGWSI_Zone *zone{nullptr};
+ RGWSI_Bucket *bucket{nullptr};
+ RGWSI_Bucket_Sync *bucket_sync{nullptr};
+ RGWSI_BucketIndex *bi{nullptr};
+ RGWSI_User* user = nullptr;
+ } svc;
+
+ struct Ctl {
+ RGWUserCtl *user{nullptr};
+ } ctl;
+
+ RGWBucketMetadataHandler *bm_handler;
+ RGWBucketInstanceMetadataHandler *bmi_handler;
+
+ RGWSI_Bucket_BE_Handler bucket_be_handler; /* bucket backend handler */
+ RGWSI_BucketInstance_BE_Handler bi_be_handler; /* bucket instance backend handler */
+
+ int call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f);
+
+public:
+ RGWBucketCtl(RGWSI_Zone *zone_svc,
+ RGWSI_Bucket *bucket_svc,
+ RGWSI_Bucket_Sync *bucket_sync_svc,
+ RGWSI_BucketIndex *bi_svc,
+ RGWSI_User* user_svc);
+
+ void init(RGWUserCtl *user_ctl,
+ RGWBucketMetadataHandler *_bm_handler,
+ RGWBucketInstanceMetadataHandler *_bmi_handler,
+ RGWDataChangesLog *datalog,
+ const DoutPrefixProvider *dpp);
+
+ struct Bucket {
+ struct GetParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ real_time *mtime{nullptr};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+ rgw_cache_entry_info *cache_info{nullptr};
+ boost::optional<obj_version> refresh_version;
+ std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+ GetParams() {}
+
+ GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ GetParams& set_mtime(ceph::real_time *_mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+
+ GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+ cache_info = _cache_info;
+ return *this;
+ }
+
+ GetParams& set_refresh_version(const obj_version& _refresh_version) {
+ refresh_version = _refresh_version;
+ return *this;
+ }
+
+ GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+ bectx_params = _bectx_params;
+ return *this;
+ }
+ };
+
+ struct PutParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ ceph::real_time mtime;
+ bool exclusive{false};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+
+ PutParams() {}
+
+ PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ PutParams& set_mtime(const ceph::real_time& _mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ PutParams& set_exclusive(bool _exclusive) {
+ exclusive = _exclusive;
+ return *this;
+ }
+
+ PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+ };
+
+ struct RemoveParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+
+ RemoveParams() {}
+
+ RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+ };
+ };
+
+ struct BucketInstance {
+ struct GetParams {
+ real_time *mtime{nullptr};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+ rgw_cache_entry_info *cache_info{nullptr};
+ boost::optional<obj_version> refresh_version;
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+ GetParams() {}
+
+ GetParams& set_mtime(ceph::real_time *_mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+
+ GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+ cache_info = _cache_info;
+ return *this;
+ }
+
+ GetParams& set_refresh_version(const obj_version& _refresh_version) {
+ refresh_version = _refresh_version;
+ return *this;
+ }
+
+ GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+ bectx_params = _bectx_params;
+ return *this;
+ }
+ };
+
+ struct PutParams {
+ std::optional<RGWBucketInfo *> orig_info; /* nullopt: orig_info was not fetched,
+ nullptr: orig_info was not found (new bucket instance */
+ ceph::real_time mtime;
+ bool exclusive{false};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+ RGWObjVersionTracker *objv_tracker{nullptr};
+
+ PutParams() {}
+
+ PutParams& set_orig_info(RGWBucketInfo *pinfo) {
+ orig_info = pinfo;
+ return *this;
+ }
+
+ PutParams& set_mtime(const ceph::real_time& _mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ PutParams& set_exclusive(bool _exclusive) {
+ exclusive = _exclusive;
+ return *this;
+ }
+
+ PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+
+ PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+ };
+
+ struct RemoveParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+
+ RemoveParams() {}
+
+ RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+ };
+ };
+
+ /* bucket entrypoint */
+ int read_bucket_entrypoint_info(const rgw_bucket& bucket,
+ RGWBucketEntryPoint *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const Bucket::GetParams& params = {});
+ int store_bucket_entrypoint_info(const rgw_bucket& bucket,
+ RGWBucketEntryPoint& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const Bucket::PutParams& params = {});
+ int remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const Bucket::RemoveParams& params = {});
+
+ /* bucket instance */
+ int read_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::GetParams& params = {});
+ int store_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::PutParams& params = {});
+ int remove_bucket_instance_info(const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::RemoveParams& params = {});
+
+ /*
+ * bucket_id may or may not be provided
+ *
+ * ep_objv_tracker might not be populated even if provided. Will only be set if entrypoint is read
+ * (that is: if bucket_id is empty).
+ */
+ int read_bucket_info(const rgw_bucket& bucket,
+ RGWBucketInfo *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::GetParams& params = {},
+ RGWObjVersionTracker *ep_objv_tracker = nullptr);
+
+
+ int set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& attrs,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ /* user/bucket */
+ int link_bucket(const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool update_entrypoint = true,
+ rgw_ep_info *pinfo = nullptr);
+
+ int unlink_bucket(const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool update_entrypoint = true);
+
+ int read_buckets_stats(std::map<std::string, RGWBucketEnt>& m,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ int read_bucket_stats(const rgw_bucket& bucket,
+ RGWBucketEnt *result,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ /* quota related */
+ int sync_user_stats(const DoutPrefixProvider *dpp,
+ const rgw_user& user_id, const RGWBucketInfo& bucket_info,
+ optional_yield y,
+ RGWBucketEnt* pent);
+
+ /* bucket sync */
+ int get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+ std::optional<rgw_bucket> bucket,
+ RGWBucketSyncPolicyHandlerRef *phandler,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+ int bucket_exports_data(const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+ int bucket_imports_data(const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+private:
+ int convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+ const rgw_bucket& bucket,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ int do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+ const rgw_bucket& bucket,
+ RGWBucketInfo& info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const BucketInstance::PutParams& params);
+
+ int do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+ RGWBucketInfo& info,
+ RGWBucketInfo *orig_info,
+ bool exclusive, real_time mtime,
+ obj_version *pep_objv,
+ std::map<std::string, bufferlist> *pattrs,
+ bool create_entry_point,
+ optional_yield,
+ const DoutPrefixProvider *dpp);
+
+ int do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+ const rgw_user& user,
+ const rgw_bucket& bucket,
+ ceph::real_time creation_time,
+ bool update_entrypoint,
+ rgw_ep_info *pinfo,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ int do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+ const rgw_user& user_id,
+ const rgw_bucket& bucket,
+ bool update_entrypoint,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+};
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, const std::string& marker,
+ const std::string& bucket_id, rgw_bucket* bucket_out);
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.cc b/src/rgw/driver/rados/rgw_bucket_sync.cc
new file mode 100644
index 000000000..6ff76c16a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket_sync.cc
@@ -0,0 +1,1018 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket_sync.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entity& e) {
+ os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zone.value_or(rgw_zone_id()) << ",az=" << (int)e.all_zones << "}";
+ return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipe& pipe) {
+ os << "{s=" << pipe.source << ",d=" << pipe.dest << "}";
+ return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entities& e) {
+ os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zones.value_or(std::set<rgw_zone_id>()) << "}";
+ return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipes& pipe) {
+ os << "{id=" << pipe.id << ",s=" << pipe.source << ",d=" << pipe.dest << "}";
+ return os;
+}
+
+static std::vector<rgw_sync_bucket_pipe> filter_relevant_pipes(const std::vector<rgw_sync_bucket_pipes>& pipes,
+ const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone)
+{
+ std::vector<rgw_sync_bucket_pipe> relevant_pipes;
+ for (auto& p : pipes) {
+ if (p.source.match_zone(source_zone) &&
+ p.dest.match_zone(dest_zone)) {
+ for (auto pipe : p.expand()) {
+ pipe.source.apply_zone(source_zone);
+ pipe.dest.apply_zone(dest_zone);
+ relevant_pipes.push_back(pipe);
+ }
+ }
+ }
+
+ return relevant_pipes;
+}
+
+static bool is_wildcard_bucket(const rgw_bucket& bucket)
+{
+ return bucket.name.empty();
+}
+
+void rgw_sync_group_pipe_map::dump(ceph::Formatter *f) const
+{
+ encode_json("zone", zone.id, f);
+ encode_json("buckets", rgw_sync_bucket_entities::bucket_key(bucket), f);
+ encode_json("sources", sources, f);
+ encode_json("dests", dests, f);
+}
+
+
+template <typename CB1, typename CB2>
+void rgw_sync_group_pipe_map::try_add_to_pipe_map(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ zb_pipe_map_t *pipe_map,
+ CB1 filter_cb,
+ CB2 call_filter_cb)
+{
+ if (!filter_cb(source_zone, nullopt, dest_zone, nullopt)) {
+ return;
+ }
+ auto relevant_pipes = filter_relevant_pipes(pipes, source_zone, dest_zone);
+
+ for (auto& pipe : relevant_pipes) {
+ rgw_sync_bucket_entity zb;
+ if (!call_filter_cb(pipe, &zb)) {
+ continue;
+ }
+ pipe_map->insert(make_pair(zb, pipe));
+ }
+}
+
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_source(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ CB filter_cb)
+{
+ return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+ &sources,
+ filter_cb,
+ [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+ *zb = rgw_sync_bucket_entity{source_zone, pipe.source.get_bucket()};
+ return filter_cb(source_zone, zb->bucket, dest_zone, pipe.dest.get_bucket());
+ });
+}
+
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_dest(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ CB filter_cb)
+{
+ return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+ &dests,
+ filter_cb,
+ [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+ *zb = rgw_sync_bucket_entity{dest_zone, pipe.dest.get_bucket()};
+ return filter_cb(source_zone, pipe.source.get_bucket(), dest_zone, zb->bucket);
+ });
+}
+
+using zb_pipe_map_t = rgw_sync_group_pipe_map::zb_pipe_map_t;
+
+pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> rgw_sync_group_pipe_map::find_pipes(const zb_pipe_map_t& m,
+ const rgw_zone_id& zone,
+ std::optional<rgw_bucket> b) const
+{
+ if (!b) {
+ return m.equal_range(rgw_sync_bucket_entity{zone, rgw_bucket()});
+ }
+
+ auto zb = rgw_sync_bucket_entity{zone, *b};
+
+ auto range = m.equal_range(zb);
+ if (range.first == range.second &&
+ !is_wildcard_bucket(*b)) {
+ /* couldn't find the specific bucket, try to find by wildcard */
+ zb.bucket = rgw_bucket();
+ range = m.equal_range(zb);
+ }
+
+ return range;
+}
+
+
+template <typename CB>
+void rgw_sync_group_pipe_map::init(const DoutPrefixProvider *dpp,
+ CephContext *cct,
+ const rgw_zone_id& _zone,
+ std::optional<rgw_bucket> _bucket,
+ const rgw_sync_policy_group& group,
+ rgw_sync_data_flow_group *_default_flow,
+ std::set<rgw_zone_id> *_pall_zones,
+ CB filter_cb) {
+ zone = _zone;
+ bucket = _bucket;
+ default_flow = _default_flow;
+ pall_zones = _pall_zones;
+
+ rgw_sync_bucket_entity zb(zone, bucket);
+
+ status = group.status;
+
+ std::vector<rgw_sync_bucket_pipes> zone_pipes;
+
+ string bucket_key = (bucket ? bucket->get_key() : "*");
+
+ /* only look at pipes that touch the specific zone and bucket */
+ for (auto& pipe : group.pipes) {
+ if (pipe.contains_zone_bucket(zone, bucket)) {
+ ldpp_dout(dpp, 20) << __func__ << "(): pipe_map (zone=" << zone << " bucket=" << bucket_key << "): adding potential pipe: " << pipe << dendl;
+ zone_pipes.push_back(pipe);
+ }
+ }
+
+ const rgw_sync_data_flow_group *pflow;
+
+ if (!group.data_flow.empty()) {
+ pflow = &group.data_flow;
+ } else {
+ if (!default_flow) {
+ return;
+ }
+ pflow = default_flow;
+ }
+
+ auto& flow = *pflow;
+
+ pall_zones->insert(zone);
+
+ /* symmetrical */
+ for (auto& symmetrical_group : flow.symmetrical) {
+ if (symmetrical_group.zones.find(zone) != symmetrical_group.zones.end()) {
+ for (auto& z : symmetrical_group.zones) {
+ if (z != zone) {
+ pall_zones->insert(z);
+ try_add_source(z, zone, zone_pipes, filter_cb);
+ try_add_dest(zone, z, zone_pipes, filter_cb);
+ }
+ }
+ }
+ }
+
+ /* directional */
+ for (auto& rule : flow.directional) {
+ if (rule.source_zone == zone) {
+ pall_zones->insert(rule.dest_zone);
+ try_add_dest(zone, rule.dest_zone, zone_pipes, filter_cb);
+ } else if (rule.dest_zone == zone) {
+ pall_zones->insert(rule.source_zone);
+ try_add_source(rule.source_zone, zone, zone_pipes, filter_cb);
+ }
+ }
+}
+
+/*
+ * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_source_pipes(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ std::optional<rgw_bucket> dest_bucket) const {
+ vector<rgw_sync_bucket_pipe> result;
+
+ auto range = find_pipes(sources, source_zone, source_bucket);
+
+ for (auto iter = range.first; iter != range.second; ++iter) {
+ auto pipe = iter->second;
+ if (pipe.dest.match_bucket(dest_bucket)) {
+ result.push_back(pipe);
+ }
+ }
+ return result;
+}
+
+/*
+ * find all relevant pipes in other zones that pull from a specific
+ * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket) const {
+ vector<rgw_sync_bucket_pipe> result;
+
+ auto range = find_pipes(dests, dest_zone, dest_bucket);
+
+ for (auto iter = range.first; iter != range.second; ++iter) {
+ auto pipe = iter->second;
+ if (pipe.source.match_bucket(source_bucket)) {
+ result.push_back(pipe);
+ }
+ }
+
+ return result;
+}
+
+/*
+ * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_pipes(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket) const {
+ if (dest_zone == zone) {
+ return find_source_pipes(source_zone, source_bucket, dest_bucket);
+ }
+
+ if (source_zone == zone) {
+ return find_dest_pipes(source_bucket, dest_zone, dest_bucket);
+ }
+
+ return vector<rgw_sync_bucket_pipe>();
+}
+
+void RGWBucketSyncFlowManager::pipe_rules::insert(const rgw_sync_bucket_pipe& pipe)
+{
+ pipes.push_back(pipe);
+
+ auto ppipe = &pipes.back();
+ auto prefix = ppipe->params.source.filter.prefix.value_or(string());
+
+ prefix_refs.insert(make_pair(prefix, ppipe));
+
+ for (auto& t : ppipe->params.source.filter.tags) {
+ string tag = t.key + "=" + t.value;
+ auto titer = tag_refs.find(tag);
+ if (titer != tag_refs.end() &&
+ ppipe->params.priority > titer->second->params.priority) {
+ titer->second = ppipe;
+ } else {
+ tag_refs[tag] = ppipe;
+ }
+ }
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_basic_info_without_tags(const rgw_obj_key& key,
+ std::optional<rgw_user> *user,
+ std::optional<rgw_user> *acl_translation_owner,
+ std::optional<string> *storage_class,
+ rgw_sync_pipe_params::Mode *mode,
+ bool *need_more_info) const
+{
+ std::optional<string> owner;
+
+ *need_more_info = false;
+
+ if (prefix_refs.empty()) {
+ return false;
+ }
+
+ auto end = prefix_refs.upper_bound(key.name);
+ auto iter = end;
+ if (iter != prefix_refs.begin()) {
+ --iter;
+ }
+ if (iter == prefix_refs.end()) {
+ return false;
+ }
+
+ if (iter != prefix_refs.begin()) {
+ iter = prefix_refs.find(iter->first); /* prefix_refs is multimap, find first element
+ holding that key */
+ }
+
+ std::vector<decltype(iter)> iters;
+
+ std::optional<int> priority;
+
+ for (; iter != end; ++iter) {
+ auto& prefix = iter->first;
+ if (!boost::starts_with(key.name, prefix)) {
+ continue;
+ }
+
+ auto& rule_params = iter->second->params;
+ auto& filter = rule_params.source.filter;
+
+ if (rule_params.priority > priority) {
+ priority = rule_params.priority;
+
+ if (!filter.has_tags()) {
+ iters.clear();
+ }
+ iters.push_back(iter);
+
+ *need_more_info = filter.has_tags(); /* if highest priority filter has tags, then
+ we can't be sure if it would be used.
+ We need to first read the info from the source object */
+ }
+ }
+
+ if (iters.empty()) {
+ return false;
+ }
+
+ std::optional<rgw_user> _user;
+ std::optional<rgw_sync_pipe_acl_translation> _acl_translation;
+ std::optional<string> _storage_class;
+ rgw_sync_pipe_params::Mode _mode{rgw_sync_pipe_params::Mode::MODE_SYSTEM};
+
+ // make sure all params are the same by saving the first one
+ // encountered and comparing all subsequent to it
+ bool first_iter = true;
+ for (auto& iter : iters) {
+ const rgw_sync_pipe_params& rule_params = iter->second->params;
+ if (first_iter) {
+ _user = rule_params.user;
+ _acl_translation = rule_params.dest.acl_translation;
+ _storage_class = rule_params.dest.storage_class;
+ _mode = rule_params.mode;
+ first_iter = false;
+ } else {
+ // note: three of these == operators are comparing std::optional
+ // against std::optional; as one would expect they are equal a)
+ // if both do not contain values or b) if both do and those
+ // contained values are the same
+ const bool conflict =
+ !(_user == rule_params.user &&
+ _acl_translation == rule_params.dest.acl_translation &&
+ _storage_class == rule_params.dest.storage_class &&
+ _mode == rule_params.mode);
+ if (conflict) {
+ *need_more_info = true;
+ return false;
+ }
+ }
+ }
+
+ *user = _user;
+ if (_acl_translation) {
+ *acl_translation_owner = _acl_translation->owner;
+ }
+ *storage_class = _storage_class;
+ *mode = _mode;
+
+ return true;
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_obj_params(const rgw_obj_key& key,
+ const RGWObjTags::tag_map_t& tags,
+ rgw_sync_pipe_params *params) const
+{
+ if (prefix_refs.empty()) {
+ return false;
+ }
+
+ auto iter = prefix_refs.upper_bound(key.name);
+ if (iter != prefix_refs.begin()) {
+ --iter;
+ }
+ if (iter == prefix_refs.end()) {
+ return false;
+ }
+
+ auto end = prefix_refs.upper_bound(key.name);
+ auto max = end;
+
+ std::optional<int> priority;
+
+ for (; iter != end; ++iter) {
+ /* NOTE: this is not the most efficient way to do it,
+ * a trie data structure would be better
+ */
+ auto& prefix = iter->first;
+ if (!boost::starts_with(key.name, prefix)) {
+ continue;
+ }
+
+ auto& rule_params = iter->second->params;
+ auto& filter = rule_params.source.filter;
+
+ if (!filter.check_tags(tags)) {
+ continue;
+ }
+
+ if (rule_params.priority > priority) {
+ priority = rule_params.priority;
+ max = iter;
+ }
+ }
+
+ if (max == end) {
+ return false;
+ }
+
+ *params = max->second->params;
+ return true;
+}
+
+/*
+ * return either the current prefix for s, or the next one if s is not within a prefix
+ */
+
+RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator RGWBucketSyncFlowManager::pipe_rules::prefix_search(const std::string& s) const
+{
+ if (prefix_refs.empty()) {
+ return prefix_refs.end();
+ }
+ auto next = prefix_refs.upper_bound(s);
+ auto iter = next;
+ if (iter != prefix_refs.begin()) {
+ --iter;
+ }
+ if (!boost::starts_with(s, iter->first)) {
+ return next;
+ }
+
+ return iter;
+}
+
+void RGWBucketSyncFlowManager::pipe_set::insert(const rgw_sync_bucket_pipe& pipe) {
+ /* Ensure this pipe doesn't match with any disabled pipes */
+ for (auto p: disabled_pipe_map) {
+ if (p.second.source.match(pipe.source) && p.second.dest.match(pipe.dest)) {
+ return;
+ }
+ }
+ pipe_map.insert(make_pair(pipe.id, pipe));
+
+ auto& rules_ref = rules[endpoints_pair(pipe)];
+
+ if (!rules_ref) {
+ rules_ref = make_shared<RGWBucketSyncFlowManager::pipe_rules>();
+ }
+
+ rules_ref->insert(pipe);
+
+ pipe_handler h(rules_ref, pipe);
+
+ handlers.insert(h);
+}
+
+void RGWBucketSyncFlowManager::pipe_set::remove_all() {
+ pipe_map.clear();
+ disabled_pipe_map.clear();
+ rules.clear();
+ handlers.clear();
+}
+
+void RGWBucketSyncFlowManager::pipe_set::disable(const rgw_sync_bucket_pipe& pipe) {
+ /* This pipe is disabled. Add it to disabled pipes & remove any
+ * matching pipes already inserted
+ */
+ disabled_pipe_map.insert(make_pair(pipe.id, pipe));
+ for (auto iter_p = pipe_map.begin(); iter_p != pipe_map.end(); ) {
+ auto p = iter_p++;
+ if (p->second.source.match(pipe.source) && p->second.dest.match(pipe.dest)) {
+ auto& rules_ref = rules[endpoints_pair(p->second)];
+ if (rules_ref) {
+ pipe_handler h(rules_ref, p->second);
+ handlers.erase(h);
+ }
+ rules.erase(endpoints_pair(p->second));
+ pipe_map.erase(p);
+ }
+ }
+}
+
+void RGWBucketSyncFlowManager::pipe_set::dump(ceph::Formatter *f) const
+{
+ encode_json("pipes", pipe_map, f);
+}
+
+bool RGWBucketSyncFlowManager::allowed_data_flow(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket,
+ bool check_activated) const
+{
+ bool found = false;
+ bool found_activated = false;
+
+ for (auto m : flow_groups) {
+ auto& fm = m.second;
+ auto pipes = fm.find_pipes(source_zone, source_bucket,
+ dest_zone, dest_bucket);
+
+ bool is_found = !pipes.empty();
+
+ if (is_found) {
+ switch (fm.status) {
+ case rgw_sync_policy_group::Status::FORBIDDEN:
+ return false;
+ case rgw_sync_policy_group::Status::ENABLED:
+ found = true;
+ found_activated = true;
+ break;
+ case rgw_sync_policy_group::Status::ALLOWED:
+ found = true;
+ break;
+ default:
+ break; /* unknown -- ignore */
+ }
+ }
+ }
+
+ if (check_activated && found_activated) {
+ return true;
+ }
+
+ return found;
+}
+
+void RGWBucketSyncFlowManager::init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy) {
+ std::optional<rgw_sync_data_flow_group> default_flow;
+ if (parent) {
+ default_flow.emplace();
+ default_flow->init_default(parent->all_zones);
+ }
+
+ for (auto& item : sync_policy.groups) {
+ auto& group = item.second;
+ auto& flow_group_map = flow_groups[group.id];
+
+ flow_group_map.init(dpp, cct, zone_id, bucket, group,
+ (default_flow ? &(*default_flow) : nullptr),
+ &all_zones,
+ [&](const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket) {
+ if (!parent) {
+ return true;
+ }
+ return parent->allowed_data_flow(source_zone,
+ source_bucket,
+ dest_zone,
+ dest_bucket,
+ false); /* just check that it's not disabled */
+ });
+ }
+}
+
+/*
+* These are the semantics to be followed while resolving the policy
+* conflicts -
+*
+* ==================================================
+* zonegroup bucket Result
+* ==================================================
+* enabled enabled enabled
+* allowed enabled
+* forbidden disabled
+* allowed enabled enabled
+* allowed disabled
+* forbidden disabled
+* forbidden enabled disabled
+* allowed disabled
+* forbidden disabled
+*
+* In case multiple group policies are set to reflect for any sync pair
+* (<source-zone,source-bucket>, <dest-zone,dest-bucket>), the following
+* rules are applied in the order-
+* 1) Even if one policy status is FORBIDDEN, the sync will be disabled
+* 2) Atleast one policy should be ENABLED for the sync to be allowed.
+*
+*/
+void RGWBucketSyncFlowManager::reflect(const DoutPrefixProvider *dpp,
+ std::optional<rgw_bucket> effective_bucket,
+ RGWBucketSyncFlowManager::pipe_set *source_pipes,
+ RGWBucketSyncFlowManager::pipe_set *dest_pipes,
+ bool only_enabled) const
+
+{
+ string effective_bucket_key;
+ bool is_forbidden = false;
+ if (effective_bucket) {
+ effective_bucket_key = effective_bucket->get_key();
+ }
+ if (parent) {
+ parent->reflect(dpp, effective_bucket, source_pipes, dest_pipes, only_enabled);
+ }
+
+ for (auto& item : flow_groups) {
+ auto& flow_group_map = item.second;
+ is_forbidden = false;
+
+ if (flow_group_map.status == rgw_sync_policy_group::Status::FORBIDDEN) {
+ /* FORBIDDEN takes precedence over all the other rules.
+ * Remove any other pipes which may allow access.
+ */
+ is_forbidden = true;
+ } else if (flow_group_map.status != rgw_sync_policy_group::Status::ENABLED &&
+ (only_enabled || flow_group_map.status != rgw_sync_policy_group::Status::ALLOWED)) {
+ /* only return enabled groups */
+ continue;
+ }
+
+ for (auto& entry : flow_group_map.sources) {
+ rgw_sync_bucket_pipe pipe = entry.second;
+ if (!pipe.dest.match_bucket(effective_bucket)) {
+ continue;
+ }
+
+ pipe.source.apply_bucket(effective_bucket);
+ pipe.dest.apply_bucket(effective_bucket);
+
+ if (is_forbidden) {
+ ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing source pipe: " << pipe << dendl;
+ source_pipes->disable(pipe);
+ } else {
+ ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding source pipe: " << pipe << dendl;
+ source_pipes->insert(pipe);
+ }
+ }
+
+ for (auto& entry : flow_group_map.dests) {
+ rgw_sync_bucket_pipe pipe = entry.second;
+
+ if (!pipe.source.match_bucket(effective_bucket)) {
+ continue;
+ }
+
+ pipe.source.apply_bucket(effective_bucket);
+ pipe.dest.apply_bucket(effective_bucket);
+
+ if (is_forbidden) {
+ ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing dest pipe: " << pipe << dendl;
+ dest_pipes->disable(pipe);
+ } else {
+ ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding dest pipe: " << pipe << dendl;
+ dest_pipes->insert(pipe);
+ }
+ }
+ }
+}
+
+
+RGWBucketSyncFlowManager::RGWBucketSyncFlowManager(CephContext *_cct,
+ const rgw_zone_id& _zone_id,
+ std::optional<rgw_bucket> _bucket,
+ const RGWBucketSyncFlowManager *_parent) : cct(_cct),
+ zone_id(_zone_id),
+ bucket(_bucket),
+ parent(_parent) {}
+
+
+void RGWSyncPolicyCompat::convert_old_sync_config(RGWSI_Zone *zone_svc,
+ RGWSI_SyncModules *sync_modules_svc,
+ rgw_sync_policy_info *ppolicy)
+{
+ bool found = false;
+
+ rgw_sync_policy_info policy;
+
+ auto& group = policy.groups["default"];
+ auto& zonegroup = zone_svc->get_zonegroup();
+
+ for (const auto& ziter1 : zonegroup.zones) {
+ auto& id1 = ziter1.first;
+ const RGWZone& z1 = ziter1.second;
+
+ for (const auto& ziter2 : zonegroup.zones) {
+ auto& id2 = ziter2.first;
+ const RGWZone& z2 = ziter2.second;
+
+ if (id1 == id2) {
+ continue;
+ }
+
+ if (z1.syncs_from(z2.name)) {
+ found = true;
+ rgw_sync_directional_rule *rule;
+ group.data_flow.find_or_create_directional(id2,
+ id1,
+ &rule);
+ }
+ }
+ }
+
+ if (!found) { /* nothing syncs */
+ return;
+ }
+
+ rgw_sync_bucket_pipes pipes;
+ pipes.id = "all";
+ pipes.source.all_zones = true;
+ pipes.dest.all_zones = true;
+
+ group.pipes.emplace_back(std::move(pipes));
+
+
+ group.status = rgw_sync_policy_group::Status::ENABLED;
+
+ *ppolicy = std::move(policy);
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+ RGWSI_SyncModules *sync_modules_svc,
+ RGWSI_Bucket_Sync *_bucket_sync_svc,
+ std::optional<rgw_zone_id> effective_zone) : zone_svc(_zone_svc) ,
+ bucket_sync_svc(_bucket_sync_svc) {
+ zone_id = effective_zone.value_or(zone_svc->zone_id());
+ flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+ zone_id,
+ nullopt,
+ nullptr));
+ sync_policy = zone_svc->get_zonegroup().sync_policy;
+
+ if (sync_policy.empty()) {
+ RGWSyncPolicyCompat::convert_old_sync_config(zone_svc, sync_modules_svc, &sync_policy);
+ legacy_config = true;
+ }
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+ const RGWBucketInfo& _bucket_info,
+ map<string, bufferlist>&& _bucket_attrs) : parent(_parent),
+ bucket_info(_bucket_info),
+ bucket_attrs(std::move(_bucket_attrs)) {
+ if (_bucket_info.sync_policy) {
+ sync_policy = *_bucket_info.sync_policy;
+
+ for (auto& entry : sync_policy.groups) {
+ for (auto& pipe : entry.second.pipes) {
+ if (pipe.params.mode == rgw_sync_pipe_params::MODE_USER &&
+ pipe.params.user.empty()) {
+ pipe.params.user = _bucket_info.owner;
+ }
+ }
+ }
+ }
+ legacy_config = parent->legacy_config;
+ bucket = _bucket_info.bucket;
+ zone_svc = parent->zone_svc;
+ bucket_sync_svc = parent->bucket_sync_svc;
+ flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+ parent->zone_id,
+ _bucket_info.bucket,
+ parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+ const rgw_bucket& _bucket,
+ std::optional<rgw_sync_policy_info> _sync_policy) : parent(_parent) {
+ if (_sync_policy) {
+ sync_policy = *_sync_policy;
+ }
+ legacy_config = parent->legacy_config;
+ bucket = _bucket;
+ zone_svc = parent->zone_svc;
+ bucket_sync_svc = parent->bucket_sync_svc;
+ flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+ parent->zone_id,
+ _bucket,
+ parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const RGWBucketInfo& bucket_info,
+ map<string, bufferlist>&& bucket_attrs) const
+{
+ return new RGWBucketSyncPolicyHandler(this, bucket_info, std::move(bucket_attrs));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const rgw_bucket& bucket,
+ std::optional<rgw_sync_policy_info> sync_policy) const
+{
+ return new RGWBucketSyncPolicyHandler(this, bucket, sync_policy);
+}
+
+int RGWBucketSyncPolicyHandler::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ int r = bucket_sync_svc->get_bucket_sync_hints(dpp, bucket.value_or(rgw_bucket()),
+ &source_hints,
+ &target_hints,
+ y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize bucket sync policy handler: get_bucket_sync_hints() on bucket="
+ << bucket << " returned r=" << r << dendl;
+ return r;
+ }
+
+ flow_mgr->init(dpp, sync_policy);
+
+ reflect(dpp, &source_pipes,
+ &target_pipes,
+ &sources,
+ &targets,
+ &source_zones,
+ &target_zones,
+ true);
+
+ return 0;
+}
+
+void RGWBucketSyncPolicyHandler::reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+ RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+ map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+ map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+ std::set<rgw_zone_id> *psource_zones,
+ std::set<rgw_zone_id> *ptarget_zones,
+ bool only_enabled) const
+{
+ RGWBucketSyncFlowManager::pipe_set _source_pipes;
+ RGWBucketSyncFlowManager::pipe_set _target_pipes;
+ map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _sources;
+ map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _targets;
+ std::set<rgw_zone_id> _source_zones;
+ std::set<rgw_zone_id> _target_zones;
+
+ flow_mgr->reflect(dpp, bucket, &_source_pipes, &_target_pipes, only_enabled);
+
+ for (auto& entry : _source_pipes.pipe_map) {
+ auto& pipe = entry.second;
+ if (!pipe.source.zone) {
+ continue;
+ }
+ _source_zones.insert(*pipe.source.zone);
+ _sources[*pipe.source.zone].insert(pipe);
+ }
+
+ for (auto& entry : _target_pipes.pipe_map) {
+ auto& pipe = entry.second;
+ if (!pipe.dest.zone) {
+ continue;
+ }
+ _target_zones.insert(*pipe.dest.zone);
+ _targets[*pipe.dest.zone].insert(pipe);
+ }
+
+ if (psource_pipes) {
+ *psource_pipes = std::move(_source_pipes);
+ }
+ if (ptarget_pipes) {
+ *ptarget_pipes = std::move(_target_pipes);
+ }
+ if (psources) {
+ *psources = std::move(_sources);
+ }
+ if (ptargets) {
+ *ptargets = std::move(_targets);
+ }
+ if (psource_zones) {
+ *psource_zones = std::move(_source_zones);
+ }
+ if (ptarget_zones) {
+ *ptarget_zones = std::move(_target_zones);
+ }
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_sources() const
+{
+ multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+ for (auto& source_entry : sources) {
+ auto& zone_id = source_entry.first;
+
+ auto& pipes = source_entry.second.pipe_map;
+
+ for (auto& entry : pipes) {
+ auto& pipe = entry.second;
+ m.insert(make_pair(zone_id, pipe));
+ }
+ }
+
+ for (auto& pipe : resolved_sources) {
+ if (!pipe.source.zone) {
+ continue;
+ }
+
+ m.insert(make_pair(*pipe.source.zone, pipe));
+ }
+
+ return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests() const
+{
+ multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+ for (auto& dest_entry : targets) {
+ auto& zone_id = dest_entry.first;
+
+ auto& pipes = dest_entry.second.pipe_map;
+
+ for (auto& entry : pipes) {
+ auto& pipe = entry.second;
+ m.insert(make_pair(zone_id, pipe));
+ }
+ }
+
+ for (auto& pipe : resolved_dests) {
+ if (!pipe.dest.zone) {
+ continue;
+ }
+
+ m.insert(make_pair(*pipe.dest.zone, pipe));
+ }
+
+ return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests_in_zone(const rgw_zone_id& zone_id) const
+{
+ multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+ auto iter = targets.find(zone_id);
+ if (iter != targets.end()) {
+ auto& pipes = iter->second.pipe_map;
+
+ for (auto& entry : pipes) {
+ auto& pipe = entry.second;
+ m.insert(make_pair(zone_id, pipe));
+ }
+ }
+
+ for (auto& pipe : resolved_dests) {
+ if (!pipe.dest.zone ||
+ *pipe.dest.zone != zone_id) {
+ continue;
+ }
+
+ m.insert(make_pair(*pipe.dest.zone, pipe));
+ }
+
+ return m;
+}
+
+void RGWBucketSyncPolicyHandler::get_pipes(std::set<rgw_sync_bucket_pipe> *_sources, std::set<rgw_sync_bucket_pipe> *_targets,
+ std::optional<rgw_sync_bucket_entity> filter_peer) { /* return raw pipes */
+ for (auto& entry : source_pipes.pipe_map) {
+ auto& source_pipe = entry.second;
+ if (!filter_peer ||
+ source_pipe.source.match(*filter_peer)) {
+ _sources->insert(source_pipe);
+ }
+ }
+
+ for (auto& entry : target_pipes.pipe_map) {
+ auto& target_pipe = entry.second;
+ if (!filter_peer ||
+ target_pipe.dest.match(*filter_peer)) {
+ _targets->insert(target_pipe);
+ }
+ }
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_exports_data() const
+{
+ if (!bucket) {
+ return false;
+ }
+
+ if (!zone_svc->sync_module_exports_data()) {
+ return false;
+ }
+
+ if (bucket_is_sync_source()) {
+ return true;
+ }
+
+ return (zone_svc->need_to_log_data() &&
+ bucket_info->datasync_flag_enabled());
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_imports_data() const
+{
+ return bucket_is_sync_target();
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.h b/src/rgw/driver/rados/rgw_bucket_sync.h
new file mode 100644
index 000000000..d425ecf17
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket_sync.h
@@ -0,0 +1,416 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_sync_policy.h"
+
+class RGWSI_Zone;
+class RGWSI_SyncModules;
+class RGWSI_Bucket_Sync;
+
+struct rgw_sync_group_pipe_map;
+struct rgw_sync_bucket_pipes;
+struct rgw_sync_policy_info;
+
+struct rgw_sync_group_pipe_map {
+ rgw_zone_id zone;
+ std::optional<rgw_bucket> bucket;
+
+ rgw_sync_policy_group::Status status{rgw_sync_policy_group::Status::UNKNOWN};
+
+ using zb_pipe_map_t = std::multimap<rgw_sync_bucket_entity, rgw_sync_bucket_pipe>;
+
+ zb_pipe_map_t sources; /* all the pipes where zone is pulling from */
+ zb_pipe_map_t dests; /* all the pipes that pull from zone */
+
+ std::set<rgw_zone_id> *pall_zones{nullptr};
+ rgw_sync_data_flow_group *default_flow{nullptr}; /* flow to use if policy doesn't define it,
+ used in the case of bucket sync policy, not at the
+ zonegroup level */
+
+ void dump(ceph::Formatter *f) const;
+
+ template <typename CB1, typename CB2>
+ void try_add_to_pipe_map(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ zb_pipe_map_t *pipe_map,
+ CB1 filter_cb,
+ CB2 call_filter_cb);
+
+ template <typename CB>
+ void try_add_source(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ CB filter_cb);
+
+ template <typename CB>
+ void try_add_dest(const rgw_zone_id& source_zone,
+ const rgw_zone_id& dest_zone,
+ const std::vector<rgw_sync_bucket_pipes>& pipes,
+ CB filter_cb);
+
+ std::pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> find_pipes(const zb_pipe_map_t& m,
+ const rgw_zone_id& zone,
+ std::optional<rgw_bucket> b) const;
+
+ template <typename CB>
+ void init(const DoutPrefixProvider *dpp, CephContext *cct,
+ const rgw_zone_id& _zone,
+ std::optional<rgw_bucket> _bucket,
+ const rgw_sync_policy_group& group,
+ rgw_sync_data_flow_group *_default_flow,
+ std::set<rgw_zone_id> *_pall_zones,
+ CB filter_cb);
+
+ /*
+ * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+ */
+ std::vector<rgw_sync_bucket_pipe> find_source_pipes(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ std::optional<rgw_bucket> dest_bucket) const;
+
+ /*
+ * find all relevant pipes in other zones that pull from a specific
+ * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+ */
+ std::vector<rgw_sync_bucket_pipe> find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket) const;
+
+ /*
+ * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+ */
+ std::vector<rgw_sync_bucket_pipe> find_pipes(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket) const;
+};
+
+class RGWSyncPolicyCompat {
+public:
+ static void convert_old_sync_config(RGWSI_Zone *zone_svc,
+ RGWSI_SyncModules *sync_modules_svc,
+ rgw_sync_policy_info *ppolicy);
+};
+
+class RGWBucketSyncFlowManager {
+ friend class RGWBucketSyncPolicyHandler;
+public:
+ struct endpoints_pair {
+ rgw_sync_bucket_entity source;
+ rgw_sync_bucket_entity dest;
+
+ endpoints_pair() {}
+ endpoints_pair(const rgw_sync_bucket_pipe& pipe) {
+ source = pipe.source;
+ dest = pipe.dest;
+ }
+
+ bool operator<(const endpoints_pair& e) const {
+ if (source < e.source) {
+ return true;
+ }
+ if (e.source < source) {
+ return false;
+ }
+ return (dest < e.dest);
+ }
+ };
+
+ /*
+ * pipe_rules: deal with a set of pipes that have common endpoints_pair
+ */
+ class pipe_rules {
+ std::list<rgw_sync_bucket_pipe> pipes;
+
+ public:
+ using prefix_map_t = std::multimap<std::string, rgw_sync_bucket_pipe *>;
+
+ std::map<std::string, rgw_sync_bucket_pipe *> tag_refs;
+ prefix_map_t prefix_refs;
+
+ void insert(const rgw_sync_bucket_pipe& pipe);
+
+ bool find_basic_info_without_tags(const rgw_obj_key& key,
+ std::optional<rgw_user> *user,
+ std::optional<rgw_user> *acl_translation,
+ std::optional<std::string> *storage_class,
+ rgw_sync_pipe_params::Mode *mode,
+ bool *need_more_info) const;
+ bool find_obj_params(const rgw_obj_key& key,
+ const RGWObjTags::tag_map_t& tags,
+ rgw_sync_pipe_params *params) const;
+
+ void scan_prefixes(std::vector<std::string> *prefixes) const;
+
+ prefix_map_t::const_iterator prefix_begin() const {
+ return prefix_refs.begin();
+ }
+ prefix_map_t::const_iterator prefix_search(const std::string& s) const;
+ prefix_map_t::const_iterator prefix_end() const {
+ return prefix_refs.end();
+ }
+ };
+
+ using pipe_rules_ref = std::shared_ptr<pipe_rules>;
+
+ /*
+ * pipe_handler: extends endpoints_rule to point at the corresponding rules handler
+ */
+ struct pipe_handler : public endpoints_pair {
+ pipe_rules_ref rules;
+
+ pipe_handler() {}
+ pipe_handler(pipe_rules_ref& _rules,
+ const rgw_sync_bucket_pipe& _pipe) : endpoints_pair(_pipe),
+ rules(_rules) {}
+ bool specific() const {
+ return source.specific() && dest.specific();
+ }
+
+ bool find_basic_info_without_tags(const rgw_obj_key& key,
+ std::optional<rgw_user> *user,
+ std::optional<rgw_user> *acl_translation,
+ std::optional<std::string> *storage_class,
+ rgw_sync_pipe_params::Mode *mode,
+ bool *need_more_info) const {
+ if (!rules) {
+ return false;
+ }
+ return rules->find_basic_info_without_tags(key, user, acl_translation, storage_class, mode, need_more_info);
+ }
+
+ bool find_obj_params(const rgw_obj_key& key,
+ const RGWObjTags::tag_map_t& tags,
+ rgw_sync_pipe_params *params) const {
+ if (!rules) {
+ return false;
+ }
+ return rules->find_obj_params(key, tags, params);
+ }
+ };
+
+ struct pipe_set {
+ std::map<endpoints_pair, pipe_rules_ref> rules;
+ std::multimap<std::string, rgw_sync_bucket_pipe> pipe_map;
+ std::multimap<std::string, rgw_sync_bucket_pipe> disabled_pipe_map;
+
+ std::set<pipe_handler> handlers;
+
+ using iterator = std::set<pipe_handler>::iterator;
+
+ void clear() {
+ rules.clear();
+ pipe_map.clear();
+ disabled_pipe_map.clear();
+ handlers.clear();
+ }
+
+ void insert(const rgw_sync_bucket_pipe& pipe);
+ void remove_all();
+ void disable(const rgw_sync_bucket_pipe& pipe);
+
+ iterator begin() const {
+ return handlers.begin();
+ }
+
+ iterator end() const {
+ return handlers.end();
+ }
+
+ void dump(ceph::Formatter *f) const;
+ };
+
+private:
+
+ CephContext *cct;
+
+ rgw_zone_id zone_id;
+ std::optional<rgw_bucket> bucket;
+
+ const RGWBucketSyncFlowManager *parent{nullptr};
+
+ std::map<std::string, rgw_sync_group_pipe_map> flow_groups;
+
+ std::set<rgw_zone_id> all_zones;
+
+ bool allowed_data_flow(const rgw_zone_id& source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_zone_id& dest_zone,
+ std::optional<rgw_bucket> dest_bucket,
+ bool check_activated) const;
+
+ /*
+ * find all the matching flows om a flow map for a specific bucket
+ */
+ void update_flow_maps(const rgw_sync_bucket_pipes& pipe);
+
+ void init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy);
+
+public:
+
+ RGWBucketSyncFlowManager(CephContext *_cct,
+ const rgw_zone_id& _zone_id,
+ std::optional<rgw_bucket> _bucket,
+ const RGWBucketSyncFlowManager *_parent);
+
+ void reflect(const DoutPrefixProvider *dpp, std::optional<rgw_bucket> effective_bucket,
+ pipe_set *flow_by_source,
+ pipe_set *flow_by_dest,
+ bool only_enabled) const;
+
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const RGWBucketSyncFlowManager::endpoints_pair& e) {
+ os << e.dest << " -> " << e.source;
+ return os;
+}
+
+class RGWBucketSyncPolicyHandler {
+ bool legacy_config{false};
+ const RGWBucketSyncPolicyHandler *parent{nullptr};
+ RGWSI_Zone *zone_svc;
+ RGWSI_Bucket_Sync *bucket_sync_svc;
+ rgw_zone_id zone_id;
+ std::optional<RGWBucketInfo> bucket_info;
+ std::optional<std::map<std::string, bufferlist> > bucket_attrs;
+ std::optional<rgw_bucket> bucket;
+ std::unique_ptr<RGWBucketSyncFlowManager> flow_mgr;
+ rgw_sync_policy_info sync_policy;
+
+ RGWBucketSyncFlowManager::pipe_set source_pipes;
+ RGWBucketSyncFlowManager::pipe_set target_pipes;
+
+ std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> sources; /* source pipes by source zone id */
+ std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> targets; /* target pipes by target zone id */
+
+ std::set<rgw_zone_id> source_zones;
+ std::set<rgw_zone_id> target_zones;
+
+ std::set<rgw_bucket> source_hints;
+ std::set<rgw_bucket> target_hints;
+ std::set<rgw_sync_bucket_pipe> resolved_sources;
+ std::set<rgw_sync_bucket_pipe> resolved_dests;
+
+
+ bool bucket_is_sync_source() const {
+ return !targets.empty() || !resolved_dests.empty();
+ }
+
+ bool bucket_is_sync_target() const {
+ return !sources.empty() || !resolved_sources.empty();
+ }
+
+ RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+ const RGWBucketInfo& _bucket_info,
+ std::map<std::string, bufferlist>&& _bucket_attrs);
+
+ RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+ const rgw_bucket& _bucket,
+ std::optional<rgw_sync_policy_info> _sync_policy);
+public:
+ RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+ RGWSI_SyncModules *sync_modules_svc,
+ RGWSI_Bucket_Sync *bucket_sync_svc,
+ std::optional<rgw_zone_id> effective_zone = std::nullopt);
+
+ RGWBucketSyncPolicyHandler *alloc_child(const RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>&& bucket_attrs) const;
+ RGWBucketSyncPolicyHandler *alloc_child(const rgw_bucket& bucket,
+ std::optional<rgw_sync_policy_info> sync_policy) const;
+
+ int init(const DoutPrefixProvider *dpp, optional_yield y);
+
+ void reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+ RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+ std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+ std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+ std::set<rgw_zone_id> *psource_zones,
+ std::set<rgw_zone_id> *ptarget_zones,
+ bool only_enabled) const;
+
+ void set_resolved_hints(std::set<rgw_sync_bucket_pipe>&& _resolved_sources,
+ std::set<rgw_sync_bucket_pipe>&& _resolved_dests) {
+ resolved_sources = std::move(_resolved_sources);
+ resolved_dests = std::move(_resolved_dests);
+ }
+
+ const std::set<rgw_sync_bucket_pipe>& get_resolved_source_hints() {
+ return resolved_sources;
+ }
+
+ const std::set<rgw_sync_bucket_pipe>& get_resolved_dest_hints() {
+ return resolved_dests;
+ }
+
+ const std::set<rgw_zone_id>& get_source_zones() const {
+ return source_zones;
+ }
+
+ const std::set<rgw_zone_id>& get_target_zones() const {
+ return target_zones;
+ }
+
+ const std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_sources() {
+ return sources;
+ }
+
+ std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_sources() const;
+ std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests() const;
+ std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests_in_zone(const rgw_zone_id& zone_id) const;
+
+ const std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_targets() {
+ return targets;
+ }
+
+ const std::optional<RGWBucketInfo>& get_bucket_info() const {
+ return bucket_info;
+ }
+
+ const std::optional<std::map<std::string, bufferlist> >& get_bucket_attrs() const {
+ return bucket_attrs;
+ }
+
+ void get_pipes(RGWBucketSyncFlowManager::pipe_set **_sources, RGWBucketSyncFlowManager::pipe_set **_targets) { /* return raw pipes (with zone name) */
+ *_sources = &source_pipes;
+ *_targets = &target_pipes;
+ }
+ void get_pipes(std::set<rgw_sync_bucket_pipe> *sources, std::set<rgw_sync_bucket_pipe> *targets,
+ std::optional<rgw_sync_bucket_entity> filter_peer);
+
+ const std::set<rgw_bucket>& get_source_hints() const {
+ return source_hints;
+ }
+
+ const std::set<rgw_bucket>& get_target_hints() const {
+ return target_hints;
+ }
+
+ bool bucket_exports_data() const;
+ bool bucket_imports_data() const;
+
+ const rgw_sync_policy_info& get_sync_policy() const {
+ return sync_policy;
+ }
+
+ bool is_legacy_config() const {
+ return legacy_config;
+ }
+};
+
diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc
new file mode 100644
index 000000000..d8e0ecba6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_rados.cc
@@ -0,0 +1,1165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_counters.h"
+#include "rgw_bucket.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_cr_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_cls.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include <boost/asio/yield.hpp>
+#include <boost/container/flat_set.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) {
+ if (processor->is_going_down()) {
+ return false;
+ }
+ req->get();
+ processor->m_req_queue.push_back(req);
+ dout(20) << "enqueued request req=" << hex << req << dec << dendl;
+ _dump_queue();
+ return true;
+}
+
+bool RGWAsyncRadosProcessor::RGWWQ::_empty() {
+ return processor->m_req_queue.empty();
+}
+
+RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() {
+ if (processor->m_req_queue.empty())
+ return NULL;
+ RGWAsyncRadosRequest *req = processor->m_req_queue.front();
+ processor->m_req_queue.pop_front();
+ dout(20) << "dequeued request req=" << hex << req << dec << dendl;
+ _dump_queue();
+ return req;
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) {
+ processor->handle_request(this, req);
+ processor->req_throttle.put(1);
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() {
+ if (!g_conf()->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ return;
+ }
+ deque<RGWAsyncRadosRequest *>::iterator iter;
+ if (processor->m_req_queue.empty()) {
+ dout(20) << "RGWWQ: empty" << dendl;
+ return;
+ }
+ dout(20) << "RGWWQ:" << dendl;
+ for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) {
+ dout(20) << "req: " << hex << *iter << dec << dendl;
+ }
+}
+
+RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(CephContext *_cct, int num_threads)
+ : cct(_cct), m_tp(cct, "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads),
+ req_throttle(_cct, "rgw_async_rados_ops", num_threads * 2),
+ req_wq(this,
+ ceph::make_timespan(g_conf()->rgw_op_thread_timeout),
+ ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout),
+ &m_tp) {
+}
+
+void RGWAsyncRadosProcessor::start() {
+ m_tp.start();
+}
+
+void RGWAsyncRadosProcessor::stop() {
+ going_down = true;
+ m_tp.drain(&req_wq);
+ m_tp.stop();
+ for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) {
+ (*iter)->put();
+ }
+}
+
+void RGWAsyncRadosProcessor::handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req) {
+ req->send_request(dpp);
+ req->put();
+}
+
+void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) {
+ req_throttle.get(1);
+ req_wq.queue(req);
+}
+
+int RGWAsyncGetSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ map<string, bufferlist> *pattrs = want_attrs ? &attrs : nullptr;
+
+ auto sysobj = svc_sysobj->get_obj(obj);
+ return sysobj.rop()
+ .set_objv_tracker(&objv_tracker)
+ .set_attrs(pattrs)
+ .set_raw_attrs(raw_attrs)
+ .read(dpp, &bl, null_yield);
+}
+
+RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ bool want_attrs, bool raw_attrs)
+ : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc_sysobj(_svc),
+ obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs)
+{
+ if (_objv_tracker) {
+ objv_tracker = *_objv_tracker;
+ }
+}
+
+int RGWSimpleRadosReadAttrsCR::send_request(const DoutPrefixProvider *dpp)
+{
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+ << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ librados::ObjectReadOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_read(&op);
+ }
+
+ if (raw_attrs && pattrs) {
+ op.getxattrs(pattrs, nullptr);
+ } else {
+ op.getxattrs(&unfiltered_attrs, nullptr);
+ }
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op,
+ nullptr);
+}
+
+int RGWSimpleRadosReadAttrsCR::request_complete()
+{
+ int ret = cn->completion()->get_return_value();
+ set_status() << "request complete; ret=" << ret;
+ if (!raw_attrs && pattrs) {
+ rgw_filter_attrset(unfiltered_attrs, RGW_ATTR_PREFIX, pattrs);
+ }
+ return ret;
+}
+
+int RGWAsyncPutSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ auto sysobj = svc->get_obj(obj);
+ return sysobj.wop()
+ .set_objv_tracker(&objv_tracker)
+ .set_exclusive(exclusive)
+ .write_data(dpp, bl, null_yield);
+}
+
+RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(const DoutPrefixProvider *_dpp,
+ RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn,
+ RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ bool _exclusive, bufferlist _bl)
+ : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+ obj(_obj), exclusive(_exclusive), bl(std::move(_bl))
+{
+ if (_objv_tracker) {
+ objv_tracker = *_objv_tracker;
+ }
+}
+
+int RGWAsyncPutSystemObjAttrs::_send_request(const DoutPrefixProvider *dpp)
+{
+ auto sysobj = svc->get_obj(obj);
+ return sysobj.wop()
+ .set_objv_tracker(&objv_tracker)
+ .set_exclusive(exclusive)
+ .set_attrs(attrs)
+ .write_attrs(dpp, null_yield);
+}
+
+RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+ RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ map<string, bufferlist> _attrs, bool exclusive)
+ : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+ obj(_obj), attrs(std::move(_attrs)), exclusive(exclusive)
+{
+ if (_objv_tracker) {
+ objv_tracker = *_objv_tracker;
+ }
+}
+
+
+RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+ uint64_t _window_size)
+ : RGWConsumerCR<string>(_store->ctx()), async_rados(_async_rados),
+ store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0)
+{
+}
+
+int RGWAsyncLockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ rgw_rados_ref ref;
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ rados::cls::lock::Lock l(lock_name);
+ utime_t duration(duration_secs, 0);
+ l.set_duration(duration);
+ l.set_cookie(cookie);
+ l.set_may_renew(true);
+
+ return l.lock_exclusive(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store),
+ obj(_obj),
+ lock_name(_name),
+ cookie(_cookie),
+ duration_secs(_duration_secs)
+{
+}
+
+int RGWAsyncUnlockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ rgw_rados_ref ref;
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ rados::cls::lock::Lock l(lock_name);
+
+ l.set_cookie(cookie);
+
+ return l.unlock(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store),
+ obj(_obj),
+ lock_name(_name), cookie(_cookie)
+{
+}
+
+RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ map<string, bufferlist>& _entries) : RGWSimpleCoroutine(_store->ctx()),
+ store(_store),
+ entries(_entries),
+ obj(_obj), cn(NULL)
+{
+ stringstream& s = set_description();
+ s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]";
+ for (auto i = entries.begin(); i != entries.end(); ++i) {
+ if (i != entries.begin()) {
+ s << ", ";
+ }
+ s << i->first;
+ }
+ s << "]";
+}
+
+int RGWRadosSetOmapKeysCR::send_request(const DoutPrefixProvider *dpp)
+{
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ librados::ObjectWriteOperation op;
+ op.omap_set(entries);
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosSetOmapKeysCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const string& _marker,
+ int _max_entries,
+ ResultPtr _result)
+ : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+ marker(_marker), max_entries(_max_entries),
+ result(std::move(_result))
+{
+ ceph_assert(result); // must be allocated
+ set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ set_status() << "send request";
+
+ librados::ObjectReadOperation op;
+ op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+ cn = stack->create_completion_notifier(result);
+ return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapKeysCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosGetOmapValsCR::RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const string& _marker,
+ int _max_entries,
+ ResultPtr _result)
+ : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+ marker(_marker), max_entries(_max_entries),
+ result(std::move(_result))
+{
+ ceph_assert(result); // must be allocated
+ set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapValsCR::send_request(const DoutPrefixProvider *dpp) {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ set_status() << "send request";
+
+ librados::ObjectReadOperation op;
+ op.omap_get_vals2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+ cn = stack->create_completion_notifier(result);
+ return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapValsCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
+ store(_store),
+ keys(_keys),
+ obj(_obj), cn(NULL)
+{
+ set_description() << "remove omap keys dest=" << obj << " keys=" << keys;
+}
+
+int RGWRadosRemoveOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ set_status() << "send request";
+
+ librados::ObjectWriteOperation op;
+ op.omap_rm_keys(keys);
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOmapKeysCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosRemoveCR::RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+ RGWObjVersionTracker* objv_tracker)
+ : RGWSimpleCoroutine(store->ctx()),
+ store(store), obj(obj), objv_tracker(objv_tracker)
+{
+ set_description() << "remove dest=" << obj;
+}
+
+int RGWRadosRemoveCR::send_request(const DoutPrefixProvider *dpp)
+{
+ auto rados = store->getRados()->get_rados_handle();
+ int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx);
+ if (r < 0) {
+ lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl;
+ return r;
+ }
+ ioctx.locator_set_key(obj.loc);
+
+ set_status() << "send request";
+
+ librados::ObjectWriteOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_write(&op);
+ }
+ op.remove();
+
+ cn = stack->create_completion_notifier();
+ return ioctx.aio_operate(obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ librados::IoCtx&& ioctx,
+ std::string_view oid,
+ RGWObjVersionTracker* objv_tracker)
+ : RGWSimpleCoroutine(store->ctx()), ioctx(std::move(ioctx)),
+ oid(std::string(oid)), objv_tracker(objv_tracker)
+{
+ set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ RGWSI_RADOS::Obj& obj,
+ RGWObjVersionTracker* objv_tracker)
+ : RGWSimpleCoroutine(store->ctx()),
+ ioctx(librados::IoCtx(obj.get_ref().pool.ioctx())),
+ oid(obj.get_ref().obj.oid),
+ objv_tracker(objv_tracker)
+{
+ set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ RGWSI_RADOS::Obj&& obj,
+ RGWObjVersionTracker* objv_tracker)
+ : RGWSimpleCoroutine(store->ctx()),
+ ioctx(std::move(obj.get_ref().pool.ioctx())),
+ oid(std::move(obj.get_ref().obj.oid)),
+ objv_tracker(objv_tracker)
+{
+ set_description() << "remove dest=" << oid;
+}
+
+int RGWRadosRemoveOidCR::send_request(const DoutPrefixProvider *dpp)
+{
+ librados::ObjectWriteOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_write(&op);
+ }
+ op.remove();
+
+ cn = stack->create_completion_notifier();
+ return ioctx.aio_operate(oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOidCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const string& _lock_name,
+ const string& _cookie,
+ uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()),
+ async_rados(_async_rados),
+ store(_store),
+ lock_name(_lock_name),
+ cookie(_cookie),
+ duration(_duration),
+ obj(_obj),
+ req(NULL)
+{
+ set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration;
+}
+
+void RGWSimpleRadosLockCR::request_cleanup()
+{
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+}
+
+int RGWSimpleRadosLockCR::send_request(const DoutPrefixProvider *dpp)
+{
+ set_status() << "sending request";
+ req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(),
+ store, NULL, obj, lock_name, cookie, duration);
+ async_rados->queue(req);
+ return 0;
+}
+
+int RGWSimpleRadosLockCR::request_complete()
+{
+ set_status() << "request complete; ret=" << req->get_ret_status();
+ return req->get_ret_status();
+}
+
+RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const string& _lock_name,
+ const string& _cookie) : RGWSimpleCoroutine(_store->ctx()),
+ async_rados(_async_rados),
+ store(_store),
+ lock_name(_lock_name),
+ cookie(_cookie),
+ obj(_obj),
+ req(NULL)
+{
+ set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie;
+}
+
+void RGWSimpleRadosUnlockCR::request_cleanup()
+{
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+}
+
+int RGWSimpleRadosUnlockCR::send_request(const DoutPrefixProvider *dpp)
+{
+ set_status() << "sending request";
+
+ req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(),
+ store, NULL, obj, lock_name, cookie);
+ async_rados->queue(req);
+ return 0;
+}
+
+int RGWSimpleRadosUnlockCR::request_complete()
+{
+ set_status() << "request complete; ret=" << req->get_ret_status();
+ return req->get_ret_status();
+}
+
+int RGWOmapAppend::operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ for (;;) {
+ if (!has_product() && going_down) {
+ set_status() << "going down";
+ break;
+ }
+ set_status() << "waiting for product";
+ yield wait_for_product();
+ yield {
+ string entry;
+ while (consume(&entry)) {
+ set_status() << "adding entry: " << entry;
+ entries[entry] = bufferlist();
+ if (entries.size() >= window_size) {
+ break;
+ }
+ }
+ if (entries.size() >= window_size || going_down) {
+ set_status() << "flushing to omap";
+ call(new RGWRadosSetOmapKeysCR(store, obj, entries));
+ entries.clear();
+ }
+ }
+ if (get_ret_status() < 0) {
+ ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl;
+ return set_state(RGWCoroutine_Error);
+ }
+ }
+ /* done with coroutine */
+ return set_state(RGWCoroutine_Done);
+ }
+ return 0;
+}
+
+void RGWOmapAppend::flush_pending() {
+ receive(pending_entries);
+ num_pending_entries = 0;
+}
+
+bool RGWOmapAppend::append(const string& s) {
+ if (is_done()) {
+ return false;
+ }
+ ++total_entries;
+ pending_entries.push_back(s);
+ if (++num_pending_entries >= (int)window_size) {
+ flush_pending();
+ }
+ return true;
+}
+
+bool RGWOmapAppend::finish() {
+ going_down = true;
+ flush_pending();
+ set_sleeping(false);
+ return (!is_done());
+}
+
+int RGWAsyncGetBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+ int r;
+ if (!bucket.bucket_id.empty()) {
+ r = store->getRados()->get_bucket_instance_info(bucket, bucket_info, nullptr, &attrs, null_yield, dpp);
+ } else {
+ r = store->ctl()->bucket->read_bucket_info(bucket, &bucket_info, null_yield, dpp,
+ RGWBucketCtl::BucketInstance::GetParams().set_attrs(&attrs));
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get bucket instance info for "
+ << bucket << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWAsyncPutBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+ auto r = store->getRados()->put_bucket_instance_info(bucket_info, exclusive,
+ mtime, attrs, dpp, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to put bucket instance info for "
+ << bucket_info.bucket << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+RGWRadosBILogTrimCR::RGWRadosBILogTrimCR(
+ const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const RGWBucketInfo& bucket_info,
+ int shard_id,
+ const rgw::bucket_index_layout_generation& generation,
+ const std::string& start_marker,
+ const std::string& end_marker)
+ : RGWSimpleCoroutine(store->ctx()), bucket_info(bucket_info),
+ shard_id(shard_id), generation(generation), bs(store->getRados()),
+ start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)),
+ end_marker(BucketIndexShardsManager::get_shard_marker(end_marker))
+{
+}
+
+int RGWRadosBILogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+ int r = bs.init(dpp, bucket_info, generation, shard_id);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: bucket shard init failed ret=" << r << dendl;
+ return r;
+ }
+
+ bufferlist in;
+ cls_rgw_bi_log_trim_op call;
+ call.start_marker = std::move(start_marker);
+ call.end_marker = std::move(end_marker);
+ encode(call, in);
+
+ librados::ObjectWriteOperation op;
+ op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in);
+
+ cn = stack->create_completion_notifier();
+ return bs.bucket_obj.aio_operate(cn->completion(), &op);
+}
+
+int RGWRadosBILogTrimCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+ set_status() << "request complete; ret=" << r;
+ return r;
+}
+
+int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ RGWObjectCtx obj_ctx(store);
+
+ char buf[16];
+ snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+ rgw::sal::Attrs attrs;
+
+ rgw_obj src_obj(src_bucket, key);
+
+ rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
+ rgw::sal::RadosObject dest_obj(store, dest_key.value_or(key), &dest_bucket);
+
+ std::string etag;
+
+ std::optional<uint64_t> bytes_transferred;
+ int r = store->getRados()->fetch_remote_obj(obj_ctx,
+ user_id.value_or(rgw_user()),
+ NULL, /* req_info */
+ source_zone,
+ dest_obj.get_obj(),
+ src_obj,
+ dest_bucket_info, /* dest */
+ nullptr, /* source */
+ dest_placement_rule,
+ nullptr, /* real_time* src_mtime, */
+ NULL, /* real_time* mtime, */
+ NULL, /* const real_time* mod_ptr, */
+ NULL, /* const real_time* unmod_ptr, */
+ false, /* high precision time */
+ NULL, /* const char *if_match, */
+ NULL, /* const char *if_nomatch, */
+ RGWRados::ATTRSMOD_NONE,
+ copy_if_newer,
+ attrs,
+ RGWObjCategory::Main,
+ versioned_epoch,
+ real_time(), /* delete_at */
+ NULL, /* string *ptag, */
+ &etag, /* string *petag, */
+ NULL, /* void (*progress_cb)(off_t, void *), */
+ NULL, /* void *progress_data*); */
+ dpp,
+ filter.get(),
+ source_trace_entry,
+ &zones_trace,
+ &bytes_transferred);
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "store->fetch_remote_obj() returned r=" << r << dendl;
+ if (counters) {
+ counters->inc(sync_counters::l_fetch_err, 1);
+ }
+ } else {
+ // r >= 0
+ if (bytes_transferred) {
+ // send notification that object was succesfully synced
+ std::string user_id = "rgw sync";
+ std::string req_id = "0";
+
+ RGWObjTags obj_tags;
+ auto iter = attrs.find(RGW_ATTR_TAGS);
+ if (iter != attrs.end()) {
+ try {
+ auto it = iter->second.cbegin();
+ obj_tags.decode(it);
+ } catch (buffer::error &err) {
+ ldpp_dout(dpp, 1) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+ }
+ }
+
+ // NOTE: we create a mutable copy of bucket.get_tenant as the get_notification function expects a std::string&, not const
+ std::string tenant(dest_bucket.get_tenant());
+
+ std::unique_ptr<rgw::sal::Notification> notify
+ = store->get_notification(dpp, &dest_obj, nullptr, rgw::notify::ObjectSyncedCreate,
+ &dest_bucket, user_id,
+ tenant,
+ req_id, null_yield);
+
+ auto notify_res = static_cast<rgw::sal::RadosNotification*>(notify.get())->get_reservation();
+ int ret = rgw::notify::publish_reserve(dpp, rgw::notify::ObjectSyncedCreate, notify_res, &obj_tags);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: reserving notification failed, with error: " << ret << dendl;
+ // no need to return, the sync already happened
+ } else {
+ ret = rgw::notify::publish_commit(&dest_obj, *bytes_transferred, ceph::real_clock::now(), etag, dest_obj.get_instance(), rgw::notify::ObjectSyncedCreate, notify_res, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+ }
+ }
+ }
+
+ if (counters) {
+ if (bytes_transferred) {
+ counters->inc(sync_counters::l_fetch, *bytes_transferred);
+ } else {
+ counters->inc(sync_counters::l_fetch_not_modified);
+ }
+ }
+ }
+ return r;
+}
+
+int RGWAsyncStatRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ RGWObjectCtx obj_ctx(store);
+
+ string user_id;
+ char buf[16];
+ snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+
+
+ rgw_obj src_obj(src_bucket, key);
+
+ int r = store->getRados()->stat_remote_obj(dpp,
+ obj_ctx,
+ rgw_user(user_id),
+ nullptr, /* req_info */
+ source_zone,
+ src_obj,
+ nullptr, /* source */
+ pmtime, /* real_time* src_mtime, */
+ psize, /* uint64_t * */
+ nullptr, /* const real_time* mod_ptr, */
+ nullptr, /* const real_time* unmod_ptr, */
+ true, /* high precision time */
+ nullptr, /* const char *if_match, */
+ nullptr, /* const char *if_nomatch, */
+ pattrs,
+ pheaders,
+ nullptr,
+ nullptr, /* string *ptag, */
+ petag); /* string *petag, */
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "store->stat_remote_obj() returned r=" << r << dendl;
+ }
+ return r;
+}
+
+
+int RGWAsyncRemoveObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ ldpp_dout(dpp, 0) << __func__ << "(): deleting obj=" << obj << dendl;
+
+ obj->set_atomic();
+
+ RGWObjState *state;
+
+ int ret = obj->get_obj_state(dpp, &state, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ /* has there been any racing object write? */
+ if (del_if_older && (state->mtime > timestamp)) {
+ ldpp_dout(dpp, 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl;
+ return 0;
+ }
+
+ RGWAccessControlPolicy policy;
+
+ /* decode policy */
+ map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_ACL);
+ if (iter != state->attrset.end()) {
+ auto bliter = iter->second.cbegin();
+ try {
+ policy.decode(bliter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ }
+
+ std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+
+ del_op->params.bucket_owner = bucket->get_info().owner;
+ del_op->params.obj_owner = policy.get_owner();
+ if (del_if_older) {
+ del_op->params.unmod_since = timestamp;
+ }
+ if (versioned) {
+ del_op->params.versioning_status = BUCKET_VERSIONED;
+ }
+ del_op->params.olh_epoch = versioned_epoch;
+ del_op->params.marker_version_id = marker_version_id;
+ del_op->params.obj_owner.set_id(rgw_user(owner));
+ del_op->params.obj_owner.set_name(owner_display_name);
+ del_op->params.mtime = timestamp;
+ del_op->params.high_precision_time = true;
+ del_op->params.zones_trace = &zones_trace;
+
+ ret = del_op->delete_obj(dpp, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl;
+ }
+ return ret;
+}
+
+int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
+{
+ if (aborted) {
+ caller->set_sleeping(false);
+ return set_cr_done();
+ }
+ reenter(this) {
+ last_renew_try_time = ceph::coarse_mono_clock::now();
+ while (!going_down) {
+ current_time = ceph::coarse_mono_clock::now();
+ yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
+ if (latency) {
+ latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+ }
+ current_time = ceph::coarse_mono_clock::now();
+ if (current_time - last_renew_try_time > interval_tolerance) {
+ // renewal should happen between 50%-90% of interval
+ ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " <<
+ (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl;
+ }
+ last_renew_try_time = current_time;
+
+ caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */
+ if (retcode < 0) {
+ set_locked(false);
+ ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl;
+ return set_state(RGWCoroutine_Error, retcode);
+ }
+ ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl;
+ set_locked(true);
+ yield wait(utime_t(interval / 2, 0));
+ }
+ set_locked(false); /* moot at this point anyway */
+ current_time = ceph::coarse_mono_clock::now();
+ yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
+ if (latency) {
+ latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+ }
+ return set_state(RGWCoroutine_Done);
+ }
+ return 0;
+}
+
+RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(const DoutPrefixProvider *_dpp, rgw::sal::RadosStore* _store, const string& _oid,
+ const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()),
+ dpp(_dpp),
+ store(_store),
+ oid(_oid), cn(NULL)
+{
+ stringstream& s = set_description();
+ s << "timelog add entry oid=" << oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}";
+ entries.push_back(entry);
+}
+
+int RGWRadosTimelogAddCR::send_request(const DoutPrefixProvider *dpp)
+{
+ set_status() << "sending request";
+
+ cn = stack->create_completion_notifier();
+ return store->svc()->cls->timelog.add(dpp, oid, entries, cn->completion(), true, null_yield);
+}
+
+int RGWRadosTimelogAddCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const std::string& oid,
+ const real_time& start_time,
+ const real_time& end_time,
+ const std::string& from_marker,
+ const std::string& to_marker)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), oid(oid),
+ start_time(start_time), end_time(end_time),
+ from_marker(from_marker), to_marker(to_marker)
+{
+ set_description() << "timelog trim oid=" << oid
+ << " start_time=" << start_time << " end_time=" << end_time
+ << " from_marker=" << from_marker << " to_marker=" << to_marker;
+}
+
+int RGWRadosTimelogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+ set_status() << "sending request";
+
+ cn = stack->create_completion_notifier();
+ return store->svc()->cls->timelog.trim(dpp, oid, start_time, end_time, from_marker,
+ to_marker, cn->completion(),
+ null_yield);
+}
+
+int RGWRadosTimelogTrimCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+
+RGWSyncLogTrimCR::RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store, const std::string& oid,
+ const std::string& to_marker,
+ std::string *last_trim_marker)
+ : RGWRadosTimelogTrimCR(dpp, store, oid, real_time{}, real_time{},
+ std::string{}, to_marker),
+ cct(store->ctx()), last_trim_marker(last_trim_marker)
+{
+}
+
+int RGWSyncLogTrimCR::request_complete()
+{
+ int r = RGWRadosTimelogTrimCR::request_complete();
+ if (r != -ENODATA) {
+ return r;
+ }
+ // nothing left to trim, update last_trim_marker
+ if (*last_trim_marker < to_marker && to_marker != max_marker) {
+ *last_trim_marker = to_marker;
+ }
+ return 0;
+}
+
+
+int RGWAsyncStatObj::_send_request(const DoutPrefixProvider *dpp)
+{
+ rgw_raw_obj raw_obj;
+ store->getRados()->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
+ return store->getRados()->raw_obj_stat(dpp, raw_obj, psize, pmtime, pepoch,
+ nullptr, nullptr, objv_tracker, null_yield);
+}
+
+RGWStatObjCR::RGWStatObjCR(const DoutPrefixProvider *dpp,
+ RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+ const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize,
+ real_time* pmtime, uint64_t *pepoch,
+ RGWObjVersionTracker *objv_tracker)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), async_rados(async_rados),
+ bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch),
+ objv_tracker(objv_tracker)
+{
+}
+
+void RGWStatObjCR::request_cleanup()
+{
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+}
+
+int RGWStatObjCR::send_request(const DoutPrefixProvider *dpp)
+{
+ req = new RGWAsyncStatObj(dpp, this, stack->create_completion_notifier(),
+ store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker);
+ async_rados->queue(req);
+ return 0;
+}
+
+int RGWStatObjCR::request_complete()
+{
+ return req->get_ret_status();
+}
+
+RGWRadosNotifyCR::RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+ bufferlist& request, uint64_t timeout_ms,
+ bufferlist *response)
+ : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj),
+ request(request), timeout_ms(timeout_ms), response(response)
+{
+ set_description() << "notify dest=" << obj;
+}
+
+int RGWRadosNotifyCR::send_request(const DoutPrefixProvider *dpp)
+{
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_notify(ref.obj.oid, cn->completion(), request,
+ timeout_ms, response);
+}
+
+int RGWRadosNotifyCR::request_complete()
+{
+ int r = cn->completion()->get_return_value();
+
+ set_status() << "request complete; ret=" << r;
+
+ return r;
+}
+
+
+int RGWDataPostNotifyCR::operate(const DoutPrefixProvider* dpp)
+{
+ reenter(this) {
+ using PostNotify2 = RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>, int>;
+ yield {
+ rgw_http_param_pair pairs[] = { { "type", "data" },
+ { "notify2", NULL },
+ { "source-zone", source_zone },
+ { NULL, NULL } };
+ call(new PostNotify2(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, nullptr));
+ }
+ if (retcode == -ERR_METHOD_NOT_ALLOWED) {
+ using PostNotify1 = RGWPostRESTResourceCR<rgw_data_notify_v1_encoder, int>;
+ yield {
+ rgw_http_param_pair pairs[] = { { "type", "data" },
+ { "notify", NULL },
+ { "source-zone", source_zone },
+ { NULL, NULL } };
+ auto encoder = rgw_data_notify_v1_encoder{shards};
+ call(new PostNotify1(store->ctx(), conn, &http_manager, "/admin/log", pairs, encoder, nullptr));
+ }
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h
new file mode 100644
index 000000000..7bda18878
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_rados.h
@@ -0,0 +1,1647 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include "include/ceph_assert.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+
+#include <atomic>
+#include "common/ceph_time.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_bucket.h"
+
+struct rgw_http_param_pair;
+class RGWRESTConn;
+
+class RGWAsyncRadosRequest : public RefCountedObject {
+ RGWCoroutine *caller;
+ RGWAioCompletionNotifier *notifier;
+
+ int retcode;
+
+ ceph::mutex lock = ceph::make_mutex("RGWAsyncRadosRequest::lock");
+
+protected:
+ virtual int _send_request(const DoutPrefixProvider *dpp) = 0;
+public:
+ RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn)
+ : caller(_caller), notifier(_cn), retcode(0) {
+ }
+ ~RGWAsyncRadosRequest() override {
+ if (notifier) {
+ notifier->put();
+ }
+ }
+
+ void send_request(const DoutPrefixProvider *dpp) {
+ get();
+ retcode = _send_request(dpp);
+ {
+ std::lock_guard l{lock};
+ if (notifier) {
+ notifier->cb(); // drops its own ref
+ notifier = nullptr;
+ }
+ }
+ put();
+ }
+
+ int get_ret_status() { return retcode; }
+
+ void finish() {
+ {
+ std::lock_guard l{lock};
+ if (notifier) {
+ // we won't call notifier->cb() to drop its ref, so drop it here
+ notifier->put();
+ notifier = nullptr;
+ }
+ }
+ put();
+ }
+};
+
+
+class RGWAsyncRadosProcessor {
+ std::deque<RGWAsyncRadosRequest *> m_req_queue;
+ std::atomic<bool> going_down = { false };
+protected:
+ CephContext *cct;
+ ThreadPool m_tp;
+ Throttle req_throttle;
+
+ struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue<RGWAsyncRadosRequest> {
+ RGWAsyncRadosProcessor *processor;
+ RGWWQ(RGWAsyncRadosProcessor *p,
+ ceph::timespan timeout, ceph::timespan suicide_timeout,
+ ThreadPool *tp)
+ : ThreadPool::WorkQueue<RGWAsyncRadosRequest>("RGWWQ", timeout, suicide_timeout, tp), processor(p) {}
+
+ bool _enqueue(RGWAsyncRadosRequest *req) override;
+ void _dequeue(RGWAsyncRadosRequest *req) override {
+ ceph_abort();
+ }
+ bool _empty() override;
+ RGWAsyncRadosRequest *_dequeue() override;
+ using ThreadPool::WorkQueue<RGWAsyncRadosRequest>::_process;
+ void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override;
+ void _dump_queue();
+ void _clear() override {
+ ceph_assert(processor->m_req_queue.empty());
+ }
+
+ CephContext *get_cct() const { return processor->cct; }
+ unsigned get_subsys() const { return ceph_subsys_rgw; }
+ std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw async rados processor: ";}
+
+ } req_wq;
+
+public:
+ RGWAsyncRadosProcessor(CephContext *_cct, int num_threads);
+ ~RGWAsyncRadosProcessor() {}
+ void start();
+ void stop();
+ void handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req);
+ void queue(RGWAsyncRadosRequest *req);
+
+ bool is_going_down() {
+ return going_down;
+ }
+
+};
+
+template <class P>
+class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+
+ P params;
+ const DoutPrefixProvider *dpp;
+
+ class Request : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ P params;
+ const DoutPrefixProvider *dpp;
+ protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+ public:
+ Request(RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn,
+ rgw::sal::RadosStore* store,
+ const P& _params,
+ const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn),
+ store(store),
+ params(_params),
+ dpp(dpp) {}
+ } *req{nullptr};
+
+ public:
+ RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+ rgw::sal::RadosStore* _store,
+ const P& _params,
+ const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+ async_rados(_async_rados),
+ store(_store),
+ params(_params),
+ dpp(_dpp) {}
+
+ ~RGWSimpleWriteOnlyAsyncCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new Request(this,
+ stack->create_completion_notifier(),
+ store,
+ params,
+ dpp);
+
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+
+template <class P, class R>
+class RGWSimpleAsyncCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+
+ P params;
+ std::shared_ptr<R> result;
+ const DoutPrefixProvider *dpp;
+
+ class Request : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ P params;
+ std::shared_ptr<R> result;
+ const DoutPrefixProvider *dpp;
+ protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+ public:
+ Request(const DoutPrefixProvider *dpp,
+ RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn,
+ rgw::sal::RadosStore* _store,
+ const P& _params,
+ std::shared_ptr<R>& _result,
+ const DoutPrefixProvider *_dpp) : RGWAsyncRadosRequest(caller, cn),
+ store(_store),
+ params(_params),
+ result(_result),
+ dpp(_dpp) {}
+ } *req{nullptr};
+
+ public:
+ RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+ rgw::sal::RadosStore* _store,
+ const P& _params,
+ std::shared_ptr<R>& _result,
+ const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+ async_rados(_async_rados),
+ store(_store),
+ params(_params),
+ result(_result),
+ dpp(_dpp) {}
+
+ ~RGWSimpleAsyncCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new Request(dpp,
+ this,
+ stack->create_completion_notifier(),
+ store,
+ params,
+ result,
+ dpp);
+
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+class RGWGenericAsyncCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+
+
+public:
+ class Action {
+ public:
+ virtual ~Action() {}
+ virtual int operate() = 0;
+ };
+
+private:
+ std::shared_ptr<Action> action;
+
+ class Request : public RGWAsyncRadosRequest {
+ std::shared_ptr<Action> action;
+ protected:
+ int _send_request(const DoutPrefixProvider *dpp) override {
+ if (!action) {
+ return 0;
+ }
+ return action->operate();
+ }
+ public:
+ Request(const DoutPrefixProvider *dpp,
+ RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn,
+ std::shared_ptr<Action>& _action) : RGWAsyncRadosRequest(caller, cn),
+ action(_action) {}
+ } *req{nullptr};
+
+ public:
+ RGWGenericAsyncCR(CephContext *_cct,
+ RGWAsyncRadosProcessor *_async_rados,
+ std::shared_ptr<Action>& _action) : RGWSimpleCoroutine(_cct),
+ async_rados(_async_rados),
+ action(_action) {}
+ template<typename T>
+ RGWGenericAsyncCR(CephContext *_cct,
+ RGWAsyncRadosProcessor *_async_rados,
+ std::shared_ptr<T>& _action) : RGWSimpleCoroutine(_cct),
+ async_rados(_async_rados),
+ action(std::static_pointer_cast<Action>(_action)) {}
+
+ ~RGWGenericAsyncCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new Request(dpp, this,
+ stack->create_completion_notifier(),
+ action);
+
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+
+class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ RGWSI_SysObj* svc_sysobj;
+ rgw_raw_obj obj;
+ const bool want_attrs;
+ const bool raw_attrs;
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncGetSystemObj(const DoutPrefixProvider *dpp,
+ RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ bool want_attrs, bool raw_attrs);
+
+ bufferlist bl;
+ std::map<std::string, bufferlist> attrs;
+ RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ RGWSI_SysObj *svc;
+ rgw_raw_obj obj;
+ bool exclusive;
+ bufferlist bl;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncPutSystemObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ bool _exclusive, bufferlist _bl);
+
+ RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ RGWSI_SysObj *svc;
+ rgw_raw_obj obj;
+ std::map<std::string, bufferlist> attrs;
+ bool exclusive;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ std::map<std::string, bufferlist> _attrs, bool exclusive);
+
+ RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ rgw_raw_obj obj;
+ std::string lock_name;
+ std::string cookie;
+ uint32_t duration_secs;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ const std::string& _name, const std::string& _cookie, uint32_t _duration_secs);
+};
+
+class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ rgw_raw_obj obj;
+ std::string lock_name;
+ std::string cookie;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+ const std::string& _name, const std::string& _cookie);
+};
+
+template <class T>
+class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider* dpp;
+ rgw::sal::RadosStore* store;
+ rgw_raw_obj obj;
+ T* result;
+ /// on ENOENT, call handle_data() with an empty object instead of failing
+ const bool empty_on_enoent;
+ RGWObjVersionTracker* objv_tracker;
+
+ T val;
+ rgw_rados_ref ref;
+ ceph::buffer::list bl;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWSimpleRadosReadCR(const DoutPrefixProvider* dpp,
+ rgw::sal::RadosStore* store,
+ const rgw_raw_obj& obj,
+ T* result, bool empty_on_enoent = true,
+ RGWObjVersionTracker* objv_tracker = nullptr)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+ obj(obj), result(result), empty_on_enoent(empty_on_enoent),
+ objv_tracker(objv_tracker) {
+ if (!result) {
+ result = &val;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+ << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ librados::ObjectReadOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_read(&op);
+ }
+
+ op.read(0, -1, &bl, nullptr);
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op,
+ nullptr);
+ }
+
+ int request_complete() {
+ int ret = cn->completion()->get_return_value();
+ set_status() << "request complete; ret=" << ret;
+
+ if (ret == -ENOENT && empty_on_enoent) {
+ *result = T();
+ } else {
+ if (ret < 0) {
+ return ret;
+ }
+ try {
+ auto iter = bl.cbegin();
+ if (iter.end()) {
+ // allow successful reads with empty buffers. ReadSyncStatus coroutines
+ // depend on this to be able to read without locking, because the
+ // cls lock from InitSyncStatus will create an empty object if it didn't
+ // exist
+ *result = T();
+ } else {
+ decode(*result, iter);
+ }
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ }
+
+ return handle_data(*result);
+ }
+
+ virtual int handle_data(T& data) {
+ return 0;
+ }
+};
+
+class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider* dpp;
+ rgw::sal::RadosStore* const store;
+
+ const rgw_raw_obj obj;
+ std::map<std::string, bufferlist>* const pattrs;
+ const bool raw_attrs;
+ RGWObjVersionTracker* const objv_tracker;
+
+ rgw_rados_ref ref;
+ std::map<std::string, bufferlist> unfiltered_attrs;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWSimpleRadosReadAttrsCR(const DoutPrefixProvider* dpp,
+ rgw::sal::RadosStore* store,
+ rgw_raw_obj obj,
+ std::map<std::string, bufferlist>* pattrs,
+ bool raw_attrs,
+ RGWObjVersionTracker* objv_tracker = nullptr)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+ obj(std::move(obj)), pattrs(pattrs), raw_attrs(raw_attrs),
+ objv_tracker(objv_tracker) {}
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+template <class T>
+class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider* dpp;
+ rgw::sal::RadosStore* const store;
+ rgw_raw_obj obj;
+ RGWObjVersionTracker* objv_tracker;
+ bool exclusive;
+
+ bufferlist bl;
+ rgw_rados_ref ref;
+ std::map<std::string, bufferlist> unfiltered_attrs;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+
+public:
+ RGWSimpleRadosWriteCR(const DoutPrefixProvider* dpp,
+ rgw::sal::RadosStore* const store,
+ rgw_raw_obj obj, const T& data,
+ RGWObjVersionTracker* objv_tracker = nullptr,
+ bool exclusive = false)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+ obj(std::move(obj)), objv_tracker(objv_tracker), exclusive(exclusive) {
+ encode(data, bl);
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+ << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ librados::ObjectWriteOperation op;
+ if (exclusive) {
+ op.create(true);
+ }
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_write(&op);
+ }
+ op.write_full(bl);
+
+ cn = stack->create_completion_notifier();
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+ }
+
+ int request_complete() override {
+ int ret = cn->completion()->get_return_value();
+ set_status() << "request complete; ret=" << ret;
+ if (ret >= 0 && objv_tracker) {
+ objv_tracker->apply_write();
+ }
+ return ret;
+ }
+};
+
+class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider* dpp;
+ rgw::sal::RadosStore* const store;
+ RGWObjVersionTracker* objv_tracker;
+ rgw_raw_obj obj;
+ std::map<std::string, bufferlist> attrs;
+ bool exclusive;
+
+ rgw_rados_ref ref;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+
+public:
+ RGWSimpleRadosWriteAttrsCR(const DoutPrefixProvider* dpp,
+ rgw::sal::RadosStore* const store,
+ rgw_raw_obj obj,
+ std::map<std::string, bufferlist> attrs,
+ RGWObjVersionTracker* objv_tracker = nullptr,
+ bool exclusive = false)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp),
+ store(store), objv_tracker(objv_tracker),
+ obj(std::move(obj)), attrs(std::move(attrs)),
+ exclusive(exclusive) {}
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+ << r << dendl;
+ return r;
+ }
+
+ set_status() << "sending request";
+
+ librados::ObjectWriteOperation op;
+ if (exclusive) {
+ op.create(true);
+ }
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_write(&op);
+ }
+
+ for (const auto& [name, bl] : attrs) {
+ if (!bl.length())
+ continue;
+ op.setxattr(name.c_str(), bl);
+ }
+
+ cn = stack->create_completion_notifier();
+ if (!op.size()) {
+ cn->cb();
+ return 0;
+ }
+
+ return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+ }
+
+ int request_complete() override {
+ int ret = cn->completion()->get_return_value();
+ set_status() << "request complete; ret=" << ret;
+ if (ret >= 0 && objv_tracker) {
+ objv_tracker->apply_write();
+ }
+ return ret;
+ }
+};
+
+class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine {
+ rgw::sal::RadosStore* store;
+ std::map<std::string, bufferlist> entries;
+
+ rgw_rados_ref ref;
+
+ rgw_raw_obj obj;
+
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ std::map<std::string, bufferlist>& _entries);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine {
+ public:
+ struct Result {
+ rgw_rados_ref ref;
+ std::set<std::string> entries;
+ bool more = false;
+ };
+ using ResultPtr = std::shared_ptr<Result>;
+
+ RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+ const std::string& _marker, int _max_entries,
+ ResultPtr result);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+
+ private:
+ rgw::sal::RadosStore* store;
+ rgw_raw_obj obj;
+ std::string marker;
+ int max_entries;
+ ResultPtr result;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosGetOmapValsCR : public RGWSimpleCoroutine {
+ public:
+ struct Result {
+ rgw_rados_ref ref;
+ std::map<std::string, bufferlist> entries;
+ bool more = false;
+ };
+ using ResultPtr = std::shared_ptr<Result>;
+
+ RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+ const std::string& _marker, int _max_entries,
+ ResultPtr result);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+
+ private:
+ rgw::sal::RadosStore* store;
+ rgw_raw_obj obj;
+ std::string marker;
+ int max_entries;
+ ResultPtr result;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
+ rgw::sal::RadosStore* store;
+
+ rgw_rados_ref ref;
+
+ std::set<std::string> keys;
+
+ rgw_raw_obj obj;
+
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const std::set<std::string>& _keys);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+
+ int request_complete() override;
+};
+
+class RGWRadosRemoveCR : public RGWSimpleCoroutine {
+ rgw::sal::RadosStore* store;
+ librados::IoCtx ioctx;
+ const rgw_raw_obj obj;
+ RGWObjVersionTracker* objv_tracker;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+ RGWObjVersionTracker* objv_tracker = nullptr);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWRadosRemoveOidCR : public RGWSimpleCoroutine {
+ librados::IoCtx ioctx;
+ const std::string oid;
+ RGWObjVersionTracker* objv_tracker;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ librados::IoCtx&& ioctx, std::string_view oid,
+ RGWObjVersionTracker* objv_tracker = nullptr);
+
+ RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ RGWSI_RADOS::Obj& obj,
+ RGWObjVersionTracker* objv_tracker = nullptr);
+
+ RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+ RGWSI_RADOS::Obj&& obj,
+ RGWObjVersionTracker* objv_tracker = nullptr);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWSimpleRadosLockCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ std::string lock_name;
+ std::string cookie;
+ uint32_t duration;
+
+ rgw_raw_obj obj;
+
+ RGWAsyncLockSystemObj *req;
+
+public:
+ RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const std::string& _lock_name,
+ const std::string& _cookie,
+ uint32_t _duration);
+ ~RGWSimpleRadosLockCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override;
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+
+ static std::string gen_random_cookie(CephContext* cct) {
+ static constexpr std::size_t COOKIE_LEN = 16;
+ char buf[COOKIE_LEN + 1];
+ gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+ return buf;
+ }
+};
+
+class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ std::string lock_name;
+ std::string cookie;
+
+ rgw_raw_obj obj;
+
+ RGWAsyncUnlockSystemObj *req;
+
+public:
+ RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ const std::string& _lock_name,
+ const std::string& _cookie);
+ ~RGWSimpleRadosUnlockCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override;
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100
+
+class RGWOmapAppend : public RGWConsumerCR<std::string> {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+
+ rgw_raw_obj obj;
+
+ bool going_down;
+
+ int num_pending_entries;
+ std::list<std::string> pending_entries;
+
+ std::map<std::string, bufferlist> entries;
+
+ uint64_t window_size;
+ uint64_t total_entries;
+public:
+ RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_raw_obj& _obj,
+ uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT);
+ int operate(const DoutPrefixProvider *dpp) override;
+ void flush_pending();
+ bool append(const std::string& s);
+ bool finish();
+
+ uint64_t get_total_entries() {
+ return total_entries;
+ }
+
+ const rgw_raw_obj& get_obj() {
+ return obj;
+ }
+};
+
+class RGWShardedOmapCRManager {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ RGWCoroutine *op;
+
+ int num_shards;
+
+ std::vector<RGWOmapAppend *> shards;
+public:
+ RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const std::string& oid_prefix)
+ : async_rados(_async_rados),
+ store(_store), op(_op), num_shards(_num_shards) {
+ shards.reserve(num_shards);
+ for (int i = 0; i < num_shards; ++i) {
+ char buf[oid_prefix.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i);
+ RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf));
+ shard->get();
+ shards.push_back(shard);
+ op->spawn(shard, false);
+ }
+ }
+
+ ~RGWShardedOmapCRManager() {
+ for (auto shard : shards) {
+ shard->put();
+ }
+ }
+
+ bool append(const std::string& entry, int shard_id) {
+ return shards[shard_id]->append(entry);
+ }
+ bool finish() {
+ bool success = true;
+ for (auto& append_op : shards) {
+ success &= (append_op->finish() && (!append_op->is_error()));
+ }
+ return success;
+ }
+
+ uint64_t get_total_entries(int shard_id) {
+ return shards[shard_id]->get_total_entries();
+ }
+};
+
+class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ rgw_bucket bucket;
+ const DoutPrefixProvider *dpp;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+ rgw::sal::RadosStore* _store, const rgw_bucket& bucket,
+ const DoutPrefixProvider *dpp)
+ : RGWAsyncRadosRequest(caller, cn), store(_store), bucket(bucket), dpp(dpp) {}
+
+ RGWBucketInfo bucket_info;
+ std::map<std::string, bufferlist> attrs;
+};
+
+class RGWAsyncPutBucketInstanceInfo : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ RGWBucketInfo& bucket_info;
+ bool exclusive;
+ real_time mtime;
+ std::map<std::string, ceph::bufferlist>* attrs;
+ const DoutPrefixProvider *dpp;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncPutBucketInstanceInfo(RGWCoroutine* caller,
+ RGWAioCompletionNotifier* cn,
+ rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ bool exclusive,
+ real_time mtime,
+ std::map<std::string, ceph::bufferlist>* attrs,
+ const DoutPrefixProvider* dpp)
+ : RGWAsyncRadosRequest(caller, cn), store(store), bucket_info(bucket_info),
+ exclusive(exclusive), mtime(mtime), attrs(attrs), dpp(dpp) {}
+};
+
+class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ rgw_bucket bucket;
+ RGWBucketInfo *bucket_info;
+ std::map<std::string, bufferlist> *pattrs;
+ const DoutPrefixProvider *dpp;
+
+ RGWAsyncGetBucketInstanceInfo *req{nullptr};
+
+public:
+ // rgw_bucket constructor
+ RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_bucket& _bucket, RGWBucketInfo *_bucket_info,
+ std::map<std::string, bufferlist> *_pattrs, const DoutPrefixProvider *dpp)
+ : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
+ bucket(_bucket), bucket_info(_bucket_info), pattrs(_pattrs), dpp(dpp) {}
+ ~RGWGetBucketInstanceInfoCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, bucket, dpp);
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ if (bucket_info) {
+ *bucket_info = std::move(req->bucket_info);
+ }
+ if (pattrs) {
+ *pattrs = std::move(req->attrs);
+ }
+ return req->get_ret_status();
+ }
+};
+
+class RGWPutBucketInstanceInfoCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ RGWBucketInfo& bucket_info;
+ bool exclusive;
+ real_time mtime;
+ std::map<std::string, ceph::bufferlist>* attrs;
+ const DoutPrefixProvider *dpp;
+
+ RGWAsyncPutBucketInstanceInfo* req = nullptr;
+
+public:
+ // rgw_bucket constructor
+ RGWPutBucketInstanceInfoCR(RGWAsyncRadosProcessor *async_rados,
+ rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ bool exclusive,
+ real_time mtime,
+ std::map<std::string, ceph::bufferlist>* attrs,
+ const DoutPrefixProvider *dpp)
+ : RGWSimpleCoroutine(store->ctx()), async_rados(async_rados), store(store),
+ bucket_info(bucket_info), exclusive(exclusive),
+ mtime(mtime), attrs(attrs), dpp(dpp) {}
+ ~RGWPutBucketInstanceInfoCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = nullptr;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncPutBucketInstanceInfo(this,
+ stack->create_completion_notifier(),
+ store, bucket_info, exclusive,
+ mtime, attrs, dpp);
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+class RGWRadosBILogTrimCR : public RGWSimpleCoroutine {
+ const RGWBucketInfo& bucket_info;
+ int shard_id;
+ const rgw::bucket_index_layout_generation generation;
+ RGWRados::BucketShard bs;
+ std::string start_marker;
+ std::string end_marker;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+ RGWRadosBILogTrimCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+ int shard_id,
+ const rgw::bucket_index_layout_generation& generation,
+ const std::string& start_marker,
+ const std::string& end_marker);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ std::optional<rgw_user> user_id;
+
+ rgw_bucket src_bucket;
+ std::optional<rgw_placement_rule> dest_placement_rule;
+ RGWBucketInfo dest_bucket_info;
+
+ rgw_obj_key key;
+ std::optional<rgw_obj_key> dest_key;
+ std::optional<uint64_t> versioned_epoch;
+
+ real_time src_mtime;
+
+ bool copy_if_newer;
+ std::shared_ptr<RGWFetchObjFilter> filter;
+ rgw_zone_set_entry source_trace_entry;
+ rgw_zone_set zones_trace;
+ PerfCounters* counters;
+ const DoutPrefixProvider *dpp;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ std::optional<rgw_user>& _user_id,
+ const rgw_bucket& _src_bucket,
+ std::optional<rgw_placement_rule> _dest_placement_rule,
+ const RGWBucketInfo& _dest_bucket_info,
+ const rgw_obj_key& _key,
+ const std::optional<rgw_obj_key>& _dest_key,
+ std::optional<uint64_t> _versioned_epoch,
+ bool _if_newer,
+ std::shared_ptr<RGWFetchObjFilter> _filter,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *_zones_trace,
+ PerfCounters* counters, const DoutPrefixProvider *dpp)
+ : RGWAsyncRadosRequest(caller, cn), store(_store),
+ source_zone(_source_zone),
+ user_id(_user_id),
+ src_bucket(_src_bucket),
+ dest_placement_rule(_dest_placement_rule),
+ dest_bucket_info(_dest_bucket_info),
+ key(_key),
+ dest_key(_dest_key),
+ versioned_epoch(_versioned_epoch),
+ copy_if_newer(_if_newer),
+ filter(_filter),
+ source_trace_entry(source_trace_entry),
+ counters(counters),
+ dpp(dpp)
+ {
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ }
+};
+
+class RGWFetchRemoteObjCR : public RGWSimpleCoroutine {
+ CephContext *cct;
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ std::optional<rgw_user> user_id;
+
+ rgw_bucket src_bucket;
+ std::optional<rgw_placement_rule> dest_placement_rule;
+ RGWBucketInfo dest_bucket_info;
+
+ rgw_obj_key key;
+ std::optional<rgw_obj_key> dest_key;
+ std::optional<uint64_t> versioned_epoch;
+
+ real_time src_mtime;
+
+ bool copy_if_newer;
+
+ std::shared_ptr<RGWFetchObjFilter> filter;
+
+ RGWAsyncFetchRemoteObj *req;
+ const rgw_zone_set_entry& source_trace_entry;
+ rgw_zone_set *zones_trace;
+ PerfCounters* counters;
+ const DoutPrefixProvider *dpp;
+
+public:
+ RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ std::optional<rgw_user> _user_id,
+ const rgw_bucket& _src_bucket,
+ std::optional<rgw_placement_rule> _dest_placement_rule,
+ const RGWBucketInfo& _dest_bucket_info,
+ const rgw_obj_key& _key,
+ const std::optional<rgw_obj_key>& _dest_key,
+ std::optional<uint64_t> _versioned_epoch,
+ bool _if_newer,
+ std::shared_ptr<RGWFetchObjFilter> _filter,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *_zones_trace,
+ PerfCounters* counters, const DoutPrefixProvider *dpp)
+ : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+ async_rados(_async_rados), store(_store),
+ source_zone(_source_zone),
+ user_id(_user_id),
+ src_bucket(_src_bucket),
+ dest_placement_rule(_dest_placement_rule),
+ dest_bucket_info(_dest_bucket_info),
+ key(_key),
+ dest_key(_dest_key),
+ versioned_epoch(_versioned_epoch),
+ copy_if_newer(_if_newer),
+ filter(_filter),
+ req(NULL),
+ source_trace_entry(source_trace_entry),
+ zones_trace(_zones_trace), counters(counters), dpp(dpp) {}
+
+
+ ~RGWFetchRemoteObjCR() override {
+ request_cleanup();
+ }
+
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store,
+ source_zone, user_id, src_bucket, dest_placement_rule, dest_bucket_info,
+ key, dest_key, versioned_epoch, copy_if_newer, filter,
+ source_trace_entry, zones_trace, counters, dpp);
+ async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ rgw_bucket src_bucket;
+ rgw_obj_key key;
+
+ ceph::real_time *pmtime;
+ uint64_t *psize;
+ std::string *petag;
+ std::map<std::string, bufferlist> *pattrs;
+ std::map<std::string, std::string> *pheaders;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ rgw_bucket& _src_bucket,
+ const rgw_obj_key& _key,
+ ceph::real_time *_pmtime,
+ uint64_t *_psize,
+ std::string *_petag,
+ std::map<std::string, bufferlist> *_pattrs,
+ std::map<std::string, std::string> *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store),
+ source_zone(_source_zone),
+ src_bucket(_src_bucket),
+ key(_key),
+ pmtime(_pmtime),
+ psize(_psize),
+ petag(_petag),
+ pattrs(_pattrs),
+ pheaders(_pheaders) {}
+};
+
+class RGWStatRemoteObjCR : public RGWSimpleCoroutine {
+ CephContext *cct;
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ rgw_bucket src_bucket;
+ rgw_obj_key key;
+
+ ceph::real_time *pmtime;
+ uint64_t *psize;
+ std::string *petag;
+ std::map<std::string, bufferlist> *pattrs;
+ std::map<std::string, std::string> *pheaders;
+
+ RGWAsyncStatRemoteObj *req;
+
+public:
+ RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ rgw_bucket& _src_bucket,
+ const rgw_obj_key& _key,
+ ceph::real_time *_pmtime,
+ uint64_t *_psize,
+ std::string *_petag,
+ std::map<std::string, bufferlist> *_pattrs,
+ std::map<std::string, std::string> *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+ async_rados(_async_rados), store(_store),
+ source_zone(_source_zone),
+ src_bucket(_src_bucket),
+ key(_key),
+ pmtime(_pmtime),
+ psize(_psize),
+ petag(_petag),
+ pattrs(_pattrs),
+ pheaders(_pheaders),
+ req(NULL) {}
+
+
+ ~RGWStatRemoteObjCR() override {
+ request_cleanup();
+ }
+
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone,
+ src_bucket, key, pmtime, psize, petag, pattrs, pheaders);
+ async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+class RGWAsyncRemoveObj : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ std::unique_ptr<rgw::sal::Object> obj;
+
+ std::string owner;
+ std::string owner_display_name;
+ bool versioned;
+ uint64_t versioned_epoch;
+ std::string marker_version_id;
+
+ bool del_if_older;
+ ceph::real_time timestamp;
+ rgw_zone_set zones_trace;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncRemoveObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+ rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ RGWBucketInfo& _bucket_info,
+ const rgw_obj_key& _key,
+ const std::string& _owner,
+ const std::string& _owner_display_name,
+ bool _versioned,
+ uint64_t _versioned_epoch,
+ bool _delete_marker,
+ bool _if_older,
+ real_time& _timestamp,
+ rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), store(_store),
+ source_zone(_source_zone),
+ owner(_owner),
+ owner_display_name(_owner_display_name),
+ versioned(_versioned),
+ versioned_epoch(_versioned_epoch),
+ del_if_older(_if_older),
+ timestamp(_timestamp) {
+ if (_delete_marker) {
+ marker_version_id = _key.instance;
+ }
+
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ store->get_bucket(nullptr, _bucket_info, &bucket);
+ obj = bucket->get_object(_key);
+ }
+};
+
+class RGWRemoveObjCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider *dpp;
+ CephContext *cct;
+ RGWAsyncRadosProcessor *async_rados;
+ rgw::sal::RadosStore* store;
+ rgw_zone_id source_zone;
+
+ RGWBucketInfo bucket_info;
+
+ rgw_obj_key key;
+ bool versioned;
+ uint64_t versioned_epoch;
+ bool delete_marker;
+ std::string owner;
+ std::string owner_display_name;
+
+ bool del_if_older;
+ real_time timestamp;
+
+ RGWAsyncRemoveObj *req;
+
+ rgw_zone_set *zones_trace;
+
+public:
+ RGWRemoveObjCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+ const rgw_zone_id& _source_zone,
+ RGWBucketInfo& _bucket_info,
+ const rgw_obj_key& _key,
+ bool _versioned,
+ uint64_t _versioned_epoch,
+ std::string *_owner,
+ std::string *_owner_display_name,
+ bool _delete_marker,
+ real_time *_timestamp,
+ rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), dpp(_dpp), cct(_store->ctx()),
+ async_rados(_async_rados), store(_store),
+ source_zone(_source_zone),
+ bucket_info(_bucket_info),
+ key(_key),
+ versioned(_versioned),
+ versioned_epoch(_versioned_epoch),
+ delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) {
+ del_if_older = (_timestamp != NULL);
+ if (_timestamp) {
+ timestamp = *_timestamp;
+ }
+
+ if (_owner) {
+ owner = *_owner;
+ }
+
+ if (_owner_display_name) {
+ owner_display_name = *_owner_display_name;
+ }
+ }
+ ~RGWRemoveObjCR() override {
+ request_cleanup();
+ }
+
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = NULL;
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncRemoveObj(dpp, this, stack->create_completion_notifier(), store, source_zone, bucket_info,
+ key, owner, owner_display_name, versioned, versioned_epoch,
+ delete_marker, del_if_older, timestamp, zones_trace);
+ async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+/// \brief Collect average latency
+///
+/// Used in data sync to back off on concurrency when latency of lock
+/// operations rises.
+///
+/// \warning This class is not thread safe. We do not use a mutex
+/// because all coroutines spawned by RGWDataSyncCR share a single thread.
+class LatencyMonitor {
+ ceph::timespan total;
+ std::uint64_t count = 0;
+
+public:
+
+ LatencyMonitor() = default;
+ void add_latency(ceph::timespan latency) {
+ total += latency;
+ ++count;
+ }
+
+ ceph::timespan avg_latency() {
+ using namespace std::literals;
+ return count == 0 ? 0s : total / count;
+ }
+};
+
+class RGWContinuousLeaseCR : public RGWCoroutine {
+ RGWAsyncRadosProcessor* async_rados;
+ rgw::sal::RadosStore* store;
+
+ const rgw_raw_obj obj;
+
+ const std::string lock_name;
+ const std::string cookie{RGWSimpleRadosLockCR::gen_random_cookie(cct)};
+
+ int interval;
+ bool going_down{false};
+ bool locked{false};
+
+ const ceph::timespan interval_tolerance;
+ const ceph::timespan ts_interval;
+
+ RGWCoroutine* caller;
+
+ bool aborted{false};
+
+ ceph::coarse_mono_time last_renew_try_time;
+ ceph::coarse_mono_time current_time;
+
+ LatencyMonitor* latency;
+
+public:
+ RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados,
+ rgw::sal::RadosStore* _store,
+ rgw_raw_obj obj, std::string lock_name,
+ int interval, RGWCoroutine* caller,
+ LatencyMonitor* const latency)
+ : RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store),
+ obj(std::move(obj)), lock_name(std::move(lock_name)),
+ interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)),
+ ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency)
+ {}
+
+ virtual ~RGWContinuousLeaseCR() override;
+
+ int operate(const DoutPrefixProvider *dpp) override;
+
+ bool is_locked() const {
+ if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) {
+ return false;
+ }
+ return locked;
+ }
+
+ void set_locked(bool status) {
+ locked = status;
+ }
+
+ void go_down() {
+ going_down = true;
+ wakeup();
+ }
+
+ void abort() {
+ aborted = true;
+ }
+};
+
+class RGWRadosTimelogAddCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ std::list<cls_log_entry> entries;
+
+ std::string oid;
+
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosTimelogAddCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, const std::string& _oid,
+ const cls_log_entry& entry);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ protected:
+ std::string oid;
+ real_time start_time;
+ real_time end_time;
+ std::string from_marker;
+ std::string to_marker;
+
+ public:
+ RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store, const std::string& oid,
+ const real_time& start_time, const real_time& end_time,
+ const std::string& from_marker,
+ const std::string& to_marker);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+// wrapper to update last_trim_marker on success
+class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR {
+ CephContext *cct;
+ std::string *last_trim_marker;
+ public:
+ static constexpr const char* max_marker = "99999999";
+
+ RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store, const std::string& oid,
+ const std::string& to_marker, std::string *last_trim_marker);
+ int request_complete() override;
+};
+
+class RGWAsyncStatObj : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWBucketInfo bucket_info;
+ rgw_obj obj;
+ uint64_t *psize;
+ real_time *pmtime;
+ uint64_t *pepoch;
+ RGWObjVersionTracker *objv_tracker;
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+ RGWAsyncStatObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* store,
+ const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+ real_time *pmtime = nullptr, uint64_t *pepoch = nullptr,
+ RGWObjVersionTracker *objv_tracker = nullptr)
+ : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(store), obj(obj), psize(psize),
+ pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {}
+};
+
+class RGWStatObjCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWAsyncRadosProcessor *async_rados;
+ RGWBucketInfo bucket_info;
+ rgw_obj obj;
+ uint64_t *psize;
+ real_time *pmtime;
+ uint64_t *pepoch;
+ RGWObjVersionTracker *objv_tracker;
+ RGWAsyncStatObj *req = nullptr;
+ public:
+ RGWStatObjCR(const DoutPrefixProvider *dpp, RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+ const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+ real_time* pmtime = nullptr, uint64_t *pepoch = nullptr,
+ RGWObjVersionTracker *objv_tracker = nullptr);
+ ~RGWStatObjCR() override {
+ request_cleanup();
+ }
+ void request_cleanup() override;
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+/// coroutine wrapper for IoCtx::aio_notify()
+class RGWRadosNotifyCR : public RGWSimpleCoroutine {
+ rgw::sal::RadosStore* const store;
+ const rgw_raw_obj obj;
+ bufferlist request;
+ const uint64_t timeout_ms;
+ bufferlist *response;
+ rgw_rados_ref ref;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+ RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+ bufferlist& request, uint64_t timeout_ms,
+ bufferlist *response);
+
+ int send_request(const DoutPrefixProvider *dpp) override;
+ int request_complete() override;
+};
+
+class RGWDataPostNotifyCR : public RGWCoroutine {
+ RGWRados *store;
+ RGWHTTPManager& http_manager;
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards;
+ const char *source_zone;
+ RGWRESTConn *conn;
+
+public:
+ RGWDataPostNotifyCR(RGWRados *_store, RGWHTTPManager& _http_manager, bc::flat_map<int,
+ bc::flat_set<rgw_data_notify_entry> >& _shards, const char *_zone, RGWRESTConn *_conn)
+ : RGWCoroutine(_store->ctx()), store(_store), http_manager(_http_manager),
+ shards(_shards), source_zone(_zone), conn(_conn) {}
+
+ int operate(const DoutPrefixProvider* dpp) override;
+};
+
diff --git a/src/rgw/driver/rados/rgw_cr_tools.cc b/src/rgw/driver/rados/rgw_cr_tools.cc
new file mode 100644
index 000000000..94665a35a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_tools.cc
@@ -0,0 +1,292 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_cr_tools.h"
+#include "rgw_bucket.h"
+#include "rgw_user.h"
+#include "rgw_op.h"
+#include "rgw_acl_s3.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+template<>
+int RGWUserCreateCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ CephContext *cct = store->ctx();
+
+ const int32_t default_max_buckets =
+ cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+ RGWUserAdminOpState op_state(store);
+
+ auto& user = params.user;
+
+ op_state.set_user_id(user);
+ op_state.set_display_name(params.display_name);
+ op_state.set_user_email(params.email);
+ op_state.set_caps(params.caps);
+ op_state.set_access_key(params.access_key);
+ op_state.set_secret_key(params.secret_key);
+
+ if (!params.key_type.empty()) {
+ int32_t key_type = KEY_TYPE_S3;
+ if (params.key_type == "swift") {
+ key_type = KEY_TYPE_SWIFT;
+ }
+
+ op_state.set_key_type(key_type);
+ }
+
+ op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets));
+ op_state.set_suspension(params.suspended);
+ op_state.set_system(params.system);
+ op_state.set_exclusive(params.exclusive);
+
+ if (params.generate_key) {
+ op_state.set_generate_key();
+ }
+
+
+ if (params.apply_quota) {
+ RGWQuota quota;
+
+ if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+ quota.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+ quota.bucket_quota.enabled = true;
+ }
+
+ if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+ quota.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+ quota.bucket_quota.enabled = true;
+ }
+
+ if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+ quota.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+ quota.user_quota.enabled = true;
+ }
+
+ if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+ quota.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+ quota.user_quota.enabled = true;
+ }
+
+ if (quota.bucket_quota.enabled) {
+ op_state.set_bucket_quota(quota.bucket_quota);
+ }
+
+ if (quota.user_quota.enabled) {
+ op_state.set_user_quota(quota.user_quota);
+ }
+ }
+
+ RGWNullFlusher flusher;
+ return RGWUserAdminOp_User::create(dpp, store, op_state, flusher, null_yield);
+}
+
+template<>
+int RGWGetUserInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ return store->ctl()->user->get_info_by_uid(dpp, params.user, result.get(), null_yield);
+}
+
+template<>
+int RGWGetBucketInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ return store->get_bucket(dpp, nullptr, params.tenant, params.bucket_name, &result->bucket, null_yield);
+}
+
+template<>
+int RGWBucketCreateLocalCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ CephContext *cct = store->ctx();
+ auto& zone_svc = store->svc()->zone;
+
+ const auto& user_info = params.user_info.get();
+ const auto& user = user_info->user_id;
+ const auto& bucket_name = params.bucket_name;
+ auto& placement_rule = params.placement_rule;
+
+ if (!placement_rule.empty() &&
+ !zone_svc->get_zone_params().valid_placement(placement_rule)) {
+ ldpp_dout(dpp, 0) << "placement target (" << placement_rule << ")"
+ << " doesn't exist in the placement targets of zonegroup"
+ << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl;
+ return -ERR_INVALID_LOCATION_CONSTRAINT;
+ }
+
+ /* we need to make sure we read bucket info, it's not read before for this
+ * specific request */
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> bucket_attrs;
+
+ int ret = store->getRados()->get_bucket_info(store->svc(), user.tenant, bucket_name,
+ bucket_info, nullptr, null_yield, dpp, &bucket_attrs);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+ bool bucket_exists = (ret != -ENOENT);
+
+ RGWAccessControlPolicy old_policy(cct);
+ ACLOwner bucket_owner;
+ bucket_owner.set_id(user);
+ bucket_owner.set_name(user_info->display_name);
+ if (bucket_exists) {
+ ret = rgw_op_get_bucket_policy_from_attr(dpp, cct, store, bucket_info,
+ bucket_attrs, &old_policy, null_yield);
+ if (ret >= 0) {
+ if (old_policy.get_owner().get_id().compare(user) != 0) {
+ return -EEXIST;
+ }
+ }
+ }
+
+ RGWBucketInfo master_info;
+ rgw_bucket *pmaster_bucket = nullptr;
+ uint32_t *pmaster_num_shards = nullptr;
+ real_time creation_time;
+
+ string zonegroup_id = zone_svc->get_zonegroup().get_id();
+
+ if (bucket_exists) {
+ rgw_placement_rule selected_placement_rule;
+ rgw_bucket bucket;
+ bucket.tenant = user.tenant;
+ bucket.name = bucket_name;
+ ret = zone_svc->select_bucket_placement(dpp, *user_info, zonegroup_id,
+ placement_rule,
+ &selected_placement_rule, nullptr, null_yield);
+ if (selected_placement_rule != bucket_info.placement_rule) {
+ ldpp_dout(dpp, 0) << "bucket already exists on a different placement rule: "
+ << " selected_rule= " << selected_placement_rule
+ << " existing_rule= " << bucket_info.placement_rule << dendl;
+ return -EEXIST;
+ }
+ }
+
+ /* Encode special metadata first as we're using std::map::emplace under
+ * the hood. This method will add the new items only if the map doesn't
+ * contain such keys yet. */
+ RGWAccessControlPolicy_S3 policy(cct);
+ policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */
+ bufferlist aclbl;
+ policy.encode(aclbl);
+ map<string, buffer::list> attrs;
+ attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl));
+
+ RGWQuotaInfo quota_info;
+ const RGWQuotaInfo * pquota_info = nullptr;
+
+ rgw_bucket bucket;
+ bucket.tenant = user.tenant;
+ bucket.name = bucket_name;
+
+ RGWBucketInfo info;
+ obj_version ep_objv;
+
+ ret = store->getRados()->create_bucket(*user_info, bucket, zonegroup_id,
+ placement_rule, bucket_info.swift_ver_location,
+ pquota_info, attrs,
+ info, nullptr, &ep_objv, creation_time,
+ pmaster_bucket, pmaster_num_shards, null_yield, dpp, true);
+
+
+ if (ret && ret != -EEXIST)
+ return ret;
+
+ bool existed = (ret == -EEXIST);
+
+ if (existed) {
+ if (info.owner != user) {
+ ldpp_dout(dpp, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl;
+ return -EEXIST;
+ }
+ bucket = info.bucket;
+ }
+
+ ret = store->ctl()->bucket->link_bucket(user, bucket, info.creation_time, null_yield, dpp, false);
+ if (ret && !existed && ret != -EEXIST) {
+ /* if it exists (or previously existed), don't remove it! */
+ int r = store->ctl()->bucket->unlink_bucket(user, bucket, null_yield, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl;
+ }
+ } else if (ret == -EEXIST || (ret == 0 && existed)) {
+ ret = -ERR_BUCKET_EXISTS;
+ }
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl;
+ }
+
+ return ret;
+}
+
+template<>
+int RGWObjectSimplePutCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ RGWDataAccess::ObjectRef obj;
+
+ CephContext *cct = store->ctx();
+
+ int ret = params.bucket->get_object(params.key, &obj);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl;
+ return -ret;
+ }
+
+ if (params.user_data) {
+ obj->set_user_data(*params.user_data);
+ }
+
+ ret = obj->put(params.data, params.attrs, dpp, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl;
+ }
+
+ return 0;
+}
+
+template<>
+int RGWBucketLifecycleConfigCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ CephContext *cct = store->ctx();
+
+ RGWLC *lc = store->getRados()->get_lc();
+ if (!lc) {
+ lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl;
+ return -EIO;
+ }
+
+ int ret = lc->set_bucket_config(params.bucket,
+ params.bucket_attrs,
+ &params.config);
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl;
+ return -ret;
+ }
+
+ return 0;
+}
+
+template<>
+int RGWBucketGetSyncPolicyHandlerCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+ int r = store->ctl()->bucket->get_sync_policy_handler(params.zone,
+ params.bucket,
+ &result->policy_handler,
+ null_yield,
+ dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(): get_sync_policy_handler() returned " << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h
new file mode 100644
index 000000000..4cd97aa82
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_tools.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_cr_rados.h"
+#include "rgw_tools.h"
+#include "rgw_lc.h"
+
+#include "services/svc_bucket_sync.h"
+
+struct rgw_user_create_params {
+ rgw_user user;
+ std::string display_name;
+ std::string email;
+ std::string access_key;
+ std::string secret_key;
+ std::string key_type; /* "swift" or "s3" */
+ std::string caps;
+
+ bool generate_key{true};
+ bool suspended{false};
+ std::optional<int32_t> max_buckets;
+ bool system{false};
+ bool exclusive{false};
+ bool apply_quota{true};
+};
+
+using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR<rgw_user_create_params>;
+
+struct rgw_get_user_info_params {
+ rgw_user user;
+};
+
+using RGWGetUserInfoCR = RGWSimpleAsyncCR<rgw_get_user_info_params, RGWUserInfo>;
+
+struct rgw_get_bucket_info_params {
+ std::string tenant;
+ std::string bucket_name;
+};
+
+struct rgw_get_bucket_info_result {
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+};
+
+using RGWGetBucketInfoCR = RGWSimpleAsyncCR<rgw_get_bucket_info_params, rgw_get_bucket_info_result>;
+
+struct rgw_bucket_create_local_params {
+ std::shared_ptr<RGWUserInfo> user_info;
+ std::string bucket_name;
+ rgw_placement_rule placement_rule;
+};
+
+using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_create_local_params>;
+
+struct rgw_object_simple_put_params {
+ RGWDataAccess::BucketRef bucket;
+ rgw_obj_key key;
+ bufferlist data;
+ std::map<std::string, bufferlist> attrs;
+ std::optional<std::string> user_data;
+};
+
+using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR<rgw_object_simple_put_params>;
+
+
+struct rgw_bucket_lifecycle_config_params {
+ rgw::sal::Bucket* bucket;
+ rgw::sal::Attrs bucket_attrs;
+ RGWLifecycleConfiguration config;
+};
+
+using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_lifecycle_config_params>;
+
+struct rgw_bucket_get_sync_policy_params {
+ std::optional<rgw_zone_id> zone;
+ std::optional<rgw_bucket> bucket;
+};
+
+struct rgw_bucket_get_sync_policy_result {
+ RGWBucketSyncPolicyHandlerRef policy_handler;
+};
+
+using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
+
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.cc b/src/rgw/driver/rados/rgw_d3n_datacache.cc
new file mode 100644
index 000000000..f1bf731ae
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.cc
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_d3n_datacache.h"
+#include "rgw_rest_client.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_common.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_crypt_sanitize.h"
+#if defined(__linux__)
+#include <features.h>
+#endif
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace efs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace efs = std::experimental::filesystem;
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int D3nCacheAioWriteRequest::d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, string oid, string cache_location)
+{
+ std::string location = cache_location + url_encode(oid, true);
+ int r = 0;
+
+ lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): Write To Cache, location=" << location << dendl;
+ cb = new struct aiocb;
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ memset(cb, 0, sizeof(struct aiocb));
+ r = fd = ::open(location.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
+ if (fd < 0) {
+ ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: open file failed, errno=" << errno << ", location='" << location.c_str() << "'" << dendl;
+ goto done;
+ }
+ if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL)
+ posix_fadvise(fd, 0, 0, g_conf()->rgw_d3n_l1_fadvise);
+ cb->aio_fildes = fd;
+
+ data = malloc(len);
+ if (!data) {
+ ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: memory allocation failed" << dendl;
+ goto close_file;
+ }
+ cb->aio_buf = data;
+ memcpy((void*)data, bl.c_str(), len);
+ cb->aio_nbytes = len;
+ goto done;
+
+close_file:
+ ::close(fd);
+done:
+ return r;
+}
+
+D3nDataCache::D3nDataCache()
+ : cct(nullptr), io_type(_io_type::ASYNC_IO), free_data_cache_size(0), outstanding_write_size(0)
+{
+ lsubdout(g_ceph_context, rgw_datacache, 5) << "D3nDataCache: " << __func__ << "()" << dendl;
+}
+
+void D3nDataCache::init(CephContext *_cct) {
+ cct = _cct;
+ free_data_cache_size = cct->_conf->rgw_d3n_l1_datacache_size;
+ head = nullptr;
+ tail = nullptr;
+ cache_location = cct->_conf->rgw_d3n_l1_datacache_persistent_path;
+ if(cache_location.back() != '/') {
+ cache_location += "/";
+ }
+ try {
+ if (efs::exists(cache_location)) {
+ // d3n: evict the cache storage directory
+ if (g_conf()->rgw_d3n_l1_evict_cache_on_start) {
+ lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: evicting the persistent storage directory on start" << dendl;
+ for (auto& p : efs::directory_iterator(cache_location)) {
+ efs::remove_all(p.path());
+ }
+ }
+ } else {
+ // create the cache storage directory
+ lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl;
+ efs::create_directories(cache_location);
+ }
+ } catch (const efs::filesystem_error& e) {
+ lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location <<
+ "' : " << e.what() << dendl;
+ }
+
+ auto conf_eviction_policy = cct->_conf.get_val<std::string>("rgw_d3n_l1_eviction_policy");
+ ceph_assert(conf_eviction_policy == "lru" || conf_eviction_policy == "random");
+ if (conf_eviction_policy == "lru")
+ eviction_policy = _eviction_policy::LRU;
+ if (conf_eviction_policy == "random")
+ eviction_policy = _eviction_policy::RANDOM;
+
+#if defined(HAVE_LIBAIO) && defined(__GLIBC__)
+ // libaio setup
+ struct aioinit ainit{0};
+ ainit.aio_threads = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_threads");
+ ainit.aio_num = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_num");
+ ainit.aio_idle_time = 10;
+ aio_init(&ainit);
+#endif
+}
+
+int D3nDataCache::d3n_io_write(bufferlist& bl, unsigned int len, std::string oid)
+{
+ D3nChunkDataInfo* chunk_info = new D3nChunkDataInfo;
+ std::string location = cache_location + url_encode(oid, true);
+
+ lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+ FILE *cache_file = nullptr;
+ int r = 0;
+ size_t nbytes = 0;
+
+ cache_file = fopen(location.c_str(), "w+");
+ if (cache_file == nullptr) {
+ ldout(cct, 0) << "ERROR: D3nDataCache::fopen file has return error, errno=" << errno << dendl;
+ return -errno;
+ }
+
+ nbytes = fwrite(bl.c_str(), 1, len, cache_file);
+ if (nbytes != len) {
+ ldout(cct, 0) << "ERROR: D3nDataCache::io_write: fwrite has returned error: nbytes!=len, nbytes=" << nbytes << ", len=" << len << dendl;
+ return -EIO;
+ }
+
+ r = fclose(cache_file);
+ if (r != 0) {
+ ldout(cct, 0) << "ERROR: D3nDataCache::fclsoe file has return error, errno=" << errno << dendl;
+ return -errno;
+ }
+
+ { // update cahce_map entries for new chunk in cache
+ const std::lock_guard l(d3n_cache_lock);
+ chunk_info->oid = oid;
+ chunk_info->set_ctx(cct);
+ chunk_info->size = len;
+ d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(oid, chunk_info));
+ }
+
+ return r;
+}
+
+void d3n_libaio_write_cb(sigval sigval)
+{
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+ D3nCacheAioWriteRequest* c = static_cast<D3nCacheAioWriteRequest*>(sigval.sival_ptr);
+ c->priv_data->d3n_libaio_write_completion_cb(c);
+}
+
+
+void D3nDataCache::d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c)
+{
+ D3nChunkDataInfo* chunk_info{nullptr};
+
+ ldout(cct, 5) << "D3nDataCache: " << __func__ << "(): oid=" << c->oid << dendl;
+
+ { // update cache_map entries for new chunk in cache
+ const std::lock_guard l(d3n_cache_lock);
+ d3n_outstanding_write_list.erase(c->oid);
+ chunk_info = new D3nChunkDataInfo;
+ chunk_info->oid = c->oid;
+ chunk_info->set_ctx(cct);
+ chunk_info->size = c->cb->aio_nbytes;
+ d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(c->oid, chunk_info));
+ }
+
+ { // update free size
+ const std::lock_guard l(d3n_eviction_lock);
+ free_data_cache_size -= c->cb->aio_nbytes;
+ outstanding_write_size -= c->cb->aio_nbytes;
+ lru_insert_head(chunk_info);
+ }
+ delete c;
+ c = nullptr;
+}
+
+int D3nDataCache::d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid)
+{
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Write To Cache, oid=" << oid << ", len=" << len << dendl;
+ struct D3nCacheAioWriteRequest* wr = new struct D3nCacheAioWriteRequest(cct);
+ int r=0;
+ if ((r = wr->d3n_libaio_prepare_write_op(bl, len, oid, cache_location)) < 0) {
+ ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() prepare libaio write op r=" << r << dendl;
+ goto done;
+ }
+ wr->cb->aio_sigevent.sigev_notify = SIGEV_THREAD;
+ wr->cb->aio_sigevent.sigev_notify_function = d3n_libaio_write_cb;
+ wr->cb->aio_sigevent.sigev_notify_attributes = nullptr;
+ wr->cb->aio_sigevent.sigev_value.sival_ptr = (void*)wr;
+ wr->oid = oid;
+ wr->priv_data = this;
+
+ if ((r = ::aio_write(wr->cb)) != 0) {
+ ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() aio_write r=" << r << dendl;
+ goto error;
+ }
+ return 0;
+
+error:
+ delete wr;
+done:
+ return r;
+}
+
+void D3nDataCache::put(bufferlist& bl, unsigned int len, std::string& oid)
+{
+ size_t sr = 0;
+ uint64_t freed_size = 0, _free_data_cache_size = 0, _outstanding_write_size = 0;
+
+ ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): oid=" << oid << ", len=" << len << dendl;
+ {
+ const std::lock_guard l(d3n_cache_lock);
+ std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+ if (iter != d3n_cache_map.end()) {
+ ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): data already cached, no rewrite" << dendl;
+ return;
+ }
+ auto it = d3n_outstanding_write_list.find(oid);
+ if (it != d3n_outstanding_write_list.end()) {
+ ldout(cct, 10) << "D3nDataCache: NOTE: data put in cache already issued, no rewrite" << dendl;
+ return;
+ }
+ d3n_outstanding_write_list.insert(oid);
+ }
+ {
+ const std::lock_guard l(d3n_eviction_lock);
+ _free_data_cache_size = free_data_cache_size;
+ _outstanding_write_size = outstanding_write_size;
+ }
+ ldout(cct, 20) << "D3nDataCache: Before eviction _free_data_cache_size:" << _free_data_cache_size << ", _outstanding_write_size:" << _outstanding_write_size << ", freed_size:" << freed_size << dendl;
+ while (len > (_free_data_cache_size - _outstanding_write_size + freed_size)) {
+ ldout(cct, 20) << "D3nDataCache: enter eviction" << dendl;
+ if (eviction_policy == _eviction_policy::LRU) {
+ sr = lru_eviction();
+ } else if (eviction_policy == _eviction_policy::RANDOM) {
+ sr = random_eviction();
+ } else {
+ ldout(cct, 0) << "D3nDataCache: Warning: unknown cache eviction policy, defaulting to lru eviction" << dendl;
+ sr = lru_eviction();
+ }
+ if (sr == 0) {
+ ldout(cct, 2) << "D3nDataCache: Warning: eviction was not able to free disk space, not writing to cache" << dendl;
+ d3n_outstanding_write_list.erase(oid);
+ return;
+ }
+ ldout(cct, 20) << "D3nDataCache: completed eviction of " << sr << " bytes" << dendl;
+ freed_size += sr;
+ }
+ int r = 0;
+ r = d3n_libaio_create_write_request(bl, len, oid);
+ if (r < 0) {
+ const std::lock_guard l(d3n_cache_lock);
+ d3n_outstanding_write_list.erase(oid);
+ ldout(cct, 1) << "D3nDataCache: create_aio_write_request fail, r=" << r << dendl;
+ return;
+ }
+
+ const std::lock_guard l(d3n_eviction_lock);
+ free_data_cache_size += freed_size;
+ outstanding_write_size += len;
+}
+
+bool D3nDataCache::get(const string& oid, const off_t len)
+{
+ const std::lock_guard l(d3n_cache_lock);
+ bool exist = false;
+ string location = cache_location + url_encode(oid, true);
+
+ lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+ std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+ if (!(iter == d3n_cache_map.end())) {
+ // check inside cache whether file exists or not!!!! then make exist true;
+ struct D3nChunkDataInfo* chdo = iter->second;
+ struct stat st;
+ int r = stat(location.c_str(), &st);
+ if ( r != -1 && st.st_size == len) { // file exists and containes required data range length
+ exist = true;
+ /*LRU*/
+ /*get D3nChunkDataInfo*/
+ const std::lock_guard l(d3n_eviction_lock);
+ lru_remove(chdo);
+ lru_insert_head(chdo);
+ } else {
+ d3n_cache_map.erase(oid);
+ const std::lock_guard l(d3n_eviction_lock);
+ lru_remove(chdo);
+ delete chdo;
+ exist = false;
+ }
+ }
+ return exist;
+}
+
+size_t D3nDataCache::random_eviction()
+{
+ lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+ int n_entries = 0;
+ int random_index = 0;
+ size_t freed_size = 0;
+ D3nChunkDataInfo* del_entry;
+ string del_oid, location;
+ {
+ const std::lock_guard l(d3n_cache_lock);
+ n_entries = d3n_cache_map.size();
+ if (n_entries <= 0) {
+ return -1;
+ }
+ srand (time(NULL));
+ random_index = ceph::util::generate_random_number<int>(0, n_entries-1);
+ std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.begin();
+ std::advance(iter, random_index);
+ del_oid = iter->first;
+ del_entry = iter->second;
+ ldout(cct, 20) << "D3nDataCache: random_eviction: index:" << random_index << ", free size: " << del_entry->size << dendl;
+ freed_size = del_entry->size;
+ delete del_entry;
+ del_entry = nullptr;
+ d3n_cache_map.erase(del_oid); // oid
+ }
+
+ location = cache_location + url_encode(del_oid, true);
+ ::remove(location.c_str());
+ return freed_size;
+}
+
+size_t D3nDataCache::lru_eviction()
+{
+ lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+ int n_entries = 0;
+ size_t freed_size = 0;
+ D3nChunkDataInfo* del_entry;
+ string del_oid, location;
+
+ {
+ const std::lock_guard l(d3n_eviction_lock);
+ del_entry = tail;
+ if (del_entry == nullptr) {
+ ldout(cct, 2) << "D3nDataCache: lru_eviction: del_entry=null_ptr" << dendl;
+ return 0;
+ }
+ lru_remove(del_entry);
+ }
+
+ {
+ const std::lock_guard l(d3n_cache_lock);
+ n_entries = d3n_cache_map.size();
+ if (n_entries <= 0) {
+ ldout(cct, 2) << "D3nDataCache: lru_eviction: cache_map.size<=0" << dendl;
+ return -1;
+ }
+ del_oid = del_entry->oid;
+ ldout(cct, 20) << "D3nDataCache: lru_eviction: oid to remove: " << del_oid << dendl;
+ d3n_cache_map.erase(del_oid); // oid
+ }
+ freed_size = del_entry->size;
+ delete del_entry;
+ location = cache_location + url_encode(del_oid, true);
+ ::remove(location.c_str());
+ return freed_size;
+}
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h
new file mode 100644
index 000000000..feaa3f2b7
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.h
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rados.h"
+#include <curl/curl.h>
+
+#include "rgw_common.h"
+
+#include <unistd.h>
+#include <signal.h>
+#include "include/Context.h"
+#include "include/lru.h"
+#include "rgw_d3n_cacherequest.h"
+
+
+/*D3nDataCache*/
+struct D3nDataCache;
+
+
+struct D3nChunkDataInfo : public LRUObject {
+ CephContext *cct;
+ uint64_t size;
+ time_t access_time;
+ std::string address;
+ std::string oid;
+ bool complete;
+ struct D3nChunkDataInfo* lru_prev;
+ struct D3nChunkDataInfo* lru_next;
+
+ D3nChunkDataInfo(): size(0) {}
+
+ void set_ctx(CephContext *_cct) {
+ cct = _cct;
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<D3nChunkDataInfo*>& o);
+};
+
+struct D3nCacheAioWriteRequest {
+ std::string oid;
+ void *data;
+ int fd;
+ struct aiocb *cb;
+ D3nDataCache *priv_data;
+ CephContext *cct;
+
+ D3nCacheAioWriteRequest(CephContext *_cct) : cct(_cct) {}
+ int d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, std::string oid, std::string cache_location);
+
+ ~D3nCacheAioWriteRequest() {
+ ::close(fd);
+ cb->aio_buf = nullptr;
+ free(data);
+ data = nullptr;
+ delete(cb);
+ }
+};
+
+struct D3nDataCache {
+
+private:
+ std::unordered_map<std::string, D3nChunkDataInfo*> d3n_cache_map;
+ std::set<std::string> d3n_outstanding_write_list;
+ std::mutex d3n_cache_lock;
+ std::mutex d3n_eviction_lock;
+
+ CephContext *cct;
+ enum class _io_type {
+ SYNC_IO = 1,
+ ASYNC_IO = 2,
+ SEND_FILE = 3
+ } io_type;
+ enum class _eviction_policy {
+ LRU=0, RANDOM=1
+ } eviction_policy;
+
+ struct sigaction action;
+ uint64_t free_data_cache_size = 0;
+ uint64_t outstanding_write_size = 0;
+ struct D3nChunkDataInfo* head;
+ struct D3nChunkDataInfo* tail;
+
+private:
+ void add_io();
+
+public:
+ D3nDataCache();
+ ~D3nDataCache() {
+ while (lru_eviction() > 0);
+ }
+
+ std::string cache_location;
+
+ bool get(const std::string& oid, const off_t len);
+ void put(bufferlist& bl, unsigned int len, std::string& obj_key);
+ int d3n_io_write(bufferlist& bl, unsigned int len, std::string oid);
+ int d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid);
+ void d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c);
+ size_t random_eviction();
+ size_t lru_eviction();
+
+ void init(CephContext *_cct);
+
+ void lru_insert_head(struct D3nChunkDataInfo* o) {
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+ o->lru_next = head;
+ o->lru_prev = nullptr;
+ if (head) {
+ head->lru_prev = o;
+ } else {
+ tail = o;
+ }
+ head = o;
+ }
+
+ void lru_insert_tail(struct D3nChunkDataInfo* o) {
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+ o->lru_next = nullptr;
+ o->lru_prev = tail;
+ if (tail) {
+ tail->lru_next = o;
+ } else {
+ head = o;
+ }
+ tail = o;
+ }
+
+ void lru_remove(struct D3nChunkDataInfo* o) {
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+ if (o->lru_next)
+ o->lru_next->lru_prev = o->lru_prev;
+ else
+ tail = o->lru_prev;
+ if (o->lru_prev)
+ o->lru_prev->lru_next = o->lru_next;
+ else
+ head = o->lru_next;
+ o->lru_next = o->lru_prev = nullptr;
+ }
+};
+
+
+template <class T>
+class D3nRGWDataCache : public T {
+
+public:
+ D3nRGWDataCache() {}
+
+ int init_rados() override {
+ int ret;
+ ret = T::init_rados();
+ if (ret < 0)
+ return ret;
+
+ return 0;
+ }
+
+ int get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg) override;
+};
+
+template<typename T>
+int D3nRGWDataCache<T>::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg) {
+ lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache::" << __func__ << "(): is head object : " << is_head_obj << dendl;
+ librados::ObjectReadOperation op;
+ struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+ std::string oid, key;
+
+ if (is_head_obj) {
+ // only when reading from the head object do we need to do the atomic test
+ int r = T::append_atomic_test(dpp, astate, op);
+ if (r < 0)
+ return r;
+
+ if (astate &&
+ obj_ofs < astate->data.length()) {
+ unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+ r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+ if (r < 0)
+ return r;
+
+ len -= chunk_len;
+ d->offset += chunk_len;
+ read_ofs += chunk_len;
+ obj_ofs += chunk_len;
+ if (!len)
+ return 0;
+ }
+
+ auto obj = d->rgwrados->svc.rados->obj(read_obj);
+ r = obj.open(dpp);
+ if (r < 0) {
+ lsubdout(g_ceph_context, rgw, 4) << "failed to open rados context for " << read_obj << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+ op.read(read_ofs, len, nullptr, nullptr);
+
+ const uint64_t cost = len;
+ const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+ auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+ return d->flush(std::move(completed));
+ } else {
+ ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << ", is_head_obj=" << is_head_obj << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+ int r;
+
+ op.read(read_ofs, len, nullptr, nullptr);
+
+ const uint64_t cost = len;
+ const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+ oid = read_obj.oid;
+
+ auto obj = d->rgwrados->svc.rados->obj(read_obj);
+ r = obj.open(dpp);
+ if (r < 0) {
+ lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: Error: failed to open rados context for " << read_obj << ", r=" << r << dendl;
+ return r;
+ }
+
+ const bool is_compressed = (astate->attrset.find(RGW_ATTR_COMPRESSION) != astate->attrset.end());
+ const bool is_encrypted = (astate->attrset.find(RGW_ATTR_CRYPT_MODE) != astate->attrset.end());
+ if (read_ofs != 0 || astate->size != astate->accounted_size || is_compressed || is_encrypted) {
+ d->d3n_bypass_cache_write = true;
+ lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: " << __func__ << "(): Note - bypassing datacache: oid=" << read_obj.oid << ", read_ofs!=0 = " << read_ofs << ", size=" << astate->size << " != accounted_size=" << astate->accounted_size << ", is_compressed=" << is_compressed << ", is_encrypted=" << is_encrypted << dendl;
+ auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+ r = d->flush(std::move(completed));
+ return r;
+ }
+
+ if (d->rgwrados->d3n_data_cache->get(oid, len)) {
+ // Read From Cache
+ ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): READ FROM CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+ auto completed = d->aio->get(obj, rgw::Aio::d3n_cache_op(dpp, d->yield, read_ofs, len, d->rgwrados->d3n_data_cache->cache_location), cost, id);
+ r = d->flush(std::move(completed));
+ if (r < 0) {
+ lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: " << __func__ << "(): Error: failed to drain/flush, r= " << r << dendl;
+ }
+ return r;
+ } else {
+ // Write To Cache
+ ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): WRITE TO CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << " len=" << len << dendl;
+ auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+ return d->flush(std::move(completed));
+ }
+ }
+ lsubdout(g_ceph_context, rgw, 1) << "D3nDataCache: " << __func__ << "(): Warning: Check head object cache handling flow, oid=" << read_obj.oid << dendl;
+
+ return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
new file mode 100644
index 000000000..a5730e51d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -0,0 +1,6762 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+#include "common/RefCountedObj.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_http_client.h"
+#include "rgw_bucket.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_bucket_sync_cache.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_error_repo.h"
+#include "rgw_sync_module.h"
+#include "rgw_sal.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sync_modules.h"
+
+#include "include/common_fwd.h"
+#include "include/random.h"
+
+#include <boost/asio/yield.hpp>
+#include <string_view>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data sync: ")
+
+using namespace std;
+
+static const string datalog_sync_status_oid_prefix = "datalog.sync-status";
+static const string datalog_sync_status_shard_prefix = "datalog.sync-status.shard";
+static const string datalog_sync_full_sync_index_prefix = "data.full-sync.index";
+static const string bucket_full_status_oid_prefix = "bucket.full-sync-status";
+static const string bucket_status_oid_prefix = "bucket.sync-status";
+static const string object_status_oid_prefix = "bucket.sync-status";
+
+void rgw_datalog_info::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("num_objects", num_shards, obj);
+}
+
+void rgw_datalog_entry::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("key", key, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("timestamp", ut, obj);
+ timestamp = ut.to_real_time();
+}
+
+void rgw_datalog_shard_data::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ JSONDecoder::decode_json("entries", entries, obj);
+};
+
+// print a bucket shard with [gen]
+std::string to_string(const rgw_bucket_shard& bs, std::optional<uint64_t> gen)
+{
+ constexpr auto digits10 = std::numeric_limits<uint64_t>::digits10;
+ constexpr auto reserve = 2 + digits10; // [value]
+ auto str = bs.get_key('/', ':', ':', reserve);
+ str.append(1, '[');
+ str.append(std::to_string(gen.value_or(0)));
+ str.append(1, ']');
+ return str;
+}
+
+class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *env;
+ const int num_shards;
+ int shard_id{0};;
+
+ map<uint32_t, rgw_data_sync_marker>& markers;
+ std::vector<RGWObjVersionTracker>& objvs;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to read data sync status: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ RGWReadDataSyncStatusMarkersCR(RGWDataSyncCtx *sc, int num_shards,
+ map<uint32_t, rgw_data_sync_marker>& markers,
+ std::vector<RGWObjVersionTracker>& objvs)
+ : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS),
+ sc(sc), env(sc->env), num_shards(num_shards), markers(markers), objvs(objvs)
+ {}
+ bool spawn_next() override;
+};
+
+bool RGWReadDataSyncStatusMarkersCR::spawn_next()
+{
+ if (shard_id >= num_shards) {
+ return false;
+ }
+ using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+ spawn(new CR(env->dpp, env->driver,
+ rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+ &markers[shard_id], true, &objvs[shard_id]),
+ false);
+ shard_id++;
+ return true;
+}
+
+class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *env;
+
+ uint64_t max_entries;
+ int num_shards;
+ int shard_id{0};
+
+ string marker;
+ std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to list recovering data sync: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ RGWReadDataSyncRecoveringShardsCR(RGWDataSyncCtx *sc, uint64_t _max_entries, int _num_shards,
+ std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys)
+ : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), sc(sc), env(sc->env),
+ max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys)
+ {}
+ bool spawn_next() override;
+};
+
+bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
+{
+ if (shard_id >= num_shards)
+ return false;
+
+ string error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+ auto& shard_keys = omapkeys[shard_id];
+ shard_keys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+ spawn(new RGWRadosGetOmapKeysCR(env->driver, rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, error_oid),
+ marker, max_entries, shard_keys), false);
+
+ ++shard_id;
+ return true;
+}
+
+class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_data_sync_status *sync_status;
+ RGWObjVersionTracker* objv_tracker;
+ std::vector<RGWObjVersionTracker>& objvs;
+
+public:
+ RGWReadDataSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+ rgw_data_sync_status *_status,
+ RGWObjVersionTracker* objv_tracker,
+ std::vector<RGWObjVersionTracker>& objvs)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(sc->env), sync_status(_status),
+ objv_tracker(objv_tracker), objvs(objvs)
+ {}
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadDataSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // read sync info
+ using ReadInfoCR = RGWSimpleRadosReadCR<rgw_data_sync_info>;
+ yield {
+ bool empty_on_enoent = false; // fail on ENOENT
+ call(new ReadInfoCR(dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+ &sync_status->sync_info, empty_on_enoent, objv_tracker));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to read sync status info with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ // read shard markers
+ objvs.resize(sync_status->sync_info.num_shards);
+ using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR;
+ yield call(new ReadMarkersCR(sc, sync_status->sync_info.num_shards,
+ sync_status->sync_markers, objvs));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ RGWRESTReadResource *http_op;
+
+ int shard_id;
+ RGWDataChangesLogInfo *shard_info;
+
+public:
+ RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc,
+ int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ sync_env(_sc->env),
+ http_op(NULL),
+ shard_id(_shard_id),
+ shard_info(_shard_info) {
+ }
+
+ ~RGWReadRemoteDataLogShardInfoCR() override {
+ if (http_op) {
+ http_op->put();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+ rgw_http_param_pair pairs[] = { { "type" , "data" },
+ { "id", buf },
+ { "info" , NULL },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+
+ http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+ }
+ yield {
+ int ret = http_op->wait(shard_info, null_yield);
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ return set_cr_done();
+ }
+ }
+ return 0;
+ }
+};
+
+struct read_remote_data_log_response {
+ string marker;
+ bool truncated;
+ vector<rgw_data_change_log_entry> entries;
+
+ read_remote_data_log_response() : truncated(false) {}
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ JSONDecoder::decode_json("entries", entries, obj);
+ };
+};
+
+class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ RGWRESTReadResource *http_op = nullptr;
+
+ int shard_id;
+ const std::string& marker;
+ string *pnext_marker;
+ vector<rgw_data_change_log_entry> *entries;
+ bool *truncated;
+
+ read_remote_data_log_response response;
+ std::optional<TOPNSPC::common::PerfGuard> timer;
+
+public:
+ RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id,
+ const std::string& marker, string *pnext_marker,
+ vector<rgw_data_change_log_entry> *_entries,
+ bool *_truncated)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
+ entries(_entries), truncated(_truncated) {
+ }
+ ~RGWReadRemoteDataLogShardCR() override {
+ if (http_op) {
+ http_op->put();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+ rgw_http_param_pair pairs[] = { { "type" , "data" },
+ { "id", buf },
+ { "marker", marker.c_str() },
+ { "extra-info", "true" },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+
+ http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+ init_new_io(http_op);
+
+ if (sync_env->counters) {
+ timer.emplace(sync_env->counters, sync_counters::l_poll);
+ }
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ if (sync_env->counters) {
+ sync_env->counters->inc(sync_counters::l_poll_err);
+ }
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+ }
+ yield {
+ timer.reset();
+ int ret = http_op->wait(&response, null_yield);
+ if (ret < 0) {
+ if (sync_env->counters && ret != -ENOENT) {
+ sync_env->counters->inc(sync_counters::l_poll_err);
+ }
+ return set_cr_error(ret);
+ }
+ entries->clear();
+ entries->swap(response.entries);
+ *pnext_marker = response.marker;
+ *truncated = response.truncated;
+ return set_cr_done();
+ }
+ }
+ return 0;
+ }
+};
+
+class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ int num_shards;
+ map<int, RGWDataChangesLogInfo> *datalog_info;
+
+ int shard_id;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to fetch remote datalog info: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+public:
+ RGWReadRemoteDataLogInfoCR(RGWDataSyncCtx *_sc,
+ int _num_shards,
+ map<int, RGWDataChangesLogInfo> *_datalog_info) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+ sc(_sc), sync_env(_sc->env), num_shards(_num_shards),
+ datalog_info(_datalog_info), shard_id(0) {}
+ bool spawn_next() override;
+};
+
+bool RGWReadRemoteDataLogInfoCR::spawn_next() {
+ if (shard_id >= num_shards) {
+ return false;
+ }
+ spawn(new RGWReadRemoteDataLogShardInfoCR(sc, shard_id, &(*datalog_info)[shard_id]), false);
+ shard_id++;
+ return true;
+}
+
+class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ RGWRESTReadResource *http_op;
+
+ int shard_id;
+ string marker;
+ uint32_t max_entries;
+ rgw_datalog_shard_data *result;
+
+public:
+ RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id,
+ const string& _marker, uint32_t _max_entries,
+ rgw_datalog_shard_data *_result)
+ : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL),
+ shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ RGWRESTConn *conn = sc->conn;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+
+ char max_entries_buf[32];
+ snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+ const char *marker_key = (marker.empty() ? "" : "marker");
+
+ rgw_http_param_pair pairs[] = { { "type", "data" },
+ { "id", buf },
+ { "max-entries", max_entries_buf },
+ { marker_key, marker.c_str() },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+
+ http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int request_complete() override {
+ int ret = http_op->wait(result, null_yield);
+ http_op->put();
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+};
+
+class RGWListRemoteDataLogCR : public RGWShardCollectCR {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ map<int, string> shards;
+ int max_entries_per_shard;
+ map<int, rgw_datalog_shard_data> *result;
+
+ map<int, string>::iterator iter;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to list remote datalog: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+public:
+ RGWListRemoteDataLogCR(RGWDataSyncCtx *_sc,
+ map<int, string>& _shards,
+ int _max_entries_per_shard,
+ map<int, rgw_datalog_shard_data> *_result) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+ sc(_sc), sync_env(_sc->env), max_entries_per_shard(_max_entries_per_shard),
+ result(_result) {
+ shards.swap(_shards);
+ iter = shards.begin();
+ }
+ bool spawn_next() override;
+};
+
+bool RGWListRemoteDataLogCR::spawn_next() {
+ if (iter == shards.end()) {
+ return false;
+ }
+
+ spawn(new RGWListRemoteDataLogShardCR(sc, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+ ++iter;
+ return true;
+}
+
+class RGWInitDataSyncStatusCoroutine : public RGWCoroutine {
+ static constexpr auto lock_name{ "sync_lock"sv };
+ RGWDataSyncCtx* const sc;
+ RGWDataSyncEnv* const sync_env{ sc->env };
+ const uint32_t num_shards;
+ rgw_data_sync_status* const status;
+ RGWSyncTraceNodeRef tn;
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+ RGWObjVersionTracker& objv_tracker;
+ std::vector<RGWObjVersionTracker>& objvs;
+
+ const rgw_pool& pool{ sync_env->svc->zone->get_zone_params().log_pool };
+ const string sync_status_oid{
+ RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) };
+
+ map<int, RGWDataChangesLogInfo> shards_info;
+
+
+public:
+ RGWInitDataSyncStatusCoroutine(
+ RGWDataSyncCtx* _sc, uint32_t num_shards, uint64_t instance_id,
+ const RGWSyncTraceNodeRef& tn_parent, rgw_data_sync_status* status,
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr,
+ RGWObjVersionTracker& objv_tracker,
+ std::vector<RGWObjVersionTracker>& objvs)
+ : RGWCoroutine(_sc->cct), sc(_sc), num_shards(num_shards), status(status),
+ tn(sync_env->sync_tracer->add_node(tn_parent, "init_data_sync_status")),
+ lease_cr(std::move(lease_cr)), objv_tracker(objv_tracker), objvs(objvs) {
+ status->sync_info.instance_id = instance_id;
+ }
+
+ static auto continuous_lease_cr(RGWDataSyncCtx* const sc,
+ RGWCoroutine* const caller) {
+ auto lock_duration = sc->cct->_conf->rgw_sync_lease_period;
+ return new RGWContinuousLeaseCR(
+ sc->env->async_rados, sc->env->driver,
+ { sc->env->svc->zone->get_zone_params().log_pool,
+ RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) },
+ string(lock_name), lock_duration, caller, &sc->lcc);
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ int ret;
+ reenter(this) {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ return set_cr_error(-ECANCELED);
+ }
+
+ using WriteInfoCR = RGWSimpleRadosWriteCR<rgw_data_sync_info>;
+ yield call(new WriteInfoCR(dpp, sync_env->driver,
+ rgw_raw_obj{pool, sync_status_oid},
+ status->sync_info, &objv_tracker));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+ return set_cr_error(retcode);
+ }
+
+ // In the original code we reacquired the lock. Since
+ // RGWSimpleRadosWriteCR doesn't appear to touch the attributes
+ // and cls_version works across it, this should be unnecessary.
+ // Putting a note here just in case. If we see ECANCELED where
+ // we expect EBUSY, we can revisit this.
+
+ /* fetch current position in logs */
+ yield {
+ RGWRESTConn *conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+ if (!conn) {
+ tn->log(0, SSTR("ERROR: connection to zone " << sc->source_zone << " does not exist!"));
+ return set_cr_error(-EIO);
+ }
+ for (uint32_t i = 0; i < num_shards; i++) {
+ spawn(new RGWReadRemoteDataLogShardInfoCR(sc, i, &shards_info[i]), true);
+ }
+ }
+ while (collect(&ret, NULL)) {
+ if (ret < 0) {
+ tn->log(0, SSTR("ERROR: failed to read remote data log shards"));
+ return set_state(RGWCoroutine_Error);
+ }
+ yield;
+ }
+ yield {
+ objvs.resize(num_shards);
+ for (uint32_t i = 0; i < num_shards; i++) {
+ RGWDataChangesLogInfo& info = shards_info[i];
+ auto& marker = status->sync_markers[i];
+ marker.next_step_marker = info.marker;
+ marker.timestamp = info.last_update;
+ const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, i);
+ auto& objv = objvs[i];
+ objv.generate_new_write_ver(cct);
+ using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_data_sync_marker>;
+ spawn(new WriteMarkerCR(dpp, sync_env->driver,
+ rgw_raw_obj{pool, oid}, marker, &objv), true);
+ }
+ }
+ while (collect(&ret, NULL)) {
+ if (ret < 0) {
+ tn->log(0, SSTR("ERROR: failed to write data sync status markers"));
+ return set_state(RGWCoroutine_Error);
+ }
+ yield;
+ }
+
+ status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+ yield call(new WriteInfoCR(dpp, sync_env->driver,
+ rgw_raw_obj{pool, sync_status_oid},
+ status->sync_info, &objv_tracker));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+RGWRemoteDataLog::RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* driver,
+ RGWAsyncRadosProcessor *async_rados)
+ : RGWCoroutinesManager(driver->ctx(), driver->getRados()->get_cr_registry()),
+ dpp(dpp), driver(driver),
+ cct(driver->ctx()), cr_registry(driver->getRados()->get_cr_registry()),
+ async_rados(async_rados),
+ http_manager(driver->ctx(), completion_mgr),
+ data_sync_cr(NULL),
+ initialized(false)
+{
+}
+
+int RGWRemoteDataLog::read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info)
+{
+ rgw_http_param_pair pairs[] = { { "type", "data" },
+ { NULL, NULL } };
+
+ int ret = sc.conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl;
+ return ret;
+ }
+
+ ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl;
+
+ return 0;
+}
+
+int RGWRemoteDataLog::read_source_log_shards_info(const DoutPrefixProvider *dpp, map<int, RGWDataChangesLogInfo> *shards_info)
+{
+ rgw_datalog_info log_info;
+ int ret = read_log_info(dpp, &log_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return run(dpp, new RGWReadRemoteDataLogInfoCR(&sc, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteDataLog::read_source_log_shards_next(const DoutPrefixProvider *dpp, map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result)
+{
+ return run(dpp, new RGWListRemoteDataLogCR(&sc, shard_markers, 1, result));
+}
+
+int RGWRemoteDataLog::init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+ RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module,
+ PerfCounters* counters)
+{
+ sync_env.init(dpp, cct, driver, driver->svc(), async_rados, &http_manager, _error_logger,
+ _sync_tracer, _sync_module, counters);
+ sc.init(&sync_env, _conn, _source_zone);
+
+ if (initialized) {
+ return 0;
+ }
+
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+
+ tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data");
+
+ initialized = true;
+
+ return 0;
+}
+
+void RGWRemoteDataLog::finish()
+{
+ stop();
+}
+
+int RGWRemoteDataLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status)
+{
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWObjVersionTracker objv;
+ std::vector<RGWObjVersionTracker> shard_objvs;
+ RGWCoroutinesManager crs(cct, cr_registry);
+ RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+
+ RGWDataSyncCtx sc_local = sc;
+ sc_local.env = &sync_env_local;
+
+ ret = crs.run(dpp, new RGWReadDataSyncStatusCoroutine(&sc_local, sync_status,
+ &objv, shard_objvs));
+ http_manager.stop();
+ return ret;
+}
+
+int RGWRemoteDataLog::read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, set<int>& recovering_shards)
+{
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWCoroutinesManager crs(cct, cr_registry);
+ RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+
+ RGWDataSyncCtx sc_local = sc;
+ sc_local.env = &sync_env_local;
+
+ std::vector<RGWRadosGetOmapKeysCR::ResultPtr> omapkeys;
+ omapkeys.resize(num_shards);
+ uint64_t max_entries{1};
+
+ ret = crs.run(dpp, new RGWReadDataSyncRecoveringShardsCR(&sc_local, max_entries, num_shards, omapkeys));
+ http_manager.stop();
+
+ if (ret == 0) {
+ for (int i = 0; i < num_shards; i++) {
+ if (omapkeys[i]->entries.size() != 0) {
+ recovering_shards.insert(i);
+ }
+ }
+ }
+
+ return ret;
+}
+
+namespace RGWRDL {
+class DataSyncInitCR : public RGWCoroutine {
+ RGWDataSyncCtx* const sc;
+ const uint32_t num_shards;
+ uint64_t instance_id;
+ const RGWSyncTraceNodeRef& tn;
+ rgw_data_sync_status* const sync_status;
+ std::vector<RGWObjVersionTracker>& objvs;
+
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+
+ RGWObjVersionTracker objv_tracker;
+
+public:
+
+ DataSyncInitCR(RGWDataSyncCtx* sc, uint32_t num_shards, uint64_t instance_id,
+ const RGWSyncTraceNodeRef& tn,
+ rgw_data_sync_status* sync_status,
+ std::vector<RGWObjVersionTracker>& objvs)
+ : RGWCoroutine(sc->cct), sc(sc), num_shards(num_shards),
+ instance_id(instance_id), tn(tn),
+ sync_status(sync_status), objvs(objvs) {}
+
+ ~DataSyncInitCR() override {
+ if (lease_cr) {
+ lease_cr->abort();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ lease_cr.reset(
+ RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this));
+
+ yield spawn(lease_cr.get(), false);
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ tn->log(5, "ERROR: failed to take data sync status lease");
+ set_status("lease lock failed, early abort");
+ drain_all();
+ return set_cr_error(lease_cr->get_ret_status());
+ }
+ tn->log(5, "waiting on data sync status lease");
+ yield set_sleeping(true);
+ }
+ tn->log(5, "acquired data sync status lease");
+ objv_tracker.generate_new_write_ver(sc->cct);
+ yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id,
+ tn, sync_status, lease_cr,
+ objv_tracker, objvs));
+ lease_cr->go_down();
+ lease_cr.reset();
+ drain_all();
+ if (retcode < 0) {
+ set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+}
+
+int RGWRemoteDataLog::init_sync_status(const DoutPrefixProvider *dpp, int num_shards)
+{
+ rgw_data_sync_status sync_status;
+ std::vector<RGWObjVersionTracker> objvs;
+ sync_status.sync_info.num_shards = num_shards;
+
+ RGWCoroutinesManager crs(cct, cr_registry);
+ RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+ auto instance_id = ceph::util::generate_random_number<uint64_t>();
+ RGWDataSyncCtx sc_local = sc;
+ sc_local.env = &sync_env_local;
+ ret = crs.run(dpp, new RGWRDL::DataSyncInitCR(&sc_local, num_shards,
+ instance_id, tn, &sync_status, objvs));
+ http_manager.stop();
+ return ret;
+}
+
+static string full_data_sync_index_shard_oid(const rgw_zone_id& source_zone, int shard_id)
+{
+ char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.id.size() + 1 + 16];
+ snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.id.c_str(), shard_id);
+ return string(buf);
+}
+
+struct read_metadata_list {
+ string marker;
+ bool truncated;
+ list<string> keys;
+ int count;
+
+ read_metadata_list() : truncated(false), count(0) {}
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ JSONDecoder::decode_json("keys", keys, obj);
+ JSONDecoder::decode_json("count", count, obj);
+ }
+};
+
+struct bucket_instance_meta_info {
+ string key;
+ obj_version ver;
+ utime_t mtime;
+ RGWBucketInstanceMetadataObject data;
+
+ bucket_instance_meta_info() {}
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("key", key, obj);
+ JSONDecoder::decode_json("ver", ver, obj);
+ JSONDecoder::decode_json("mtime", mtime, obj);
+ JSONDecoder::decode_json("data", data, obj);
+ }
+};
+
+class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ const string instance_key;
+
+ rgw_bucket_index_marker_info *info;
+
+public:
+ RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncCtx *_sc,
+ const rgw_bucket& bucket,
+ rgw_bucket_index_marker_info *_info)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ instance_key(bucket.get_key()), info(_info) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield {
+ rgw_http_param_pair pairs[] = { { "type" , "bucket-index" },
+ { "bucket-instance", instance_key.c_str() },
+ { "info" , NULL },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+ call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, info));
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+
+class RGWListBucketIndexesCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env = sc->env;
+
+ rgw::sal::RadosStore* driver = sync_env->driver;
+
+ rgw_data_sync_status *sync_status;
+ std::vector<RGWObjVersionTracker>& objvs;
+
+ int req_ret = 0;
+ int ret = 0;
+
+ list<string>::iterator iter;
+
+ unique_ptr<RGWShardedOmapCRManager> entries_index;
+ string oid_prefix =
+ datalog_sync_full_sync_index_prefix + "." + sc->source_zone.id;
+
+ string path = "/admin/metadata/bucket.instance";
+ bucket_instance_meta_info meta_info;
+ string key;
+
+ bool failed = false;
+ bool truncated = false;
+ read_metadata_list result;
+
+public:
+ RGWListBucketIndexesCR(RGWDataSyncCtx* sc,
+ rgw_data_sync_status* sync_status, std::vector<RGWObjVersionTracker>& objvs)
+ : RGWCoroutine(sc->cct), sc(sc), sync_status(sync_status), objvs(objvs) {}
+ ~RGWListBucketIndexesCR() override { }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ entries_index = std::make_unique<RGWShardedOmapCRManager>(
+ sync_env->async_rados, driver, this,
+ cct->_conf->rgw_data_log_num_shards,
+ sync_env->svc->zone->get_zone_params().log_pool,
+ oid_prefix);
+ yield; // yield so OmapAppendCRs can start
+
+ do {
+ yield {
+ string entrypoint = "/admin/metadata/bucket.instance"s;
+
+ rgw_http_param_pair pairs[] = {{"max-entries", "1000"},
+ {"marker", result.marker.c_str()},
+ {NULL, NULL}};
+
+ call(new RGWReadRESTResourceCR<read_metadata_list>(
+ sync_env->cct, sc->conn, sync_env->http_manager,
+ entrypoint, pairs, &result));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0)
+ << "ERROR: failed to fetch metadata for section bucket.instance"
+ << dendl;
+ return set_cr_error(retcode);
+ }
+
+ for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) {
+ ldpp_dout(dpp, 20) << "list metadata: section=bucket.instance key="
+ << *iter << dendl;
+ key = *iter;
+
+ yield {
+ rgw_http_param_pair pairs[] = {{"key", key.c_str()},
+ {NULL, NULL}};
+
+ call(new RGWReadRESTResourceCR<bucket_instance_meta_info>(
+ sync_env->cct, sc->conn, sync_env->http_manager, path, pairs,
+ &meta_info));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata for key: "
+ << key << dendl;
+ return set_cr_error(retcode);
+ }
+ // Now that bucket full sync is bucket-wide instead of
+ // per-shard, we only need to register a single shard of
+ // each bucket to guarantee that sync will see everything
+ // that happened before data full sync starts. This also
+ // means we don't have to care about the bucket's current
+ // shard count.
+ yield entries_index->append(
+ fmt::format("{}:{}", key, 0),
+ sync_env->svc->datalog_rados->get_log_shard_id(
+ meta_info.data.get_bucket_info().bucket, 0));
+ }
+ truncated = result.truncated;
+ } while (truncated);
+
+ yield {
+ if (!entries_index->finish()) {
+ failed = true;
+ }
+ }
+ if (!failed) {
+ for (auto iter = sync_status->sync_markers.begin();
+ iter != sync_status->sync_markers.end();
+ ++iter) {
+ int shard_id = (int)iter->first;
+ rgw_data_sync_marker& marker = iter->second;
+ marker.total_entries = entries_index->get_total_entries(shard_id);
+ spawn(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+ dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool,
+ RGWDataSyncStatusManager::shard_obj_name(
+ sc->source_zone, shard_id)),
+ marker, &objvs[shard_id]),
+ true);
+ }
+ } else {
+ yield call(sync_env->error_logger->log_error_cr(
+ dpp, sc->conn->get_remote_id(), "data.init", "",
+ EIO, string("failed to build bucket instances map")));
+ }
+ while (collect(&ret, NULL)) {
+ if (ret < 0) {
+ yield call(sync_env->error_logger->log_error_cr(
+ dpp, sc->conn->get_remote_id(), "data.init", "",
+ -ret, string("failed to driver sync status: ") +
+ cpp_strerror(-ret)));
+ req_ret = ret;
+ }
+ yield;
+ }
+ drain_all();
+ if (req_ret < 0) {
+ yield return set_cr_error(req_ret);
+ }
+ yield return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+#define DATA_SYNC_UPDATE_MARKER_WINDOW 1
+
+class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ string marker_oid;
+ rgw_data_sync_marker sync_marker;
+ RGWSyncTraceNodeRef tn;
+ RGWObjVersionTracker& objv;
+
+public:
+ RGWDataSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+ const string& _marker_oid,
+ const rgw_data_sync_marker& _marker,
+ RGWSyncTraceNodeRef& _tn, RGWObjVersionTracker& objv) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW),
+ sc(_sc), sync_env(_sc->env),
+ marker_oid(_marker_oid),
+ sync_marker(_marker),
+ tn(_tn), objv(objv) {}
+
+ RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+ sync_marker.marker = new_marker;
+ sync_marker.pos = index_pos;
+ sync_marker.timestamp = timestamp;
+
+ tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker));
+
+ return new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, marker_oid),
+ sync_marker, &objv);
+ }
+
+ RGWOrderCallCR *allocate_order_control_cr() override {
+ return new RGWLastCallerWinsCR(sync_env->cct);
+ }
+};
+
+// ostream wrappers to print buckets without copying strings
+struct bucket_str {
+ const rgw_bucket& b;
+ explicit bucket_str(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) {
+ auto& b = rhs.b;
+ if (!b.tenant.empty()) {
+ out << b.tenant << '/';
+ }
+ out << b.name;
+ if (!b.bucket_id.empty()) {
+ out << ':' << b.bucket_id;
+ }
+ return out;
+}
+
+struct bucket_str_noinstance {
+ const rgw_bucket& b;
+ explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) {
+ auto& b = rhs.b;
+ if (!b.tenant.empty()) {
+ out << b.tenant << '/';
+ }
+ out << b.name;
+ return out;
+}
+
+struct bucket_shard_str {
+ const rgw_bucket_shard& bs;
+ explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) {
+ auto& bs = rhs.bs;
+ out << bucket_str{bs.bucket};
+ if (bs.shard_id >= 0) {
+ out << ':' << bs.shard_id;
+ }
+ return out;
+}
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<bucket_shard_str> : fmt::ostream_formatter {};
+#endif
+
+struct all_bucket_info {
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> attrs;
+};
+
+struct rgw_sync_pipe_info_entity
+{
+private:
+ RGWBucketInfo bucket_info;
+ map<string, bufferlist> bucket_attrs;
+ bool _has_bucket_info{false};
+
+public:
+ rgw_zone_id zone;
+
+ rgw_sync_pipe_info_entity() {}
+ rgw_sync_pipe_info_entity(const rgw_sync_bucket_entity& e,
+ std::optional<all_bucket_info>& binfo) {
+ if (e.zone) {
+ zone = *e.zone;
+ }
+ if (!e.bucket) {
+ return;
+ }
+ if (!binfo ||
+ binfo->bucket_info.bucket != *e.bucket) {
+ bucket_info.bucket = *e.bucket;
+ } else {
+ set_bucket_info(*binfo);
+ }
+ }
+
+ void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+ if (_has_bucket_info) {
+ return;
+ }
+ if (bucket_info.bucket.name.empty()) {
+ return;
+ }
+
+ auto iter = buckets_info.find(bucket_info.bucket);
+ if (iter == buckets_info.end()) {
+ return;
+ }
+
+ set_bucket_info(iter->second);
+ }
+
+ bool has_bucket_info() const {
+ return _has_bucket_info;
+ }
+
+ void set_bucket_info(const all_bucket_info& all_info) {
+ bucket_info = all_info.bucket_info;
+ bucket_attrs = all_info.attrs;
+ _has_bucket_info = true;
+ }
+
+ const RGWBucketInfo& get_bucket_info() const {
+ return bucket_info;
+ }
+
+ const rgw_bucket& get_bucket() const {
+ return bucket_info.bucket;
+ }
+
+ bool operator<(const rgw_sync_pipe_info_entity& e) const {
+ if (zone < e.zone) {
+ return false;
+ }
+ if (zone > e.zone) {
+ return true;
+ }
+ return (bucket_info.bucket < e.bucket_info.bucket);
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_info_entity& e) {
+ auto& bucket = e.get_bucket_info().bucket;
+
+ out << e.zone << ":" << bucket.get_key();
+ return out;
+}
+
+struct rgw_sync_pipe_handler_info {
+ RGWBucketSyncFlowManager::pipe_handler handler;
+ rgw_sync_pipe_info_entity source;
+ rgw_sync_pipe_info_entity target;
+
+ rgw_sync_pipe_handler_info() {}
+ rgw_sync_pipe_handler_info(const RGWBucketSyncFlowManager::pipe_handler& _handler,
+ std::optional<all_bucket_info> source_bucket_info,
+ std::optional<all_bucket_info> target_bucket_info) : handler(_handler),
+ source(handler.source, source_bucket_info),
+ target(handler.dest, target_bucket_info) {
+ }
+
+ bool operator<(const rgw_sync_pipe_handler_info& p) const {
+ if (source < p.source) {
+ return true;
+ }
+ if (p.source < source) {
+ return false;
+ }
+ return (target < p.target);
+ }
+
+ void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+ source.update_empty_bucket_info(buckets_info);
+ target.update_empty_bucket_info(buckets_info);
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_handler_info& p) {
+ out << p.source << ">" << p.target;
+ return out;
+}
+
+struct rgw_sync_pipe_info_set {
+ std::set<rgw_sync_pipe_handler_info> handlers;
+
+ using iterator = std::set<rgw_sync_pipe_handler_info>::iterator;
+
+ void clear() {
+ handlers.clear();
+ }
+
+ void insert(const RGWBucketSyncFlowManager::pipe_handler& handler,
+ std::optional<all_bucket_info>& source_bucket_info,
+ std::optional<all_bucket_info>& target_bucket_info) {
+ rgw_sync_pipe_handler_info p(handler, source_bucket_info, target_bucket_info);
+ handlers.insert(p);
+ }
+
+ iterator begin() {
+ return handlers.begin();
+ }
+
+ iterator end() {
+ return handlers.end();
+ }
+
+ size_t size() const {
+ return handlers.size();
+ }
+
+ bool empty() const {
+ return handlers.empty();
+ }
+
+ void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+ if (buckets_info.empty()) {
+ return;
+ }
+
+ std::set<rgw_sync_pipe_handler_info> p;
+
+ for (auto pipe : handlers) {
+ pipe.update_empty_bucket_info(buckets_info);
+ p.insert(pipe);
+ }
+
+ handlers = std::move(p);
+ }
+};
+
+class RGWRunBucketSourcesSyncCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+
+ rgw_sync_pipe_info_set pipes;
+ rgw_sync_pipe_info_set::iterator siter;
+
+ rgw_bucket_sync_pair_info sync_pair;
+
+ RGWSyncTraceNodeRef tn;
+ ceph::real_time* progress;
+ std::vector<ceph::real_time> shard_progress;
+ std::vector<ceph::real_time>::iterator cur_shard_progress;
+
+ RGWRESTConn *conn{nullptr};
+ rgw_zone_id last_zone;
+
+ std::optional<uint64_t> gen;
+ rgw_bucket_index_marker_info marker_info;
+ BucketIndexShardsManager marker_mgr;
+
+public:
+ RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_bucket_shard& source_bs,
+ const RGWSyncTraceNodeRef& _tn_parent,
+ std::optional<uint64_t> gen,
+ ceph::real_time* progress);
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWDataSyncSingleEntryCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw::bucket_sync::Handle state; // cached bucket-shard state
+ rgw_data_sync_obligation obligation; // input obligation
+ std::optional<rgw_data_sync_obligation> complete; // obligation to complete
+ uint32_t obligation_counter = 0;
+ RGWDataSyncShardMarkerTrack *marker_tracker;
+ rgw_raw_obj error_repo;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ RGWSyncTraceNodeRef tn;
+
+ ceph::real_time progress;
+ int sync_status = 0;
+public:
+ RGWDataSyncSingleEntryCR(RGWDataSyncCtx *_sc, rgw::bucket_sync::Handle state,
+ rgw_data_sync_obligation _obligation,
+ RGWDataSyncShardMarkerTrack *_marker_tracker,
+ const rgw_raw_obj& error_repo,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const RGWSyncTraceNodeRef& _tn_parent)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ state(std::move(state)), obligation(std::move(_obligation)),
+ marker_tracker(_marker_tracker), error_repo(error_repo),
+ lease_cr(std::move(lease_cr)) {
+ set_description() << "data sync single entry (source_zone=" << sc->source_zone << ") " << obligation;
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", to_string(obligation.bs, obligation.gen));
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ if (state->obligation) {
+ // this is already syncing in another DataSyncSingleEntryCR
+ if (state->obligation->timestamp < obligation.timestamp) {
+ // cancel existing obligation and overwrite it
+ tn->log(10, SSTR("canceling existing obligation " << *state->obligation));
+ complete = std::move(*state->obligation);
+ *state->obligation = std::move(obligation);
+ state->counter++;
+ } else {
+ // cancel new obligation
+ tn->log(10, SSTR("canceling new obligation " << obligation));
+ complete = std::move(obligation);
+ }
+ } else {
+ // start syncing a new obligation
+ state->obligation = obligation;
+ obligation_counter = state->counter;
+ state->counter++;
+
+ // loop until the latest obligation is satisfied, because other callers
+ // may update the obligation while we're syncing
+ while ((state->obligation->timestamp == ceph::real_time() ||
+ state->progress_timestamp < state->obligation->timestamp) &&
+ obligation_counter != state->counter) {
+ obligation_counter = state->counter;
+ progress = ceph::real_time{};
+
+ ldout(cct, 4) << "starting sync on " << bucket_shard_str{state->key.first}
+ << ' ' << *state->obligation << " progress timestamp " << state->progress_timestamp
+ << " progress " << progress << dendl;
+ yield call(new RGWRunBucketSourcesSyncCR(sc, lease_cr,
+ state->key.first, tn,
+ state->obligation->gen,
+ &progress));
+ if (retcode < 0) {
+ break;
+ }
+ state->progress_timestamp = std::max(progress, state->progress_timestamp);
+ }
+ // any new obligations will process themselves
+ complete = std::move(*state->obligation);
+ state->obligation.reset();
+
+ tn->log(10, SSTR("sync finished on " << bucket_shard_str{state->key.first}
+ << " progress=" << progress << ' ' << complete << " r=" << retcode));
+ }
+ sync_status = retcode;
+
+ if (sync_status == -ENOENT) {
+ // this was added when 'tenant/' was added to datalog entries, because
+ // preexisting tenant buckets could never sync and would stay in the
+ // error_repo forever
+ tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << complete->bs));
+ sync_status = 0;
+ }
+
+ if (sync_status < 0) {
+ // write actual sync failures for 'radosgw-admin sync error list'
+ if (sync_status != -EBUSY && sync_status != -EAGAIN) {
+ yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data",
+ to_string(complete->bs, complete->gen),
+ -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status)));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode));
+ }
+ }
+ if (complete->timestamp != ceph::real_time{}) {
+ tn->log(10, SSTR("writing " << *complete << " to error repo for retry"));
+ yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(complete->bs, complete->gen),
+ complete->timestamp));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode));
+ }
+ }
+ } else if (complete->retry) {
+ yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(complete->bs, complete->gen),
+ complete->timestamp));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to remove omap key from error repo ("
+ << error_repo << " retcode=" << retcode));
+ }
+ }
+ /* FIXME: what do do in case of error */
+ if (marker_tracker && !complete->marker.empty()) {
+ /* update marker */
+ yield call(marker_tracker->finish(complete->marker));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ }
+ if (sync_status == 0) {
+ sync_status = retcode;
+ }
+ if (sync_status < 0) {
+ return set_cr_error(sync_status);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+rgw_raw_obj datalog_oid_for_error_repo(RGWDataSyncCtx *sc, rgw::sal::RadosStore* driver,
+ rgw_pool& pool, rgw_bucket_shard& bs) {
+ int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+ string oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, datalog_shard);
+ return rgw_raw_obj(pool, oid + ".retry");
+ }
+
+class RGWDataIncrementalSyncFullObligationCR: public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_shard source_bs;
+ rgw_raw_obj error_repo;
+ std::string error_marker;
+ ceph::real_time timestamp;
+ RGWSyncTraceNodeRef tn;
+ rgw_bucket_index_marker_info remote_info;
+ rgw_pool pool;
+ uint32_t sid;
+ rgw_bucket_shard bs;
+ std::vector<store_gen_shards>::const_iterator each;
+
+public:
+ RGWDataIncrementalSyncFullObligationCR(RGWDataSyncCtx *_sc, rgw_bucket_shard& _source_bs,
+ const rgw_raw_obj& error_repo, const std::string& _error_marker,
+ ceph::real_time& _timestamp, RGWSyncTraceNodeRef& _tn)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), source_bs(_source_bs),
+ error_repo(error_repo), error_marker(_error_marker), timestamp(_timestamp),
+ tn(sync_env->sync_tracer->add_node(_tn, "error_repo", SSTR(bucket_shard_str(source_bs))))
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ each = remote_info.generations.cbegin();
+ for (; each != remote_info.generations.cend(); each++) {
+ for (sid = 0; sid < each->num_shards; sid++) {
+ bs.bucket = source_bs.bucket;
+ bs.shard_id = sid;
+ pool = sync_env->svc->zone->get_zone_params().log_pool;
+ error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+ tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry"));
+ yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(bs, each->gen),
+ timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ retcode = ret;
+ }
+ return 0;
+ });
+ }
+ }
+ drain_all_cb([&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, SSTR("writing to error repo returned error: " << ret));
+ }
+ return ret;
+ });
+
+ // once everything succeeds, remove the full sync obligation from the error repo
+ yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+ error_marker, timestamp));
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+RGWCoroutine* data_sync_single_entry(RGWDataSyncCtx *sc, const rgw_bucket_shard& src,
+ std::optional<uint64_t> gen,
+ const std::string marker,
+ ceph::real_time timestamp,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache,
+ RGWDataSyncShardMarkerTrack* marker_tracker,
+ rgw_raw_obj error_repo,
+ RGWSyncTraceNodeRef& tn,
+ bool retry) {
+ auto state = bucket_shard_cache->get(src, gen);
+ auto obligation = rgw_data_sync_obligation{src, gen, marker, timestamp, retry};
+ return new RGWDataSyncSingleEntryCR(sc, std::move(state), std::move(obligation),
+ &*marker_tracker, error_repo,
+ lease_cr.get(), tn);
+}
+
+static ceph::real_time timestamp_for_bucket_shard(rgw::sal::RadosStore* driver,
+ const rgw_data_sync_status& sync_status,
+ const rgw_bucket_shard& bs) {
+ int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+ auto status = sync_status.sync_markers.find(datalog_shard);
+ if (status == sync_status.sync_markers.end()) {
+ return ceph::real_clock::zero();
+ }
+ return status->second.timestamp;
+}
+
+class RGWDataFullSyncSingleEntryCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_pool pool;
+ rgw_bucket_shard source_bs;
+ const std::string key;
+ rgw_data_sync_status sync_status;
+ rgw_raw_obj error_repo;
+ ceph::real_time timestamp;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+ RGWDataSyncShardMarkerTrack* marker_tracker;
+ RGWSyncTraceNodeRef tn;
+ rgw_bucket_index_marker_info remote_info;
+ uint32_t sid;
+ std::vector<store_gen_shards>::iterator each;
+ uint64_t i{0};
+ RGWCoroutine* shard_cr = nullptr;
+ bool first_shard = true;
+ bool error_inject;
+
+public:
+ RGWDataFullSyncSingleEntryCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, const rgw_bucket_shard& _source_bs,
+ const std::string& _key, const rgw_data_sync_status& sync_status, const rgw_raw_obj& _error_repo,
+ ceph::real_time _timestamp, boost::intrusive_ptr<const RGWContinuousLeaseCR> _lease_cr,
+ boost::intrusive_ptr<rgw::bucket_sync::Cache> _bucket_shard_cache,
+ RGWDataSyncShardMarkerTrack* _marker_tracker,
+ RGWSyncTraceNodeRef& _tn)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), pool(_pool), source_bs(_source_bs), key(_key),
+ error_repo(_error_repo), timestamp(_timestamp), lease_cr(std::move(_lease_cr)),
+ bucket_shard_cache(_bucket_shard_cache), marker_tracker(_marker_tracker), tn(_tn) {
+ error_inject = (sync_env->cct->_conf->rgw_sync_data_full_inject_err_probability > 0);
+ }
+
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ if (error_inject &&
+ rand() % 10000 < cct->_conf->rgw_sync_data_full_inject_err_probability * 10000.0) {
+ tn->log(0, SSTR("injecting read bilog info error on key=" << key));
+ retcode = -ENOENT;
+ } else {
+ tn->log(0, SSTR("read bilog info key=" << key));
+ yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+ }
+
+ if (retcode < 0) {
+ tn->log(10, SSTR("full sync: failed to read remote bucket info. Writing "
+ << source_bs.shard_id << " to error repo for retry"));
+ yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(source_bs, std::nullopt),
+ timestamp));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to log " << source_bs.shard_id << " in error repo: retcode=" << retcode));
+ }
+ yield call(marker_tracker->finish(key));
+ return set_cr_error(retcode);
+ }
+
+ //wait to sync the first shard of the oldest generation and then sync all other shards.
+ //if any of the operations fail at any time, write them into error repo for later retry.
+
+ each = remote_info.generations.begin();
+ for (; each != remote_info.generations.end(); each++) {
+ for (sid = 0; sid < each->num_shards; sid++) {
+ source_bs.shard_id = sid;
+ // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
+ error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+ timestamp = timestamp_for_bucket_shard(sync_env->driver, sync_status, source_bs);
+ if (retcode < 0) {
+ tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry"));
+ yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(source_bs, each->gen),
+ timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt);
+ } else {
+ shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp,
+ lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false);
+ tn->log(10, SSTR("full sync: syncing shard_id " << sid << " of gen " << each->gen));
+ if (first_shard) {
+ yield call(shard_cr);
+ first_shard = false;
+ } else {
+ yield_spawn_window(shard_cr, sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ retcode = ret;
+ }
+ return retcode;
+ });
+ }
+ }
+ }
+ drain_all_cb([&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ retcode = ret;
+ }
+ return retcode;
+ });
+ }
+
+ yield call(marker_tracker->finish(key));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWDataBaseSyncShardCR : public RGWCoroutine {
+protected:
+ RGWDataSyncCtx *const sc;
+ const rgw_pool& pool;
+ const uint32_t shard_id;
+ rgw_data_sync_marker& sync_marker;
+ RGWSyncTraceNodeRef tn;
+ const string& status_oid;
+ const rgw_raw_obj& error_repo;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ const rgw_data_sync_status& sync_status;
+ RGWObjVersionTracker& objv;
+ boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+
+ std::optional<RGWDataSyncShardMarkerTrack> marker_tracker;
+ RGWRadosGetOmapValsCR::ResultPtr omapvals;
+ rgw_bucket_shard source_bs;
+
+ int parse_bucket_key(const std::string& key, rgw_bucket_shard& bs) const {
+ int ret = rgw_bucket_parse_bucket_key(sc->env->cct, key,
+ &bs.bucket, &bs.shard_id);
+ //for the case of num_shards 0, shard_id gets a value of -1
+ //because of the way bucket instance gets parsed in the absence of shard_id delimiter.
+ //interpret it as a non-negative value.
+ if (ret == 0) {
+ if (bs.shard_id < 0) {
+ bs.shard_id = 0;
+ }
+ }
+ return ret;
+ }
+
+ RGWDataBaseSyncShardCR(
+ RGWDataSyncCtx *const _sc, const rgw_pool& pool, const uint32_t shard_id,
+ rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+ const string& status_oid, const rgw_raw_obj& error_repo,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_data_sync_status& sync_status,
+ RGWObjVersionTracker& objv,
+ const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+ : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+ sync_marker(sync_marker), tn(tn), status_oid(status_oid),
+ error_repo(error_repo), lease_cr(std::move(lease_cr)),
+ sync_status(sync_status), objv(objv),
+ bucket_shard_cache(bucket_shard_cache) {}
+};
+
+class RGWDataFullSyncShardCR : public RGWDataBaseSyncShardCR {
+ static constexpr auto OMAP_GET_MAX_ENTRIES = 100;
+
+ string oid;
+ uint64_t total_entries = 0;
+ ceph::real_time entry_timestamp;
+ std::map<std::string, bufferlist> entries;
+ std::map<std::string, bufferlist>::iterator iter;
+ string error_marker;
+
+public:
+
+ RGWDataFullSyncShardCR(
+ RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+ rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+ const string& status_oid, const rgw_raw_obj& error_repo,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv,
+ const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+ : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+ status_oid, error_repo, std::move(lease_cr),
+ sync_status, objv, bucket_shard_cache) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ tn->log(10, "start full sync");
+ oid = full_data_sync_index_shard_oid(sc->source_zone, shard_id);
+ marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv);
+ total_entries = sync_marker.pos;
+ entry_timestamp = sync_marker.timestamp; // time when full sync started
+ do {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ tn->log(1, "lease is lost, abort");
+ return set_cr_error(-ECANCELED);
+ }
+ omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+ yield call(new RGWRadosGetOmapValsCR(sc->env->driver,
+ rgw_raw_obj(pool, oid),
+ sync_marker.marker,
+ OMAP_GET_MAX_ENTRIES, omapvals));
+ if (retcode < 0) {
+ drain_all();
+ return set_cr_error(retcode);
+ }
+ entries = std::move(omapvals->entries);
+ if (entries.size() > 0) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ }
+ tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+ iter = entries.begin();
+ for (; iter != entries.end(); ++iter) {
+ retcode = parse_bucket_key(iter->first, source_bs);
+ if (retcode < 0) {
+ tn->log(1, SSTR("failed to parse bucket shard: " << iter->first));
+ marker_tracker->try_update_high_marker(iter->first, 0,
+ entry_timestamp);
+ continue;
+ }
+ tn->log(20, SSTR("full sync: " << iter->first));
+ total_entries++;
+ if (!marker_tracker->start(iter->first, total_entries,
+ entry_timestamp)) {
+ tn->log(0, SSTR("ERROR: cannot start syncing " << iter->first
+ << ". Duplicate entry?"));
+ } else {
+ tn->log(10, SSTR("timestamp for " << iter->first << " is :" << entry_timestamp));
+ yield_spawn_window(new RGWDataFullSyncSingleEntryCR(
+ sc, pool, source_bs, iter->first, sync_status,
+ error_repo, entry_timestamp, lease_cr,
+ bucket_shard_cache, &*marker_tracker, tn),
+ sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+ std::nullopt);
+ }
+ sync_marker.marker = iter->first;
+ }
+ } while (omapvals->more);
+ omapvals.reset();
+
+ drain_all();
+
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+ /* update marker to reflect we're done with full sync */
+ sync_marker.state = rgw_data_sync_marker::IncrementalSync;
+ sync_marker.marker = sync_marker.next_step_marker;
+ sync_marker.next_step_marker.clear();
+ yield call(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+ sc->env->dpp, sc->env->driver,
+ rgw_raw_obj(pool, status_oid), sync_marker, &objv));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ // clean up full sync index, ignoring errors
+ yield call(new RGWRadosRemoveCR(sc->env->driver, {pool, oid}));
+
+ // transition to incremental sync
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWDataIncSyncShardCR : public RGWDataBaseSyncShardCR {
+ static constexpr int max_error_entries = 10;
+ static constexpr uint32_t retry_backoff_secs = 60;
+
+ ceph::mutex& inc_lock;
+ bc::flat_set<rgw_data_notify_entry>& modified_shards;
+
+ bc::flat_set<rgw_data_notify_entry> current_modified;
+ decltype(current_modified)::iterator modified_iter;
+
+ ceph::coarse_real_time error_retry_time;
+ string error_marker;
+ std::map<std::string, bufferlist> error_entries;
+ decltype(error_entries)::iterator iter;
+ ceph::real_time entry_timestamp;
+ std::optional<uint64_t> gen;
+
+ string next_marker;
+ vector<rgw_data_change_log_entry> log_entries;
+ decltype(log_entries)::iterator log_iter;
+ bool truncated = false;
+ int cbret = 0;
+
+ utime_t get_idle_interval() const {
+ ceph::timespan interval = std::chrono::seconds(cct->_conf->rgw_data_sync_poll_interval);
+ if (!ceph::coarse_real_clock::is_zero(error_retry_time)) {
+ auto now = ceph::coarse_real_clock::now();
+ if (error_retry_time > now) {
+ auto d = error_retry_time - now;
+ if (interval > d) {
+ interval = d;
+ }
+ }
+ }
+ // convert timespan -> time_point -> utime_t
+ return utime_t(ceph::coarse_real_clock::zero() + interval);
+ }
+
+
+public:
+
+ RGWDataIncSyncShardCR(
+ RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+ rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+ const string& status_oid, const rgw_raw_obj& error_repo,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv,
+ const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache,
+ ceph::mutex& inc_lock,
+ bc::flat_set<rgw_data_notify_entry>& modified_shards)
+ : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+ status_oid, error_repo, std::move(lease_cr),
+ sync_status, objv, bucket_shard_cache),
+ inc_lock(inc_lock), modified_shards(modified_shards) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ tn->log(10, "start incremental sync");
+ marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv);
+ do {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ tn->log(1, "lease is lost, abort");
+ return set_cr_error(-ECANCELED);
+ }
+ {
+ current_modified.clear();
+ std::unique_lock il(inc_lock);
+ current_modified.swap(modified_shards);
+ il.unlock();
+ }
+
+ if (current_modified.size() > 0) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ }
+ /* process out of band updates */
+ for (modified_iter = current_modified.begin();
+ modified_iter != current_modified.end();
+ ++modified_iter) {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ yield call(marker_tracker->flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ retcode = parse_bucket_key(modified_iter->key, source_bs);
+ if (retcode < 0) {
+ tn->log(1, SSTR("failed to parse bucket shard: "
+ << modified_iter->key));
+ continue;
+ }
+ tn->log(20, SSTR("received async update notification: "
+ << modified_iter->key));
+ spawn(data_sync_single_entry(sc, source_bs, modified_iter->gen, {},
+ ceph::real_time{}, lease_cr,
+ bucket_shard_cache, &*marker_tracker,
+ error_repo, tn, false), false);
+ }
+
+ if (error_retry_time <= ceph::coarse_real_clock::now()) {
+ /* process bucket shards that previously failed */
+ omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+ yield call(new RGWRadosGetOmapValsCR(sc->env->driver, error_repo,
+ error_marker, max_error_entries,
+ omapvals));
+ error_entries = std::move(omapvals->entries);
+ tn->log(20, SSTR("read error repo, got " << error_entries.size()
+ << " entries"));
+ iter = error_entries.begin();
+ for (; iter != error_entries.end(); ++iter) {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ yield call(marker_tracker->flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ error_marker = iter->first;
+ entry_timestamp = rgw::error_repo::decode_value(iter->second);
+ retcode = rgw::error_repo::decode_key(iter->first, source_bs, gen);
+ if (retcode == -EINVAL) {
+ // backward compatibility for string keys that don't encode a gen
+ retcode = parse_bucket_key(error_marker, source_bs);
+ }
+ if (retcode < 0) {
+ tn->log(1, SSTR("failed to parse bucket shard: " << error_marker));
+ spawn(rgw::error_repo::remove_cr(sc->env->driver->svc()->rados,
+ error_repo, error_marker,
+ entry_timestamp),
+ false);
+ continue;
+ }
+ tn->log(10, SSTR("gen is " << gen));
+ if (!gen) {
+ // write all full sync obligations for the bucket to error repo
+ spawn(new RGWDataIncrementalSyncFullObligationCR(sc, source_bs,
+ error_repo, error_marker, entry_timestamp, tn), false);
+ } else {
+ tn->log(20, SSTR("handle error entry key="
+ << to_string(source_bs, gen)
+ << " timestamp=" << entry_timestamp));
+ spawn(data_sync_single_entry(sc, source_bs, gen, "",
+ entry_timestamp, lease_cr,
+ bucket_shard_cache, &*marker_tracker,
+ error_repo, tn, true), false);
+ }
+ }
+ if (!omapvals->more) {
+ error_retry_time = ceph::coarse_real_clock::now() +
+ make_timespan(retry_backoff_secs);
+ error_marker.clear();
+ }
+ }
+ omapvals.reset();
+
+ tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker="
+ << sync_marker.marker));
+ yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id,
+ sync_marker.marker,
+ &next_marker, &log_entries,
+ &truncated));
+ if (retcode < 0 && retcode != -ENOENT) {
+ tn->log(0, SSTR("ERROR: failed to read remote data log info: ret="
+ << retcode));
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ if (log_entries.size() > 0) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ }
+
+ for (log_iter = log_entries.begin();
+ log_iter != log_entries.end();
+ ++log_iter) {
+ if (!lease_cr->is_locked()) {
+ drain_all();
+ yield call(marker_tracker->flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+
+ tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key));
+ retcode = parse_bucket_key(log_iter->entry.key, source_bs);
+ if (retcode < 0) {
+ tn->log(1, SSTR("failed to parse bucket shard: "
+ << log_iter->entry.key));
+ marker_tracker->try_update_high_marker(log_iter->log_id, 0,
+ log_iter->log_timestamp);
+ continue;
+ }
+ if (!marker_tracker->start(log_iter->log_id, 0,
+ log_iter->log_timestamp)) {
+ tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id
+ << ". Duplicate entry?"));
+ } else {
+ tn->log(1, SSTR("incremental sync on " << log_iter->entry.key << "shard: " << shard_id << "on gen " << log_iter->entry.gen));
+ yield_spawn_window(data_sync_single_entry(sc, source_bs, log_iter->entry.gen, log_iter->log_id,
+ log_iter->log_timestamp, lease_cr,bucket_shard_cache,
+ &*marker_tracker, error_repo, tn, false),
+ sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, SSTR("data_sync_single_entry returned error: " << ret));
+ cbret = ret;
+ }
+ return 0;
+ });
+ }
+ }
+ if (cbret < 0 ) {
+ retcode = cbret;
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ tn->log(20, SSTR("shard_id=" << shard_id <<
+ " sync_marker="<< sync_marker.marker
+ << " next_marker=" << next_marker
+ << " truncated=" << truncated));
+ if (!next_marker.empty()) {
+ sync_marker.marker = next_marker;
+ } else if (!log_entries.empty()) {
+ sync_marker.marker = log_entries.back().log_id;
+ }
+ if (!truncated) {
+ // we reached the end, wait a while before checking for more
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+ yield wait(get_idle_interval());
+ }
+ } while (true);
+ }
+ return 0;
+ }
+};
+
+class RGWDataSyncShardCR : public RGWCoroutine {
+ RGWDataSyncCtx *const sc;
+ const rgw_pool pool;
+ const uint32_t shard_id;
+ rgw_data_sync_marker& sync_marker;
+ rgw_data_sync_status sync_status;
+ const RGWSyncTraceNodeRef tn;
+ RGWObjVersionTracker& objv;
+ bool *reset_backoff;
+
+ ceph::mutex inc_lock = ceph::make_mutex("RGWDataSyncShardCR::inc_lock");
+ ceph::condition_variable inc_cond;
+
+ RGWDataSyncEnv *const sync_env{ sc->env };
+
+ const string status_oid{ RGWDataSyncStatusManager::shard_obj_name(
+ sc->source_zone, shard_id) };
+ const rgw_raw_obj error_repo{ pool, status_oid + ".retry" };
+
+ // target number of entries to cache before recycling idle ones
+ static constexpr size_t target_cache_size = 256;
+ boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache {
+ rgw::bucket_sync::Cache::create(target_cache_size) };
+
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+ boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+ bc::flat_set<rgw_data_notify_entry> modified_shards;
+
+public:
+ RGWDataSyncShardCR(RGWDataSyncCtx* const _sc, const rgw_pool& pool,
+ const uint32_t shard_id, rgw_data_sync_marker& marker,
+ const rgw_data_sync_status& sync_status,
+ RGWSyncTraceNodeRef& tn, RGWObjVersionTracker& objv, bool *reset_backoff)
+ : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+ sync_marker(marker), sync_status(sync_status), tn(tn),
+ objv(objv), reset_backoff(reset_backoff) {
+ set_description() << "data sync shard source_zone=" << sc->source_zone
+ << " shard_id=" << shard_id;
+ }
+
+ ~RGWDataSyncShardCR() override {
+ if (lease_cr) {
+ lease_cr->abort();
+ }
+ }
+
+ void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& entries) {
+ std::lock_guard l{inc_lock};
+ modified_shards.insert(entries.begin(), entries.end());
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield init_lease_cr();
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ tn->log(5, "failed to take lease");
+ set_status("lease lock failed, early abort");
+ drain_all();
+ return set_cr_error(lease_cr->get_ret_status());
+ }
+ set_sleeping(true);
+ yield;
+ }
+ *reset_backoff = true;
+ tn->log(10, "took lease");
+ /* Reread data sync status to fech latest marker and objv */
+ objv.clear();
+ yield call(new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+ rgw_raw_obj(pool, status_oid),
+ &sync_marker, true, &objv));
+ if (retcode < 0) {
+ lease_cr->go_down();
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ while (true) {
+ if (sync_marker.state == rgw_data_sync_marker::FullSync) {
+ yield call(new RGWDataFullSyncShardCR(sc, pool, shard_id,
+ sync_marker, tn,
+ status_oid, error_repo,
+ lease_cr, sync_status,
+ objv, bucket_shard_cache));
+ if (retcode < 0) {
+ if (retcode != -EBUSY) {
+ tn->log(10, SSTR("full sync failed (retcode=" << retcode << ")"));
+ }
+ lease_cr->go_down();
+ drain_all();
+ return set_cr_error(retcode);
+ }
+ } else if (sync_marker.state == rgw_data_sync_marker::IncrementalSync) {
+ yield call(new RGWDataIncSyncShardCR(sc, pool, shard_id,
+ sync_marker, tn,
+ status_oid, error_repo,
+ lease_cr, sync_status,
+ objv, bucket_shard_cache,
+ inc_lock, modified_shards));
+ if (retcode < 0) {
+ if (retcode != -EBUSY) {
+ tn->log(10, SSTR("incremental sync failed (retcode=" << retcode
+ << ")"));
+ }
+ lease_cr->go_down();
+ drain_all();
+ return set_cr_error(retcode);
+ }
+ } else {
+ lease_cr->go_down();
+ drain_all();
+ return set_cr_error(-EIO);
+ }
+ }
+ }
+ return 0;
+ }
+
+ void init_lease_cr() {
+ set_status("acquiring sync lock");
+ uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+ string lock_name = "sync_lock";
+ if (lease_cr) {
+ lease_cr->abort();
+ }
+ auto driver = sync_env->driver;
+ lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
+ rgw_raw_obj(pool, status_oid),
+ lock_name, lock_duration, this,
+ &sc->lcc));
+ lease_stack.reset(spawn(lease_cr.get(), false));
+ }
+};
+
+class RGWDataSyncShardControlCR : public RGWBackoffControlCR {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ rgw_pool pool;
+
+ uint32_t shard_id;
+ rgw_data_sync_marker sync_marker;
+ rgw_data_sync_status sync_status;
+
+ RGWSyncTraceNodeRef tn;
+ RGWObjVersionTracker& objv;
+public:
+ RGWDataSyncShardControlCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool,
+ uint32_t _shard_id, rgw_data_sync_marker& _marker,
+ const rgw_data_sync_status& sync_status,
+ RGWObjVersionTracker& objv,
+ RGWSyncTraceNodeRef& _tn_parent)
+ : RGWBackoffControlCR(_sc->cct, false),
+ sc(_sc), sync_env(_sc->env),
+ pool(_pool),
+ shard_id(_shard_id),
+ sync_marker(_marker), objv(objv) {
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id));
+ }
+
+ RGWCoroutine *alloc_cr() override {
+ return new RGWDataSyncShardCR(sc, pool, shard_id, sync_marker, sync_status, tn, objv, backoff_ptr());
+ }
+
+ RGWCoroutine *alloc_finisher_cr() override {
+ return new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+ &sync_marker, true, &objv);
+ }
+
+ void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& keys) {
+ std::lock_guard l{cr_lock()};
+
+ RGWDataSyncShardCR *cr = static_cast<RGWDataSyncShardCR *>(get_cr());
+ if (!cr) {
+ return;
+ }
+
+ cr->append_modified_shards(keys);
+ }
+};
+
+class RGWDataSyncCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ uint32_t num_shards;
+
+ rgw_data_sync_status sync_status;
+ std::vector<RGWObjVersionTracker> objvs;
+
+ ceph::mutex shard_crs_lock =
+ ceph::make_mutex("RGWDataSyncCR::shard_crs_lock");
+ map<int, RGWDataSyncShardControlCR *> shard_crs;
+
+ bool *reset_backoff;
+
+ RGWSyncTraceNodeRef tn;
+
+ RGWDataSyncModule *data_sync_module{nullptr};
+
+ boost::intrusive_ptr<RGWContinuousLeaseCR> init_lease;
+ boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+ RGWObjVersionTracker obj_version;
+public:
+ RGWDataSyncCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ num_shards(_num_shards),
+ reset_backoff(_reset_backoff), tn(_tn) {
+
+ }
+
+ ~RGWDataSyncCR() override {
+ for (auto iter : shard_crs) {
+ iter.second->put();
+ }
+ if (init_lease) {
+ init_lease->abort();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+ /* read sync status */
+ yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status,
+ &obj_version, objvs));
+
+ data_sync_module = sync_env->sync_module->get_data_handler();
+
+ if (retcode < 0 && retcode != -ENOENT) {
+ tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state !=
+ rgw_data_sync_info::StateSync) {
+ init_lease.reset(
+ RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this));
+ yield lease_stack.reset(spawn(init_lease.get(), false));
+
+ while (!init_lease->is_locked()) {
+ if (init_lease->is_done()) {
+ tn->log(5, "ERROR: failed to take data sync status lease");
+ set_status("lease lock failed, early abort");
+ drain_all();
+ return set_cr_error(init_lease->get_ret_status());
+ }
+ tn->log(5, "waiting on data sync status lease");
+ yield set_sleeping(true);
+ }
+ tn->log(5, "acquired data sync status lease");
+
+ // Reread sync status now that we've acquired the lock!
+ obj_version.clear();
+ yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status, &obj_version, objvs));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ }
+
+ /* state: init status */
+ if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) {
+ tn->log(20, SSTR("init"));
+ sync_status.sync_info.num_shards = num_shards;
+ uint64_t instance_id;
+ instance_id = ceph::util::generate_random_number<uint64_t>();
+ yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, tn,
+ &sync_status, init_lease, obj_version, objvs));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode));
+ init_lease->go_down();
+ drain_all();
+ return set_cr_error(retcode);
+ }
+ // sets state = StateBuildingFullSyncMaps
+
+ *reset_backoff = true;
+ }
+
+ data_sync_module->init(sc, sync_status.sync_info.instance_id);
+
+ if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) {
+ tn->log(10, SSTR("building full sync maps"));
+ /* call sync module init here */
+ sync_status.sync_info.num_shards = num_shards;
+ yield call(data_sync_module->init_sync(dpp, sc));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ if (!init_lease->is_locked()) {
+ init_lease->go_down();
+ drain_all();
+ return set_cr_error(-ECANCELED);
+ }
+ /* state: building full sync maps */
+ yield call(new RGWListBucketIndexesCR(sc, &sync_status, objvs));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ sync_status.sync_info.state = rgw_data_sync_info::StateSync;
+
+ if (!init_lease->is_locked()) {
+ init_lease->go_down();
+ drain_all();
+ return set_cr_error(-ECANCELED);
+ }
+ /* update new state */
+ yield call(set_sync_info_cr());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ *reset_backoff = true;
+ }
+
+ yield call(data_sync_module->start_sync(dpp, sc));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to start sync, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) {
+ if (init_lease) {
+ init_lease->go_down();
+ drain_all();
+ init_lease.reset();
+ lease_stack.reset();
+ }
+ yield {
+ tn->log(10, SSTR("spawning " << num_shards << " shards sync"));
+ for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
+ iter != sync_status.sync_markers.end(); ++iter) {
+ RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sc, sync_env->svc->zone->get_zone_params().log_pool,
+ iter->first, iter->second, sync_status, objvs[iter->first], tn);
+ cr->get();
+ shard_crs_lock.lock();
+ shard_crs[iter->first] = cr;
+ shard_crs_lock.unlock();
+ spawn(cr, true);
+ }
+ }
+ }
+
+ return set_cr_done();
+ }
+ return 0;
+ }
+
+ RGWCoroutine *set_sync_info_cr() {
+ return new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+ sync_status.sync_info, &obj_version);
+ }
+
+ void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+ std::lock_guard l{shard_crs_lock};
+ map<int, RGWDataSyncShardControlCR *>::iterator iter = shard_crs.find(shard_id);
+ if (iter == shard_crs.end()) {
+ return;
+ }
+ iter->second->append_modified_shards(entries);
+ iter->second->wakeup();
+ }
+};
+
+class RGWDefaultDataSyncModule : public RGWDataSyncModule {
+public:
+ RGWDefaultDataSyncModule() {}
+
+ RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+ rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace) override;
+ RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+ RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance {
+ RGWDefaultDataSyncModule data_handler;
+public:
+ RGWDefaultSyncModuleInstance() {}
+ RGWDataSyncModule *get_data_handler() override {
+ return &data_handler;
+ }
+ bool supports_user_writes() override {
+ return true;
+ }
+};
+
+int RGWDefaultSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+ instance->reset(new RGWDefaultSyncModuleInstance());
+ return 0;
+}
+
+class RGWUserPermHandler {
+ friend struct Init;
+ friend class Bucket;
+
+ RGWDataSyncEnv *sync_env;
+ rgw_user uid;
+
+ struct _info {
+ RGWUserInfo user_info;
+ rgw::IAM::Environment env;
+ std::unique_ptr<rgw::auth::Identity> identity;
+ RGWAccessControlPolicy user_acl;
+ };
+
+ std::shared_ptr<_info> info;
+
+ struct Init;
+
+ std::shared_ptr<Init> init_action;
+
+ struct Init : public RGWGenericAsyncCR::Action {
+ RGWDataSyncEnv *sync_env;
+
+ rgw_user uid;
+ std::shared_ptr<RGWUserPermHandler::_info> info;
+
+ int ret{0};
+
+ Init(RGWUserPermHandler *handler) : sync_env(handler->sync_env),
+ uid(handler->uid),
+ info(handler->info) {}
+ int operate() override {
+ auto user_ctl = sync_env->driver->getRados()->ctl.user;
+
+ ret = user_ctl->get_info_by_uid(sync_env->dpp, uid, &info->user_info, null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+
+ info->identity = rgw::auth::transform_old_authinfo(sync_env->cct,
+ uid,
+ RGW_PERM_FULL_CONTROL,
+ false, /* system_request? */
+ TYPE_RGW);
+
+ map<string, bufferlist> uattrs;
+
+ ret = user_ctl->get_attrs_by_uid(sync_env->dpp, uid, &uattrs, null_yield);
+ if (ret == 0) {
+ ret = RGWUserPermHandler::policy_from_attrs(sync_env->cct, uattrs, &info->user_acl);
+ }
+ if (ret == -ENOENT) {
+ info->user_acl.create_default(uid, info->user_info.display_name);
+ }
+
+ return 0;
+ }
+ };
+
+public:
+ RGWUserPermHandler(RGWDataSyncEnv *_sync_env,
+ const rgw_user& _uid) : sync_env(_sync_env),
+ uid(_uid) {}
+
+ RGWCoroutine *init_cr() {
+ info = make_shared<_info>();
+ init_action = make_shared<Init>(this);
+
+ return new RGWGenericAsyncCR(sync_env->cct,
+ sync_env->async_rados,
+ init_action);
+ }
+
+ class Bucket {
+ RGWDataSyncEnv *sync_env;
+ std::shared_ptr<_info> info;
+ RGWAccessControlPolicy bucket_acl;
+ std::optional<perm_state> ps;
+ public:
+ Bucket() {}
+
+ int init(RGWUserPermHandler *handler,
+ const RGWBucketInfo& bucket_info,
+ const map<string, bufferlist>& bucket_attrs);
+
+ bool verify_bucket_permission(int perm);
+ bool verify_object_permission(const map<string, bufferlist>& obj_attrs,
+ int perm);
+ };
+
+ static int policy_from_attrs(CephContext *cct,
+ const map<string, bufferlist>& attrs,
+ RGWAccessControlPolicy *acl) {
+ acl->set_ctx(cct);
+
+ auto aiter = attrs.find(RGW_ATTR_ACL);
+ if (aiter == attrs.end()) {
+ return -ENOENT;
+ }
+ auto iter = aiter->second.begin();
+ try {
+ acl->decode(iter);
+ } catch (buffer::error& err) {
+ ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+ }
+
+ int init_bucket(const RGWBucketInfo& bucket_info,
+ const map<string, bufferlist>& bucket_attrs,
+ Bucket *bs) {
+ return bs->init(this, bucket_info, bucket_attrs);
+ }
+};
+
+int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler,
+ const RGWBucketInfo& bucket_info,
+ const map<string, bufferlist>& bucket_attrs)
+{
+ sync_env = handler->sync_env;
+ info = handler->info;
+
+ int r = RGWUserPermHandler::policy_from_attrs(sync_env->cct, bucket_attrs, &bucket_acl);
+ if (r < 0) {
+ return r;
+ }
+
+ ps.emplace(sync_env->cct,
+ info->env,
+ info->identity.get(),
+ bucket_info,
+ info->identity->get_perm_mask(),
+ false, /* defer to bucket acls */
+ nullptr, /* referer */
+ false); /* request_payer */
+
+ return 0;
+}
+
+bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm)
+{
+ return verify_bucket_permission_no_policy(sync_env->dpp,
+ &(*ps),
+ &info->user_acl,
+ &bucket_acl,
+ perm);
+}
+
+bool RGWUserPermHandler::Bucket::verify_object_permission(const map<string, bufferlist>& obj_attrs,
+ int perm)
+{
+ RGWAccessControlPolicy obj_acl;
+
+ int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl);
+ if (r < 0) {
+ return r;
+ }
+
+ return verify_bucket_permission_no_policy(sync_env->dpp,
+ &(*ps),
+ &bucket_acl,
+ &obj_acl,
+ perm);
+}
+
+class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default {
+ rgw_bucket_sync_pipe sync_pipe;
+
+ std::shared_ptr<RGWUserPermHandler::Bucket> bucket_perms;
+ std::optional<rgw_sync_pipe_dest_params> verify_dest_params;
+
+ std::optional<ceph::real_time> mtime;
+ std::optional<string> etag;
+ std::optional<uint64_t> obj_size;
+
+ std::unique_ptr<rgw::auth::Identity> identity;
+
+ std::shared_ptr<bool> need_retry;
+
+public:
+ RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe,
+ std::shared_ptr<RGWUserPermHandler::Bucket>& _bucket_perms,
+ std::optional<rgw_sync_pipe_dest_params>&& _verify_dest_params,
+ std::shared_ptr<bool>& _need_retry) : sync_pipe(_sync_pipe),
+ bucket_perms(_bucket_perms),
+ verify_dest_params(std::move(_verify_dest_params)),
+ need_retry(_need_retry) {
+ *need_retry = false;
+ }
+
+ int filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const map<string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule) override;
+};
+
+int RGWFetchObjFilter_Sync::filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const map<string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule)
+{
+ int abort_err = -ERR_PRECONDITION_FAILED;
+
+ rgw_sync_pipe_params params;
+
+ RGWObjTags obj_tags;
+
+ auto iter = obj_attrs.find(RGW_ATTR_TAGS);
+ if (iter != obj_attrs.end()) {
+ try {
+ auto it = iter->second.cbegin();
+ obj_tags.decode(it);
+ } catch (buffer::error &err) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+ }
+ }
+
+ if (!sync_pipe.info.handler.find_obj_params(source_key,
+ obj_tags.get_tags(),
+ &params)) {
+ return abort_err;
+ }
+
+ if (verify_dest_params &&
+ !(*verify_dest_params == params.dest)) {
+ /* raced! original dest params were different, will need to retry */
+ ldout(cct, 0) << "WARNING: " << __func__ << ": pipe dest params are different than original params, must have raced with object rewrite, retrying" << dendl;
+ *need_retry = true;
+ return -ECANCELED;
+ }
+
+ std::optional<std::map<string, bufferlist> > new_attrs;
+
+ if (params.dest.acl_translation) {
+ rgw_user& acl_translation_owner = params.dest.acl_translation->owner;
+ if (!acl_translation_owner.empty()) {
+ if (params.mode == rgw_sync_pipe_params::MODE_USER &&
+ acl_translation_owner != dest_bucket_info.owner) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": acl translation was requested, but user (" << acl_translation_owner
+ << ") is not dest bucket owner (" << dest_bucket_info.owner << ")" << dendl;
+ return -EPERM;
+ }
+ *poverride_owner = acl_translation_owner;
+ }
+ }
+ if (params.mode == rgw_sync_pipe_params::MODE_USER) {
+ if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl;
+ return -EPERM;
+ }
+ }
+
+ if (!dest_placement_rule &&
+ params.dest.storage_class) {
+ dest_rule.storage_class = *params.dest.storage_class;
+ dest_rule.inherit_from(dest_bucket_info.placement_rule);
+ dest_placement_rule = dest_rule;
+ *prule = &dest_rule;
+ }
+
+ return RGWFetchObjFilter_Default::filter(cct,
+ source_key,
+ dest_bucket_info,
+ dest_placement_rule,
+ obj_attrs,
+ poverride_owner,
+ prule);
+}
+
+class RGWObjFetchCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_sync_pipe& sync_pipe;
+ rgw_obj_key& key;
+ std::optional<rgw_obj_key> dest_key;
+ std::optional<uint64_t> versioned_epoch;
+ const rgw_zone_set_entry& source_trace_entry;
+ rgw_zone_set *zones_trace;
+
+ bool need_more_info{false};
+ bool check_change{false};
+
+ ceph::real_time src_mtime;
+ uint64_t src_size;
+ string src_etag;
+ map<string, bufferlist> src_attrs;
+ map<string, string> src_headers;
+
+ std::optional<rgw_user> param_user;
+ rgw_sync_pipe_params::Mode param_mode;
+
+ std::optional<RGWUserPermHandler> user_perms;
+ std::shared_ptr<RGWUserPermHandler::Bucket> source_bucket_perms;
+ RGWUserPermHandler::Bucket dest_bucket_perms;
+
+ std::optional<rgw_sync_pipe_dest_params> dest_params;
+
+ int try_num{0};
+ std::shared_ptr<bool> need_retry;
+public:
+ RGWObjFetchCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ rgw_obj_key& _key,
+ std::optional<rgw_obj_key> _dest_key,
+ std::optional<uint64_t> _versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *_zones_trace) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ sync_pipe(_sync_pipe),
+ key(_key),
+ dest_key(_dest_key),
+ versioned_epoch(_versioned_epoch),
+ source_trace_entry(source_trace_entry),
+ zones_trace(_zones_trace) {
+ }
+
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+#define MAX_RACE_RETRIES_OBJ_FETCH 10
+ for (try_num = 0; try_num < MAX_RACE_RETRIES_OBJ_FETCH; ++try_num) {
+
+ {
+ std::optional<rgw_user> param_acl_translation;
+ std::optional<string> param_storage_class;
+
+ if (!sync_pipe.info.handler.find_basic_info_without_tags(key,
+ &param_user,
+ &param_acl_translation,
+ &param_storage_class,
+ &param_mode,
+ &need_more_info)) {
+ if (!need_more_info) {
+ return set_cr_error(-ERR_PRECONDITION_FAILED);
+ }
+ }
+ }
+
+ if (need_more_info) {
+ ldout(cct, 20) << "Could not determine exact policy rule for obj=" << key << ", will read source object attributes" << dendl;
+ /*
+ * we need to fetch info about source object, so that we can determine
+ * the correct policy configuration. This can happen if there are multiple
+ * policy rules, and some depend on the object tagging */
+ yield call(new RGWStatRemoteObjCR(sync_env->async_rados,
+ sync_env->driver,
+ sc->source_zone,
+ sync_pipe.info.source_bs.bucket,
+ key,
+ &src_mtime,
+ &src_size,
+ &src_etag,
+ &src_attrs,
+ &src_headers));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ RGWObjTags obj_tags;
+
+ auto iter = src_attrs.find(RGW_ATTR_TAGS);
+ if (iter != src_attrs.end()) {
+ try {
+ auto it = iter->second.cbegin();
+ obj_tags.decode(it);
+ } catch (buffer::error &err) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+ }
+ }
+
+ rgw_sync_pipe_params params;
+ if (!sync_pipe.info.handler.find_obj_params(key,
+ obj_tags.get_tags(),
+ &params)) {
+ return set_cr_error(-ERR_PRECONDITION_FAILED);
+ }
+
+ param_user = params.user;
+ param_mode = params.mode;
+
+ dest_params = params.dest;
+ }
+
+ if (param_mode == rgw_sync_pipe_params::MODE_USER) {
+ if (!param_user) {
+ ldout(cct, 20) << "ERROR: " << __func__ << ": user level sync but user param not set" << dendl;
+ return set_cr_error(-EPERM);
+ }
+ user_perms.emplace(sync_env, *param_user);
+
+ yield call(user_perms->init_cr());
+ if (retcode < 0) {
+ ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init user perms manager for uid=" << *param_user << dendl;
+ return set_cr_error(retcode);
+ }
+
+ /* verify that user is allowed to write at the target bucket */
+ int r = user_perms->init_bucket(sync_pipe.dest_bucket_info,
+ sync_pipe.dest_bucket_attrs,
+ &dest_bucket_perms);
+ if (r < 0) {
+ ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) {
+ ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl;
+ return -EPERM;
+ }
+
+ /* init source bucket permission structure */
+ source_bucket_perms = make_shared<RGWUserPermHandler::Bucket>();
+ r = user_perms->init_bucket(sync_pipe.source_bucket_info,
+ sync_pipe.source_bucket_attrs,
+ source_bucket_perms.get());
+ if (r < 0) {
+ ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+
+ yield {
+ if (!need_retry) {
+ need_retry = make_shared<bool>();
+ }
+ auto filter = make_shared<RGWFetchObjFilter_Sync>(sync_pipe,
+ source_bucket_perms,
+ std::move(dest_params),
+ need_retry);
+
+ call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone,
+ nullopt,
+ sync_pipe.info.source_bs.bucket,
+ std::nullopt, sync_pipe.dest_bucket_info,
+ key, dest_key, versioned_epoch,
+ true,
+ std::static_pointer_cast<RGWFetchObjFilter>(filter),
+ source_trace_entry, zones_trace,
+ sync_env->counters, dpp));
+ }
+ if (retcode < 0) {
+ if (*need_retry) {
+ continue;
+ }
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+
+ ldout(cct, 0) << "ERROR: " << __func__ << ": Too many retries trying to fetch object, possibly a bug: bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << " key=" << key << dendl;
+
+ return set_cr_error(-EIO);
+ }
+ return 0;
+ }
+};
+
+RGWCoroutine *RGWDefaultDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+ rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace)
+{
+ return new RGWObjFetchCR(sc, sync_pipe, key, std::nullopt, versioned_epoch,
+ source_trace_entry, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+ auto sync_env = sc->env;
+ return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+ sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+ NULL, NULL, false, &mtime, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+ auto sync_env = sc->env;
+ return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+ sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+ &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule {
+public:
+ RGWArchiveDataSyncModule() {}
+
+ RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+ rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace) override;
+ RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+ RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance {
+ RGWArchiveDataSyncModule data_handler;
+public:
+ RGWArchiveSyncModuleInstance() {}
+ RGWDataSyncModule *get_data_handler() override {
+ return &data_handler;
+ }
+ RGWMetadataHandler *alloc_bucket_meta_handler() override {
+ return RGWArchiveBucketMetaHandlerAllocator::alloc();
+ }
+ RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) override {
+ return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(driver);
+ }
+};
+
+int RGWArchiveSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+ instance->reset(new RGWArchiveSyncModuleInstance());
+ return 0;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+ rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace)
+{
+ auto sync_env = sc->env;
+ ldout(sc->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+ if (!sync_pipe.dest_bucket_info.versioned() ||
+ (sync_pipe.dest_bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) {
+ ldout(sc->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl;
+ sync_pipe.dest_bucket_info.flags = (sync_pipe.dest_bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
+ int op_ret = sync_env->driver->getRados()->put_bucket_instance_info(sync_pipe.dest_bucket_info, false, real_time(), NULL, sync_env->dpp, null_yield);
+ if (op_ret < 0) {
+ ldpp_dout(sync_env->dpp, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl;
+ return NULL;
+ }
+ }
+
+ std::optional<rgw_obj_key> dest_key;
+
+ if (versioned_epoch.value_or(0) == 0) { /* force version if not set */
+ versioned_epoch = 0;
+ dest_key = key;
+ }
+
+ if (key.instance.empty()) {
+ dest_key = key;
+ sync_env->driver->getRados()->gen_rand_obj_instance_name(&(*dest_key));
+ }
+
+ return new RGWObjFetchCR(sc, sync_pipe, key, dest_key, versioned_epoch,
+ source_trace_entry, zones_trace);
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+ ldout(sc->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl;
+ return NULL;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+ ldout(sc->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+ << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ auto sync_env = sc->env;
+ return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+ sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+ &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWDataSyncControlCR : public RGWBackoffControlCR
+{
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ uint32_t num_shards;
+
+ RGWSyncTraceNodeRef tn;
+
+ static constexpr bool exit_on_error = false; // retry on all errors
+public:
+ RGWDataSyncControlCR(RGWDataSyncCtx *_sc, uint32_t _num_shards,
+ RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, exit_on_error),
+ sc(_sc), sync_env(_sc->env), num_shards(_num_shards) {
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "sync");
+ }
+
+ RGWCoroutine *alloc_cr() override {
+ return new RGWDataSyncCR(sc, num_shards, tn, backoff_ptr());
+ }
+
+ void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+ ceph::mutex& m = cr_lock();
+
+ m.lock();
+ RGWDataSyncCR *cr = static_cast<RGWDataSyncCR *>(get_cr());
+ if (!cr) {
+ m.unlock();
+ return;
+ }
+
+ cr->get();
+ m.unlock();
+
+ if (cr) {
+ cr->wakeup(shard_id, entries);
+ }
+
+ cr->put();
+ }
+};
+
+void RGWRemoteDataLog::wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+ std::shared_lock rl{lock};
+ if (!data_sync_cr) {
+ return;
+ }
+ data_sync_cr->wakeup(shard_id, entries);
+}
+
+int RGWRemoteDataLog::run_sync(const DoutPrefixProvider *dpp, int num_shards)
+{
+ lock.lock();
+ data_sync_cr = new RGWDataSyncControlCR(&sc, num_shards, tn);
+ data_sync_cr->get(); // run() will drop a ref, so take another
+ lock.unlock();
+
+ int r = run(dpp, data_sync_cr);
+
+ lock.lock();
+ data_sync_cr->put();
+ data_sync_cr = NULL;
+ lock.unlock();
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl;
+ return r;
+ }
+ return 0;
+}
+
+CephContext *RGWDataSyncStatusManager::get_cct() const
+{
+ return driver->ctx();
+}
+
+int RGWDataSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+ RGWZone *zone_def;
+
+ if (!(zone_def = driver->svc()->zone->find_zone(source_zone))) {
+ ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl;
+ return -EIO;
+ }
+
+ if (!driver->svc()->sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) {
+ return -ENOTSUP;
+ }
+
+ const RGWZoneParams& zone_params = driver->svc()->zone->get_zone_params();
+
+ if (sync_module == nullptr) {
+ sync_module = driver->get_sync_module();
+ }
+
+ conn = driver->svc()->zone->get_zone_conn(source_zone);
+ if (!conn) {
+ ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl;
+ return -EINVAL;
+ }
+
+ error_logger = new RGWSyncErrorLogger(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+ int r = source_log.init(source_zone, conn, error_logger, driver->getRados()->get_sync_tracer(),
+ sync_module, counters);
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl;
+ finalize();
+ return r;
+ }
+
+ rgw_datalog_info datalog_info;
+ r = source_log.read_log_info(dpp, &datalog_info);
+ if (r < 0) {
+ ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl;
+ finalize();
+ return r;
+ }
+
+ num_shards = datalog_info.num_shards;
+
+ for (int i = 0; i < num_shards; i++) {
+ shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i));
+ }
+
+ return 0;
+}
+
+void RGWDataSyncStatusManager::finalize()
+{
+ delete error_logger;
+ error_logger = nullptr;
+}
+
+unsigned RGWDataSyncStatusManager::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+ auto zone = std::string_view{source_zone.id};
+ return out << "data sync zone:" << zone.substr(0, 8) << ' ';
+}
+
+string RGWDataSyncStatusManager::sync_status_oid(const rgw_zone_id& source_zone)
+{
+ char buf[datalog_sync_status_oid_prefix.size() + source_zone.id.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.id.c_str());
+
+ return string(buf);
+}
+
+string RGWDataSyncStatusManager::shard_obj_name(const rgw_zone_id& source_zone, int shard_id)
+{
+ char buf[datalog_sync_status_shard_prefix.size() + source_zone.id.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.id.c_str(), shard_id);
+
+ return string(buf);
+}
+
+class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ const rgw_bucket_sync_pair_info& sync_pair;
+ const string sync_status_oid;
+
+ rgw_bucket_shard_sync_info& status;
+ RGWObjVersionTracker& objv_tracker;
+ const BucketIndexShardsManager& marker_mgr;
+ bool exclusive;
+public:
+ RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+ const rgw_bucket_sync_pair_info& _sync_pair,
+ rgw_bucket_shard_sync_info& _status,
+ uint64_t gen,
+ const BucketIndexShardsManager& _marker_mgr,
+ RGWObjVersionTracker& objv_tracker,
+ bool exclusive)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ sync_pair(_sync_pair),
+ sync_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, _sync_pair, gen)),
+ status(_status), objv_tracker(objv_tracker), marker_mgr(_marker_mgr), exclusive(exclusive)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield {
+ rgw_raw_obj obj(sync_env->svc->zone->get_zone_params().log_pool, sync_status_oid);
+
+ // whether or not to do full sync, incremental sync will follow anyway
+ if (sync_env->sync_module->should_full_sync()) {
+ const auto max_marker = marker_mgr.get(sync_pair.source_bs.shard_id, "");
+ status.inc_marker.position = max_marker;
+ }
+ status.inc_marker.timestamp = ceph::real_clock::now();
+ status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+
+ map<string, bufferlist> attrs;
+ status.encode_all_attrs(attrs);
+ call(new RGWSimpleRadosWriteAttrsCR(dpp, sync_env->driver,
+ obj, attrs, &objv_tracker, exclusive));
+ }
+
+ if (retcode < 0) {
+ ldout(cct, 20) << "ERROR: init marker position failed. error: " << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ ldout(cct, 20) << "init marker position: " << status.inc_marker.position <<
+ ". written to shard status object: " << sync_status_oid << dendl;
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync."
+
+template <class T>
+static bool decode_attr(CephContext *cct, map<string, bufferlist>& attrs, const string& attr_name, T *val)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+ if (iter == attrs.end()) {
+ *val = T();
+ return false;
+ }
+
+ auto biter = iter->second.cbegin();
+ try {
+ decode(*val, biter);
+ } catch (buffer::error& err) {
+ ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl;
+ return false;
+ }
+ return true;
+}
+
+void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map<string, bufferlist>& attrs)
+{
+ if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) {
+ decode_attr(cct, attrs, "state", &state);
+ }
+ if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) {
+ decode_attr(cct, attrs, "inc_marker", &inc_marker);
+ }
+}
+
+void rgw_bucket_shard_sync_info::encode_all_attrs(map<string, bufferlist>& attrs)
+{
+ encode_state_attr(attrs);
+ inc_marker.encode_attr(attrs);
+}
+
+void rgw_bucket_shard_sync_info::encode_state_attr(map<string, bufferlist>& attrs)
+{
+ using ceph::encode;
+ encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]);
+}
+
+void rgw_bucket_shard_full_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+ using ceph::encode;
+ encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]);
+}
+
+void rgw_bucket_shard_inc_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+ using ceph::encode;
+ encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]);
+}
+
+class RGWReadBucketPipeSyncStatusCoroutine : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ string oid;
+ rgw_bucket_shard_sync_info *status;
+ RGWObjVersionTracker* objv_tracker;
+ map<string, bufferlist> attrs;
+public:
+ RGWReadBucketPipeSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ rgw_bucket_shard_sync_info *_status,
+ RGWObjVersionTracker* objv_tracker,
+ uint64_t gen)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)),
+ status(_status), objv_tracker(objv_tracker)
+ {}
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadBucketPipeSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ yield call(new RGWSimpleRadosReadAttrsCR(dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, oid),
+ &attrs, true, objv_tracker));
+ if (retcode == -ENOENT) {
+ *status = rgw_bucket_shard_sync_info();
+ return set_cr_done();
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ status->decode_from_attrs(sync_env->cct, attrs);
+ return set_cr_done();
+ }
+ return 0;
+}
+
+// wrap ReadSyncStatus and set a flag if it's not in incremental
+class CheckBucketShardStatusIsIncremental : public RGWReadBucketPipeSyncStatusCoroutine {
+ bool* result;
+ rgw_bucket_shard_sync_info status;
+ public:
+ CheckBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ bool* result)
+ : RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &status, nullptr, 0 /*no gen in compat mode*/),
+ result(result)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ int r = RGWReadBucketPipeSyncStatusCoroutine::operate(dpp);
+ if (state == RGWCoroutine_Done &&
+ status.state != rgw_bucket_shard_sync_info::StateIncrementalSync) {
+ *result = false;
+ }
+ return r;
+ }
+};
+
+class CheckAllBucketShardStatusIsIncremental : public RGWShardCollectCR {
+ // start with 1 shard, and only spawn more if we detect an existing shard.
+ // this makes the backward compatilibility check far less expensive in the
+ // general case where no shards exist
+ static constexpr int initial_concurrent_shards = 1;
+ static constexpr int max_concurrent_shards = 16;
+
+ RGWDataSyncCtx* sc;
+ rgw_bucket_sync_pair_info sync_pair;
+ const int num_shards;
+ bool* result;
+ int shard = 0;
+ public:
+ CheckAllBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ int num_shards, bool* result)
+ : RGWShardCollectCR(sc->cct, initial_concurrent_shards),
+ sc(sc), sync_pair(sync_pair), num_shards(num_shards), result(result)
+ {}
+
+ bool spawn_next() override {
+ // stop spawning if we saw any errors or non-incremental shards
+ if (shard >= num_shards || status < 0 || !*result) {
+ return false;
+ }
+ sync_pair.source_bs.shard_id = shard++;
+ spawn(new CheckBucketShardStatusIsIncremental(sc, sync_pair, result), false);
+ return true;
+ }
+
+ private:
+ int handle_result(int r) override {
+ if (r < 0) {
+ ldout(cct, 4) << "failed to read bucket shard status: "
+ << cpp_strerror(r) << dendl;
+ } else if (shard == 0) {
+ // enable concurrency once the first shard succeeds
+ max_concurrent = max_concurrent_shards;
+ }
+ return r;
+ }
+};
+
+// wrap InitBucketShardSyncStatus with local storage for 'status' and 'objv'
+// and a loop to retry on racing writes
+class InitBucketShardStatusCR : public RGWCoroutine {
+ RGWDataSyncCtx* sc;
+ rgw_bucket_sync_pair_info pair;
+ rgw_bucket_shard_sync_info status;
+ RGWObjVersionTracker objv;
+ const uint64_t gen;
+ const BucketIndexShardsManager& marker_mgr;
+
+ public:
+ InitBucketShardStatusCR(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& pair,
+ uint64_t gen,
+ const BucketIndexShardsManager& marker_mgr)
+ : RGWCoroutine(sc->cct), sc(sc), pair(pair), gen(gen), marker_mgr(marker_mgr)
+ {}
+ int operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ // non exclusive create with empty status
+ objv.generate_new_write_ver(cct);
+ yield call(new RGWInitBucketShardSyncStatusCoroutine(sc, pair, status, gen, marker_mgr, objv, false));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class InitBucketShardStatusCollectCR : public RGWShardCollectCR {
+ static constexpr int max_concurrent_shards = 16;
+ RGWDataSyncCtx* sc;
+ rgw_bucket_sync_pair_info sync_pair;
+ const uint64_t gen;
+ const BucketIndexShardsManager& marker_mgr;
+
+ const int num_shards;
+ int shard = 0;
+
+ int handle_result(int r) override {
+ if (r < 0) {
+ ldout(cct, 4) << "failed to init bucket shard status: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ InitBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ uint64_t gen,
+ const BucketIndexShardsManager& marker_mgr,
+ int num_shards)
+ : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+ sc(sc), sync_pair(sync_pair), gen(gen), marker_mgr(marker_mgr), num_shards(num_shards)
+ {}
+
+ bool spawn_next() override {
+ if (shard >= num_shards || status < 0) { // stop spawning on any errors
+ return false;
+ }
+ sync_pair.source_bs.shard_id = shard++;
+ spawn(new InitBucketShardStatusCR(sc, sync_pair, gen, marker_mgr), false);
+ return true;
+ }
+};
+
+class RemoveBucketShardStatusCR : public RGWCoroutine {
+ RGWDataSyncCtx* const sc;
+ RGWDataSyncEnv* const sync_env;
+
+ rgw_bucket_sync_pair_info sync_pair;
+ rgw_raw_obj obj;
+ RGWObjVersionTracker objv;
+
+public:
+ RemoveBucketShardStatusCR(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair, uint64_t gen)
+ : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+ sync_pair(sync_pair),
+ obj(sync_env->svc->zone->get_zone_params().log_pool,
+ RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen))
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield call(new RGWRadosRemoveCR(sync_env->driver, obj, &objv));
+ if (retcode < 0 && retcode != -ENOENT) {
+ ldout(cct, 20) << "ERROR: failed to remove bucket shard status for: " << sync_pair <<
+ ". with error: " << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ ldout(cct, 20) << "removed bucket shard status object: " << obj.oid << dendl;
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RemoveBucketShardStatusCollectCR : public RGWShardCollectCR {
+ static constexpr int max_concurrent_shards = 16;
+ RGWDataSyncCtx* const sc;
+ RGWDataSyncEnv* const sync_env;
+ rgw_bucket_sync_pair_info sync_pair;
+ const uint64_t gen;
+
+ const int num_shards;
+ int shard = 0;
+
+ int handle_result(int r) override {
+ if (r < 0) {
+ ldout(cct, 4) << "failed to remove bucket shard status object: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ RemoveBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ uint64_t gen,
+ int num_shards)
+ : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+ sc(sc), sync_env(sc->env), sync_pair(sync_pair), gen(gen), num_shards(num_shards)
+ {}
+
+ bool spawn_next() override {
+ if (shard >= num_shards) {
+ return false;
+ }
+ sync_pair.source_bs.shard_id = shard++;
+ spawn(new RemoveBucketShardStatusCR(sc, sync_pair, gen), false);
+ return true;
+ }
+};
+
+class InitBucketFullSyncStatusCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ const rgw_bucket_sync_pair_info& sync_pair;
+ const rgw_raw_obj& status_obj;
+ rgw_bucket_sync_status& status;
+ RGWObjVersionTracker& objv;
+ const RGWBucketInfo& source_info;
+ const bool check_compat;
+
+ const rgw_bucket_index_marker_info& info;
+ BucketIndexShardsManager marker_mgr;
+
+ bool all_incremental = true;
+ bool no_zero = false;
+
+public:
+ InitBucketFullSyncStatusCR(RGWDataSyncCtx* sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ const rgw_raw_obj& status_obj,
+ rgw_bucket_sync_status& status,
+ RGWObjVersionTracker& objv,
+ const RGWBucketInfo& source_info,
+ bool check_compat,
+ const rgw_bucket_index_marker_info& info)
+ : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+ sync_pair(sync_pair), status_obj(status_obj),
+ status(status), objv(objv), source_info(source_info),
+ check_compat(check_compat), info(info)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ retcode = marker_mgr.from_string(info.max_marker, -1);
+ if (retcode < 0) {
+ lderr(cct) << "failed to parse bilog shard markers: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ status.state = BucketSyncState::Init;
+
+ if (info.oldest_gen == 0) {
+ if (check_compat) {
+ // use shard count from our log gen=0
+ // try to convert existing per-shard incremental status for backward compatibility
+ if (source_info.layout.logs.empty() ||
+ source_info.layout.logs.front().gen > 0) {
+ ldpp_dout(dpp, 20) << "no generation zero when checking compatibility" << dendl;
+ no_zero = true;
+ } else if (auto& log = source_info.layout.logs.front();
+ log.layout.type != rgw::BucketLogType::InIndex) {
+ ldpp_dout(dpp, 20) << "unrecognized log layout type when checking compatibility " << log.layout.type << dendl;
+ no_zero = true;
+ }
+ if (!no_zero) {
+ yield {
+ const int num_shards0 = rgw::num_shards(
+ source_info.layout.logs.front().layout.in_index.layout);
+ call(new CheckAllBucketShardStatusIsIncremental(sc, sync_pair,
+ num_shards0,
+ &all_incremental));
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ if (all_incremental) {
+ // we can use existing status and resume incremental sync
+ status.state = BucketSyncState::Incremental;
+ }
+ } else {
+ all_incremental = false;
+ }
+ }
+ }
+
+ if (status.state != BucketSyncState::Incremental) {
+ // initialize all shard sync status. this will populate the log marker
+ // positions where incremental sync will resume after full sync
+ yield {
+ const int num_shards = marker_mgr.get().size();
+ call(new InitBucketShardStatusCollectCR(sc, sync_pair, info.latest_gen, marker_mgr, num_shards));
+ }
+ if (retcode < 0) {
+ ldout(cct, 20) << "failed to init bucket shard status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (sync_env->sync_module->should_full_sync()) {
+ status.state = BucketSyncState::Full;
+ } else {
+ status.state = BucketSyncState::Incremental;
+ }
+ }
+
+ status.shards_done_with_gen.resize(marker_mgr.get().size());
+ status.incremental_gen = info.latest_gen;
+
+ ldout(cct, 20) << "writing bucket sync status during init. state=" << status.state << ". marker=" << status.full.position << dendl;
+
+ // write bucket sync status
+ using CR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+ yield call(new CR(dpp, sync_env->driver,
+ status_obj, status, &objv, false));
+ if (retcode < 0) {
+ ldout(cct, 20) << "failed to write bucket shard status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+#define OMAP_READ_MAX_ENTRIES 10
+class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw::sal::RadosStore* driver;
+
+ const int shard_id;
+ int max_entries;
+
+ set<string>& recovering_buckets;
+ string marker;
+ string error_oid;
+
+ RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+ set<string> error_entries;
+ int max_omap_entries;
+ int count;
+
+public:
+ RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+ set<string>& _recovering_buckets, const int _max_entries)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+ recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
+ {
+ error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadRecoveringBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this){
+ //read recovering bucket shards
+ count = 0;
+ do {
+ omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+ yield call(new RGWRadosGetOmapKeysCR(driver, rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, error_oid),
+ marker, max_omap_entries, omapkeys));
+
+ if (retcode == -ENOENT) {
+ break;
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to read recovering bucket shards with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ error_entries = std::move(omapkeys->entries);
+ if (error_entries.empty()) {
+ break;
+ }
+
+ count += error_entries.size();
+ marker = *error_entries.rbegin();
+ for (const std::string& key : error_entries) {
+ rgw_bucket_shard bs;
+ std::optional<uint64_t> gen;
+ if (int r = rgw::error_repo::decode_key(key, bs, gen); r < 0) {
+ // insert the key as-is
+ recovering_buckets.insert(std::move(key));
+ } else if (gen) {
+ recovering_buckets.insert(fmt::format("{}[{}]", bucket_shard_str{bs}, *gen));
+ } else {
+ recovering_buckets.insert(fmt::format("{}[full]", bucket_shard_str{bs}));
+ }
+ }
+ } while (omapkeys->more && count < max_entries);
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw::sal::RadosStore* driver;
+
+ const int shard_id;
+ int max_entries;
+
+ set<string>& pending_buckets;
+ string marker;
+ string status_oid;
+
+ rgw_data_sync_marker* sync_marker;
+ int count;
+
+ std::string next_marker;
+ vector<rgw_data_change_log_entry> log_entries;
+ bool truncated;
+
+public:
+ RGWReadPendingBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+ set<string>& _pending_buckets,
+ rgw_data_sync_marker* _sync_marker, const int _max_entries)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+ pending_buckets(_pending_buckets), sync_marker(_sync_marker)
+ {
+ status_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id);
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadPendingBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this){
+ //read sync status marker
+ using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+ yield call(new CR(dpp, sync_env->driver,
+ rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, status_oid),
+ sync_marker));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to read sync status marker with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ //read pending bucket shards
+ marker = sync_marker->marker;
+ count = 0;
+ do{
+ yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, marker,
+ &next_marker, &log_entries, &truncated));
+
+ if (retcode == -ENOENT) {
+ break;
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to read remote data log info with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (log_entries.empty()) {
+ break;
+ }
+
+ count += log_entries.size();
+ for (const auto& entry : log_entries) {
+ pending_buckets.insert(entry.entry.key);
+ }
+ }while(truncated && count < max_entries);
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+int RGWRemoteDataLog::read_shard_status(const DoutPrefixProvider *dpp, int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
+{
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+ RGWHTTPManager http_manager(driver->ctx(), crs.get_completion_mgr());
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWDataSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+ RGWDataSyncCtx sc_local = sc;
+ sc_local.env = &sync_env_local;
+ list<RGWCoroutinesStack *> stacks;
+ RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+ recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sc_local, shard_id, recovering_buckets, max_entries));
+ stacks.push_back(recovering_stack);
+ RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+ pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sc_local, shard_id, pending_buckets, sync_marker, max_entries));
+ stacks.push_back(pending_stack);
+ ret = crs.run(dpp, stacks);
+ http_manager.stop();
+ return ret;
+}
+
+CephContext *RGWBucketPipeSyncStatusManager::get_cct() const
+{
+ return driver->ctx();
+}
+
+void rgw_bucket_entry_owner::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("ID", id, obj);
+ JSONDecoder::decode_json("DisplayName", display_name, obj);
+}
+
+struct bucket_list_entry {
+ bool delete_marker;
+ rgw_obj_key key;
+ bool is_latest;
+ real_time mtime;
+ string etag;
+ uint64_t size;
+ string storage_class;
+ rgw_bucket_entry_owner owner;
+ uint64_t versioned_epoch;
+ string rgw_tag;
+
+ bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {}
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj);
+ JSONDecoder::decode_json("Key", key.name, obj);
+ JSONDecoder::decode_json("VersionId", key.instance, obj);
+ JSONDecoder::decode_json("IsLatest", is_latest, obj);
+ string mtime_str;
+ JSONDecoder::decode_json("RgwxMtime", mtime_str, obj);
+
+ struct tm t;
+ uint32_t nsec;
+ if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) {
+ ceph_timespec ts;
+ ts.tv_sec = (uint64_t)internal_timegm(&t);
+ ts.tv_nsec = nsec;
+ mtime = real_clock::from_ceph_timespec(ts);
+ }
+ JSONDecoder::decode_json("ETag", etag, obj);
+ JSONDecoder::decode_json("Size", size, obj);
+ JSONDecoder::decode_json("StorageClass", storage_class, obj);
+ JSONDecoder::decode_json("Owner", owner, obj);
+ JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
+ JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
+ if (key.instance == "null" && !versioned_epoch) {
+ key.instance.clear();
+ }
+ }
+
+ RGWModifyOp get_modify_op() const {
+ if (delete_marker) {
+ return CLS_RGW_OP_LINK_OLH_DM;
+ } else if (!key.instance.empty() && key.instance != "null") {
+ return CLS_RGW_OP_LINK_OLH;
+ } else {
+ return CLS_RGW_OP_ADD;
+ }
+ }
+};
+
+struct bucket_list_result {
+ string name;
+ string prefix;
+ string key_marker;
+ string version_id_marker;
+ int max_keys;
+ bool is_truncated;
+ list<bucket_list_entry> entries;
+
+ bucket_list_result() : max_keys(0), is_truncated(false) {}
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("Name", name, obj);
+ JSONDecoder::decode_json("Prefix", prefix, obj);
+ JSONDecoder::decode_json("KeyMarker", key_marker, obj);
+ JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj);
+ JSONDecoder::decode_json("MaxKeys", max_keys, obj);
+ JSONDecoder::decode_json("IsTruncated", is_truncated, obj);
+ JSONDecoder::decode_json("Entries", entries, obj);
+ }
+};
+
+class RGWListRemoteBucketCR: public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ const rgw_bucket_shard& bs;
+ rgw_obj_key marker_position;
+
+ bucket_list_result *result;
+
+public:
+ RGWListRemoteBucketCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs,
+ rgw_obj_key& _marker_position, bucket_list_result *_result)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), bs(bs),
+ marker_position(_marker_position), result(_result) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield {
+ rgw_http_param_pair pairs[] = { { "versions" , NULL },
+ { "format" , "json" },
+ { "objs-container" , "true" },
+ { "key-marker" , marker_position.name.c_str() },
+ { "version-id-marker" , marker_position.instance.c_str() },
+ { NULL, NULL } };
+ string p = string("/") + bs.bucket.get_key(':', 0);
+ call(new RGWReadRESTResourceCR<bucket_list_result>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, result));
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+struct next_bilog_result {
+ uint64_t generation = 0;
+ int num_shards = 0;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("generation", generation, obj);
+ JSONDecoder::decode_json("num_shards", num_shards, obj);
+ }
+};
+
+struct bilog_list_result {
+ list<rgw_bi_log_entry> entries;
+ bool truncated{false};
+ std::optional<next_bilog_result> next_log;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("entries", entries, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ JSONDecoder::decode_json("next_log", next_log, obj);
+ }
+};
+
+class RGWListBucketIndexLogCR: public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ const string instance_key;
+ string marker;
+
+ bilog_list_result *result;
+ std::optional<PerfGuard> timer;
+ uint64_t generation;
+ std::string gen_str = std::to_string(generation);
+ uint32_t format_ver{1};
+
+public:
+ RGWListBucketIndexLogCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, string& _marker,
+ uint64_t _generation, bilog_list_result *_result)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ instance_key(bs.get_key()), marker(_marker), result(_result), generation(_generation) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ if (sync_env->counters) {
+ timer.emplace(sync_env->counters, sync_counters::l_poll);
+ }
+ yield {
+ rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() },
+ { "format" , "json" },
+ { "marker" , marker.c_str() },
+ { "type", "bucket-index" },
+ { "generation", gen_str.c_str() },
+ { "format-ver", "2"},
+ { NULL, NULL } };
+
+ call(new RGWReadRESTResourceCR<bilog_list_result>(sync_env->cct, sc->conn, sync_env->http_manager,
+ "/admin/log", pairs, result));
+ }
+ timer.reset();
+ if (retcode < 0) {
+ if (sync_env->counters) {
+ sync_env->counters->inc(sync_counters::l_poll_err);
+ }
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10
+
+class RGWBucketFullSyncMarkerTrack : public RGWSyncShardMarkerTrack<rgw_obj_key, rgw_obj_key> {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ const rgw_raw_obj& status_obj;
+ rgw_bucket_sync_status& sync_status;
+ RGWSyncTraceNodeRef tn;
+ RGWObjVersionTracker& objv_tracker;
+
+public:
+ RGWBucketFullSyncMarkerTrack(RGWDataSyncCtx *_sc,
+ const rgw_raw_obj& status_obj,
+ rgw_bucket_sync_status& sync_status,
+ RGWSyncTraceNodeRef tn,
+ RGWObjVersionTracker& objv_tracker)
+ : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+ sc(_sc), sync_env(_sc->env), status_obj(status_obj),
+ sync_status(sync_status), tn(std::move(tn)), objv_tracker(objv_tracker)
+ {}
+
+
+ RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+ sync_status.full.position = new_marker;
+ sync_status.full.count = index_pos;
+
+ tn->log(20, SSTR("updating marker oid=" << status_obj.oid << " marker=" << new_marker));
+ return new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+ sync_env->dpp, sync_env->driver,
+ status_obj, sync_status, &objv_tracker);
+ }
+
+ RGWOrderCallCR *allocate_order_control_cr() override {
+ return new RGWLastCallerWinsCR(sync_env->cct);
+ }
+};
+
+// write the incremental sync status and update 'stable_timestamp' on success
+class RGWWriteBucketShardIncSyncStatus : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+ rgw_raw_obj obj;
+ rgw_bucket_shard_inc_sync_marker sync_marker;
+ ceph::real_time* stable_timestamp;
+ RGWObjVersionTracker& objv_tracker;
+ std::map<std::string, bufferlist> attrs;
+ public:
+ RGWWriteBucketShardIncSyncStatus(RGWDataSyncEnv *sync_env,
+ const rgw_raw_obj& obj,
+ const rgw_bucket_shard_inc_sync_marker& sync_marker,
+ ceph::real_time* stable_timestamp,
+ RGWObjVersionTracker& objv_tracker)
+ : RGWCoroutine(sync_env->cct), sync_env(sync_env), obj(obj),
+ sync_marker(sync_marker), stable_timestamp(stable_timestamp),
+ objv_tracker(objv_tracker)
+ {}
+ int operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ sync_marker.encode_attr(attrs);
+
+ yield call(new RGWSimpleRadosWriteAttrsCR(sync_env->dpp, sync_env->driver,
+ obj, attrs, &objv_tracker));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ if (stable_timestamp) {
+ *stable_timestamp = sync_marker.timestamp;
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, rgw_obj_key> {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ rgw_raw_obj obj;
+ rgw_bucket_shard_inc_sync_marker sync_marker;
+
+ map<rgw_obj_key, string> key_to_marker;
+
+ struct operation {
+ rgw_obj_key key;
+ bool is_olh;
+ };
+ map<string, operation> marker_to_op;
+ std::set<std::string> pending_olh; // object names with pending olh operations
+
+ RGWSyncTraceNodeRef tn;
+ RGWObjVersionTracker& objv_tracker;
+ ceph::real_time* stable_timestamp;
+
+ void handle_finish(const string& marker) override {
+ auto iter = marker_to_op.find(marker);
+ if (iter == marker_to_op.end()) {
+ return;
+ }
+ auto& op = iter->second;
+ key_to_marker.erase(op.key);
+ reset_need_retry(op.key);
+ if (op.is_olh) {
+ pending_olh.erase(op.key.name);
+ }
+ marker_to_op.erase(iter);
+ }
+
+public:
+ RGWBucketIncSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+ const string& _marker_oid,
+ const rgw_bucket_shard_inc_sync_marker& _marker,
+ RGWSyncTraceNodeRef tn,
+ RGWObjVersionTracker& objv_tracker,
+ ceph::real_time* stable_timestamp)
+ : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+ sc(_sc), sync_env(_sc->env),
+ obj(sync_env->svc->zone->get_zone_params().log_pool, _marker_oid),
+ sync_marker(_marker), tn(std::move(tn)), objv_tracker(objv_tracker),
+ stable_timestamp(stable_timestamp)
+ {}
+
+ const rgw_raw_obj& get_obj() const { return obj; }
+
+ RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+ sync_marker.position = new_marker;
+ sync_marker.timestamp = timestamp;
+
+ tn->log(20, SSTR("updating marker marker_oid=" << obj.oid << " marker=" << new_marker << " timestamp=" << timestamp));
+ return new RGWWriteBucketShardIncSyncStatus(sync_env, obj, sync_marker,
+ stable_timestamp, objv_tracker);
+ }
+
+ /*
+ * create index from key -> <op, marker>, and from marker -> key
+ * this is useful so that we can insure that we only have one
+ * entry for any key that is used. This is needed when doing
+ * incremenatl sync of data, and we don't want to run multiple
+ * concurrent sync operations for the same bucket shard
+ * Also, we should make sure that we don't run concurrent operations on the same key with
+ * different ops.
+ */
+ bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) {
+ auto result = key_to_marker.emplace(key, marker);
+ if (!result.second) { // exists
+ set_need_retry(key);
+ return false;
+ }
+ marker_to_op[marker] = operation{key, is_olh};
+ if (is_olh) {
+ // prevent other olh ops from starting on this object name
+ pending_olh.insert(key.name);
+ }
+ return true;
+ }
+
+ bool can_do_op(const rgw_obj_key& key, bool is_olh) {
+ // serialize olh ops on the same object name
+ if (is_olh && pending_olh.count(key.name)) {
+ tn->log(20, SSTR("sync of " << key << " waiting for pending olh op"));
+ return false;
+ }
+ return (key_to_marker.find(key) == key_to_marker.end());
+ }
+
+ RGWOrderCallCR *allocate_order_control_cr() override {
+ return new RGWLastCallerWinsCR(sync_env->cct);
+ }
+};
+
+static bool ignore_sync_error(int err) {
+ switch (err) {
+ case -ENOENT:
+ case -EPERM:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+template <class T, class K>
+class RGWBucketSyncSingleEntryCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ rgw_bucket_sync_pipe& sync_pipe;
+ rgw_bucket_shard& bs;
+
+ rgw_obj_key key;
+ bool versioned;
+ std::optional<uint64_t> versioned_epoch;
+ rgw_bucket_entry_owner owner;
+ real_time timestamp;
+ RGWModifyOp op;
+ RGWPendingState op_state;
+
+ T entry_marker;
+ RGWSyncShardMarkerTrack<T, K> *marker_tracker;
+
+ int sync_status;
+
+ stringstream error_ss;
+
+ bool error_injection;
+
+ RGWDataSyncModule *data_sync_module;
+
+ rgw_zone_set_entry source_trace_entry;
+ rgw_zone_set zones_trace;
+
+ RGWSyncTraceNodeRef tn;
+ std::string zone_name;
+
+public:
+ RGWBucketSyncSingleEntryCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ const rgw_obj_key& _key, bool _versioned,
+ std::optional<uint64_t> _versioned_epoch,
+ real_time& _timestamp,
+ const rgw_bucket_entry_owner& _owner,
+ RGWModifyOp _op, RGWPendingState _op_state,
+ const T& _entry_marker, RGWSyncShardMarkerTrack<T, K> *_marker_tracker, rgw_zone_set& _zones_trace,
+ RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+ key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch),
+ owner(_owner),
+ timestamp(_timestamp), op(_op),
+ op_state(_op_state),
+ entry_marker(_entry_marker),
+ marker_tracker(_marker_tracker),
+ sync_status(0){
+ stringstream ss;
+ ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]";
+ set_description() << "bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state;
+ set_status("init");
+
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key));
+
+ tn->log(20, SSTR("bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state));
+ error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0);
+
+ data_sync_module = sync_env->sync_module->get_data_handler();
+
+ source_trace_entry.zone = sc->source_zone.id;
+ source_trace_entry.location_key = _sync_pipe.info.source_bs.bucket.get_key();
+
+ zones_trace = _zones_trace;
+ zones_trace.insert(sync_env->svc->zone->get_zone().id, _sync_pipe.info.dest_bucket.get_key());
+
+ if (sc->env->ostr) {
+ RGWZone* z;
+ if ((z = sc->env->driver->svc()->zone->find_zone(sc->source_zone))) {
+ zone_name = z->name;
+ }
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ /* skip entries that are not complete */
+ if (op_state != CLS_RGW_STATE_COMPLETE) {
+ goto done;
+ }
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+ do {
+ yield {
+ marker_tracker->reset_need_retry(key);
+ if (key.name.empty()) {
+ /* shouldn't happen */
+ set_status("skipping empty entry");
+ tn->log(0, "entry with empty obj name, skipping");
+ goto done;
+ }
+ if (error_injection &&
+ rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) {
+ tn->log(0, SSTR(": injecting data sync error on key=" << key.name));
+ retcode = -EIO;
+ } else if (op == CLS_RGW_OP_ADD ||
+ op == CLS_RGW_OP_LINK_OLH) {
+ set_status("syncing obj");
+ tn->log(5, SSTR("bucket sync: sync obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+ if (versioned_epoch) {
+ pretty_print(sc->env, "Syncing object s3://{}/{} version {} in sync from zone {}\n",
+ bs.bucket.name, key, *versioned_epoch, zone_name);
+ } else {
+ pretty_print(sc->env, "Syncing object s3://{}/{} in sync from zone {}\n",
+ bs.bucket.name, key, zone_name);
+ }
+ call(data_sync_module->sync_object(dpp, sc, sync_pipe, key, versioned_epoch,
+ source_trace_entry, &zones_trace));
+ } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) {
+ set_status("removing obj");
+ if (versioned_epoch) {
+ pretty_print(sc->env, "Deleting object s3://{}/{} version {} in sync from zone {}\n",
+ bs.bucket.name, key, *versioned_epoch, zone_name);
+ } else {
+ pretty_print(sc->env, "Deleting object s3://{}/{} in sync from zone {}\n",
+ bs.bucket.name, key, zone_name);
+ }
+ if (op == CLS_RGW_OP_UNLINK_INSTANCE) {
+ versioned = true;
+ }
+ tn->log(10, SSTR("removing obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+ call(data_sync_module->remove_object(dpp, sc, sync_pipe, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+ // our copy of the object is more recent, continue as if it succeeded
+ } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
+ set_status("creating delete marker");
+ tn->log(10, SSTR("creating delete marker: obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+ call(data_sync_module->create_delete_marker(dpp, sc, sync_pipe, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace));
+ }
+ tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key));
+ }
+ if (retcode == -ERR_PRECONDITION_FAILED) {
+ pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n",
+ bs.bucket.name, key, zone_name);
+ set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+ tn->log(0, "Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+ retcode = 0;
+ }
+ } while (marker_tracker->need_retry(key));
+ {
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+ if (retcode >= 0) {
+ tn->log(10, "success");
+ } else {
+ tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")"));
+ }
+ }
+
+ if (retcode < 0 && retcode != -ENOENT) {
+ set_status() << "failed to sync obj; retcode=" << retcode;
+ tn->log(0, SSTR("ERROR: failed to sync object: "
+ << bucket_shard_str{bs} << "/" << key.name));
+ if (!ignore_sync_error(retcode)) {
+ error_ss << bucket_shard_str{bs} << "/" << key.name;
+ sync_status = retcode;
+ }
+ }
+ if (!error_ss.str().empty()) {
+ yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
+ }
+done:
+ if (sync_status == 0) {
+ /* update marker */
+ set_status() << "calling marker_tracker->finish(" << entry_marker << ")";
+ yield call(marker_tracker->finish(entry_marker));
+ sync_status = retcode;
+ }
+ if (sync_status < 0) {
+ return set_cr_error(sync_status);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWBucketFullSyncCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_sync_pipe& sync_pipe;
+ rgw_bucket_sync_status& sync_status;
+ rgw_bucket_shard& bs;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ bucket_list_result list_result;
+ list<bucket_list_entry>::iterator entries_iter;
+ rgw_obj_key list_marker;
+ bucket_list_entry *entry{nullptr};
+
+ int total_entries{0};
+
+ int sync_result{0};
+
+ const rgw_raw_obj& status_obj;
+ RGWObjVersionTracker& objv;
+
+ rgw_zone_set zones_trace;
+
+ RGWSyncTraceNodeRef tn;
+ RGWBucketFullSyncMarkerTrack marker_tracker;
+
+ struct _prefix_handler {
+ RGWBucketSyncFlowManager::pipe_rules_ref rules;
+ RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator iter;
+ std::optional<string> cur_prefix;
+
+ void set_rules(RGWBucketSyncFlowManager::pipe_rules_ref& _rules) {
+ rules = _rules;
+ }
+
+ bool revalidate_marker(rgw_obj_key *marker) {
+ if (cur_prefix &&
+ boost::starts_with(marker->name, *cur_prefix)) {
+ return true;
+ }
+ if (!rules) {
+ return false;
+ }
+ iter = rules->prefix_search(marker->name);
+ if (iter == rules->prefix_end()) {
+ return false;
+ }
+ cur_prefix = iter->first;
+ marker->name = *cur_prefix;
+ marker->instance.clear();
+ return true;
+ }
+
+ bool check_key_handled(const rgw_obj_key& key) {
+ if (!rules) {
+ return false;
+ }
+ if (cur_prefix &&
+ boost::starts_with(key.name, *cur_prefix)) {
+ return true;
+ }
+ iter = rules->prefix_search(key.name);
+ if (iter == rules->prefix_end()) {
+ return false;
+ }
+ cur_prefix = iter->first;
+ return boost::starts_with(key.name, iter->first);
+ }
+ } prefix_handler;
+
+public:
+ RGWBucketFullSyncCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ const rgw_raw_obj& status_obj,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ rgw_bucket_sync_status& sync_status,
+ RGWSyncTraceNodeRef tn_parent,
+ RGWObjVersionTracker& objv_tracker)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ sync_pipe(_sync_pipe), sync_status(sync_status),
+ bs(_sync_pipe.info.source_bs),
+ lease_cr(std::move(lease_cr)), status_obj(status_obj), objv(objv_tracker),
+ tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync",
+ SSTR(bucket_shard_str{bs}))),
+ marker_tracker(sc, status_obj, sync_status, tn, objv_tracker)
+ {
+ zones_trace.insert(sc->source_zone.id, sync_pipe.info.dest_bucket.get_key());
+ prefix_handler.set_rules(sync_pipe.get_rules());
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ list_marker = sync_status.full.position;
+
+ total_entries = sync_status.full.count;
+ do {
+ if (lease_cr && !lease_cr->is_locked()) {
+ tn->log(1, "no lease or lease is lost, abort");
+ drain_all();
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ set_status("listing remote bucket");
+ tn->log(20, "listing bucket for full sync");
+
+ if (!prefix_handler.revalidate_marker(&list_marker)) {
+ set_status() << "finished iterating over all available prefixes: last marker=" << list_marker;
+ tn->log(20, SSTR("finished iterating over all available prefixes: last marker=" << list_marker));
+ break;
+ }
+
+ yield call(new RGWListRemoteBucketCR(sc, bs, list_marker, &list_result));
+ if (retcode < 0 && retcode != -ENOENT) {
+ set_status("failed bucket listing, going down");
+ drain_all();
+ yield spawn(marker_tracker.flush(), true);
+ return set_cr_error(retcode);
+ }
+ if (list_result.entries.size() > 0) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ }
+ entries_iter = list_result.entries.begin();
+ for (; entries_iter != list_result.entries.end(); ++entries_iter) {
+ if (lease_cr && !lease_cr->is_locked()) {
+ drain_all();
+ yield call(marker_tracker.flush());
+ tn->log(1, "no lease or lease is lost, abort");
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ tn->log(20, SSTR("[full sync] syncing object: "
+ << bucket_shard_str{bs} << "/" << entries_iter->key));
+ entry = &(*entries_iter);
+ list_marker = entries_iter->key;
+ if (!prefix_handler.check_key_handled(entries_iter->key)) {
+ set_status() << "skipping entry due to policy rules: " << entries_iter->key;
+ tn->log(20, SSTR("skipping entry due to policy rules: " << entries_iter->key));
+ continue;
+ }
+ total_entries++;
+ if (!marker_tracker.start(entry->key, total_entries, real_time())) {
+ tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?"));
+ } else {
+ using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
+ yield spawn(new SyncCR(sc, sync_pipe, entry->key,
+ false, /* versioned, only matters for object removal */
+ entry->versioned_epoch, entry->mtime,
+ entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
+ entry->key, &marker_tracker, zones_trace, tn),
+ false);
+ }
+ drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, "a sync operation returned error");
+ sync_result = ret;
+ }
+ return 0;
+ });
+ }
+ } while (list_result.is_truncated && sync_result == 0);
+ set_status("done iterating over all objects");
+
+ /* wait for all operations to complete */
+ drain_all_cb([&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, "a sync operation returned error");
+ sync_result = ret;
+ }
+ return 0;
+ });
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+ if (lease_cr && !lease_cr->is_locked()) {
+ tn->log(1, "no lease or lease is lost, abort");
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ /* update sync state to incremental */
+ if (sync_result == 0) {
+ sync_status.state = BucketSyncState::Incremental;
+ tn->log(5, SSTR("set bucket state=" << sync_status.state));
+ yield call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+ dpp, sync_env->driver, status_obj, sync_status, &objv));
+ tn->log(5, SSTR("bucket status objv=" << objv));
+ } else {
+ tn->log(10, SSTR("backing out with sync_status=" << sync_result));
+ }
+ if (retcode < 0 && sync_result == 0) { /* actually tried to set incremental state and failed */
+ tn->log(0, SSTR("ERROR: failed to set sync state on bucket "
+ << bucket_shard_str{bs} << " retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ if (sync_result < 0) {
+ return set_cr_error(sync_result);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+static bool has_olh_epoch(RGWModifyOp op) {
+ return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE;
+}
+
+class RGWBucketShardIsDoneCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_sync_status bucket_status;
+ const rgw_raw_obj& bucket_status_obj;
+ const int shard_id;
+ RGWObjVersionTracker objv_tracker;
+ const next_bilog_result& next_log;
+ const uint64_t generation;
+
+public:
+ RGWBucketShardIsDoneCR(RGWDataSyncCtx *_sc, const rgw_raw_obj& _bucket_status_obj,
+ int _shard_id, const next_bilog_result& _next_log, const uint64_t _gen)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ bucket_status_obj(_bucket_status_obj),
+ shard_id(_shard_id), next_log(_next_log), generation(_gen) {}
+
+ int operate(const DoutPrefixProvider* dpp) override
+ {
+ reenter(this) {
+ do {
+ // read bucket sync status
+ objv_tracker.clear();
+ using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+ yield call(new ReadCR(dpp, sync_env->driver,
+ bucket_status_obj, &bucket_status, false, &objv_tracker));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 20) << "failed to read bucket shard status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (bucket_status.state != BucketSyncState::Incremental) {
+ // exit with success to avoid stale shard being
+ // retried in error repo if we lost a race
+ ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR found sync state = " << bucket_status.state << dendl;
+ return set_cr_done();
+ }
+
+ if (bucket_status.incremental_gen != generation) {
+ // exit with success to avoid stale shard being
+ // retried in error repo if we lost a race
+ ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR expected gen: " << generation
+ << ", got: " << bucket_status.incremental_gen << dendl;
+ return set_cr_done();
+ }
+
+ yield {
+ // update bucket_status after a shard is done with current gen
+ auto& done = bucket_status.shards_done_with_gen;
+ done[shard_id] = true;
+
+ // increment gen if all shards are already done with current gen
+ if (std::all_of(done.begin(), done.end(),
+ [] (const bool done){return done; } )) {
+ bucket_status.incremental_gen = next_log.generation;
+ done.clear();
+ done.resize(next_log.num_shards, false);
+ }
+ ldpp_dout(dpp, 20) << "bucket status incremental gen is " << bucket_status.incremental_gen << dendl;
+ using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+ call(new WriteCR(dpp, sync_env->driver,
+ bucket_status_obj, bucket_status, &objv_tracker, false));
+ }
+ if (retcode < 0 && retcode != -ECANCELED) {
+ ldpp_dout(dpp, 20) << "failed to write bucket sync status: " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ } else if (retcode >= 0) {
+ return set_cr_done();
+ }
+ } while (retcode == -ECANCELED);
+ }
+ return 0;
+ }
+};
+
+class RGWBucketShardIncrementalSyncCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_sync_pipe& sync_pipe;
+ RGWBucketSyncFlowManager::pipe_rules_ref rules;
+ rgw_bucket_shard& bs;
+ const rgw_raw_obj& bucket_status_obj;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ bilog_list_result extended_result;
+ list<rgw_bi_log_entry> list_result;
+ int next_num_shards;
+ uint64_t next_gen;
+ bool truncated;
+
+ list<rgw_bi_log_entry>::iterator entries_iter, entries_end;
+ map<pair<string, string>, pair<real_time, RGWModifyOp> > squash_map;
+ rgw_bucket_shard_sync_info& sync_info;
+ uint64_t generation;
+ rgw_obj_key key;
+ rgw_bi_log_entry *entry{nullptr};
+ bool updated_status{false};
+ rgw_zone_id zone_id;
+ string target_location_key;
+
+ string cur_id;
+
+ int sync_status{0};
+ bool syncstopped{false};
+
+ RGWSyncTraceNodeRef tn;
+ RGWBucketIncSyncShardMarkerTrack marker_tracker;
+
+public:
+ RGWBucketShardIncrementalSyncCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ const std::string& shard_status_oid,
+ const rgw_raw_obj& _bucket_status_obj,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ rgw_bucket_shard_sync_info& sync_info,
+ uint64_t generation,
+ RGWSyncTraceNodeRef& _tn_parent,
+ RGWObjVersionTracker& objv_tracker,
+ ceph::real_time* stable_timestamp)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+ bucket_status_obj(_bucket_status_obj), lease_cr(std::move(lease_cr)),
+ sync_info(sync_info), generation(generation), zone_id(sync_env->svc->zone->get_zone().id),
+ tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync",
+ SSTR(bucket_shard_str{bs}))),
+ marker_tracker(sc, shard_status_oid, sync_info.inc_marker, tn,
+ objv_tracker, stable_timestamp)
+ {
+ set_description() << "bucket shard incremental sync bucket="
+ << bucket_shard_str{bs};
+ set_status("init");
+ rules = sync_pipe.get_rules();
+ target_location_key = sync_pipe.info.dest_bucket.get_key();
+ }
+
+ bool check_key_handled(const rgw_obj_key& key) {
+ if (!rules) {
+ return false;
+ }
+ auto iter = rules->prefix_search(key.name);
+ if (iter == rules->prefix_end()) {
+ return false;
+ }
+ return boost::starts_with(key.name, iter->first);
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+ int ret;
+ reenter(this) {
+ do {
+ if (lease_cr && !lease_cr->is_locked()) {
+ tn->log(1, "no lease or lease is lost, abort");
+ drain_all();
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position));
+ set_status() << "listing bilog; position=" << sync_info.inc_marker.position;
+ yield call(new RGWListBucketIndexLogCR(sc, bs, sync_info.inc_marker.position, generation, &extended_result));
+ if (retcode < 0 && retcode != -ENOENT) {
+ /* wait for all operations to complete */
+ drain_all();
+ yield spawn(marker_tracker.flush(), true);
+ return set_cr_error(retcode);
+ }
+ list_result = std::move(extended_result.entries);
+ truncated = extended_result.truncated;
+ if (extended_result.next_log) {
+ next_gen = extended_result.next_log->generation;
+ next_num_shards = extended_result.next_log->num_shards;
+ }
+
+ squash_map.clear();
+ entries_iter = list_result.begin();
+ entries_end = list_result.end();
+ for (; entries_iter != entries_end; ++entries_iter) {
+ auto e = *entries_iter;
+ if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) {
+ ldpp_dout(dpp, 20) << "syncstop at: " << e.timestamp << ". marker: " << e.id << dendl;
+ syncstopped = true;
+ entries_end = std::next(entries_iter); // stop after this entry
+ break;
+ }
+ if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+ ldpp_dout(dpp, 20) << "syncstart at: " << e.timestamp << ". marker: " << e.id << dendl;
+ continue;
+ }
+ if (e.op == CLS_RGW_OP_CANCEL) {
+ continue;
+ }
+ if (e.state != CLS_RGW_STATE_COMPLETE) {
+ continue;
+ }
+ if (e.zones_trace.exists(zone_id.id, target_location_key)) {
+ continue;
+ }
+ auto& squash_entry = squash_map[make_pair(e.object, e.instance)];
+ // don't squash over olh entries - we need to apply their olh_epoch
+ if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) {
+ continue;
+ }
+ if (squash_entry.first <= e.timestamp) {
+ squash_entry = make_pair<>(e.timestamp, e.op);
+ }
+ }
+
+ entries_iter = list_result.begin();
+ for (; entries_iter != entries_end; ++entries_iter) {
+ if (lease_cr && !lease_cr->is_locked()) {
+ tn->log(1, "no lease or lease is lost, abort");
+ drain_all();
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ return set_cr_error(-ECANCELED);
+ }
+ entry = &(*entries_iter);
+ {
+ ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */
+ if (p < 0) {
+ cur_id = entry->id;
+ } else {
+ cur_id = entry->id.substr(p + 1);
+ }
+ }
+ sync_info.inc_marker.position = cur_id;
+
+ if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+ ldpp_dout(dpp, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl;
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+
+ if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) {
+ set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry";
+ tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+
+ tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns));
+
+ if (!key.ns.empty()) {
+ set_status() << "skipping entry in namespace: " << entry->object;
+ tn->log(20, SSTR("skipping entry in namespace: " << entry->object));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+
+ if (!check_key_handled(key)) {
+ set_status() << "skipping entry due to policy rules: " << entry->object;
+ tn->log(20, SSTR("skipping entry due to policy rules: " << entry->object));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+
+ set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op;
+ if (entry->op == CLS_RGW_OP_CANCEL) {
+ set_status() << "canceled operation, skipping";
+ tn->log(20, SSTR("skipping object: "
+ << bucket_shard_str{bs} << "/" << key << ": canceled operation"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+ if (entry->state != CLS_RGW_STATE_COMPLETE) {
+ set_status() << "non-complete operation, skipping";
+ tn->log(20, SSTR("skipping object: "
+ << bucket_shard_str{bs} << "/" << key << ": non-complete operation"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+ if (entry->zones_trace.exists(zone_id.id, target_location_key)) {
+ set_status() << "redundant operation, skipping";
+ tn->log(20, SSTR("skipping object: "
+ <<bucket_shard_str{bs} <<"/"<<key<<": redundant operation"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+ if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) {
+ set_status() << "squashed operation, skipping";
+ tn->log(20, SSTR("skipping object: "
+ << bucket_shard_str{bs} << "/" << key << ": squashed operation"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+ tn->log(20, SSTR("syncing object: "
+ << bucket_shard_str{bs} << "/" << key));
+ updated_status = false;
+ while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) {
+ if (!updated_status) {
+ set_status() << "can't do op, conflicting inflight operation";
+ updated_status = true;
+ }
+ tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete"));
+ yield wait_for_child();
+ bool again = true;
+ while (again) {
+ again = collect(&ret, nullptr);
+ if (ret < 0) {
+ tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")"));
+ sync_status = ret;
+ /* we have reported this error */
+ }
+ }
+ if (sync_status != 0)
+ break;
+ }
+ if (sync_status != 0) {
+ /* get error, stop */
+ break;
+ }
+ if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) {
+ set_status() << "can't do op, sync already in progress for object";
+ tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object"));
+ marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+ continue;
+ }
+ // yield {
+ set_status() << "start object sync";
+ if (!marker_tracker.start(cur_id, 0, entry->timestamp)) {
+ tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?"));
+ } else {
+ std::optional<uint64_t> versioned_epoch;
+ rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name);
+ if (entry->ver.pool < 0) {
+ versioned_epoch = entry->ver.epoch;
+ }
+ tn->log(20, SSTR("entry->timestamp=" << entry->timestamp));
+ using SyncCR = RGWBucketSyncSingleEntryCR<string, rgw_obj_key>;
+ spawn(new SyncCR(sc, sync_pipe, key,
+ entry->is_versioned(), versioned_epoch,
+ entry->timestamp, owner, entry->op, entry->state,
+ cur_id, &marker_tracker, entry->zones_trace, tn),
+ false);
+ }
+ // }
+ drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, "a sync operation returned error");
+ sync_status = ret;
+ }
+ return 0;
+ });
+ }
+
+ } while (!list_result.empty() && sync_status == 0 && !syncstopped);
+
+ drain_all_cb([&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, "a sync operation returned error");
+ sync_status = ret;
+ }
+ return 0;
+ });
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+ if (syncstopped) {
+ // transition to StateStopped in RGWSyncBucketShardCR. if sync is
+ // still disabled, we'll delete the sync status object. otherwise we'll
+ // restart full sync to catch any changes that happened while sync was
+ // disabled
+ sync_info.state = rgw_bucket_shard_sync_info::StateStopped;
+ return set_cr_done();
+ }
+
+ yield call(marker_tracker.flush());
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+ if (sync_status < 0) {
+ tn->log(10, SSTR("backing out with sync_status=" << sync_status));
+ return set_cr_error(sync_status);
+ }
+
+ if (!truncated && extended_result.next_log) {
+ yield call(new RGWBucketShardIsDoneCR(sc, bucket_status_obj, bs.shard_id, *extended_result.next_log, generation));
+ if (retcode < 0) {
+ ldout(cct, 20) << "failed to update bucket sync status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ yield {
+ // delete the shard status object
+ auto status_obj = sync_env->svc->rados->obj(marker_tracker.get_obj());
+ retcode = status_obj.open(dpp);
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ call(new RGWRadosRemoveOidCR(sync_env->driver, std::move(status_obj)));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 20) << "failed to remove shard status object: " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+ }
+
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWGetBucketPeersCR : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+
+ std::optional<rgw_bucket> target_bucket;
+ std::optional<rgw_zone_id> source_zone;
+ std::optional<rgw_bucket> source_bucket;
+
+ rgw_sync_pipe_info_set *pipes;
+ map<rgw_bucket, all_bucket_info> buckets_info;
+ map<rgw_bucket, all_bucket_info>::iterator siiter;
+ std::optional<all_bucket_info> target_bucket_info;
+ std::optional<all_bucket_info> source_bucket_info;
+
+ rgw_sync_pipe_info_set::iterator siter;
+
+ std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+ std::shared_ptr<rgw_bucket_get_sync_policy_result> target_policy;
+
+ RGWSyncTraceNodeRef tn;
+
+ using pipe_const_iter = map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>::const_iterator;
+
+ static pair<pipe_const_iter, pipe_const_iter> get_pipe_iters(const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& m, std::optional<rgw_zone_id> zone) {
+ if (!zone) {
+ return { m.begin(), m.end() };
+ }
+
+ auto b = m.find(*zone);
+ if (b == m.end()) {
+ return { b, b };
+ }
+ return { b, std::next(b) };
+ }
+
+ void filter_sources(std::optional<rgw_zone_id> source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_sources,
+ rgw_sync_pipe_info_set *result) {
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": source_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+ << " source_bucket=" << source_bucket.value_or(rgw_bucket())
+ << " all_sources.size()=" << all_sources.size() << dendl;
+ auto iters = get_pipe_iters(all_sources, source_zone);
+ for (auto i = iters.first; i != iters.second; ++i) {
+ for (auto& handler : i->second) {
+ if (!handler.specific()) {
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+ continue;
+ }
+ if (source_bucket &&
+ !source_bucket->match(*handler.source.bucket)) {
+ continue;
+ }
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+ result->insert(handler, source_bucket_info, target_bucket_info);
+ }
+ }
+ }
+
+ void filter_targets(std::optional<rgw_zone_id> target_zone,
+ std::optional<rgw_bucket> target_bucket,
+ const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_targets,
+ rgw_sync_pipe_info_set *result) {
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": target_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+ << " target_bucket=" << source_bucket.value_or(rgw_bucket())
+ << " all_targets.size()=" << all_targets.size() << dendl;
+ auto iters = get_pipe_iters(all_targets, target_zone);
+ for (auto i = iters.first; i != iters.second; ++i) {
+ for (auto& handler : i->second) {
+ if (target_bucket &&
+ handler.dest.bucket &&
+ !target_bucket->match(*handler.dest.bucket)) {
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+ continue;
+ }
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+ result->insert(handler, source_bucket_info, target_bucket_info);
+ }
+ }
+ }
+
+ void update_from_target_bucket_policy();
+ void update_from_source_bucket_policy();
+
+ struct GetHintTargets : public RGWGenericAsyncCR::Action {
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket source_bucket;
+ std::set<rgw_bucket> targets;
+
+ GetHintTargets(RGWDataSyncEnv *_sync_env,
+ const rgw_bucket& _source_bucket) : sync_env(_sync_env),
+ source_bucket(_source_bucket) {}
+ int operate() override {
+ int r = sync_env->svc->bucket_sync->get_bucket_sync_hints(sync_env->dpp,
+ source_bucket,
+ nullptr,
+ &targets,
+ null_yield);
+ if (r < 0) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): failed to fetch bucket sync hints for bucket=" << source_bucket << dendl;
+ return r;
+ }
+
+ return 0;
+ }
+ };
+
+ std::shared_ptr<GetHintTargets> get_hint_targets_action;
+ std::set<rgw_bucket>::iterator hiter;
+
+public:
+ RGWGetBucketPeersCR(RGWDataSyncEnv *_sync_env,
+ std::optional<rgw_bucket> _target_bucket,
+ std::optional<rgw_zone_id> _source_zone,
+ std::optional<rgw_bucket> _source_bucket,
+ rgw_sync_pipe_info_set *_pipes,
+ const RGWSyncTraceNodeRef& _tn_parent)
+ : RGWCoroutine(_sync_env->cct),
+ sync_env(_sync_env),
+ target_bucket(_target_bucket),
+ source_zone(_source_zone),
+ source_bucket(_source_bucket),
+ pipes(_pipes),
+ tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_peers",
+ SSTR( "target=" << target_bucket.value_or(rgw_bucket())
+ << ":source=" << target_bucket.value_or(rgw_bucket())
+ << ":source_zone=" << source_zone.value_or(rgw_zone_id("*")).id))) {
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+std::ostream& operator<<(std::ostream& out, std::optional<rgw_bucket_shard>& bs) {
+ if (!bs) {
+ out << "*";
+ } else {
+ out << *bs;
+ }
+ return out;
+}
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ std::optional<uint64_t> gen,
+ const RGWSyncTraceNodeRef& tn,
+ ceph::real_time* progress);
+
+RGWRunBucketSourcesSyncCR::RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_bucket_shard& source_bs,
+ const RGWSyncTraceNodeRef& _tn_parent,
+ std::optional<uint64_t> gen,
+ ceph::real_time* progress)
+ : RGWCoroutine(_sc->env->cct), sc(_sc), sync_env(_sc->env),
+ lease_cr(std::move(lease_cr)),
+ tn(sync_env->sync_tracer->add_node(
+ _tn_parent, "bucket_sync_sources",
+ SSTR( "source=" << source_bs << ":source_zone=" << sc->source_zone))),
+ progress(progress),
+ gen(gen)
+{
+ sync_pair.source_bs = source_bs;
+}
+
+int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ yield call(new RGWGetBucketPeersCR(sync_env, std::nullopt, sc->source_zone,
+ sync_pair.source_bs.bucket, &pipes, tn));
+ if (retcode < 0 && retcode != -ENOENT) {
+ tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+ return set_cr_error(retcode);
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << "(): requested source_bs=" << sync_pair.source_bs << dendl;
+
+ if (pipes.empty()) {
+ ldpp_dout(dpp, 20) << __func__ << "(): no relevant sync pipes found" << dendl;
+ return set_cr_done();
+ }
+
+ shard_progress.resize(pipes.size());
+ cur_shard_progress = shard_progress.begin();
+
+ for (siter = pipes.begin(); siter != pipes.end(); ++siter, ++cur_shard_progress) {
+ ldpp_dout(dpp, 20) << __func__ << "(): sync pipe=" << *siter << dendl;
+
+ sync_pair.dest_bucket = siter->target.get_bucket();
+ sync_pair.handler = siter->handler;
+
+ ldpp_dout(dpp, 20) << __func__ << "(): sync_pair=" << sync_pair << dendl;
+
+ yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair,
+ gen, tn, &*cur_shard_progress),
+ sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+ [&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret));
+ }
+ return ret;
+ });
+ }
+ drain_all_cb([&](uint64_t stack_id, int ret) {
+ if (ret < 0) {
+ tn->log(10, SSTR("a sync operation returned error: " << ret));
+ }
+ return ret;
+ });
+ if (progress) {
+ *progress = *std::min_element(shard_progress.begin(), shard_progress.end());
+ }
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+class RGWSyncGetBucketInfoCR : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket bucket;
+ RGWBucketInfo *pbucket_info;
+ map<string, bufferlist> *pattrs;
+ RGWMetaSyncEnv meta_sync_env;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWSyncGetBucketInfoCR(RGWDataSyncEnv *_sync_env,
+ const rgw_bucket& _bucket,
+ RGWBucketInfo *_pbucket_info,
+ map<string, bufferlist> *_pattrs,
+ const RGWSyncTraceNodeRef& _tn_parent)
+ : RGWCoroutine(_sync_env->cct),
+ sync_env(_sync_env),
+ bucket(_bucket),
+ pbucket_info(_pbucket_info),
+ pattrs(_pattrs),
+ tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_info",
+ SSTR(bucket))) {
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncGetBucketInfoCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+ if (retcode == -ENOENT) {
+ /* bucket instance info has not been synced in yet, fetch it now */
+ yield {
+ tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata"));
+ string raw_key = string("bucket.instance:") + bucket.get_key();
+
+ meta_sync_env.init(dpp, cct, sync_env->driver, sync_env->svc->zone->get_master_conn(), sync_env->async_rados,
+ sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer);
+
+ call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key,
+ string() /* no marker */,
+ MDLOG_STATUS_COMPLETE,
+ NULL /* no marker tracker */,
+ tn));
+ }
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bucket}));
+ return set_cr_error(retcode);
+ }
+
+ yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+ }
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bucket}));
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+void RGWGetBucketPeersCR::update_from_target_bucket_policy()
+{
+ if (!target_policy ||
+ !target_policy->policy_handler ||
+ !pipes) {
+ return;
+ }
+
+ auto handler = target_policy->policy_handler.get();
+
+ filter_sources(source_zone,
+ source_bucket,
+ handler->get_sources(),
+ pipes);
+
+ for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+ if (!siter->source.has_bucket_info()) {
+ buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+ }
+ if (!siter->target.has_bucket_info()) {
+ buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+ }
+ }
+}
+
+void RGWGetBucketPeersCR::update_from_source_bucket_policy()
+{
+ if (!source_policy ||
+ !source_policy->policy_handler ||
+ !pipes) {
+ return;
+ }
+
+ auto handler = source_policy->policy_handler.get();
+
+ filter_targets(sync_env->svc->zone->get_zone().id,
+ target_bucket,
+ handler->get_targets(),
+ pipes);
+
+ for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+ if (!siter->source.has_bucket_info()) {
+ buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+ }
+ if (!siter->target.has_bucket_info()) {
+ buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+ }
+ }
+}
+
+
+class RGWSyncGetBucketSyncPolicyHandlerCR : public RGWCoroutine {
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket bucket;
+ rgw_bucket_get_sync_policy_params get_policy_params;
+
+ std::shared_ptr<rgw_bucket_get_sync_policy_result> policy;
+
+ RGWSyncTraceNodeRef tn;
+
+ int i;
+
+public:
+ RGWSyncGetBucketSyncPolicyHandlerCR(RGWDataSyncEnv *_sync_env,
+ std::optional<rgw_zone_id> zone,
+ const rgw_bucket& _bucket,
+ std::shared_ptr<rgw_bucket_get_sync_policy_result>& _policy,
+ const RGWSyncTraceNodeRef& _tn_parent)
+ : RGWCoroutine(_sync_env->cct),
+ sync_env(_sync_env),
+ bucket(_bucket),
+ policy(_policy),
+ tn(sync_env->sync_tracer->add_node(_tn_parent, "get_sync_policy_handler",
+ SSTR(bucket))) {
+ get_policy_params.zone = zone;
+ get_policy_params.bucket = bucket;
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ for (i = 0; i < 2; ++i) {
+ yield call(new RGWBucketGetSyncPolicyHandlerCR(sync_env->async_rados,
+ sync_env->driver,
+ get_policy_params,
+ policy,
+ dpp));
+ if (retcode < 0 &&
+ retcode != -ENOENT) {
+ return set_cr_error(retcode);
+ }
+
+ if (retcode == 0) {
+ return set_cr_done();
+ }
+
+ /* bucket instance was not found,
+ * try to get bucket instance info, can trigger
+ * metadata sync of bucket instance
+ */
+ yield call(new RGWSyncGetBucketInfoCR(sync_env,
+ bucket,
+ nullptr,
+ nullptr,
+ tn));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ }
+ }
+
+ return 0;
+ }
+};
+
+
+int RGWGetBucketPeersCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ if (pipes) {
+ pipes->clear();
+ }
+ if (target_bucket) {
+ target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+ yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+ nullopt,
+ *target_bucket,
+ target_policy,
+ tn));
+ if (retcode < 0 &&
+ retcode != -ENOENT) {
+ return set_cr_error(retcode);
+ }
+
+ update_from_target_bucket_policy();
+ }
+
+ if (source_bucket && source_zone) {
+ source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+ yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+ source_zone,
+ *source_bucket,
+ source_policy,
+ tn));
+ if (retcode < 0 &&
+ retcode != -ENOENT) {
+ return set_cr_error(retcode);
+ }
+
+ if (source_policy->policy_handler) {
+ auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+ auto& opt_attrs = source_policy->policy_handler->get_bucket_attrs();
+ if (opt_bucket_info && opt_attrs) {
+ source_bucket_info.emplace();
+ source_bucket_info->bucket_info = *opt_bucket_info;
+ source_bucket_info->attrs = *opt_attrs;
+ }
+ }
+
+ if (!target_bucket) {
+ get_hint_targets_action = make_shared<GetHintTargets>(sync_env, *source_bucket);
+
+ yield call(new RGWGenericAsyncCR(cct, sync_env->async_rados,
+ get_hint_targets_action));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ /* hints might have incomplete bucket ids,
+ * in which case we need to figure out the current
+ * bucket_id
+ */
+ for (hiter = get_hint_targets_action->targets.begin();
+ hiter != get_hint_targets_action->targets.end();
+ ++hiter) {
+ ldpp_dout(dpp, 20) << "Got sync hint for bucket=" << *source_bucket << ": " << hiter->get_key() << dendl;
+
+ target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+ yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+ nullopt,
+ *hiter,
+ target_policy,
+ tn));
+ if (retcode < 0 &&
+ retcode != -ENOENT) {
+ return set_cr_error(retcode);
+ }
+ update_from_target_bucket_policy();
+ }
+ }
+ }
+
+ update_from_source_bucket_policy();
+
+ for (siiter = buckets_info.begin(); siiter != buckets_info.end(); ++siiter) {
+ if (siiter->second.bucket_info.bucket.name.empty()) {
+ yield call(new RGWSyncGetBucketInfoCR(sync_env, siiter->first,
+ &siiter->second.bucket_info,
+ &siiter->second.attrs,
+ tn));
+ }
+ }
+
+ if (pipes) {
+ pipes->update_empty_bucket_info(buckets_info);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+class RGWSyncBucketShardCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+ rgw_bucket_sync_pair_info sync_pair;
+ rgw_bucket_sync_pipe& sync_pipe;
+ bool& bucket_stopped;
+ uint64_t generation;
+ ceph::real_time* progress;
+
+ const std::string shard_status_oid;
+ const rgw_raw_obj bucket_status_obj;
+ rgw_bucket_shard_sync_info sync_status;
+ RGWObjVersionTracker objv_tracker;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWSyncBucketShardCR(RGWDataSyncCtx *_sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_bucket_sync_pair_info& _sync_pair,
+ rgw_bucket_sync_pipe& sync_pipe,
+ bool& bucket_stopped,
+ uint64_t generation,
+ const RGWSyncTraceNodeRef& tn,
+ ceph::real_time* progress)
+ : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+ sync_pipe(sync_pipe), bucket_stopped(bucket_stopped), generation(generation), progress(progress),
+ shard_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, generation)),
+ bucket_status_obj(sc->env->svc->zone->get_zone_params().log_pool,
+ RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+ sync_pair.source_bs.bucket,
+ sync_pair.dest_bucket)),
+ tn(tn) {
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncBucketShardCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ objv_tracker.clear();
+ yield call(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &sync_status, &objv_tracker, generation));
+ if (retcode < 0 && retcode != -ENOENT) {
+ tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+ return set_cr_error(retcode);
+ }
+
+ tn->log(20, SSTR("sync status for source bucket shard: " << sync_status.state));
+ sync_status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+ if (progress) {
+ *progress = sync_status.inc_marker.timestamp;
+ }
+
+ yield call(new RGWBucketShardIncrementalSyncCR(sc, sync_pipe,
+ shard_status_oid, bucket_status_obj, lease_cr,
+ sync_status, generation, tn,
+ objv_tracker, progress));
+ if (retcode < 0) {
+ tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode));
+ return set_cr_error(retcode);
+ }
+
+ if (sync_status.state == rgw_bucket_shard_sync_info::StateStopped) {
+ tn->log(20, SSTR("syncstopped indication for source bucket shard"));
+ bucket_stopped = true;
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+class RGWSyncBucketCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *env;
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> data_lease_cr;
+ boost::intrusive_ptr<RGWContinuousLeaseCR> bucket_lease_cr;
+ rgw_bucket_sync_pair_info sync_pair;
+ rgw_bucket_sync_pipe sync_pipe;
+ std::optional<uint64_t> gen;
+ ceph::real_time* progress;
+
+ const std::string lock_name = "bucket sync";
+ const uint32_t lock_duration;
+ const rgw_raw_obj status_obj;
+ rgw_bucket_sync_status bucket_status;
+ bool bucket_stopped = false;
+ RGWObjVersionTracker objv;
+ bool init_check_compat = false;
+ rgw_bucket_index_marker_info info;
+ rgw_raw_obj error_repo;
+ rgw_bucket_shard source_bs;
+ rgw_pool pool;
+ uint64_t current_gen = 0;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWSyncBucketCR(RGWDataSyncCtx *_sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+ const rgw_bucket_sync_pair_info& _sync_pair,
+ std::optional<uint64_t> gen,
+ const RGWSyncTraceNodeRef& _tn_parent,
+ ceph::real_time* progress)
+ : RGWCoroutine(_sc->cct), sc(_sc), env(_sc->env),
+ data_lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+ gen(gen), progress(progress),
+ lock_duration(cct->_conf->rgw_sync_lease_period),
+ status_obj(env->svc->zone->get_zone_params().log_pool,
+ RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+ sync_pair.source_bs.bucket,
+ sync_pair.dest_bucket)),
+ tn(env->sync_tracer->add_node(_tn_parent, "bucket",
+ SSTR(bucket_str{_sync_pair.dest_bucket} << "<-" << bucket_shard_str{_sync_pair.source_bs} ))) {
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+ boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ std::optional<uint64_t> gen,
+ const RGWSyncTraceNodeRef& tn,
+ ceph::real_time* progress)
+{
+ return new RGWSyncBucketCR(sc, std::move(lease), sync_pair,
+ gen, tn, progress);
+}
+
+#define RELEASE_LOCK(cr) \
+ if (cr) {cr->go_down(); drain_all(); cr.reset();}
+
+int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // read source/destination bucket info
+ yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.source_bs.bucket, &sync_pipe.source_bucket_info,
+ &sync_pipe.source_bucket_attrs, tn));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+ return set_cr_error(retcode);
+ }
+
+ yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.dest_bucket, &sync_pipe.dest_bucket_info,
+ &sync_pipe.dest_bucket_attrs, tn));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+ return set_cr_error(retcode);
+ }
+
+ sync_pipe.info = sync_pair;
+
+ // read bucket sync status
+ using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+ using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+
+ objv.clear();
+ yield call(new ReadCR(dpp, env->driver,
+ status_obj, &bucket_status, false, &objv));
+ if (retcode == -ENOENT) {
+ // if the full sync status object didn't exist yet, run the backward
+ // compatability logic in InitBucketFullSyncStatusCR below. if it did
+ // exist, a `bucket sync init` probably requested its re-initialization,
+ // and shouldn't try to resume incremental sync
+ init_check_compat = true;
+
+ // use exclusive create to set state=Init
+ objv.generate_new_write_ver(cct);
+ yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status, &objv, true));
+ tn->log(20, "bucket status object does not exist, create a new one");
+ if (retcode == -EEXIST) {
+ // raced with another create, read its status
+ tn->log(20, "raced with another create, read its status");
+ objv.clear();
+ yield call(new ReadCR(dpp, env->driver,
+ status_obj, &bucket_status, false, &objv));
+ }
+ }
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: failed to read bucket status object. error: " << retcode));
+ return set_cr_error(retcode);
+ }
+
+ do {
+ tn->log(20, SSTR("sync status for source bucket: " << bucket_status.state <<
+ ". lease is: " << (bucket_lease_cr ? "taken" : "not taken") << ". stop indications is: " << bucket_stopped));
+
+ if (bucket_status.state != BucketSyncState::Incremental ||
+ bucket_stopped) {
+
+ if (!bucket_lease_cr) {
+ bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+ lock_name, lock_duration, this, &sc->lcc));
+ yield spawn(bucket_lease_cr.get(), false);
+ while (!bucket_lease_cr->is_locked()) {
+ if (bucket_lease_cr->is_done()) {
+ tn->log(5, "failed to take lease");
+ set_status("lease lock failed, early abort");
+ drain_all();
+ return set_cr_error(bucket_lease_cr->get_ret_status());
+ }
+ tn->log(5, "waiting on bucket lease");
+ yield set_sleeping(true);
+ }
+ }
+
+ // if state is Init or Stopped, we query the remote RGW for ther state
+ yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, sync_pair.source_bs.bucket, &info));
+ if (retcode < 0) {
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_error(retcode);
+ }
+ if (info.syncstopped) {
+ // remote indicates stopped state
+ tn->log(20, "remote bilog indicates that sync was stopped");
+
+ // if state was incremental, remove all per-shard status objects
+ if (bucket_status.state == BucketSyncState::Incremental) {
+ yield {
+ const auto num_shards = bucket_status.shards_done_with_gen.size();
+ const auto gen = bucket_status.incremental_gen;
+ call(new RemoveBucketShardStatusCollectCR(sc, sync_pair, gen, num_shards));
+ }
+ }
+
+ // check if local state is "stopped"
+ objv.clear();
+ yield call(new ReadCR(dpp, env->driver,
+ status_obj, &bucket_status, false, &objv));
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: failed to read status before writing 'stopped'. error: " << retcode));
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_error(retcode);
+ }
+ if (bucket_status.state != BucketSyncState::Stopped) {
+ // make sure that state is changed to stopped localy
+ bucket_status.state = BucketSyncState::Stopped;
+ yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status,
+ &objv, false));
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: failed to write 'stopped' status. error: " << retcode));
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_error(retcode);
+ }
+ }
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_done();
+ }
+ if (bucket_stopped) {
+ tn->log(20, SSTR("ERROR: switched from 'stop' to 'start' sync. while state is: " << bucket_status.state));
+ bucket_stopped = false;
+ bucket_status.state = BucketSyncState::Init;
+ }
+ }
+
+ if (bucket_status.state != BucketSyncState::Incremental) {
+ // if the state wasn't Incremental, take a bucket-wide lease to prevent
+ // different shards from duplicating the init and full sync
+ if (!bucket_lease_cr) {
+ bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+ lock_name, lock_duration, this, &sc->lcc));
+ yield spawn(bucket_lease_cr.get(), false);
+ while (!bucket_lease_cr->is_locked()) {
+ if (bucket_lease_cr->is_done()) {
+ tn->log(5, "failed to take lease");
+ set_status("lease lock failed, early abort");
+ drain_all();
+ return set_cr_error(bucket_lease_cr->get_ret_status());
+ }
+ tn->log(5, "waiting on bucket lease");
+ yield set_sleeping(true);
+ }
+ }
+
+ // reread the status after acquiring the lock
+ objv.clear();
+ yield call(new ReadCR(dpp, env->driver, status_obj,
+ &bucket_status, false, &objv));
+ if (retcode < 0) {
+ RELEASE_LOCK(bucket_lease_cr);
+ tn->log(20, SSTR("ERROR: reading the status after acquiring the lock failed. error: " << retcode));
+ return set_cr_error(retcode);
+ }
+ tn->log(20, SSTR("status after acquiring the lock is: " << bucket_status.state));
+
+ yield call(new InitBucketFullSyncStatusCR(sc, sync_pair, status_obj,
+ bucket_status, objv,
+ sync_pipe.source_bucket_info,
+ init_check_compat, info));
+
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: init full sync failed. error: " << retcode));
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_error(retcode);
+ }
+ }
+
+ assert(bucket_status.state == BucketSyncState::Incremental ||
+ bucket_status.state == BucketSyncState::Full);
+
+ if (bucket_status.state == BucketSyncState::Full) {
+ assert(bucket_lease_cr);
+ yield call(new RGWBucketFullSyncCR(sc, sync_pipe, status_obj,
+ bucket_lease_cr, bucket_status,
+ tn, objv));
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: full sync failed. error: " << retcode));
+ RELEASE_LOCK(bucket_lease_cr);
+ return set_cr_error(retcode);
+ }
+ }
+
+ if (bucket_status.state == BucketSyncState::Incremental) {
+ // lease not required for incremental sync
+ RELEASE_LOCK(bucket_lease_cr);
+
+ assert(sync_pair.source_bs.shard_id >= 0);
+ // if a specific gen was requested, compare that to the sync status
+ if (gen) {
+ current_gen = bucket_status.incremental_gen;
+ source_bs = sync_pair.source_bs;
+ if (*gen > current_gen) {
+ /* In case the data log entry is missing for previous gen, it may
+ * not be marked complete and the sync can get stuck. To avoid it,
+ * may be we can add this (shardid, gen) to error repo to force
+ * sync and mark that shard as completed.
+ */
+ pool = sc->env->svc->zone->get_zone_params().log_pool;
+ if ((static_cast<std::size_t>(source_bs.shard_id) < bucket_status.shards_done_with_gen.size()) &&
+ !bucket_status.shards_done_with_gen[source_bs.shard_id]) {
+ // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
+ error_repo = datalog_oid_for_error_repo(sc, sc->env->driver,
+ pool, source_bs);
+ yield call(rgw::error_repo::write_cr(sc->env->driver->svc()->rados, error_repo,
+ rgw::error_repo::encode_key(source_bs, current_gen),
+ ceph::real_clock::zero()));
+ if (retcode < 0) {
+ tn->log(0, SSTR("ERROR: failed to log prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
+ } else {
+ tn->log(20, SSTR("logged prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
+ }
+ }
+ retcode = -EAGAIN;
+ tn->log(10, SSTR("ERROR: requested sync of future generation "
+ << *gen << " > " << current_gen
+ << ", returning " << retcode << " for later retry"));
+ return set_cr_error(retcode);
+ } else if (*gen < current_gen) {
+ tn->log(10, SSTR("WARNING: requested sync of past generation "
+ << *gen << " < " << current_gen
+ << ", returning success"));
+ return set_cr_done();
+ }
+ }
+
+ if (static_cast<std::size_t>(sync_pair.source_bs.shard_id) >= bucket_status.shards_done_with_gen.size()) {
+ tn->log(1, SSTR("bucket shard " << sync_pair.source_bs << " index out of bounds"));
+ return set_cr_done(); // return success so we don't retry
+ }
+ if (bucket_status.shards_done_with_gen[sync_pair.source_bs.shard_id]) {
+ tn->log(10, SSTR("bucket shard " << sync_pair.source_bs << " of gen " <<
+ gen << " already synced."));
+ return set_cr_done();
+ }
+
+ yield call(new RGWSyncBucketShardCR(sc, data_lease_cr, sync_pair,
+ sync_pipe, bucket_stopped,
+ bucket_status.incremental_gen, tn, progress));
+ if (retcode < 0) {
+ tn->log(20, SSTR("ERROR: incremental sync failed. error: " << retcode));
+ return set_cr_error(retcode);
+ }
+ }
+ // loop back to previous states unless incremental sync returns normally
+ } while (bucket_status.state != BucketSyncState::Incremental || bucket_stopped);
+
+ return set_cr_done();
+ }
+
+ return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::do_init(const DoutPrefixProvider *dpp,
+ std::ostream* ostr)
+{
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+
+ sync_module.reset(new RGWDefaultSyncModuleInstance());
+ auto async_rados = driver->svc()->rados->get_async_processor();
+
+ sync_env.init(this, driver->ctx(), driver,
+ driver->svc(), async_rados, &http_manager,
+ error_logger.get(), driver->getRados()->get_sync_tracer(),
+ sync_module, nullptr);
+
+ sync_env.ostr = ostr;
+
+ rgw_sync_pipe_info_set pipes;
+
+ ret = cr_mgr.run(dpp, new RGWGetBucketPeersCR(&sync_env,
+ dest_bucket,
+ source_zone,
+ source_bucket,
+ &pipes,
+ sync_env.sync_tracer->root_node));
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "failed to get bucket source peers info: (ret=" << ret << "): " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ if (pipes.empty()) {
+ ldpp_dout(this, 0) << "No peers. This is not a valid multisite configuration." << dendl;
+ return -EINVAL;
+ }
+
+ for (auto& pipe : pipes) {
+ auto& szone = pipe.source.zone;
+
+ auto conn = driver->svc()->zone->get_zone_conn(szone);
+ if (!conn) {
+ ldpp_dout(this, 0) << "connection object to zone " << szone << " does not exist" << dendl;
+ return -EINVAL;
+ }
+
+ RGWZone* z;
+ if (!(z = driver->svc()->zone->find_zone(szone))) {
+ ldpp_dout(this, 0) << "zone " << szone << " does not exist" << dendl;
+ return -EINVAL;
+ }
+ sources.emplace_back(&sync_env, szone, conn,
+ pipe.source.get_bucket_info(),
+ pipe.target.get_bucket(),
+ pipe.handler, z->name);
+ }
+
+ return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::remote_info(const DoutPrefixProvider *dpp,
+ source& s,
+ uint64_t* oldest_gen,
+ uint64_t* latest_gen,
+ uint64_t* num_shards)
+{
+ rgw_bucket_index_marker_info remote_info;
+ BucketIndexShardsManager remote_markers;
+ auto r = rgw_read_remote_bilog_info(dpp, s.sc.conn, s.info.bucket,
+ remote_info, remote_markers,
+ null_yield);
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " rgw_read_remote_bilog_info: r="
+ << r << dendl;
+ return r;
+ }
+ if (oldest_gen)
+ *oldest_gen = remote_info.oldest_gen;
+
+ if (latest_gen)
+ *latest_gen = remote_info.latest_gen;
+
+ if (num_shards)
+ *num_shards = remote_markers.get().size();
+
+ return 0;
+}
+
+tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+RGWBucketPipeSyncStatusManager::construct(
+ const DoutPrefixProvider* dpp,
+ rgw::sal::RadosStore* driver,
+ std::optional<rgw_zone_id> source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_bucket& dest_bucket,
+ std::ostream* ostr)
+{
+ std::unique_ptr<RGWBucketPipeSyncStatusManager> self{
+ new RGWBucketPipeSyncStatusManager(driver, source_zone, source_bucket,
+ dest_bucket)};
+ auto r = self->do_init(dpp, ostr);
+ if (r < 0) {
+ return tl::unexpected(r);
+ }
+ return self;
+}
+
+int RGWBucketPipeSyncStatusManager::init_sync_status(
+ const DoutPrefixProvider *dpp)
+{
+ // Just running one at a time saves us from buildup/teardown and in
+ // practice we only do one zone at a time.
+ for (auto& source : sources) {
+ list<RGWCoroutinesStack*> stacks;
+ RGWCoroutinesStack *stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+ pretty_print(source.sc.env, "Initializing sync state of bucket {} with zone {}.\n",
+ source.info.bucket.name, source.zone_name);
+ stack->call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+ dpp, source.sc.env->driver,
+ {sync_env.svc->zone->get_zone_params().log_pool,
+ full_status_oid(source.sc.source_zone,
+ source.info.bucket,
+ source.dest)},
+ rgw_bucket_sync_status{}));
+ stacks.push_back(stack);
+ auto r = cr_mgr.run(dpp, stacks);
+ if (r < 0) {
+ pretty_print(source.sc.env,
+ "Initialization of sync state for bucket {} with zone {} "
+ "failed with error {}\n",
+ source.info.bucket.name, source.zone_name, cpp_strerror(r));
+ }
+ }
+ return 0;
+}
+
+tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int>
+RGWBucketPipeSyncStatusManager::read_sync_status(
+ const DoutPrefixProvider *dpp)
+{
+ std::map<int, rgw_bucket_shard_sync_info> sync_status;
+ list<RGWCoroutinesStack *> stacks;
+
+ auto sz = sources.begin();
+
+ if (source_zone) {
+ sz = std::find_if(sources.begin(), sources.end(),
+ [this](const source& s) {
+ return s.sc.source_zone == *source_zone;
+ }
+ );
+ if (sz == sources.end()) {
+ ldpp_dout(this, 0) << "ERROR: failed to find source zone: "
+ << *source_zone << dendl;
+ return tl::unexpected(-ENOENT);
+ }
+ } else {
+ ldpp_dout(this, 5) << "No source zone specified, using source zone: "
+ << sz->sc.source_zone << dendl;
+ return tl::unexpected(-ENOENT);
+ }
+ uint64_t num_shards, latest_gen;
+ auto ret = remote_info(dpp, *sz, nullptr, &latest_gen, &num_shards);
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "Unable to get remote info: "
+ << ret << dendl;
+ return tl::unexpected(ret);
+ }
+ auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+ std::vector<rgw_bucket_sync_pair_info> pairs(num_shards);
+ for (auto shard = 0u; shard < num_shards; ++shard) {
+ auto& pair = pairs[shard];
+ pair.source_bs.bucket = sz->info.bucket;
+ pair.dest_bucket = sz->dest;
+ pair.source_bs.shard_id = shard;
+ stack->call(new RGWReadBucketPipeSyncStatusCoroutine(
+ &sz->sc, pair, &sync_status[shard],
+ nullptr, latest_gen));
+ }
+
+ stacks.push_back(stack);
+
+ ret = cr_mgr.run(dpp, stacks);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to read sync status for "
+ << bucket_str{dest_bucket} << dendl;
+ return tl::unexpected(ret);
+ }
+
+ return sync_status;
+}
+
+namespace rgw::bucket_sync_run {
+// Retry-loop over calls to sync_bucket_shard_cr
+class ShardCR : public RGWCoroutine {
+ static constexpr auto allowed_retries = 10u;
+
+ RGWDataSyncCtx& sc;
+ const rgw_bucket_sync_pair_info& pair;
+ const uint64_t gen;
+ unsigned retries = 0;
+
+ ceph::real_time prev_progress;
+ ceph::real_time progress;
+
+public:
+
+ ShardCR(RGWDataSyncCtx& sc, const rgw_bucket_sync_pair_info& pair,
+ const uint64_t gen)
+ : RGWCoroutine(sc.cct), sc(sc), pair(pair), gen(gen) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ // Since all errors (except ECANCELED) are considered retryable,
+ // retry other errors so long as we're making progress.
+ for (retries = 0u, retcode = -EDOM;
+ (retries < allowed_retries) && (retcode != 0);
+ ++retries) {
+ ldpp_dout(dpp, 5) << "ShardCR: syncing bucket shard on: "
+ << "zone=" << sc.source_zone
+ << ", bucket=" << pair.source_bs.bucket.name
+ << ", shard=" << pair.source_bs.shard_id
+ << ", gen=" << gen
+ << dendl;
+ yield call(sync_bucket_shard_cr(&sc, nullptr, pair, gen,
+ sc.env->sync_tracer->root_node,
+ &progress));
+
+ if (retcode == -ECANCELED) {
+ ldpp_dout(dpp, -1) << "ERROR: Got -ECANCELED for "
+ << pair.source_bs << dendl;
+ drain_all();
+ return set_cr_error(retcode);
+ } else if (retcode < 0) {
+ ldpp_dout(dpp, 5) << "WARNING: Got error, retcode=" << retcode << " for "
+ << pair.source_bs << "on retry "
+ << retries + 1 << " of " << allowed_retries
+ << " allowed" << dendl;
+ // Reset the retry counter if we made any progress
+ if (progress != prev_progress) {
+ retries = 0;
+ }
+ prev_progress = progress;
+ }
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: Exhausted retries for "
+ << pair.source_bs << " retcode="
+ << retcode << dendl;
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ drain_all();
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+// Loop over calls to ShardCR with limited concurrency
+class GenCR : public RGWShardCollectCR {
+ static constexpr auto MAX_CONCURRENT_SHARDS = 64;
+
+ RGWDataSyncCtx& sc;
+ const uint64_t gen;
+
+ std::vector<rgw_bucket_sync_pair_info> pairs;
+ decltype(pairs)::const_iterator iter;
+
+public:
+ GenCR(RGWDataSyncCtx& sc, const rgw_bucket& source, const rgw_bucket& dest,
+ const uint64_t gen, const uint64_t shards,
+ const RGWBucketSyncFlowManager::pipe_handler& handler)
+ : RGWShardCollectCR(sc.cct, MAX_CONCURRENT_SHARDS),
+ sc(sc), gen(gen) {
+ pairs.resize(shards);
+ for (auto shard = 0u; shard < shards; ++shard) {
+ auto& pair = pairs[shard];
+ pair.handler = handler;
+ pair.source_bs.bucket = source;
+ pair.dest_bucket = dest;
+ pair.source_bs.shard_id = shard;
+ }
+ iter = pairs.cbegin();
+ assert(pairs.size() == shards);
+ }
+
+ virtual bool spawn_next() override {
+ if (iter == pairs.cend()) {
+ return false;
+ }
+ spawn(new ShardCR(sc, *iter, gen), false);
+ ++iter;
+ return true;
+ }
+
+ int handle_result(int r) override {
+ if (r < 0) {
+ ldpp_dout(sc.env->dpp, 4) << "ERROR: Error syncing shard: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+};
+
+// Read sync status, loop over calls to GenCR
+class SourceCR : public RGWCoroutine {
+ RGWDataSyncCtx& sc;
+ const RGWBucketInfo& info;
+ const rgw_bucket& dest;
+ const RGWBucketSyncFlowManager::pipe_handler& handler;
+ const rgw_raw_obj status_obj{
+ sc.env->svc->zone->get_zone_params().log_pool,
+ RGWBucketPipeSyncStatusManager::full_status_oid(sc.source_zone, info.bucket,
+ dest)};
+
+ BucketSyncState state = BucketSyncState::Incremental;
+ uint64_t gen = 0;
+ uint64_t num_shards = 0;
+ rgw_bucket_sync_status status;
+ std::string zone_name;
+
+public:
+
+ SourceCR(RGWDataSyncCtx& sc, const RGWBucketInfo& info,
+ const rgw_bucket& dest,
+ const RGWBucketSyncFlowManager::pipe_handler& handler,
+ const std::string& zone_name)
+ : RGWCoroutine(sc.cct), sc(sc), info(info), dest(dest), handler(handler),
+ zone_name(zone_name) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ // Get the source's status. In incremental sync, this gives us
+ // the generation and shard count that is next needed to be run.
+ yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+ dpp, sc.env->driver, status_obj, &status));
+ if (retcode < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+ << sc.source_zone << " retcode="
+ << retcode << dendl;
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ if (status.state == BucketSyncState::Stopped) {
+ // Nothing to do.
+ pretty_print(sc.env, "Sync of bucket {} from source zone {} is in state Stopped. "
+ "Nothing to do.\n", dest.name, zone_name);
+ ldpp_dout(dpp, 5) << "SourceCR: Bucket is in state Stopped, returning."
+ << dendl;
+ drain_all();
+ return set_cr_done();
+ }
+
+ do {
+ state = status.state;
+ gen = status.incremental_gen;
+ num_shards = status.shards_done_with_gen.size();
+
+ ldpp_dout(dpp, 5) << "SourceCR: "
+ << "state=" << state
+ << ", gen=" << gen
+ << ", num_shards=" << num_shards
+ << dendl;
+
+ // Special case to handle full sync. Since full sync no longer
+ // uses shards and has no generations, we sync shard zero,
+ // though use the current generation so a following
+ // incremental sync can carry on.
+ if (state != BucketSyncState::Incremental) {
+ pretty_print(sc.env, "Beginning full sync of bucket {} from source zone {}.\n",
+ dest.name, zone_name);
+ ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with "
+ << "gen=" << gen
+ << ", num_shards=" << 1
+ << dendl;
+ yield call(new GenCR(sc, info.bucket, dest, gen, 1, handler));
+ } else {
+ pretty_print(sc.env, "Beginning incremental sync of bucket {}, generation {} from source zone {}.\n",
+ dest.name, gen, zone_name);
+ ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with "
+ << "gen=" << gen
+ << ", num_shards=" << num_shards
+ << dendl;
+ yield call(new GenCR(sc, info.bucket, dest, gen, num_shards,
+ handler));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: Giving up syncing from "
+ << sc.source_zone << " retcode="
+ << retcode << dendl;
+ drain_all();
+ return set_cr_error(retcode);
+ }
+
+ pretty_print(sc.env, "Completed.\n");
+
+ yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+ dpp, sc.env->driver, status_obj, &status));
+ if (retcode < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+ << sc.source_zone << " retcode="
+ << retcode << dendl;
+ drain_all();
+ return set_cr_error(retcode);
+ }
+ // Repeat until we have done an incremental run and the
+ // generation remains unchanged.
+ ldpp_dout(dpp, 5) << "SourceCR: "
+ << "state=" << state
+ << ", gen=" << gen
+ << ", num_shards=" << num_shards
+ << ", status.state=" << status.state
+ << ", status.incremental_gen=" << status.incremental_gen
+ << ", status.shards_done_with_gen.size()=" << status.shards_done_with_gen.size()
+ << dendl;
+ } while (state != BucketSyncState::Incremental ||
+ gen != status.incremental_gen);
+ drain_all();
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+} // namespace rgw::bucket_sync_run
+
+int RGWBucketPipeSyncStatusManager::run(const DoutPrefixProvider *dpp)
+{
+ list<RGWCoroutinesStack *> stacks;
+ for (auto& source : sources) {
+ auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+ stack->call(new rgw::bucket_sync_run::SourceCR(
+ source.sc, source.info, source.dest, source.handler,
+ source.zone_name));
+ stacks.push_back(stack);
+ }
+ auto ret = cr_mgr.run(dpp, stacks);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "ERROR: Sync unsuccessful on bucket "
+ << bucket_str{dest_bucket} << dendl;
+ }
+ return ret;
+}
+
+unsigned RGWBucketPipeSyncStatusManager::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWBucketPipeSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+ auto zone = std::string_view{source_zone.value_or(rgw_zone_id("*")).id};
+ return out << "bucket sync zone:" << zone.substr(0, 8)
+ << " bucket:" << dest_bucket << ' ';
+}
+
+string RGWBucketPipeSyncStatusManager::full_status_oid(const rgw_zone_id& source_zone,
+ const rgw_bucket& source_bucket,
+ const rgw_bucket& dest_bucket)
+{
+ if (source_bucket == dest_bucket) {
+ return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+ + dest_bucket.get_key();
+ } else {
+ return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+ + dest_bucket.get_key() + ":" + source_bucket.get_key();
+ }
+}
+
+inline std::string generation_token(uint64_t gen) {
+ return (gen == 0) ? "" : (":" + std::to_string(gen));
+}
+
+string RGWBucketPipeSyncStatusManager::inc_status_oid(const rgw_zone_id& source_zone,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ uint64_t gen)
+{
+ if (sync_pair.source_bs.bucket == sync_pair.dest_bucket) {
+ return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.source_bs.get_key() +
+ generation_token(gen);
+ } else {
+ return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.dest_bucket.get_key() + ":" + sync_pair.source_bs.get_key() +
+ generation_token(gen);
+ }
+}
+
+string RGWBucketPipeSyncStatusManager::obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& obj)
+{
+ string prefix = object_status_oid_prefix + "." + source_zone.id + ":" + obj.bucket.get_key();
+ if (sync_pipe.source_bucket_info.bucket !=
+ sync_pipe.dest_bucket_info.bucket) {
+ prefix += string("/") + sync_pipe.dest_bucket_info.bucket.get_key();
+ }
+ return prefix + ":" + obj.key.name + ":" + obj.key.instance;
+}
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+ RGWRESTConn* conn,
+ const rgw_bucket& bucket,
+ rgw_bucket_index_marker_info& info,
+ BucketIndexShardsManager& markers,
+ optional_yield y)
+{
+ const auto instance_key = bucket.get_key();
+ const rgw_http_param_pair params[] = {
+ { "type" , "bucket-index" },
+ { "bucket-instance", instance_key.c_str() },
+ { "info" , nullptr },
+ { nullptr, nullptr }
+ };
+ int r = conn->get_json_resource(dpp, "/admin/log/", params, y, info);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ // parse shard markers
+ r = markers.from_string(info.max_marker, -1);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "failed to decode remote log markers" << dendl;
+ return r;
+ }
+ return 0;
+}
+
+class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR {
+ static constexpr int max_concurrent_shards = 16;
+ rgw::sal::RadosStore* const driver;
+ RGWDataSyncCtx *const sc;
+ RGWDataSyncEnv *const env;
+ const uint64_t gen;
+
+ rgw_bucket_sync_pair_info sync_pair;
+ using Vector = std::vector<rgw_bucket_shard_sync_info>;
+ Vector::iterator i, end;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to read bucket shard sync status: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ RGWCollectBucketSyncStatusCR(rgw::sal::RadosStore* driver, RGWDataSyncCtx *sc,
+ const rgw_bucket_sync_pair_info& sync_pair,
+ uint64_t gen,
+ Vector *status)
+ : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+ driver(driver), sc(sc), env(sc->env), gen(gen), sync_pair(sync_pair),
+ i(status->begin()), end(status->end())
+ {}
+
+ bool spawn_next() override {
+ if (i == end) {
+ return false;
+ }
+ spawn(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &*i, nullptr, gen), false);
+ ++i;
+ ++sync_pair.source_bs.shard_id;
+ return true;
+ }
+};
+
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *driver,
+ const rgw_sync_bucket_pipe& pipe,
+ rgw_bucket_sync_status *status,
+ optional_yield y)
+{
+ auto get_oid = RGWBucketPipeSyncStatusManager::full_status_oid;
+ const rgw_raw_obj obj{driver->svc()->zone->get_zone_params().log_pool,
+ get_oid(*pipe.source.zone, *pipe.source.bucket, *pipe.dest.bucket)};
+
+ auto svc = driver->svc()->sysobj;
+ auto sysobj = svc->get_obj(obj);
+ bufferlist bl;
+ int ret = sysobj.rop().read(dpp, &bl, y);
+ if (ret < 0)
+ return ret;
+
+ try {
+ auto iter = bl.cbegin();
+ using ceph::decode;
+ rgw_bucket_sync_status result;
+ decode(result, iter);
+ *status = result;
+ return 0;
+ } catch (const buffer::error& err) {
+ lderr(svc->ctx()) << "error decoding " << obj << ": " << err.what() << dendl;
+ return -EIO;
+ }
+}
+
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *driver,
+ const rgw_sync_bucket_pipe& pipe,
+ uint64_t gen,
+ std::vector<rgw_bucket_shard_sync_info> *status)
+{
+ if (!pipe.source.zone ||
+ !pipe.source.bucket ||
+ !pipe.dest.zone ||
+ !pipe.dest.bucket) {
+ return -EINVAL;
+ }
+
+ rgw_bucket_sync_pair_info sync_pair;
+ sync_pair.source_bs.bucket = *pipe.source.bucket;
+ sync_pair.source_bs.shard_id = 0;
+ sync_pair.dest_bucket = *pipe.dest.bucket;
+
+ RGWDataSyncEnv env;
+ RGWSyncModuleInstanceRef module; // null sync module
+ env.init(dpp, driver->ctx(), driver, driver->svc(), driver->svc()->rados->get_async_processor(),
+ nullptr, nullptr, nullptr, module, nullptr);
+
+ RGWDataSyncCtx sc;
+ sc.init(&env, nullptr, *pipe.source.zone);
+
+ RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+ return crs.run(dpp, new RGWCollectBucketSyncStatusCR(driver, &sc,
+ sync_pair,
+ gen,
+ status));
+}
+
+void rgw_data_sync_info::generate_test_instances(list<rgw_data_sync_info*>& o)
+{
+ auto info = new rgw_data_sync_info;
+ info->state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+ info->num_shards = 8;
+ o.push_back(info);
+ o.push_back(new rgw_data_sync_info);
+}
+
+void rgw_data_sync_marker::generate_test_instances(list<rgw_data_sync_marker*>& o)
+{
+ auto marker = new rgw_data_sync_marker;
+ marker->state = rgw_data_sync_marker::IncrementalSync;
+ marker->marker = "01234";
+ marker->pos = 5;
+ o.push_back(marker);
+ o.push_back(new rgw_data_sync_marker);
+}
+
+void rgw_data_sync_status::generate_test_instances(list<rgw_data_sync_status*>& o)
+{
+ o.push_back(new rgw_data_sync_status);
+}
+
+void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const
+{
+ encode_json("position", position, f);
+ encode_json("count", count, f);
+}
+
+void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("position", position, obj);
+ JSONDecoder::decode_json("timestamp", timestamp, obj);
+}
+
+void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const
+{
+ encode_json("position", position, f);
+ encode_json("timestamp", timestamp, f);
+}
+
+void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj)
+{
+ std::string s;
+ JSONDecoder::decode_json("status", s, obj);
+ if (s == "full-sync") {
+ state = StateFullSync;
+ } else if (s == "incremental-sync") {
+ state = StateIncrementalSync;
+ } else if (s == "stopped") {
+ state = StateStopped;
+ } else {
+ state = StateInit;
+ }
+ JSONDecoder::decode_json("inc_marker", inc_marker, obj);
+}
+
+void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("position", position, obj);
+ JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_shard_sync_info::dump(Formatter *f) const
+{
+ const char *s{nullptr};
+ switch ((SyncState)state) {
+ case StateInit:
+ s = "init";
+ break;
+ case StateFullSync:
+ s = "full-sync";
+ break;
+ case StateIncrementalSync:
+ s = "incremental-sync";
+ break;
+ case StateStopped:
+ s = "stopped";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+ encode_json("status", s, f);
+ encode_json("inc_marker", inc_marker, f);
+}
+
+void rgw_bucket_full_sync_status::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("position", position, obj);
+ JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_full_sync_status::dump(Formatter *f) const
+{
+ encode_json("position", position, f);
+ encode_json("count", count, f);
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f)
+{
+ switch (state) {
+ case BucketSyncState::Init:
+ encode_json(name, "init", f);
+ break;
+ case BucketSyncState::Full:
+ encode_json(name, "full-sync", f);
+ break;
+ case BucketSyncState::Incremental:
+ encode_json(name, "incremental-sync", f);
+ break;
+ case BucketSyncState::Stopped:
+ encode_json(name, "stopped", f);
+ break;
+ default:
+ encode_json(name, "unknown", f);
+ break;
+ }
+}
+
+void decode_json_obj(BucketSyncState& state, JSONObj *obj)
+{
+ std::string s;
+ decode_json_obj(s, obj);
+ if (s == "full-sync") {
+ state = BucketSyncState::Full;
+ } else if (s == "incremental-sync") {
+ state = BucketSyncState::Incremental;
+ } else if (s == "stopped") {
+ state = BucketSyncState::Stopped;
+ } else {
+ state = BucketSyncState::Init;
+ }
+}
+
+void rgw_bucket_sync_status::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("state", state, obj);
+ JSONDecoder::decode_json("full", full, obj);
+ JSONDecoder::decode_json("incremental_gen", incremental_gen, obj);
+}
+
+void rgw_bucket_sync_status::dump(Formatter *f) const
+{
+ encode_json("state", state, f);
+ encode_json("full", full, f);
+ encode_json("incremental_gen", incremental_gen, f);
+}
+
+
+void bilog_status_v2::dump(Formatter *f) const
+{
+ encode_json("sync_status", sync_status, f);
+ encode_json("inc_status", inc_status, f);
+}
+
+void bilog_status_v2::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("sync_status", sync_status, obj);
+ JSONDecoder::decode_json("inc_status", inc_status, obj);
+}
diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h
new file mode 100644
index 000000000..b9a39343f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_data_sync.h
@@ -0,0 +1,868 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "include/encoding.h"
+
+#include "common/ceph_json.h"
+#include "common/likely.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_http_client.h"
+#include "rgw_sal_rados.h"
+
+#include "rgw_datalog.h"
+#include "rgw_sync.h"
+#include "rgw_sync_module.h"
+#include "rgw_sync_trace.h"
+#include "rgw_sync_policy.h"
+
+#include "rgw_bucket_sync.h"
+
+// represents an obligation to sync an entry up a given time
+struct rgw_data_sync_obligation {
+ rgw_bucket_shard bs;
+ std::optional<uint64_t> gen;
+ std::string marker;
+ ceph::real_time timestamp;
+ bool retry = false;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_data_sync_obligation& o) {
+ out << "key=" << o.bs;
+ if (o.gen) {
+ out << '[' << *o.gen << ']';
+ }
+ if (!o.marker.empty()) {
+ out << " marker=" << o.marker;
+ }
+ if (o.timestamp != ceph::real_time{}) {
+ out << " timestamp=" << o.timestamp;
+ }
+ if (o.retry) {
+ out << " retry";
+ }
+ return out;
+}
+
+class JSONObj;
+struct rgw_sync_bucket_pipe;
+
+struct rgw_bucket_sync_pair_info {
+ RGWBucketSyncFlowManager::pipe_handler handler; /* responsible for sync filters */
+ rgw_bucket_shard source_bs;
+ rgw_bucket dest_bucket;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pair_info& p) {
+ if (p.source_bs.bucket == p.dest_bucket) {
+ return out << p.source_bs;
+ }
+ return out << p.source_bs << "->" << p.dest_bucket;
+}
+
+struct rgw_bucket_sync_pipe {
+ rgw_bucket_sync_pair_info info;
+ RGWBucketInfo source_bucket_info;
+ std::map<std::string, bufferlist> source_bucket_attrs;
+ RGWBucketInfo dest_bucket_info;
+ std::map<std::string, bufferlist> dest_bucket_attrs;
+
+ RGWBucketSyncFlowManager::pipe_rules_ref& get_rules() {
+ return info.handler.rules;
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pipe& p) {
+ return out << p.info;
+}
+
+struct rgw_datalog_info {
+ uint32_t num_shards;
+
+ rgw_datalog_info() : num_shards(0) {}
+
+ void decode_json(JSONObj *obj);
+};
+
+struct rgw_data_sync_info {
+ enum SyncState {
+ StateInit = 0,
+ StateBuildingFullSyncMaps = 1,
+ StateSync = 2,
+ };
+
+ uint16_t state;
+ uint32_t num_shards;
+
+ uint64_t instance_id{0};
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(state, bl);
+ encode(num_shards, bl);
+ encode(instance_id, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(state, bl);
+ decode(num_shards, bl);
+ if (struct_v >= 2) {
+ decode(instance_id, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ std::string s;
+ switch ((SyncState)state) {
+ case StateInit:
+ s = "init";
+ break;
+ case StateBuildingFullSyncMaps:
+ s = "building-full-sync-maps";
+ break;
+ case StateSync:
+ s = "sync";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+ encode_json("status", s, f);
+ encode_json("num_shards", num_shards, f);
+ encode_json("instance_id", instance_id, f);
+ }
+ void decode_json(JSONObj *obj) {
+ std::string s;
+ JSONDecoder::decode_json("status", s, obj);
+ if (s == "building-full-sync-maps") {
+ state = StateBuildingFullSyncMaps;
+ } else if (s == "sync") {
+ state = StateSync;
+ } else {
+ state = StateInit;
+ }
+ JSONDecoder::decode_json("num_shards", num_shards, obj);
+ JSONDecoder::decode_json("instance_id", instance_id, obj);
+ }
+ static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
+
+ rgw_data_sync_info() : state((int)StateInit), num_shards(0) {}
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_info)
+
+struct rgw_data_sync_marker {
+ enum SyncState {
+ FullSync = 0,
+ IncrementalSync = 1,
+ };
+ uint16_t state;
+ std::string marker;
+ std::string next_step_marker;
+ uint64_t total_entries;
+ uint64_t pos;
+ real_time timestamp;
+
+ rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(state, bl);
+ encode(marker, bl);
+ encode(next_step_marker, bl);
+ encode(total_entries, bl);
+ encode(pos, bl);
+ encode(timestamp, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(state, bl);
+ decode(marker, bl);
+ decode(next_step_marker, bl);
+ decode(total_entries, bl);
+ decode(pos, bl);
+ decode(timestamp, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ const char *s{nullptr};
+ switch ((SyncState)state) {
+ case FullSync:
+ s = "full-sync";
+ break;
+ case IncrementalSync:
+ s = "incremental-sync";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+ encode_json("status", s, f);
+ encode_json("marker", marker, f);
+ encode_json("next_step_marker", next_step_marker, f);
+ encode_json("total_entries", total_entries, f);
+ encode_json("pos", pos, f);
+ encode_json("timestamp", utime_t(timestamp), f);
+ }
+ void decode_json(JSONObj *obj) {
+ std::string s;
+ JSONDecoder::decode_json("status", s, obj);
+ if (s == "full-sync") {
+ state = FullSync;
+ } else if (s == "incremental-sync") {
+ state = IncrementalSync;
+ }
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+ JSONDecoder::decode_json("total_entries", total_entries, obj);
+ JSONDecoder::decode_json("pos", pos, obj);
+ utime_t t;
+ JSONDecoder::decode_json("timestamp", t, obj);
+ timestamp = t.to_real_time();
+ }
+ static void generate_test_instances(std::list<rgw_data_sync_marker*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_marker)
+
+struct rgw_data_sync_status {
+ rgw_data_sync_info sync_info;
+ std::map<uint32_t, rgw_data_sync_marker> sync_markers;
+
+ rgw_data_sync_status() {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(sync_info, bl);
+ /* sync markers are encoded separately */
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(sync_info, bl);
+ /* sync markers are decoded separately */
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const {
+ encode_json("info", sync_info, f);
+ encode_json("markers", sync_markers, f);
+ }
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("info", sync_info, obj);
+ JSONDecoder::decode_json("markers", sync_markers, obj);
+ }
+ static void generate_test_instances(std::list<rgw_data_sync_status*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_status)
+
+struct rgw_datalog_entry {
+ std::string key;
+ ceph::real_time timestamp;
+
+ void decode_json(JSONObj *obj);
+};
+
+struct rgw_datalog_shard_data {
+ std::string marker;
+ bool truncated;
+ std::vector<rgw_datalog_entry> entries;
+
+ void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWDataSyncControlCR;
+
+struct rgw_bucket_entry_owner {
+ std::string id;
+ std::string display_name;
+
+ rgw_bucket_entry_owner() {}
+ rgw_bucket_entry_owner(const std::string& _id, const std::string& _display_name) : id(_id), display_name(_display_name) {}
+
+ void decode_json(JSONObj *obj);
+};
+
+class RGWSyncErrorLogger;
+class RGWRESTConn;
+class RGWServices;
+
+struct RGWDataSyncEnv {
+ const DoutPrefixProvider *dpp{nullptr};
+ CephContext *cct{nullptr};
+ rgw::sal::RadosStore* driver{nullptr};
+ RGWServices *svc{nullptr};
+ RGWAsyncRadosProcessor *async_rados{nullptr};
+ RGWHTTPManager *http_manager{nullptr};
+ RGWSyncErrorLogger *error_logger{nullptr};
+ RGWSyncTraceManager *sync_tracer{nullptr};
+ RGWSyncModuleInstanceRef sync_module{nullptr};
+ PerfCounters* counters{nullptr};
+
+ RGWDataSyncEnv() {}
+
+ void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _driver, RGWServices *_svc,
+ RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+ RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer,
+ RGWSyncModuleInstanceRef& _sync_module,
+ PerfCounters* _counters) {
+ dpp = _dpp;
+ cct = _cct;
+ driver = _driver;
+ svc = _svc;
+ async_rados = _async_rados;
+ http_manager = _http_manager;
+ error_logger = _error_logger;
+ sync_tracer = _sync_tracer;
+ sync_module = _sync_module;
+ counters = _counters;
+ }
+
+ std::string shard_obj_name(int shard_id);
+ std::string status_oid();
+
+ std::ostream* ostr{nullptr}; // For pretty printing progress
+};
+
+// pretty ostream output for `radosgw-admin bucket sync run`
+#if FMT_VERSION >= 90000
+template<typename ...T>
+void pretty_print(const RGWDataSyncEnv* env, fmt::format_string<T...> fmt, T&& ...t) {
+#else
+template<typename S, typename ...T>
+void pretty_print(const RGWDataSyncEnv* env, const S& fmt, T&& ...t) {
+#endif
+ if (unlikely(!!env->ostr)) {
+ fmt::print(*env->ostr, fmt, std::forward<T>(t)...);
+ env->ostr->flush();
+ }
+}
+
+/// \brief Adjust concurrency based on latency
+///
+/// Keep a running average of operation latency and scale concurrency
+/// down when latency rises.
+class LatencyConcurrencyControl : public LatencyMonitor {
+ static constexpr auto dout_subsys = ceph_subsys_rgw;
+ ceph::coarse_mono_time last_warning;
+public:
+ CephContext* cct;
+
+ LatencyConcurrencyControl(CephContext* cct)
+ : cct(cct) {}
+
+ /// \brief Lower concurrency when latency rises
+ ///
+ /// Since we have multiple spawn windows (data sync overall and
+ /// bucket), accept a number of concurrent operations to spawn and,
+ /// if latency is high, cut it in half. If latency is really high,
+ /// cut it to 1.
+ int64_t adj_concurrency(int64_t concurrency) {
+ using namespace std::literals;
+ auto threshold = (cct->_conf->rgw_sync_lease_period * 1s) / 12;
+
+ if (avg_latency() >= 2 * threshold) [[unlikely]] {
+ auto now = ceph::coarse_mono_clock::now();
+ if (now - last_warning > 5min) {
+ ldout(cct, -1)
+ << "WARNING: The OSD cluster is overloaded and struggling to "
+ << "complete ops. You need more capacity to serve this level "
+ << "of demand." << dendl;
+ last_warning = now;
+ }
+ return 1;
+ } else if (avg_latency() >= threshold) [[unlikely]] {
+ return concurrency / 2;
+ } else [[likely]] {
+ return concurrency;
+ }
+ }
+};
+
+struct RGWDataSyncCtx {
+ RGWDataSyncEnv *env{nullptr};
+ CephContext *cct{nullptr};
+
+ RGWRESTConn *conn{nullptr};
+ rgw_zone_id source_zone;
+
+ LatencyConcurrencyControl lcc{nullptr};
+
+ RGWDataSyncCtx() = default;
+
+ RGWDataSyncCtx(RGWDataSyncEnv* env,
+ RGWRESTConn* conn,
+ const rgw_zone_id& source_zone)
+ : env(env), cct(env->cct), conn(conn), source_zone(source_zone), lcc(cct) {}
+
+ void init(RGWDataSyncEnv *_env,
+ RGWRESTConn *_conn,
+ const rgw_zone_id& _source_zone) {
+ cct = _env->cct;
+ env = _env;
+ conn = _conn;
+ source_zone = _source_zone;
+ lcc.cct = cct;
+ }
+};
+
+class RGWRados;
+
+class RGWRemoteDataLog : public RGWCoroutinesManager {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* driver;
+ CephContext *cct;
+ RGWCoroutinesManagerRegistry *cr_registry;
+ RGWAsyncRadosProcessor *async_rados;
+ RGWHTTPManager http_manager;
+
+ RGWDataSyncEnv sync_env;
+ RGWDataSyncCtx sc;
+
+ ceph::shared_mutex lock = ceph::make_shared_mutex("RGWRemoteDataLog::lock");
+ RGWDataSyncControlCR *data_sync_cr;
+
+ RGWSyncTraceNodeRef tn;
+
+ bool initialized;
+
+public:
+ RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* _store,
+ RGWAsyncRadosProcessor *async_rados);
+ int init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+ RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module,
+ PerfCounters* _counters);
+ void finish();
+
+ int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info);
+ int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info);
+ int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result);
+ int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status);
+ int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards);
+ int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets,std::set<std::string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
+ int init_sync_status(const DoutPrefixProvider *dpp, int num_shards);
+ int run_sync(const DoutPrefixProvider *dpp, int num_shards);
+
+ void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries);
+};
+
+class RGWDataSyncStatusManager : public DoutPrefixProvider {
+ rgw::sal::RadosStore* driver;
+
+ rgw_zone_id source_zone;
+ RGWRESTConn *conn;
+ RGWSyncErrorLogger *error_logger;
+ RGWSyncModuleInstanceRef sync_module;
+ PerfCounters* counters;
+
+ RGWRemoteDataLog source_log;
+
+ std::string source_status_oid;
+ std::string source_shard_status_oid_prefix;
+
+ std::map<int, rgw_raw_obj> shard_objs;
+
+ int num_shards;
+
+public:
+ RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+ const rgw_zone_id& _source_zone, PerfCounters* counters)
+ : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+ sync_module(nullptr), counters(counters),
+ source_log(this, driver, async_rados), num_shards(0) {}
+ RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+ const rgw_zone_id& _source_zone, PerfCounters* counters,
+ const RGWSyncModuleInstanceRef& _sync_module)
+ : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+ sync_module(_sync_module), counters(counters),
+ source_log(this, driver, async_rados), num_shards(0) {}
+ ~RGWDataSyncStatusManager() {
+ finalize();
+ }
+ int init(const DoutPrefixProvider *dpp);
+ void finalize();
+
+ static std::string shard_obj_name(const rgw_zone_id& source_zone, int shard_id);
+ static std::string sync_status_oid(const rgw_zone_id& source_zone);
+
+ int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) {
+ return source_log.read_sync_status(dpp, sync_status);
+ }
+
+ int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards) {
+ return source_log.read_recovering_shards(dpp, num_shards, recovering_shards);
+ }
+
+ int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets, std::set<std::string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
+ return source_log.read_shard_status(dpp, shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
+ }
+ int init_sync_status(const DoutPrefixProvider *dpp) { return source_log.init_sync_status(dpp, num_shards); }
+
+ int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) {
+ return source_log.read_log_info(dpp, log_info);
+ }
+ int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info) {
+ return source_log.read_source_log_shards_info(dpp, shards_info);
+ }
+ int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result) {
+ return source_log.read_source_log_shards_next(dpp, shard_markers, result);
+ }
+
+ int run(const DoutPrefixProvider *dpp) { return source_log.run_sync(dpp, num_shards); }
+
+ void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) { return source_log.wakeup(shard_id, entries); }
+
+ void stop() {
+ source_log.finish();
+ }
+
+ // implements DoutPrefixProvider
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+};
+
+class RGWBucketPipeSyncStatusManager;
+class RGWBucketSyncCR;
+
+struct rgw_bucket_shard_full_sync_marker {
+ rgw_obj_key position;
+ uint64_t count;
+
+ rgw_bucket_shard_full_sync_marker() : count(0) {}
+
+ void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(position, bl);
+ encode(count, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(position, bl);
+ decode(count, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker)
+
+struct rgw_bucket_shard_inc_sync_marker {
+ std::string position;
+ ceph::real_time timestamp;
+
+ void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(position, bl);
+ encode(timestamp, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(position, bl);
+ if (struct_v >= 2) {
+ decode(timestamp, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker)
+
+struct rgw_bucket_shard_sync_info {
+ enum SyncState {
+ StateInit = 0,
+ StateFullSync = 1,
+ StateIncrementalSync = 2,
+ StateStopped = 3,
+ };
+
+ uint16_t state;
+ rgw_bucket_shard_inc_sync_marker inc_marker;
+
+ void decode_from_attrs(CephContext *cct, std::map<std::string, bufferlist>& attrs);
+ void encode_all_attrs(std::map<std::string, bufferlist>& attrs);
+ void encode_state_attr(std::map<std::string, bufferlist>& attrs);
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(state, bl);
+ encode(inc_marker, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(state, bl);
+ if (struct_v <= 1) {
+ rgw_bucket_shard_full_sync_marker full_marker;
+ decode(full_marker, bl);
+ }
+ decode(inc_marker, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+
+ rgw_bucket_shard_sync_info() : state((int)StateInit) {}
+
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
+
+struct rgw_bucket_full_sync_status {
+ rgw_obj_key position;
+ uint64_t count = 0;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(position, bl);
+ encode(count, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(position, bl);
+ decode(count, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_full_sync_status)
+
+enum class BucketSyncState : uint8_t {
+ Init = 0,
+ Full,
+ Incremental,
+ Stopped,
+};
+inline std::ostream& operator<<(std::ostream& out, const BucketSyncState& s) {
+ switch (s) {
+ case BucketSyncState::Init: out << "init"; break;
+ case BucketSyncState::Full: out << "full"; break;
+ case BucketSyncState::Incremental: out << "incremental"; break;
+ case BucketSyncState::Stopped: out << "stopped"; break;
+ }
+ return out;
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f);
+void decode_json_obj(BucketSyncState& state, JSONObj *obj);
+
+struct rgw_bucket_sync_status {
+ BucketSyncState state = BucketSyncState::Init;
+ rgw_bucket_full_sync_status full;
+ uint64_t incremental_gen = 0;
+ std::vector<bool> shards_done_with_gen;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(state, bl);
+ encode(full, bl);
+ encode(incremental_gen, bl);
+ encode(shards_done_with_gen, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(state, bl);
+ decode(full, bl);
+ if (struct_v > 1) {
+ decode(incremental_gen, bl);
+ decode(shards_done_with_gen, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_sync_status)
+
+struct bilog_status_v2 {
+ rgw_bucket_sync_status sync_status;
+ std::vector<rgw_bucket_shard_sync_info> inc_status;
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+
+struct store_gen_shards {
+ uint64_t gen = 0;
+ uint32_t num_shards = 0;
+
+ void dump(Formatter *f) const {
+ encode_json("gen", gen, f);
+ encode_json("num_shards", num_shards, f);
+ }
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("gen", gen, obj);
+ JSONDecoder::decode_json("num_shards", num_shards, obj);
+ }
+};
+
+struct rgw_bucket_index_marker_info {
+ std::string bucket_ver;
+ std::string master_ver;
+ std::string max_marker;
+ bool syncstopped{false};
+ uint64_t oldest_gen = 0;
+ uint64_t latest_gen = 0;
+ std::vector<store_gen_shards> generations;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
+ JSONDecoder::decode_json("master_ver", master_ver, obj);
+ JSONDecoder::decode_json("max_marker", max_marker, obj);
+ JSONDecoder::decode_json("syncstopped", syncstopped, obj);
+ JSONDecoder::decode_json("oldest_gen", oldest_gen, obj);
+ JSONDecoder::decode_json("latest_gen", latest_gen, obj);
+ JSONDecoder::decode_json("generations", generations, obj);
+ }
+};
+
+
+class BucketIndexShardsManager;
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+ RGWRESTConn* conn,
+ const rgw_bucket& bucket,
+ rgw_bucket_index_marker_info& info,
+ BucketIndexShardsManager& markers,
+ optional_yield y);
+
+class RGWBucketPipeSyncStatusManager : public DoutPrefixProvider {
+ rgw::sal::RadosStore* driver;
+
+ RGWDataSyncEnv sync_env;
+
+ RGWCoroutinesManager cr_mgr{driver->ctx(),
+ driver->getRados()->get_cr_registry()};
+
+ RGWHTTPManager http_manager{driver->ctx(), cr_mgr.get_completion_mgr()};
+
+ std::optional<rgw_zone_id> source_zone;
+ std::optional<rgw_bucket> source_bucket;
+
+ std::unique_ptr<RGWSyncErrorLogger> error_logger =
+ std::make_unique<RGWSyncErrorLogger>(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX,
+ ERROR_LOGGER_SHARDS);
+ RGWSyncModuleInstanceRef sync_module;
+
+ rgw_bucket dest_bucket;
+
+ struct source {
+ RGWDataSyncCtx sc;
+ RGWBucketInfo info;
+ rgw_bucket dest;
+ RGWBucketSyncFlowManager::pipe_handler handler;
+ std::string zone_name;
+
+ source(RGWDataSyncEnv* env, const rgw_zone_id& zone, RGWRESTConn* conn,
+ const RGWBucketInfo& info, const rgw_bucket& dest,
+ const RGWBucketSyncFlowManager::pipe_handler& handler,
+ const std::string& zone_name)
+ : sc(env, conn, zone), info(info), dest(dest), handler(handler),
+ zone_name(zone_name) {}
+ };
+ std::vector<source> sources;
+
+ int do_init(const DoutPrefixProvider *dpp, std::ostream* ostr);
+ RGWBucketPipeSyncStatusManager(rgw::sal::RadosStore* driver,
+ std::optional<rgw_zone_id> source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_bucket& dest_bucket)
+ : driver(driver), source_zone(source_zone), source_bucket(source_bucket),
+ dest_bucket(dest_bucket) {}
+
+ int remote_info(const DoutPrefixProvider *dpp, source& s,
+ uint64_t* oldest_gen, uint64_t* latest_gen,
+ uint64_t* num_shards);
+public:
+ static tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+ construct(const DoutPrefixProvider* dpp, rgw::sal::RadosStore* driver,
+ std::optional<rgw_zone_id> source_zone,
+ std::optional<rgw_bucket> source_bucket,
+ const rgw_bucket& dest_bucket, std::ostream *ostream);
+ ~RGWBucketPipeSyncStatusManager() = default;
+
+
+ static std::string full_status_oid(const rgw_zone_id& source_zone,
+ const rgw_bucket& source_bucket,
+ const rgw_bucket& dest_bucket);
+ static std::string inc_status_oid(const rgw_zone_id& source_zone,
+ const rgw_bucket_sync_pair_info& bs,
+ uint64_t gen);
+ // specific source obj sync status, can be used by sync modules
+ static std::string obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& obj);
+
+ // implements DoutPrefixProvider
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+
+ int init_sync_status(const DoutPrefixProvider *dpp);
+ tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int> read_sync_status(
+ const DoutPrefixProvider *dpp);
+ int run(const DoutPrefixProvider *dpp);
+};
+
+/// read the full sync status with respect to a source bucket
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *driver,
+ const rgw_sync_bucket_pipe& pipe,
+ rgw_bucket_sync_status *status,
+ optional_yield y);
+
+/// read the incremental sync status of all bucket shards from the given source zone
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *driver,
+ const rgw_sync_bucket_pipe& pipe,
+ uint64_t gen,
+ std::vector<rgw_bucket_shard_sync_info> *status);
+
+class RGWDefaultSyncModule : public RGWSyncModule {
+public:
+ RGWDefaultSyncModule() {}
+ bool supports_writes() override { return true; }
+ bool supports_data_export() override { return true; }
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWArchiveSyncModule : public RGWDefaultSyncModule {
+public:
+ RGWArchiveSyncModule() {}
+ bool supports_writes() override { return true; }
+ bool supports_data_export() override { return false; }
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_datalog.cc b/src/rgw/driver/rados/rgw_datalog.cc
new file mode 100644
index 000000000..7ca37abf6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog.cc
@@ -0,0 +1,1090 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+
+#include "common/async/yield_context.h"
+#include "common/debug.h"
+#include "common/containers.h"
+#include "common/errno.h"
+#include "common/error_code.h"
+
+#include "common/async/blocked_completion.h"
+#include "common/async/librados_completion.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/log/cls_log_client.h"
+
+#include "cls_fifo_legacy.h"
+#include "rgw_bucket_layout.h"
+#include "rgw_datalog.h"
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+
+#define dout_context g_ceph_context
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+namespace bs = boost::system;
+namespace lr = librados;
+
+using ceph::containers::tiny_vector;
+
+void rgw_data_change::dump(ceph::Formatter *f) const
+{
+ std::string type;
+ switch (entity_type) {
+ case ENTITY_TYPE_BUCKET:
+ type = "bucket";
+ break;
+ default:
+ type = "unknown";
+ }
+ encode_json("entity_type", type, f);
+ encode_json("key", key, f);
+ utime_t ut(timestamp);
+ encode_json("timestamp", ut, f);
+ encode_json("gen", gen, f);
+}
+
+void rgw_data_change::decode_json(JSONObj *obj) {
+ std::string s;
+ JSONDecoder::decode_json("entity_type", s, obj);
+ if (s == "bucket") {
+ entity_type = ENTITY_TYPE_BUCKET;
+ } else {
+ entity_type = ENTITY_TYPE_UNKNOWN;
+ }
+ JSONDecoder::decode_json("key", key, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("timestamp", ut, obj);
+ timestamp = ut.to_real_time();
+ JSONDecoder::decode_json("gen", gen, obj);
+}
+
+void rgw_data_change_log_entry::dump(Formatter *f) const
+{
+ encode_json("log_id", log_id, f);
+ utime_t ut(log_timestamp);
+ encode_json("log_timestamp", ut, f);
+ encode_json("entry", entry, f);
+}
+
+void rgw_data_change_log_entry::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("log_id", log_id, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("log_timestamp", ut, obj);
+ log_timestamp = ut.to_real_time();
+ JSONDecoder::decode_json("entry", entry, obj);
+}
+
+void rgw_data_notify_entry::dump(Formatter *f) const
+{
+ encode_json("key", key, f);
+ encode_json("gen", gen, f);
+}
+
+void rgw_data_notify_entry::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("key", key, obj);
+ JSONDecoder::decode_json("gen", gen, obj);
+}
+
+class RGWDataChangesOmap final : public RGWDataChangesBE {
+ using centries = std::list<cls_log_entry>;
+ std::vector<std::string> oids;
+
+public:
+ RGWDataChangesOmap(lr::IoCtx& ioctx,
+ RGWDataChangesLog& datalog,
+ uint64_t gen_id,
+ int num_shards)
+ : RGWDataChangesBE(ioctx, datalog, gen_id) {
+ oids.reserve(num_shards);
+ for (auto i = 0; i < num_shards; ++i) {
+ oids.push_back(get_oid(i));
+ }
+ }
+ ~RGWDataChangesOmap() override = default;
+
+ void prepare(ceph::real_time ut, const std::string& key,
+ ceph::buffer::list&& entry, entries& out) override {
+ if (!std::holds_alternative<centries>(out)) {
+ ceph_assert(std::visit([](const auto& v) { return std::empty(v); }, out));
+ out = centries();
+ }
+
+ cls_log_entry e;
+ cls_log_add_prepare_entry(e, utime_t(ut), {}, key, entry);
+ std::get<centries>(out).push_back(std::move(e));
+ }
+ int push(const DoutPrefixProvider *dpp, int index, entries&& items, optional_yield y) override {
+ lr::ObjectWriteOperation op;
+ cls_log_add(op, std::get<centries>(items), true);
+ auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to push to " << oids[index] << cpp_strerror(-r)
+ << dendl;
+ }
+ return r;
+ }
+ int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+ const std::string& key, ceph::buffer::list&& bl,
+ optional_yield y) override {
+ lr::ObjectWriteOperation op;
+ cls_log_add(op, utime_t(now), {}, key, bl);
+ auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to push to " << oids[index]
+ << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::optional<std::string_view> marker,
+ std::string* out_marker, bool* truncated,
+ optional_yield y) override {
+ std::list<cls_log_entry> log_entries;
+ lr::ObjectReadOperation op;
+ cls_log_list(op, {}, {}, std::string(marker.value_or("")),
+ max_entries, log_entries, out_marker, truncated);
+ auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y);
+ if (r == -ENOENT) {
+ *truncated = false;
+ return 0;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to list " << oids[index]
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ for (auto iter = log_entries.begin(); iter != log_entries.end(); ++iter) {
+ rgw_data_change_log_entry log_entry;
+ log_entry.log_id = iter->id;
+ auto rt = iter->timestamp.to_real_time();
+ log_entry.log_timestamp = rt;
+ auto liter = iter->data.cbegin();
+ try {
+ decode(log_entry.entry, liter);
+ } catch (ceph::buffer::error& err) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to decode data changes log entry: "
+ << err.what() << dendl;
+ return -EIO;
+ }
+ entries.push_back(log_entry);
+ }
+ return 0;
+ }
+ int get_info(const DoutPrefixProvider *dpp, int index,
+ RGWDataChangesLogInfo *info, optional_yield y) override {
+ cls_log_header header;
+ lr::ObjectReadOperation op;
+ cls_log_info(op, &header);
+ auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y);
+ if (r == -ENOENT) r = 0;
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to get info from " << oids[index]
+ << cpp_strerror(-r) << dendl;
+ } else {
+ info->marker = header.max_marker;
+ info->last_update = header.max_time.to_real_time();
+ }
+ return r;
+ }
+ int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+ optional_yield y) override {
+ lr::ObjectWriteOperation op;
+ cls_log_trim(op, {}, {}, {}, std::string(marker));
+ auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+ if (r == -ENOENT) r = -ENODATA;
+ if (r < 0 && r != -ENODATA) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to get info from " << oids[index]
+ << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+ lr::AioCompletion* c) override {
+ lr::ObjectWriteOperation op;
+ cls_log_trim(op, {}, {}, {}, std::string(marker));
+ auto r = ioctx.aio_operate(oids[index], c, &op, 0);
+ if (r == -ENOENT) r = -ENODATA;
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to get info from " << oids[index]
+ << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ std::string_view max_marker() const override {
+ return "99999999";
+ }
+ int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override {
+ for (auto shard = 0u; shard < oids.size(); ++shard) {
+ std::list<cls_log_entry> log_entries;
+ lr::ObjectReadOperation op;
+ std::string out_marker;
+ bool truncated;
+ cls_log_list(op, {}, {}, {}, 1, log_entries, &out_marker, &truncated);
+ auto r = rgw_rados_operate(dpp, ioctx, oids[shard], &op, nullptr, y);
+ if (r == -ENOENT) {
+ continue;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to list " << oids[shard]
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ if (!log_entries.empty()) {
+ return 0;
+ }
+ }
+ return 1;
+ }
+};
+
+class RGWDataChangesFIFO final : public RGWDataChangesBE {
+ using centries = std::vector<ceph::buffer::list>;
+ tiny_vector<LazyFIFO> fifos;
+
+public:
+ RGWDataChangesFIFO(lr::IoCtx& ioctx,
+ RGWDataChangesLog& datalog,
+ uint64_t gen_id, int shards)
+ : RGWDataChangesBE(ioctx, datalog, gen_id),
+ fifos(shards, [&ioctx, this](std::size_t i, auto emplacer) {
+ emplacer.emplace(ioctx, get_oid(i));
+ }) {}
+ ~RGWDataChangesFIFO() override = default;
+ void prepare(ceph::real_time, const std::string&,
+ ceph::buffer::list&& entry, entries& out) override {
+ if (!std::holds_alternative<centries>(out)) {
+ ceph_assert(std::visit([](auto& v) { return std::empty(v); }, out));
+ out = centries();
+ }
+ std::get<centries>(out).push_back(std::move(entry));
+ }
+ int push(const DoutPrefixProvider *dpp, int index, entries&& items,
+ optional_yield y) override {
+ auto r = fifos[index].push(dpp, std::get<centries>(items), y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to push to FIFO: " << get_oid(index)
+ << ": " << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ int push(const DoutPrefixProvider *dpp, int index, ceph::real_time,
+ const std::string&, ceph::buffer::list&& bl,
+ optional_yield y) override {
+ auto r = fifos[index].push(dpp, std::move(bl), y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to push to FIFO: " << get_oid(index)
+ << ": " << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::optional<std::string_view> marker, std::string* out_marker,
+ bool* truncated, optional_yield y) override {
+ std::vector<rgw::cls::fifo::list_entry> log_entries;
+ bool more = false;
+ auto r = fifos[index].list(dpp, max_entries, marker, &log_entries, &more,
+ y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to list FIFO: " << get_oid(index)
+ << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ for (const auto& entry : log_entries) {
+ rgw_data_change_log_entry log_entry;
+ log_entry.log_id = entry.marker;
+ log_entry.log_timestamp = entry.mtime;
+ auto liter = entry.data.cbegin();
+ try {
+ decode(log_entry.entry, liter);
+ } catch (const buffer::error& err) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": failed to decode data changes log entry: "
+ << err.what() << dendl;
+ return -EIO;
+ }
+ entries.push_back(std::move(log_entry));
+ }
+ if (truncated)
+ *truncated = more;
+ if (out_marker && !log_entries.empty()) {
+ *out_marker = log_entries.back().marker;
+ }
+ return 0;
+ }
+ int get_info(const DoutPrefixProvider *dpp, int index,
+ RGWDataChangesLogInfo *info, optional_yield y) override {
+ auto& fifo = fifos[index];
+ auto r = fifo.read_meta(dpp, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to get FIFO metadata: " << get_oid(index)
+ << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ rados::cls::fifo::info m;
+ fifo.meta(dpp, m, y);
+ auto p = m.head_part_num;
+ if (p < 0) {
+ info->marker = "";
+ info->last_update = ceph::real_clock::zero();
+ return 0;
+ }
+ rgw::cls::fifo::part_info h;
+ r = fifo.get_part_info(dpp, p, &h, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to get part info: " << get_oid(index) << "/" << p
+ << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ info->marker = rgw::cls::fifo::marker{p, h.last_ofs}.to_string();
+ info->last_update = h.max_time;
+ return 0;
+ }
+ int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+ optional_yield y) override {
+ auto r = fifos[index].trim(dpp, marker, false, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to trim FIFO: " << get_oid(index)
+ << ": " << cpp_strerror(-r) << dendl;
+ }
+ return r;
+ }
+ int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+ librados::AioCompletion* c) override {
+ int r = 0;
+ if (marker == rgw::cls::fifo::marker(0, 0).to_string()) {
+ rgw_complete_aio_completion(c, -ENODATA);
+ } else {
+ // This null_yield is used for lazily opening FIFOs.
+ //
+ // shouldn't exist, but it can't be eliminated
+ // since your caller is an RGWCoroutine in the data sync code.
+ //
+ // It can be eliminated after Reef when we can get rid of
+ // AioCompletion entirely.
+ fifos[index].trim(dpp, marker, false, c, null_yield);
+ }
+ return r;
+ }
+ std::string_view max_marker() const override {
+ static const std::string mm =
+ rgw::cls::fifo::marker::max().to_string();
+ return std::string_view(mm);
+ }
+ int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override {
+ std::vector<rgw::cls::fifo::list_entry> log_entries;
+ bool more = false;
+ for (auto shard = 0u; shard < fifos.size(); ++shard) {
+ auto r = fifos[shard].list(dpp, 1, {}, &log_entries, &more, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": unable to list FIFO: " << get_oid(shard)
+ << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ if (!log_entries.empty()) {
+ return 0;
+ }
+ }
+ return 1;
+ }
+};
+
+RGWDataChangesLog::RGWDataChangesLog(CephContext* cct)
+ : cct(cct),
+ num_shards(cct->_conf->rgw_data_log_num_shards),
+ prefix(get_prefix()),
+ changes(cct->_conf->rgw_data_log_changes_size) {}
+
+bs::error_code DataLogBackends::handle_init(entries_t e) noexcept {
+ std::unique_lock l(m);
+
+ for (const auto& [gen_id, gen] : e) {
+ if (gen.pruned) {
+ lderr(datalog.cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": ERROR: given empty generation: gen_id=" << gen_id << dendl;
+ }
+ if (count(gen_id) != 0) {
+ lderr(datalog.cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": ERROR: generation already exists: gen_id=" << gen_id << dendl;
+ }
+ try {
+ switch (gen.type) {
+ case log_type::omap:
+ emplace(gen_id, new RGWDataChangesOmap(ioctx, datalog, gen_id, shards));
+ break;
+ case log_type::fifo:
+ emplace(gen_id, new RGWDataChangesFIFO(ioctx, datalog, gen_id, shards));
+ break;
+ default:
+ lderr(datalog.cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": IMPOSSIBLE: invalid log type: gen_id=" << gen_id
+ << ", type" << gen.type << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+ } catch (const bs::system_error& err) {
+ lderr(datalog.cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": error setting up backend: gen_id=" << gen_id
+ << ", err=" << err.what() << dendl;
+ return err.code();
+ }
+ }
+ return {};
+}
+bs::error_code DataLogBackends::handle_new_gens(entries_t e) noexcept {
+ return handle_init(std::move(e));
+}
+bs::error_code DataLogBackends::handle_empty_to(uint64_t new_tail) noexcept {
+ std::unique_lock l(m);
+ auto i = cbegin();
+ if (i->first < new_tail) {
+ return {};
+ }
+ if (new_tail >= (cend() - 1)->first) {
+ lderr(datalog.cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": ERROR: attempt to trim head: new_tail=" << new_tail << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+ erase(i, upper_bound(new_tail));
+ return {};
+}
+
+
+int RGWDataChangesLog::start(const DoutPrefixProvider *dpp, const RGWZone* _zone,
+ const RGWZoneParams& zoneparams,
+ librados::Rados* lr)
+{
+ zone = _zone;
+ ceph_assert(zone);
+ auto defbacking = to_log_type(
+ cct->_conf.get_val<std::string>("rgw_default_data_log_backing"));
+ // Should be guaranteed by `set_enum_allowed`
+ ceph_assert(defbacking);
+ auto log_pool = zoneparams.log_pool;
+ auto r = rgw_init_ioctx(dpp, lr, log_pool, ioctx, true, false);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+ << ": Failed to initialized ioctx, r=" << r
+ << ", pool=" << log_pool << dendl;
+ return -r;
+ }
+
+ // This null_yield is in startup code, so it doesn't matter that much.
+ auto besr = logback_generations::init<DataLogBackends>(
+ dpp, ioctx, metadata_log_oid(), [this](uint64_t gen_id, int shard) {
+ return get_oid(gen_id, shard);
+ },
+ num_shards, *defbacking, null_yield, *this);
+
+
+ if (!besr) {
+ lderr(cct) << __PRETTY_FUNCTION__
+ << ": Error initializing backends: "
+ << besr.error().message() << dendl;
+ return ceph::from_error_code(besr.error());
+ }
+
+ bes = std::move(*besr);
+ renew_thread = make_named_thread("rgw_dt_lg_renew",
+ &RGWDataChangesLog::renew_run, this);
+ return 0;
+}
+
+int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) {
+ const auto& name = bs.bucket.name;
+ auto shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0);
+ auto r = (ceph_str_hash_linux(name.data(), name.size()) +
+ shard_shift) % num_shards;
+ return static_cast<int>(r);
+}
+
+int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp)
+{
+ if (!zone->log_data)
+ return 0;
+
+ /* we can't keep the bucket name as part of the cls_log_entry, and we need
+ * it later, so we keep two lists under the map */
+ bc::flat_map<int, std::pair<std::vector<BucketGen>,
+ RGWDataChangesBE::entries>> m;
+
+ std::unique_lock l(lock);
+ decltype(cur_cycle) entries;
+ entries.swap(cur_cycle);
+ l.unlock();
+
+ auto ut = real_clock::now();
+ auto be = bes->head();
+ for (const auto& [bs, gen] : entries) {
+ auto index = choose_oid(bs);
+
+ rgw_data_change change;
+ bufferlist bl;
+ change.entity_type = ENTITY_TYPE_BUCKET;
+ change.key = bs.get_key();
+ change.timestamp = ut;
+ change.gen = gen;
+ encode(change, bl);
+
+ m[index].first.push_back({bs, gen});
+ be->prepare(ut, change.key, std::move(bl), m[index].second);
+ }
+
+ for (auto& [index, p] : m) {
+ auto& [buckets, entries] = p;
+
+ auto now = real_clock::now();
+
+ // This null_yield can stay (for now) as we're in our own thread.
+ auto ret = be->push(dpp, index, std::move(entries), null_yield);
+ if (ret < 0) {
+ /* we don't really need to have a special handling for failed cases here,
+ * as this is just an optimization. */
+ ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl;
+ return ret;
+ }
+
+ auto expiration = now;
+ expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+ for (auto& [bs, gen] : buckets) {
+ update_renewed(bs, gen, expiration);
+ }
+ }
+
+ return 0;
+}
+
+auto RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs,
+ uint64_t gen)
+ -> ChangeStatusPtr
+{
+ ceph_assert(ceph_mutex_is_locked(lock));
+ ChangeStatusPtr status;
+ if (!changes.find({bs, gen}, status)) {
+ status = std::make_shared<ChangeStatus>();
+ changes.add({bs, gen}, status);
+ }
+ return status;
+}
+
+void RGWDataChangesLog::register_renew(const rgw_bucket_shard& bs,
+ const rgw::bucket_log_layout_generation& gen)
+{
+ std::scoped_lock l{lock};
+ cur_cycle.insert({bs, gen.gen});
+}
+
+void RGWDataChangesLog::update_renewed(const rgw_bucket_shard& bs,
+ uint64_t gen,
+ real_time expiration)
+{
+ std::unique_lock l{lock};
+ auto status = _get_change(bs, gen);
+ l.unlock();
+
+ ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name="
+ << bs.bucket.name << " shard_id=" << bs.shard_id
+ << " expiration=" << expiration << dendl;
+
+ std::unique_lock sl(status->lock);
+ status->cur_expiration = expiration;
+}
+
+int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) {
+ rgw_bucket_shard bs(bucket, shard_id);
+ return choose_oid(bs);
+}
+
+bool RGWDataChangesLog::filter_bucket(const DoutPrefixProvider *dpp,
+ const rgw_bucket& bucket,
+ optional_yield y) const
+{
+ if (!bucket_filter) {
+ return true;
+ }
+
+ return bucket_filter(bucket, y, dpp);
+}
+
+std::string RGWDataChangesLog::get_oid(uint64_t gen_id, int i) const {
+ return (gen_id > 0 ?
+ fmt::format("{}@G{}.{}", prefix, gen_id, i) :
+ fmt::format("{}.{}", prefix, i));
+}
+
+int RGWDataChangesLog::add_entry(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_log_layout_generation& gen,
+ int shard_id, optional_yield y)
+{
+ auto& bucket = bucket_info.bucket;
+
+ if (!filter_bucket(dpp, bucket, y)) {
+ return 0;
+ }
+
+ if (observer) {
+ observer->on_bucket_changed(bucket.get_key());
+ }
+
+ rgw_bucket_shard bs(bucket, shard_id);
+
+ int index = choose_oid(bs);
+
+ mark_modified(index, bs, gen.gen);
+
+ std::unique_lock l(lock);
+
+ auto status = _get_change(bs, gen.gen);
+ l.unlock();
+
+ auto now = real_clock::now();
+
+ std::unique_lock sl(status->lock);
+
+ ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name
+ << " shard_id=" << shard_id << " now=" << now
+ << " cur_expiration=" << status->cur_expiration << dendl;
+
+ if (now < status->cur_expiration) {
+ /* no need to send, recently completed */
+ sl.unlock();
+ register_renew(bs, gen);
+ return 0;
+ }
+
+ RefCountedCond* cond;
+
+ if (status->pending) {
+ cond = status->cond;
+
+ ceph_assert(cond);
+
+ status->cond->get();
+ sl.unlock();
+
+ int ret = cond->wait();
+ cond->put();
+ if (!ret) {
+ register_renew(bs, gen);
+ }
+ return ret;
+ }
+
+ status->cond = new RefCountedCond;
+ status->pending = true;
+
+ ceph::real_time expiration;
+
+ int ret;
+
+ do {
+ status->cur_sent = now;
+
+ expiration = now;
+ expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+
+ sl.unlock();
+
+ ceph::buffer::list bl;
+ rgw_data_change change;
+ change.entity_type = ENTITY_TYPE_BUCKET;
+ change.key = bs.get_key();
+ change.timestamp = now;
+ change.gen = gen.gen;
+ encode(change, bl);
+
+ ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl;
+
+ auto be = bes->head();
+ ret = be->push(dpp, index, now, change.key, std::move(bl), y);
+
+ now = real_clock::now();
+
+ sl.lock();
+
+ } while (!ret && real_clock::now() > expiration);
+
+ cond = status->cond;
+
+ status->pending = false;
+ /* time of when operation started, not completed */
+ status->cur_expiration = status->cur_sent;
+ status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window);
+ status->cond = nullptr;
+ sl.unlock();
+
+ cond->done(ret);
+ cond->put();
+
+ return ret;
+}
+
+int DataLogBackends::list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::string_view marker, std::string* out_marker,
+ bool* truncated, optional_yield y)
+{
+ const auto [start_id, start_cursor] = cursorgen(marker);
+ auto gen_id = start_id;
+ std::string out_cursor;
+ while (max_entries > 0) {
+ std::vector<rgw_data_change_log_entry> gentries;
+ std::unique_lock l(m);
+ auto i = lower_bound(gen_id);
+ if (i == end()) return 0;
+ auto be = i->second;
+ l.unlock();
+ gen_id = be->gen_id;
+ auto r = be->list(dpp, shard, max_entries, gentries,
+ gen_id == start_id ? start_cursor : std::string{},
+ &out_cursor, truncated, y);
+ if (r < 0)
+ return r;
+
+ if (out_marker && !out_cursor.empty()) {
+ *out_marker = gencursor(gen_id, out_cursor);
+ }
+ for (auto& g : gentries) {
+ g.log_id = gencursor(gen_id, g.log_id);
+ }
+ if (int s = gentries.size(); s < 0 || s > max_entries)
+ max_entries = 0;
+ else
+ max_entries -= gentries.size();
+
+ std::move(gentries.begin(), gentries.end(),
+ std::back_inserter(entries));
+ ++gen_id;
+ }
+ return 0;
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::string_view marker,
+ std::string* out_marker, bool* truncated,
+ optional_yield y)
+{
+ assert(shard < num_shards);
+ return bes->list(dpp, shard, max_entries, entries, marker, out_marker,
+ truncated, y);
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ LogMarker& marker, bool *ptruncated,
+ optional_yield y)
+{
+ bool truncated;
+ entries.clear();
+ for (; marker.shard < num_shards && int(entries.size()) < max_entries;
+ marker.shard++, marker.marker.clear()) {
+ int ret = list_entries(dpp, marker.shard, max_entries - entries.size(),
+ entries, marker.marker, NULL, &truncated, y);
+ if (ret == -ENOENT) {
+ continue;
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ if (!truncated) {
+ *ptruncated = false;
+ return 0;
+ }
+ }
+ *ptruncated = (marker.shard < num_shards);
+ return 0;
+}
+
+int RGWDataChangesLog::get_info(const DoutPrefixProvider *dpp, int shard_id,
+ RGWDataChangesLogInfo *info, optional_yield y)
+{
+ assert(shard_id < num_shards);
+ auto be = bes->head();
+ auto r = be->get_info(dpp, shard_id, info, y);
+ if (!info->marker.empty()) {
+ info->marker = gencursor(be->gen_id, info->marker);
+ }
+ return r;
+}
+
+int DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+ std::string_view marker, optional_yield y)
+{
+ auto [target_gen, cursor] = cursorgen(marker);
+ std::unique_lock l(m);
+ const auto head_gen = (end() - 1)->second->gen_id;
+ const auto tail_gen = begin()->first;
+ if (target_gen < tail_gen) return 0;
+ auto r = 0;
+ for (auto be = lower_bound(0)->second;
+ be->gen_id <= target_gen && be->gen_id <= head_gen && r >= 0;
+ be = upper_bound(be->gen_id)->second) {
+ l.unlock();
+ auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+ r = be->trim(dpp, shard_id, c, y);
+ if (r == -ENOENT)
+ r = -ENODATA;
+ if (r == -ENODATA && be->gen_id < target_gen)
+ r = 0;
+ if (be->gen_id == target_gen)
+ break;
+ l.lock();
+ };
+ return r;
+}
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+ std::string_view marker, optional_yield y)
+{
+ assert(shard_id < num_shards);
+ return bes->trim_entries(dpp, shard_id, marker, y);
+}
+
+class GenTrim : public rgw::cls::fifo::Completion<GenTrim> {
+public:
+ DataLogBackends* const bes;
+ const int shard_id;
+ const uint64_t target_gen;
+ const std::string cursor;
+ const uint64_t head_gen;
+ const uint64_t tail_gen;
+ boost::intrusive_ptr<RGWDataChangesBE> be;
+
+ GenTrim(const DoutPrefixProvider *dpp, DataLogBackends* bes, int shard_id, uint64_t target_gen,
+ std::string cursor, uint64_t head_gen, uint64_t tail_gen,
+ boost::intrusive_ptr<RGWDataChangesBE> be,
+ lr::AioCompletion* super)
+ : Completion(dpp, super), bes(bes), shard_id(shard_id), target_gen(target_gen),
+ cursor(std::move(cursor)), head_gen(head_gen), tail_gen(tail_gen),
+ be(std::move(be)) {}
+
+ void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+ auto gen_id = be->gen_id;
+ be.reset();
+ if (r == -ENOENT)
+ r = -ENODATA;
+ if (r == -ENODATA && gen_id < target_gen)
+ r = 0;
+ if (r < 0) {
+ complete(std::move(p), r);
+ return;
+ }
+
+ {
+ std::unique_lock l(bes->m);
+ auto i = bes->upper_bound(gen_id);
+ if (i == bes->end() || i->first > target_gen || i->first > head_gen) {
+ l.unlock();
+ complete(std::move(p), -ENODATA);
+ return;
+ }
+ be = i->second;
+ }
+ auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+ be->trim(dpp, shard_id, c, call(std::move(p)));
+ }
+};
+
+void DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+ librados::AioCompletion* c)
+{
+ auto [target_gen, cursor] = cursorgen(marker);
+ std::unique_lock l(m);
+ const auto head_gen = (end() - 1)->second->gen_id;
+ const auto tail_gen = begin()->first;
+ if (target_gen < tail_gen) {
+ l.unlock();
+ rgw_complete_aio_completion(c, -ENODATA);
+ return;
+ }
+ auto be = begin()->second;
+ l.unlock();
+ auto gt = std::make_unique<GenTrim>(dpp, this, shard_id, target_gen,
+ std::string(cursor), head_gen, tail_gen,
+ be, c);
+
+ auto cc = be->gen_id == target_gen ? cursor : be->max_marker();
+ be->trim(dpp, shard_id, cc, GenTrim::call(std::move(gt)));
+}
+
+int DataLogBackends::trim_generations(const DoutPrefixProvider *dpp,
+ std::optional<uint64_t>& through,
+ optional_yield y) {
+ if (size() != 1) {
+ std::vector<mapped_type> candidates;
+ {
+ std::scoped_lock l(m);
+ auto e = cend() - 1;
+ for (auto i = cbegin(); i < e; ++i) {
+ candidates.push_back(i->second);
+ }
+ }
+
+ std::optional<uint64_t> highest;
+ for (auto& be : candidates) {
+ auto r = be->is_empty(dpp, y);
+ if (r < 0) {
+ return r;
+ } else if (r == 1) {
+ highest = be->gen_id;
+ } else {
+ break;
+ }
+ }
+
+ through = highest;
+ if (!highest) {
+ return 0;
+ }
+ auto ec = empty_to(dpp, *highest, y);
+ if (ec) {
+ return ceph::from_error_code(ec);
+ }
+ }
+
+ return ceph::from_error_code(remove_empty(dpp, y));
+}
+
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+ librados::AioCompletion* c)
+{
+ assert(shard_id < num_shards);
+ bes->trim_entries(dpp, shard_id, marker, c);
+ return 0;
+}
+
+bool RGWDataChangesLog::going_down() const
+{
+ return down_flag;
+}
+
+RGWDataChangesLog::~RGWDataChangesLog() {
+ down_flag = true;
+ if (renew_thread.joinable()) {
+ renew_stop();
+ renew_thread.join();
+ }
+}
+
+void RGWDataChangesLog::renew_run() noexcept {
+ static constexpr auto runs_per_prune = 150;
+ auto run = 0;
+ for (;;) {
+ const DoutPrefix dp(cct, dout_subsys, "rgw data changes log: ");
+ ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl;
+ int r = renew_entries(&dp);
+ if (r < 0) {
+ ldpp_dout(&dp, 0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl;
+ }
+
+ if (going_down())
+ break;
+
+ if (run == runs_per_prune) {
+ std::optional<uint64_t> through;
+ ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruning old generations" << dendl;
+ // This null_yield can stay, for now, as it's in its own thread.
+ trim_generations(&dp, through, null_yield);
+ if (r < 0) {
+ derr << "RGWDataChangesLog::ChangesRenewThread: failed pruning r="
+ << r << dendl;
+ } else if (through) {
+ ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruned generations "
+ << "through " << *through << "." << dendl;
+ } else {
+ ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: nothing to prune."
+ << dendl;
+ }
+ run = 0;
+ } else {
+ ++run;
+ }
+
+ int interval = cct->_conf->rgw_data_log_window * 3 / 4;
+ std::unique_lock locker{renew_lock};
+ renew_cond.wait_for(locker, std::chrono::seconds(interval));
+ }
+}
+
+void RGWDataChangesLog::renew_stop()
+{
+ std::lock_guard l{renew_lock};
+ renew_cond.notify_all();
+}
+
+void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen)
+{
+ if (!cct->_conf->rgw_data_notify_interval_msec) {
+ return;
+ }
+
+ auto key = bs.get_key();
+ {
+ std::shared_lock rl{modified_lock}; // read lock to check for existence
+ auto shard = modified_shards.find(shard_id);
+ if (shard != modified_shards.end() && shard->second.count({key, gen})) {
+ return;
+ }
+ }
+
+ std::unique_lock wl{modified_lock}; // write lock for insertion
+ modified_shards[shard_id].insert(rgw_data_notify_entry{key, gen});
+}
+
+std::string RGWDataChangesLog::max_marker() const {
+ return gencursor(std::numeric_limits<uint64_t>::max(),
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
+}
+
+int RGWDataChangesLog::change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y) {
+ return ceph::from_error_code(bes->new_backing(dpp, type, y));
+}
+
+int RGWDataChangesLog::trim_generations(const DoutPrefixProvider *dpp,
+ std::optional<uint64_t>& through,
+ optional_yield y) {
+ return bes->trim_generations(dpp, through, y);
+}
+
+void RGWDataChangesLogInfo::dump(Formatter *f) const
+{
+ encode_json("marker", marker, f);
+ utime_t ut(last_update);
+ encode_json("last_update", ut, f);
+}
+
+void RGWDataChangesLogInfo::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("marker", marker, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("last_update", ut, obj);
+ last_update = ut.to_real_time();
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h
new file mode 100644
index 000000000..174cf86de
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog.h
@@ -0,0 +1,394 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <variant>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/smart_ptr/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <fmt/format.h>
+
+#include "common/async/yield_context.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/function2.hpp"
+
+#include "include/rados/librados.hpp"
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+
+#include "cls/log/cls_log_types.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_log_backing.h"
+#include "rgw_sync_policy.h"
+#include "rgw_zone.h"
+#include "rgw_trim_bilog.h"
+
+namespace bc = boost::container;
+
+enum DataLogEntityType {
+ ENTITY_TYPE_UNKNOWN = 0,
+ ENTITY_TYPE_BUCKET = 1,
+};
+
+struct rgw_data_change {
+ DataLogEntityType entity_type;
+ std::string key;
+ ceph::real_time timestamp;
+ uint64_t gen = 0;
+
+ void encode(ceph::buffer::list& bl) const {
+ // require decoders to recognize v2 when gen>0
+ const uint8_t compat = (gen == 0) ? 1 : 2;
+ ENCODE_START(2, compat, bl);
+ auto t = std::uint8_t(entity_type);
+ encode(t, bl);
+ encode(key, bl);
+ encode(timestamp, bl);
+ encode(gen, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ std::uint8_t t;
+ decode(t, bl);
+ entity_type = DataLogEntityType(t);
+ decode(key, bl);
+ decode(timestamp, bl);
+ if (struct_v < 2) {
+ gen = 0;
+ } else {
+ decode(gen, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(ceph::Formatter* f) const;
+ void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change)
+
+struct rgw_data_change_log_entry {
+ std::string log_id;
+ ceph::real_time log_timestamp;
+ rgw_data_change entry;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(log_id, bl);
+ encode(log_timestamp, bl);
+ encode(entry, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(log_id, bl);
+ decode(log_timestamp, bl);
+ decode(entry, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(ceph::Formatter* f) const;
+ void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change_log_entry)
+
+struct RGWDataChangesLogInfo {
+ std::string marker;
+ ceph::real_time last_update;
+
+ void dump(ceph::Formatter* f) const;
+ void decode_json(JSONObj* obj);
+};
+
+struct RGWDataChangesLogMarker {
+ int shard = 0;
+ std::string marker;
+
+ RGWDataChangesLogMarker() = default;
+};
+
+class RGWDataChangesLog;
+
+struct rgw_data_notify_entry {
+ std::string key;
+ uint64_t gen = 0;
+
+ void dump(ceph::Formatter* f) const;
+ void decode_json(JSONObj* obj);
+
+ rgw_data_notify_entry& operator=(const rgw_data_notify_entry&) = default;
+
+ bool operator <(const rgw_data_notify_entry& d) const {
+ if (key < d.key) {
+ return true;
+ }
+ if (d.key < key) {
+ return false;
+ }
+ return gen < d.gen;
+ }
+ friend std::ostream& operator <<(std::ostream& m,
+ const rgw_data_notify_entry& e) {
+ return m << "[key: " << e.key << ", gen: " << e.gen << "]";
+ }
+};
+
+class RGWDataChangesBE;
+
+class DataLogBackends final
+ : public logback_generations,
+ private bc::flat_map<uint64_t, boost::intrusive_ptr<RGWDataChangesBE>> {
+ friend class logback_generations;
+ friend class GenTrim;
+
+ std::mutex m;
+ RGWDataChangesLog& datalog;
+
+ DataLogBackends(librados::IoCtx& ioctx,
+ std::string oid,
+ fu2::unique_function<std::string(
+ uint64_t, int) const>&& get_oid,
+ int shards, RGWDataChangesLog& datalog) noexcept
+ : logback_generations(ioctx, oid, std::move(get_oid),
+ shards), datalog(datalog) {}
+public:
+
+ boost::intrusive_ptr<RGWDataChangesBE> head() {
+ std::unique_lock l(m);
+ auto i = end();
+ --i;
+ return i->second;
+ }
+ int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::string_view marker, std::string* out_marker, bool* truncated,
+ optional_yield y);
+ int trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+ std::string_view marker, optional_yield y);
+ void trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+ librados::AioCompletion* c);
+ void set_zero(RGWDataChangesBE* be) {
+ emplace(0, be);
+ }
+
+ bs::error_code handle_init(entries_t e) noexcept override;
+ bs::error_code handle_new_gens(entries_t e) noexcept override;
+ bs::error_code handle_empty_to(uint64_t new_tail) noexcept override;
+
+ int trim_generations(const DoutPrefixProvider *dpp,
+ std::optional<uint64_t>& through,
+ optional_yield y);
+};
+
+struct BucketGen {
+ rgw_bucket_shard shard;
+ uint64_t gen;
+
+ BucketGen(const rgw_bucket_shard& shard, uint64_t gen)
+ : shard(shard), gen(gen) {}
+
+ BucketGen(rgw_bucket_shard&& shard, uint64_t gen)
+ : shard(std::move(shard)), gen(gen) {}
+
+ BucketGen(const BucketGen&) = default;
+ BucketGen(BucketGen&&) = default;
+ BucketGen& operator =(const BucketGen&) = default;
+ BucketGen& operator =(BucketGen&&) = default;
+
+ ~BucketGen() = default;
+};
+
+inline bool operator ==(const BucketGen& l, const BucketGen& r) {
+ return (l.shard == r.shard) && (l.gen == r.gen);
+}
+
+inline bool operator <(const BucketGen& l, const BucketGen& r) {
+ if (l.shard < r.shard) {
+ return true;
+ } else if (l.shard == r.shard) {
+ return l.gen < r.gen;
+ } else {
+ return false;
+ }
+}
+
+class RGWDataChangesLog {
+ friend DataLogBackends;
+ CephContext *cct;
+ librados::IoCtx ioctx;
+ rgw::BucketChangeObserver *observer = nullptr;
+ const RGWZone* zone;
+ std::unique_ptr<DataLogBackends> bes;
+
+ const int num_shards;
+ std::string get_prefix() {
+ auto prefix = cct->_conf->rgw_data_log_obj_prefix;
+ return prefix.empty() ? prefix : "data_log";
+ }
+ std::string metadata_log_oid() {
+ return get_prefix() + "generations_metadata";
+ }
+ std::string prefix;
+
+ ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::lock");
+ ceph::shared_mutex modified_lock =
+ ceph::make_shared_mutex("RGWDataChangesLog::modified_lock");
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> modified_shards;
+
+ std::atomic<bool> down_flag = { false };
+
+ struct ChangeStatus {
+ std::shared_ptr<const rgw_sync_policy_info> sync_policy;
+ ceph::real_time cur_expiration;
+ ceph::real_time cur_sent;
+ bool pending = false;
+ RefCountedCond* cond = nullptr;
+ ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::ChangeStatus");
+ };
+
+ using ChangeStatusPtr = std::shared_ptr<ChangeStatus>;
+
+ lru_map<BucketGen, ChangeStatusPtr> changes;
+
+ bc::flat_set<BucketGen> cur_cycle;
+
+ ChangeStatusPtr _get_change(const rgw_bucket_shard& bs, uint64_t gen);
+ void register_renew(const rgw_bucket_shard& bs,
+ const rgw::bucket_log_layout_generation& gen);
+ void update_renewed(const rgw_bucket_shard& bs,
+ uint64_t gen,
+ ceph::real_time expiration);
+
+ ceph::mutex renew_lock = ceph::make_mutex("ChangesRenewThread::lock");
+ ceph::condition_variable renew_cond;
+ void renew_run() noexcept;
+ void renew_stop();
+ std::thread renew_thread;
+
+ std::function<bool(const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp)> bucket_filter;
+ bool going_down() const;
+ bool filter_bucket(const DoutPrefixProvider *dpp, const rgw_bucket& bucket, optional_yield y) const;
+ int renew_entries(const DoutPrefixProvider *dpp);
+
+public:
+
+ RGWDataChangesLog(CephContext* cct);
+ ~RGWDataChangesLog();
+
+ int start(const DoutPrefixProvider *dpp, const RGWZone* _zone, const RGWZoneParams& zoneparams,
+ librados::Rados* lr);
+ int choose_oid(const rgw_bucket_shard& bs);
+ int add_entry(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+ const rgw::bucket_log_layout_generation& gen, int shard_id,
+ optional_yield y);
+ int get_log_shard_id(rgw_bucket& bucket, int shard_id);
+ int list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::string_view marker, std::string* out_marker,
+ bool* truncated, optional_yield y);
+ int trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+ std::string_view marker, optional_yield y);
+ int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+ librados::AioCompletion* c); // :(
+ int get_info(const DoutPrefixProvider *dpp, int shard_id,
+ RGWDataChangesLogInfo *info, optional_yield y);
+
+ using LogMarker = RGWDataChangesLogMarker;
+
+ int list_entries(const DoutPrefixProvider *dpp, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ LogMarker& marker, bool* ptruncated,
+ optional_yield y);
+
+ void mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen);
+ auto read_clear_modified() {
+ std::unique_lock wl{modified_lock};
+ decltype(modified_shards) modified;
+ modified.swap(modified_shards);
+ modified_shards.clear();
+ return modified;
+ }
+
+ void set_observer(rgw::BucketChangeObserver *observer) {
+ this->observer = observer;
+ }
+
+ void set_bucket_filter(decltype(bucket_filter)&& f) {
+ bucket_filter = std::move(f);
+ }
+ // a marker that compares greater than any other
+ std::string max_marker() const;
+ std::string get_oid(uint64_t gen_id, int shard_id) const;
+
+
+ int change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y);
+ int trim_generations(const DoutPrefixProvider *dpp,
+ std::optional<uint64_t>& through,
+ optional_yield y);
+};
+
+class RGWDataChangesBE : public boost::intrusive_ref_counter<RGWDataChangesBE> {
+protected:
+ librados::IoCtx& ioctx;
+ CephContext* const cct;
+ RGWDataChangesLog& datalog;
+
+ std::string get_oid(int shard_id) {
+ return datalog.get_oid(gen_id, shard_id);
+ }
+public:
+ using entries = std::variant<std::list<cls_log_entry>,
+ std::vector<ceph::buffer::list>>;
+
+ const uint64_t gen_id;
+
+ RGWDataChangesBE(librados::IoCtx& ioctx,
+ RGWDataChangesLog& datalog,
+ uint64_t gen_id)
+ : ioctx(ioctx), cct(static_cast<CephContext*>(ioctx.cct())),
+ datalog(datalog), gen_id(gen_id) {}
+ virtual ~RGWDataChangesBE() = default;
+
+ virtual void prepare(ceph::real_time now,
+ const std::string& key,
+ ceph::buffer::list&& entry,
+ entries& out) = 0;
+ virtual int push(const DoutPrefixProvider *dpp, int index, entries&& items,
+ optional_yield y) = 0;
+ virtual int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+ const std::string& key, ceph::buffer::list&& bl,
+ optional_yield y) = 0;
+ virtual int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+ std::vector<rgw_data_change_log_entry>& entries,
+ std::optional<std::string_view> marker,
+ std::string* out_marker, bool* truncated,
+ optional_yield y) = 0;
+ virtual int get_info(const DoutPrefixProvider *dpp, int index,
+ RGWDataChangesLogInfo *info, optional_yield y) = 0;
+ virtual int trim(const DoutPrefixProvider *dpp, int index,
+ std::string_view marker, optional_yield y) = 0;
+ virtual int trim(const DoutPrefixProvider *dpp, int index,
+ std::string_view marker, librados::AioCompletion* c) = 0;
+ virtual std::string_view max_marker() const = 0;
+ // 1 on empty, 0 on non-empty, negative on error.
+ virtual int is_empty(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+};
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.cc b/src/rgw/driver/rados/rgw_datalog_notify.cc
new file mode 100644
index 000000000..12cdc532f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog_notify.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_datalog_notify.h"
+#include "rgw_datalog.h"
+
+// custom encoding for v1 notify API
+struct EntryEncoderV1 {
+ const rgw_data_notify_entry& entry;
+};
+struct SetEncoderV1 {
+ const bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// encode rgw_data_notify_entry as string
+void encode_json(const char *name, const EntryEncoderV1& e, Formatter *f)
+{
+ f->dump_string(name, e.entry.key); // encode the key only
+}
+// encode set<rgw_data_notify_entry> as set<string>
+void encode_json(const char *name, const SetEncoderV1& e, Formatter *f)
+{
+ f->open_array_section(name);
+ for (auto& entry : e.entries) {
+ encode_json("obj", EntryEncoderV1{entry}, f);
+ }
+ f->close_section();
+}
+// encode map<int, set<rgw_data_notify_entry>> as map<int, set<string>>
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, Formatter *f)
+{
+ f->open_array_section(name);
+ for (auto& [key, val] : e.shards) {
+ f->open_object_section("entry");
+ encode_json("key", key, f);
+ encode_json("val", SetEncoderV1{val}, f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+struct EntryDecoderV1 {
+ rgw_data_notify_entry& entry;
+};
+struct SetDecoderV1 {
+ bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// decode string into rgw_data_notify_entry
+void decode_json_obj(EntryDecoderV1& d, JSONObj *obj)
+{
+ decode_json_obj(d.entry.key, obj);
+ d.entry.gen = 0;
+}
+// decode set<string> into set<rgw_data_notify_entry>
+void decode_json_obj(SetDecoderV1& d, JSONObj *obj)
+{
+ for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+ rgw_data_notify_entry val;
+ auto decoder = EntryDecoderV1{val};
+ decode_json_obj(decoder, *o);
+ d.entries.insert(std::move(val));
+ }
+}
+// decode map<int, set<string>> into map<int, set<rgw_data_notify_entry>>
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj)
+{
+ for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+ int shard_id = 0;
+ JSONDecoder::decode_json("key", shard_id, *o);
+ bc::flat_set<rgw_data_notify_entry> val;
+ SetDecoderV1 decoder{val};
+ JSONDecoder::decode_json("val", decoder, *o);
+ d.shards[shard_id] = std::move(val);
+ }
+}
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.h b/src/rgw/driver/rados/rgw_datalog_notify.h
new file mode 100644
index 000000000..4cd1b3c11
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog_notify.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "rgw_datalog.h"
+
+namespace bc = boost::container;
+
+namespace ceph { class Formatter; }
+class JSONObj;
+
+class RGWCoroutine;
+class RGWHTTPManager;
+class RGWRESTConn;
+
+struct rgw_data_notify_entry;
+
+// json encoder and decoder for notify v1 API
+struct rgw_data_notify_v1_encoder {
+ const bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e,
+ ceph::Formatter *f);
+struct rgw_data_notify_v1_decoder {
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj);
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.cc b/src/rgw/driver/rados/rgw_etag_verifier.cc
new file mode 100644
index 000000000..52f7c7948
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_etag_verifier.cc
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_etag_verifier.h"
+#include "rgw_obj_manifest.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::putobj {
+
+int create_etag_verifier(const DoutPrefixProvider *dpp,
+ CephContext* cct, rgw::sal::DataProcessor* filter,
+ const bufferlist& manifest_bl,
+ const std::optional<RGWCompressionInfo>& compression,
+ etag_verifier_ptr& verifier)
+{
+ RGWObjManifest manifest;
+
+ try {
+ auto miter = manifest_bl.cbegin();
+ decode(manifest, miter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+ return -EIO;
+ }
+
+ RGWObjManifestRule rule;
+ bool found = manifest.get_rule(0, &rule);
+ if (!found) {
+ ldpp_dout(dpp, -1) << "ERROR: manifest->get_rule() could not find rule" << dendl;
+ return -EIO;
+ }
+
+ if (rule.start_part_num == 0) {
+ /* Atomic object */
+ verifier.emplace<ETagVerifier_Atomic>(cct, filter);
+ return 0;
+ }
+
+ uint64_t cur_part_ofs = UINT64_MAX;
+ std::vector<uint64_t> part_ofs;
+
+ /*
+ * We must store the offset of each part to calculate the ETAGs for each
+ * MPU part. These part ETags then become the input for the MPU object
+ * Etag.
+ */
+ for (auto mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) {
+ if (cur_part_ofs == mi.get_part_ofs())
+ continue;
+ cur_part_ofs = mi.get_part_ofs();
+ ldpp_dout(dpp, 20) << "MPU Part offset:" << cur_part_ofs << dendl;
+ part_ofs.push_back(cur_part_ofs);
+ }
+
+ if (compression) {
+ // if the source object was compressed, the manifest is storing
+ // compressed part offsets. transform the compressed offsets back to
+ // their original offsets by finding the first block of each part
+ const auto& blocks = compression->blocks;
+ auto block = blocks.begin();
+ for (auto& ofs : part_ofs) {
+ // find the compression_block with new_ofs == ofs
+ constexpr auto less = [] (const compression_block& block, uint64_t ofs) {
+ return block.new_ofs < ofs;
+ };
+ block = std::lower_bound(block, blocks.end(), ofs, less);
+ if (block == blocks.end() || block->new_ofs != ofs) {
+ ldpp_dout(dpp, 4) << "no match for compressed offset " << ofs
+ << ", disabling etag verification" << dendl;
+ return -EIO;
+ }
+ ofs = block->old_ofs;
+ ldpp_dout(dpp, 20) << "MPU Part uncompressed offset:" << ofs << dendl;
+ }
+ }
+
+ verifier.emplace<ETagVerifier_MPU>(cct, std::move(part_ofs), filter);
+ return 0;
+}
+
+int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset)
+{
+ bufferlist out;
+ if (in.length() > 0)
+ hash.Update((const unsigned char *)in.c_str(), in.length());
+
+ return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_Atomic::calculate_etag()
+{
+ unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+
+ /* Return early if ETag has already been calculated */
+ if (!calculated_etag.empty())
+ return;
+
+ hash.Final(m);
+ buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+ calculated_etag = calc_md5;
+ ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag
+ << dendl;
+}
+
+void ETagVerifier_MPU::process_end_of_MPU_part()
+{
+ unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+ std::string calculated_etag_part;
+
+ hash.Final(m);
+ mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+ hash.Restart();
+
+ if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
+ buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part);
+ calculated_etag_part = calc_md5_part;
+ ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl;
+ }
+
+ cur_part_index++;
+ next_part_index++;
+}
+
+int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset)
+{
+ uint64_t bl_end = in.length() + logical_offset;
+
+ /* Handle the last MPU part */
+ if (size_t(next_part_index) == part_ofs.size()) {
+ hash.Update((const unsigned char *)in.c_str(), in.length());
+ goto done;
+ }
+
+ /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */
+ if (bl_end > part_ofs[next_part_index]) {
+
+ uint64_t part_one_len = part_ofs[next_part_index] - logical_offset;
+ hash.Update((const unsigned char *)in.c_str(), part_one_len);
+ process_end_of_MPU_part();
+
+ hash.Update((const unsigned char *)in.c_str() + part_one_len,
+ bl_end - part_ofs[cur_part_index]);
+ /*
+ * If we've moved to the last part of the MPU, avoid usage of
+ * parts_ofs[next_part_index] as it will lead to our-of-range access.
+ */
+ if (size_t(next_part_index) == part_ofs.size())
+ goto done;
+ } else {
+ hash.Update((const unsigned char *)in.c_str(), in.length());
+ }
+
+ /* Update the MPU Etag if the current part has ended */
+ if (logical_offset + in.length() + 1 == part_ofs[next_part_index])
+ process_end_of_MPU_part();
+
+done:
+ return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_MPU::calculate_etag()
+{
+ const uint32_t parts = part_ofs.size();
+ constexpr auto digits10 = std::numeric_limits<uint32_t>::digits10;
+ constexpr auto extra = 2 + digits10; // add "-%u\0" at the end
+
+ unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + extra];
+
+ /* Return early if ETag has already been calculated */
+ if (!calculated_etag.empty())
+ return;
+
+ hash.Final(m);
+ mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+
+ /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */
+ mpu_etag_hash.Final(mpu_m);
+ buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str);
+ snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+ sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+ "-%u", parts);
+
+ calculated_etag = final_etag_str;
+ ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h
new file mode 100644
index 000000000..18a4f5a3f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_etag_verifier.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * RGW Etag Verifier is an RGW filter which enables the objects copied using
+ * multisite sync to be verified using their ETag from source i.e. the MD5
+ * checksum of the object is computed at the destination and is verified to be
+ * identical to the ETag stored in the object HEAD at source cluster.
+ *
+ * For MPU objects, a different filter named RGWMultipartEtagFilter is applied
+ * which re-computes ETag using RGWObjManifest. This computes the ETag using the
+ * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
+ * on the MPU parts.
+ */
+
+#pragma once
+
+#include "rgw_putobj.h"
+#include "rgw_op.h"
+#include "common/static_ptr.h"
+
+namespace rgw::putobj {
+
+class ETagVerifier : public rgw::putobj::Pipe
+{
+protected:
+ CephContext* cct;
+ MD5 hash;
+ std::string calculated_etag;
+
+public:
+ ETagVerifier(CephContext* cct_, rgw::sal::DataProcessor *next)
+ : Pipe(next), cct(cct_) {
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ }
+
+ virtual void calculate_etag() = 0;
+ std::string get_calculated_etag() { return calculated_etag;}
+
+}; /* ETagVerifier */
+
+class ETagVerifier_Atomic : public ETagVerifier
+{
+public:
+ ETagVerifier_Atomic(CephContext* cct_, rgw::sal::DataProcessor *next)
+ : ETagVerifier(cct_, next) {}
+
+ int process(bufferlist&& data, uint64_t logical_offset) override;
+ void calculate_etag() override;
+
+}; /* ETagVerifier_Atomic */
+
+class ETagVerifier_MPU : public ETagVerifier
+{
+ std::vector<uint64_t> part_ofs;
+ uint64_t cur_part_index{0}, next_part_index{1};
+ MD5 mpu_etag_hash;
+
+ void process_end_of_MPU_part();
+
+public:
+ ETagVerifier_MPU(CephContext* cct,
+ std::vector<uint64_t> part_ofs,
+ rgw::sal::DataProcessor *next)
+ : ETagVerifier(cct, next),
+ part_ofs(std::move(part_ofs))
+ {
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ }
+
+ int process(bufferlist&& data, uint64_t logical_offset) override;
+ void calculate_etag() override;
+
+}; /* ETagVerifier_MPU */
+
+constexpr auto max_etag_verifier_size = std::max(
+ sizeof(ETagVerifier_Atomic),
+ sizeof(ETagVerifier_MPU)
+ );
+using etag_verifier_ptr = ceph::static_ptr<ETagVerifier, max_etag_verifier_size>;
+
+int create_etag_verifier(const DoutPrefixProvider *dpp,
+ CephContext* cct, rgw::sal::DataProcessor* next,
+ const bufferlist& manifest_bl,
+ const std::optional<RGWCompressionInfo>& compression,
+ etag_verifier_ptr& verifier);
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_gc.cc b/src/rgw/driver/rados/rgw_gc.cc
new file mode 100644
index 000000000..bd16bde1b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc.cc
@@ -0,0 +1,811 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc.h"
+
+#include "rgw_tools.h"
+#include "include/scope_guard.h"
+#include "include/rados/librados.hpp"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "rgw_perf_counters.h"
+#include "cls/lock/cls_lock_client.h"
+#include "include/random.h"
+#include "rgw_gc_log.h"
+
+#include <list> // XXX
+#include <sstream>
+#include "xxhash.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+static string gc_oid_prefix = "gc";
+static string gc_index_lock_name = "gc_process";
+
+void RGWGC::initialize(CephContext *_cct, RGWRados *_store) {
+ cct = _cct;
+ store = _store;
+
+ max_objs = min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max());
+
+ obj_names = new string[max_objs];
+
+ for (int i = 0; i < max_objs; i++) {
+ obj_names[i] = gc_oid_prefix;
+ char buf[32];
+ snprintf(buf, 32, ".%d", i);
+ obj_names[i].append(buf);
+
+ auto it = transitioned_objects_cache.begin() + i;
+ transitioned_objects_cache.insert(it, false);
+
+ //version = 0 -> not ready for transition
+ //version = 1 -> marked ready for transition
+ librados::ObjectWriteOperation op;
+ op.create(false);
+ const uint64_t queue_size = cct->_conf->rgw_gc_max_queue_size, num_deferred_entries = cct->_conf->rgw_gc_max_deferred;
+ gc_log_init2(op, queue_size, num_deferred_entries);
+ store->gc_operate(this, obj_names[i], &op);
+ }
+}
+
+void RGWGC::finalize()
+{
+ delete[] obj_names;
+}
+
+int RGWGC::tag_index(const string& tag)
+{
+ return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs);
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag)
+{
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl;
+
+ if (cct->_conf->rgw_max_chunk_size) {
+ cls_rgw_obj_chain broken_chain;
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl;
+
+ for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) {
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl;
+ broken_chain.objs.emplace_back(*it);
+ cls_rgw_gc_obj_info info;
+ info.tag = tag;
+ info.chain = broken_chain;
+ cls_rgw_gc_set_entry_op op;
+ op.info = info;
+ size_t total_encoded_size = op.estimate_encoded_size();
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl;
+
+ if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc
+ broken_chain.objs.pop_back();
+ --it;
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl;
+ auto ret = send_chain(broken_chain, tag);
+ if (ret < 0) {
+ broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline
+ ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+ return {ret, {broken_chain}};
+ }
+ broken_chain.objs.clear();
+ }
+ }
+ if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size
+ ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl;
+ auto ret = send_chain(broken_chain, tag);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+ return {ret, {broken_chain}};
+ }
+ }
+ } else {
+ auto ret = send_chain(chain, tag);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+ return {ret, {std::move(chain)}};
+ }
+ }
+ return {0, {}};
+}
+
+int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag)
+{
+ ObjectWriteOperation op;
+ cls_rgw_gc_obj_info info;
+ info.chain = chain;
+ info.tag = tag;
+ gc_log_enqueue2(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+ int i = tag_index(tag);
+
+ ldpp_dout(this, 20) << "RGWGC::send_chain - on object name: " << obj_names[i] << "tag is: " << tag << dendl;
+
+ auto ret = store->gc_operate(this, obj_names[i], &op);
+ if (ret != -ECANCELED && ret != -EPERM) {
+ return ret;
+ }
+ ObjectWriteOperation set_entry_op;
+ cls_rgw_gc_set_entry(set_entry_op, cct->_conf->rgw_gc_obj_min_wait, info);
+ return store->gc_operate(this, obj_names[i], &set_entry_op);
+}
+
+struct defer_chain_state {
+ librados::AioCompletion* completion = nullptr;
+ // TODO: hold a reference on the state in RGWGC to avoid use-after-free if
+ // RGWGC destructs before this completion fires
+ RGWGC* gc = nullptr;
+ cls_rgw_gc_obj_info info;
+
+ ~defer_chain_state() {
+ if (completion) {
+ completion->release();
+ }
+ }
+};
+
+static void async_defer_callback(librados::completion_t, void* arg)
+{
+ std::unique_ptr<defer_chain_state> state{static_cast<defer_chain_state*>(arg)};
+ if (state->completion->get_return_value() == -ECANCELED) {
+ state->gc->on_defer_canceled(state->info);
+ }
+}
+
+void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info)
+{
+ const std::string& tag = info.tag;
+ const int i = tag_index(tag);
+
+ // ECANCELED from cls_version_check() tells us that we've transitioned
+ transitioned_objects_cache[i] = true;
+
+ ObjectWriteOperation op;
+ cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+ cls_rgw_gc_remove(op, {tag});
+
+ auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ store->gc_aio_operate(obj_names[i], c, &op);
+ c->release();
+}
+
+int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain)
+{
+ const int i = tag_index(tag);
+ cls_rgw_gc_obj_info info;
+ info.chain = chain;
+ info.tag = tag;
+
+ // if we've transitioned this shard object, we can rely on the cls_rgw_gc queue
+ if (transitioned_objects_cache[i]) {
+ ObjectWriteOperation op;
+ cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+ // this tag may still be present in omap, so remove it once the cls_rgw_gc
+ // enqueue succeeds
+ cls_rgw_gc_remove(op, {tag});
+
+ auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ int ret = store->gc_aio_operate(obj_names[i], c, &op);
+ c->release();
+ return ret;
+ }
+
+ // if we haven't seen the transition yet, write the defer to omap with cls_rgw
+ ObjectWriteOperation op;
+
+ // assert that we haven't initialized cls_rgw_gc queue. this prevents us
+ // from writing new entries to omap after the transition
+ gc_log_defer1(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+ // prepare a callback to detect the transition via ECANCELED from cls_version_check()
+ auto state = std::make_unique<defer_chain_state>();
+ state->gc = this;
+ state->info.chain = chain;
+ state->info.tag = tag;
+ state->completion = librados::Rados::aio_create_completion(
+ state.get(), async_defer_callback);
+
+ int ret = store->gc_aio_operate(obj_names[i], state->completion, &op);
+ if (ret == 0) {
+ state.release(); // release ownership until async_defer_callback()
+ }
+ return ret;
+}
+
+int RGWGC::remove(int index, const std::vector<string>& tags, AioCompletion **pc)
+{
+ ObjectWriteOperation op;
+ cls_rgw_gc_remove(op, tags);
+
+ auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ int ret = store->gc_aio_operate(obj_names[index], c, &op);
+ if (ret < 0) {
+ c->release();
+ } else {
+ *pc = c;
+ }
+ return ret;
+}
+
+int RGWGC::remove(int index, int num_entries)
+{
+ ObjectWriteOperation op;
+ cls_rgw_gc_queue_remove_entries(op, num_entries);
+
+ return store->gc_operate(this, obj_names[index], &op);
+}
+
+int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+ result.clear();
+ string next_marker;
+ bool check_queue = false;
+
+ for (; *index < max_objs && result.size() < max; (*index)++, marker.clear(), check_queue = false) {
+ std::list<cls_rgw_gc_obj_info> entries, queue_entries;
+ int ret = 0;
+
+ //processing_queue is set to true from previous iteration if the queue was under process and probably has more elements in it.
+ if (! transitioned_objects_cache[*index] && ! check_queue && ! processing_queue) {
+ ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker);
+ if (ret != -ENOENT && ret < 0) {
+ return ret;
+ }
+ obj_version objv;
+ cls_version_read(store->gc_pool_ctx, obj_names[*index], &objv);
+ if (ret == -ENOENT || entries.size() == 0) {
+ if (objv.ver == 0) {
+ continue;
+ } else {
+ if (! expired_only) {
+ transitioned_objects_cache[*index] = true;
+ marker.clear();
+ } else {
+ std::list<cls_rgw_gc_obj_info> non_expired_entries;
+ ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, 1, false, non_expired_entries, truncated, next_marker);
+ if (non_expired_entries.size() == 0) {
+ transitioned_objects_cache[*index] = true;
+ marker.clear();
+ }
+ }
+ }
+ }
+ if ((objv.ver == 1) && (entries.size() < max - result.size())) {
+ check_queue = true;
+ marker.clear();
+ }
+ }
+ if (transitioned_objects_cache[*index] || check_queue || processing_queue) {
+ processing_queue = false;
+ ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[*index], marker, (max - result.size()) - entries.size(), expired_only, queue_entries, truncated, next_marker);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ if (entries.size() == 0 && queue_entries.size() == 0)
+ continue;
+
+ std::list<cls_rgw_gc_obj_info>::iterator iter;
+ for (iter = entries.begin(); iter != entries.end(); ++iter) {
+ result.push_back(*iter);
+ }
+
+ for (iter = queue_entries.begin(); iter != queue_entries.end(); ++iter) {
+ result.push_back(*iter);
+ }
+
+ marker = next_marker;
+
+ if (*index == max_objs - 1) {
+ if (queue_entries.size() > 0 && *truncated) {
+ processing_queue = true;
+ } else {
+ processing_queue = false;
+ }
+ /* we cut short here, truncated will hold the correct value */
+ return 0;
+ }
+
+ if (result.size() == max) {
+ if (queue_entries.size() > 0 && *truncated) {
+ processing_queue = true;
+ } else {
+ processing_queue = false;
+ *index += 1; //move to next gc object
+ }
+
+ /* close approximation, it might be that the next of the objects don't hold
+ * anything, in this case truncated should have been false, but we can find
+ * that out on the next iteration
+ */
+ *truncated = true;
+ return 0;
+ }
+ }
+ *truncated = false;
+ processing_queue = false;
+
+ return 0;
+}
+
+class RGWGCIOManager {
+ const DoutPrefixProvider* dpp;
+ CephContext *cct;
+ RGWGC *gc;
+
+ struct IO {
+ enum Type {
+ UnknownIO = 0,
+ TailIO = 1,
+ IndexIO = 2,
+ } type{UnknownIO};
+ librados::AioCompletion *c{nullptr};
+ string oid;
+ int index{-1};
+ string tag;
+ };
+
+ deque<IO> ios;
+ vector<std::vector<string> > remove_tags;
+ /* tracks the number of remaining shadow objects for a given tag in order to
+ * only remove the tag once all shadow objects have themselves been removed
+ */
+ vector<map<string, size_t> > tag_io_size;
+
+#define MAX_AIO_DEFAULT 10
+ size_t max_aio{MAX_AIO_DEFAULT};
+
+public:
+ RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp),
+ cct(_cct),
+ gc(_gc) {
+ max_aio = cct->_conf->rgw_gc_max_concurrent_io;
+ remove_tags.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+ tag_io_size.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+ }
+
+ ~RGWGCIOManager() {
+ for (auto io : ios) {
+ io.c->release();
+ }
+ }
+
+ int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op,
+ int index, const string& tag) {
+ while (ios.size() > max_aio) {
+ if (gc->going_down()) {
+ return 0;
+ }
+ auto ret = handle_next_completion();
+ //Return error if we are using queue, else ignore it
+ if (gc->transitioned_objects_cache[index] && ret < 0) {
+ return ret;
+ }
+ }
+
+ auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ int ret = ioctx->aio_operate(oid, c, op);
+ if (ret < 0) {
+ return ret;
+ }
+ ios.push_back(IO{IO::TailIO, c, oid, index, tag});
+
+ return 0;
+ }
+
+ int handle_next_completion() {
+ ceph_assert(!ios.empty());
+ IO& io = ios.front();
+ io.c->wait_for_complete();
+ int ret = io.c->get_return_value();
+ io.c->release();
+
+ if (ret == -ENOENT) {
+ ret = 0;
+ }
+
+ if (io.type == IO::IndexIO && ! gc->transitioned_objects_cache[io.index]) {
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" <<
+ io.index << " returned error, ret=" << ret << dendl;
+ }
+ goto done;
+ }
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid <<
+ ", ret=" << ret << dendl;
+ goto done;
+ }
+
+ if (! gc->transitioned_objects_cache[io.index]) {
+ schedule_tag_removal(io.index, io.tag);
+ }
+
+ done:
+ ios.pop_front();
+ return ret;
+ }
+
+ /* This is a request to schedule a tag removal. It will be called once when
+ * there are no shadow objects. But it will also be called for every shadow
+ * object when there are any. Since we do not want the tag to be removed
+ * until all shadow objects have been successfully removed, the scheduling
+ * will not happen until the shadow object count goes down to zero
+ */
+ void schedule_tag_removal(int index, string tag) {
+ auto& ts = tag_io_size[index];
+ auto ts_it = ts.find(tag);
+ if (ts_it != ts.end()) {
+ auto& size = ts_it->second;
+ --size;
+ // wait all shadow obj delete return
+ if (size != 0)
+ return;
+
+ ts.erase(ts_it);
+ }
+
+ auto& rt = remove_tags[index];
+
+ rt.push_back(tag);
+ if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) {
+ flush_remove_tags(index, rt);
+ }
+ }
+
+ void add_tag_io_size(int index, string tag, size_t size) {
+ auto& ts = tag_io_size[index];
+ ts.emplace(tag, size);
+ }
+
+ int drain_ios() {
+ int ret_val = 0;
+ while (!ios.empty()) {
+ if (gc->going_down()) {
+ return -EAGAIN;
+ }
+ auto ret = handle_next_completion();
+ if (ret < 0) {
+ ret_val = ret;
+ }
+ }
+ return ret_val;
+ }
+
+ void drain() {
+ drain_ios();
+ flush_remove_tags();
+ /* the tags draining might have generated more ios, drain those too */
+ drain_ios();
+ }
+
+ void flush_remove_tags(int index, vector<string>& rt) {
+ IO index_io;
+ index_io.type = IO::IndexIO;
+ index_io.index = index;
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ " removing entries from gc log shard index=" << index << ", size=" <<
+ rt.size() << ", entries=" << rt << dendl;
+
+ auto rt_guard = make_scope_guard(
+ [&]
+ {
+ rt.clear();
+ }
+ );
+
+ int ret = gc->remove(index, rt, &index_io.c);
+ if (ret < 0) {
+ /* we already cleared list of tags, this prevents us from
+ * ballooning in case of a persistent problem
+ */
+ ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" <<
+ index << " ret=" << ret << dendl;
+ return;
+ }
+ if (perfcounter) {
+ /* log the count of tags retired for rate estimation */
+ perfcounter->inc(l_rgw_gc_retire, rt.size());
+ }
+ ios.push_back(index_io);
+ }
+
+ void flush_remove_tags() {
+ int index = 0;
+ for (auto& rt : remove_tags) {
+ if (! gc->transitioned_objects_cache[index]) {
+ flush_remove_tags(index, rt);
+ }
+ ++index;
+ }
+ }
+
+ int remove_queue_entries(int index, int num_entries) {
+ int ret = gc->remove(index, num_entries);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to remove queue entries on index=" <<
+ index << " ret=" << ret << dendl;
+ return ret;
+ }
+ if (perfcounter) {
+ /* log the count of tags retired for rate estimation */
+ perfcounter->inc(l_rgw_gc_retire, num_entries);
+ }
+ return 0;
+ }
+}; // class RGWGCIOManger
+
+int RGWGC::process(int index, int max_secs, bool expired_only,
+ RGWGCIOManager& io_manager)
+{
+ ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" <<
+ index << ", max_secs=" << max_secs << ", expired_only=" <<
+ expired_only << dendl;
+
+ rados::cls::lock::Lock l(gc_index_lock_name);
+ utime_t end = ceph_clock_now();
+
+ /* max_secs should be greater than zero. We don't want a zero max_secs
+ * to be translated as no timeout, since we'd then need to break the
+ * lock and that would require a manual intervention. In this case
+ * we can just wait it out. */
+ if (max_secs <= 0)
+ return -EAGAIN;
+
+ end += max_secs;
+ utime_t time(max_secs, 0);
+ l.set_duration(time);
+
+ int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]);
+ if (ret == -EBUSY) { /* already locked by another gc processor */
+ ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " <<
+ obj_names[index] << dendl;
+ return 0;
+ }
+ if (ret < 0)
+ return ret;
+
+ string marker;
+ string next_marker;
+ bool truncated;
+ IoCtx *ctx = new IoCtx;
+ do {
+ int max = 100;
+ std::list<cls_rgw_gc_obj_info> entries;
+
+ int ret = 0;
+
+ if (! transitioned_objects_cache[index]) {
+ ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+ ldpp_dout(this, 20) <<
+ "RGWGC::process cls_rgw_gc_list returned with returned:" << ret <<
+ ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+ ", next_marker='" << next_marker << "'" << dendl;
+ obj_version objv;
+ cls_version_read(store->gc_pool_ctx, obj_names[index], &objv);
+ if ((objv.ver == 1) && entries.size() == 0) {
+ std::list<cls_rgw_gc_obj_info> non_expired_entries;
+ ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, 1, false, non_expired_entries, &truncated, next_marker);
+ if (non_expired_entries.size() == 0) {
+ transitioned_objects_cache[index] = true;
+ marker.clear();
+ ldpp_dout(this, 20) << "RGWGC::process cls_rgw_gc_list returned NO non expired entries, so setting cache entry to TRUE" << dendl;
+ } else {
+ ret = 0;
+ goto done;
+ }
+ }
+ if ((objv.ver == 0) && (ret == -ENOENT || entries.size() == 0)) {
+ ret = 0;
+ goto done;
+ }
+ }
+
+ if (transitioned_objects_cache[index]) {
+ ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+ ldpp_dout(this, 20) <<
+ "RGWGC::process cls_rgw_gc_queue_list_entries returned with return value:" << ret <<
+ ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+ ", next_marker='" << next_marker << "'" << dendl;
+ if (entries.size() == 0) {
+ ret = 0;
+ goto done;
+ }
+ }
+
+ if (ret < 0)
+ goto done;
+
+ marker = next_marker;
+
+ string last_pool;
+ std::list<cls_rgw_gc_obj_info>::iterator iter;
+ for (iter = entries.begin(); iter != entries.end(); ++iter) {
+ cls_rgw_gc_obj_info& info = *iter;
+
+ ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" <<
+ info.tag << "', time=" << info.time << ", chain.objs.size()=" <<
+ info.chain.objs.size() << dendl;
+
+ std::list<cls_rgw_obj>::iterator liter;
+ cls_rgw_obj_chain& chain = info.chain;
+
+ utime_t now = ceph_clock_now();
+ if (now >= end) {
+ goto done;
+ }
+ if (! transitioned_objects_cache[index]) {
+ if (chain.objs.empty()) {
+ io_manager.schedule_tag_removal(index, info.tag);
+ } else {
+ io_manager.add_tag_io_size(index, info.tag, chain.objs.size());
+ }
+ }
+ if (! chain.objs.empty()) {
+ for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+ cls_rgw_obj& obj = *liter;
+
+ if (obj.pool != last_pool) {
+ delete ctx;
+ ctx = new IoCtx;
+ ret = rgw_init_ioctx(this, store->get_rados_handle(), obj.pool, *ctx);
+ if (ret < 0) {
+ if (transitioned_objects_cache[index]) {
+ goto done;
+ }
+ last_pool = "";
+ ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" <<
+ obj.pool << dendl;
+ continue;
+ }
+ last_pool = obj.pool;
+ }
+
+ ctx->locator_set_key(obj.loc);
+
+ const string& oid = obj.key.name; /* just stored raw oid there */
+
+ ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool <<
+ ":" << obj.key.name << dendl;
+ ObjectWriteOperation op;
+ cls_refcount_put(op, info.tag, true);
+
+ ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag);
+ if (ret < 0) {
+ ldpp_dout(this, 0) <<
+ "WARNING: failed to schedule deletion for oid=" << oid << dendl;
+ if (transitioned_objects_cache[index]) {
+ //If deleting oid failed for any of them, we will not delete queue entries
+ goto done;
+ }
+ }
+ if (going_down()) {
+ // leave early, even if tag isn't removed, it's ok since it
+ // will be picked up next time around
+ goto done;
+ }
+ } // chains loop
+ } // else -- chains not empty
+ } // entries loop
+ if (transitioned_objects_cache[index] && entries.size() > 0) {
+ ret = io_manager.drain_ios();
+ if (ret < 0) {
+ goto done;
+ }
+ //Remove the entries from the queue
+ ldpp_dout(this, 5) << "RGWGC::process removing entries, marker: " << marker << dendl;
+ ret = io_manager.remove_queue_entries(index, entries.size());
+ if (ret < 0) {
+ ldpp_dout(this, 0) <<
+ "WARNING: failed to remove queue entries" << dendl;
+ goto done;
+ }
+ }
+ } while (truncated);
+
+done:
+ /* we don't drain here, because if we're going down we don't want to
+ * hold the system if backend is unresponsive
+ */
+ l.unlock(&store->gc_pool_ctx, obj_names[index]);
+ delete ctx;
+
+ return 0;
+}
+
+int RGWGC::process(bool expired_only)
+{
+ int max_secs = cct->_conf->rgw_gc_processor_max_time;
+
+ const int start = ceph::util::generate_random_number(0, max_objs - 1);
+
+ RGWGCIOManager io_manager(this, store->ctx(), this);
+
+ for (int i = 0; i < max_objs; i++) {
+ int index = (i + start) % max_objs;
+ int ret = process(index, max_secs, expired_only, io_manager);
+ if (ret < 0)
+ return ret;
+ }
+ if (!going_down()) {
+ io_manager.drain();
+ }
+
+ return 0;
+}
+
+bool RGWGC::going_down()
+{
+ return down_flag;
+}
+
+void RGWGC::start_processor()
+{
+ worker = new GCWorker(this, cct, this);
+ worker->create("rgw_gc");
+}
+
+void RGWGC::stop_processor()
+{
+ down_flag = true;
+ if (worker) {
+ worker->stop();
+ worker->join();
+ }
+ delete worker;
+ worker = NULL;
+}
+
+unsigned RGWGC::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWGC::gen_prefix(std::ostream& out) const
+{
+ return out << "garbage collection: ";
+}
+
+void *RGWGC::GCWorker::entry() {
+ do {
+ utime_t start = ceph_clock_now();
+ ldpp_dout(dpp, 2) << "garbage collection: start" << dendl;
+ int r = gc->process(true);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl;
+ }
+ ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl;
+
+ if (gc->going_down())
+ break;
+
+ utime_t end = ceph_clock_now();
+ end -= start;
+ int secs = cct->_conf->rgw_gc_processor_period;
+
+ if (secs <= end.sec())
+ continue; // next round
+
+ secs -= end.sec();
+
+ std::unique_lock locker{lock};
+ cond.wait_for(locker, std::chrono::seconds(secs));
+ } while (!gc->going_down());
+
+ return NULL;
+}
+
+void RGWGC::GCWorker::stop()
+{
+ std::lock_guard l{lock};
+ cond.notify_all();
+}
diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h
new file mode 100644
index 000000000..f3df64099
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "rgw_common.h"
+#include "rgw_sal.h"
+#include "rgw_rados.h"
+#include "cls/rgw/cls_rgw_types.h"
+
+#include <atomic>
+
+class RGWGCIOManager;
+
+class RGWGC : public DoutPrefixProvider {
+ CephContext *cct;
+ RGWRados *store;
+ int max_objs;
+ std::string *obj_names;
+ std::atomic<bool> down_flag = { false };
+
+ static constexpr uint64_t seed = 8675309;
+
+ int tag_index(const std::string& tag);
+ int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+ class GCWorker : public Thread {
+ const DoutPrefixProvider *dpp;
+ CephContext *cct;
+ RGWGC *gc;
+ ceph::mutex lock = ceph::make_mutex("GCWorker");
+ ceph::condition_variable cond;
+
+ public:
+ GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc) {}
+ void *entry() override;
+ void stop();
+ };
+
+ GCWorker *worker;
+public:
+ RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {}
+ ~RGWGC() {
+ stop_processor();
+ finalize();
+ }
+ std::vector<bool> transitioned_objects_cache;
+ std::tuple<int, std::optional<cls_rgw_obj_chain>> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+ // asynchronously defer garbage collection on an object that's still being read
+ int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info);
+
+ // callback for when async_defer_chain() fails with ECANCELED
+ void on_defer_canceled(const cls_rgw_gc_obj_info& info);
+
+ int remove(int index, const std::vector<std::string>& tags, librados::AioCompletion **pc);
+ int remove(int index, int num_entries);
+
+ void initialize(CephContext *_cct, RGWRados *_store);
+ void finalize();
+
+ int list(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+ void list_init(int *index) { *index = 0; }
+ int process(int index, int process_max_secs, bool expired_only,
+ RGWGCIOManager& io_manager);
+ int process(bool expired_only);
+
+ bool going_down();
+ void start_processor();
+ void stop_processor();
+
+ CephContext *get_cct() const override { return store->ctx(); }
+ unsigned get_subsys() const;
+
+ std::ostream& gen_prefix(std::ostream& out) const;
+
+};
diff --git a/src/rgw/driver/rados/rgw_gc_log.cc b/src/rgw/driver/rados/rgw_gc_log.cc
new file mode 100644
index 000000000..ad819eddc
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc_log.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc_log.h"
+
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/version/cls_version_client.h"
+
+
+void gc_log_init2(librados::ObjectWriteOperation& op,
+ uint64_t max_size, uint64_t max_deferred)
+{
+ obj_version objv; // objv.ver = 0
+ cls_version_check(op, objv, VER_COND_EQ);
+ cls_rgw_gc_queue_init(op, max_size, max_deferred);
+ objv.ver = 1;
+ cls_version_set(op, objv);
+}
+
+void gc_log_enqueue1(librados::ObjectWriteOperation& op,
+ uint32_t expiration, cls_rgw_gc_obj_info& info)
+{
+ obj_version objv; // objv.ver = 0
+ cls_version_check(op, objv, VER_COND_EQ);
+ cls_rgw_gc_set_entry(op, expiration, info);
+}
+
+void gc_log_enqueue2(librados::ObjectWriteOperation& op,
+ uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+ obj_version objv;
+ objv.ver = 1;
+ cls_version_check(op, objv, VER_COND_EQ);
+ cls_rgw_gc_queue_enqueue(op, expiration, info);
+}
+
+void gc_log_defer1(librados::ObjectWriteOperation& op,
+ uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+ obj_version objv; // objv.ver = 0
+ cls_version_check(op, objv, VER_COND_EQ);
+ cls_rgw_gc_defer_entry(op, expiration, info.tag);
+}
+
+void gc_log_defer2(librados::ObjectWriteOperation& op,
+ uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+ obj_version objv;
+ objv.ver = 1;
+ cls_version_check(op, objv, VER_COND_EQ);
+ cls_rgw_gc_queue_defer_entry(op, expiration, info);
+ // TODO: conditional on whether omap is known to be empty
+ cls_rgw_gc_remove(op, {info.tag});
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
new file mode 100644
index 000000000..c52acef65
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_lc_tier.cc
@@ -0,0 +1,1310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+#include <iostream>
+#include <map>
+
+#include "common/Formatter.h"
+#include <common/errno.h>
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+#include "rgw_common.h"
+#include "rgw_rest.h"
+#include "svc_zone.h"
+
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct rgw_lc_multipart_part_info {
+ int part_num{0};
+ uint64_t ofs{0};
+ uint64_t size{0};
+ std::string etag;
+};
+
+struct rgw_lc_obj_properties {
+ ceph::real_time mtime;
+ std::string etag;
+ uint64_t versioned_epoch{0};
+ std::map<std::string, RGWTierACLMapping>& target_acl_mappings;
+ std::string target_storage_class;
+
+ rgw_lc_obj_properties(ceph::real_time _mtime, std::string _etag,
+ uint64_t _versioned_epoch, std::map<std::string,
+ RGWTierACLMapping>& _t_acl_mappings,
+ std::string _t_storage_class) :
+ mtime(_mtime), etag(_etag),
+ versioned_epoch(_versioned_epoch),
+ target_acl_mappings(_t_acl_mappings),
+ target_storage_class(_t_storage_class) {}
+};
+
+struct rgw_lc_multipart_upload_info {
+ std::string upload_id;
+ uint64_t obj_size;
+ ceph::real_time mtime;
+ std::string etag;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(upload_id, bl);
+ encode(obj_size, bl);
+ encode(mtime, bl);
+ encode(etag, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(upload_id, bl);
+ decode(obj_size, bl);
+ decode(mtime, bl);
+ decode(etag, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
+
+static inline string get_key_instance(const rgw_obj_key& key)
+{
+ if (!key.instance.empty() &&
+ !key.have_null_instance()) {
+ return "-" + key.instance;
+ }
+ return "";
+}
+
+static inline string get_key_oid(const rgw_obj_key& key)
+{
+ string oid = key.name;
+ if (!key.instance.empty() &&
+ !key.have_null_instance()) {
+ oid += string("-") + key.instance;
+ }
+ return oid;
+}
+
+static inline string obj_to_aws_path(const rgw_obj& obj)
+{
+ string path = obj.bucket.name + "/" + get_key_oid(obj.key);
+ return path;
+}
+
+static int read_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+ const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+ int ret = 0;
+ rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+ if (!rados) {
+ ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+ return -1;
+ }
+
+ auto& pool = status_obj->pool;
+ const auto oid = status_obj->oid;
+ auto sysobj = rados->svc()->sysobj;
+ bufferlist bl;
+
+ ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr,
+ null_yield, dpp);
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bl.length() > 0) {
+ try {
+ auto p = bl.cbegin();
+ status->decode(p);
+ } catch (buffer::error& e) {
+ ldpp_dout(dpp, 10) << "failed to decode status obj: "
+ << e.what() << dendl;
+ return -EIO;
+ }
+ } else {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int put_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+ const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+ int ret = 0;
+ rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+ if (!rados) {
+ ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+ return -1;
+ }
+
+ auto& pool = status_obj->pool;
+ const auto oid = status_obj->oid;
+ auto sysobj = rados->svc()->sysobj;
+ bufferlist bl;
+ status->encode(bl);
+
+ ret = rgw_put_system_obj(dpp, sysobj, pool, oid, bl, true, nullptr,
+ real_time{}, null_yield);
+
+ return ret;
+}
+
+static int delete_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+ const rgw_raw_obj *status_obj)
+{
+ int ret = 0;
+ rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+ if (!rados) {
+ ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+ return -1;
+ }
+
+ auto& pool = status_obj->pool;
+ const auto oid = status_obj->oid;
+ auto sysobj = rados->svc()->sysobj;
+
+ ret = rgw_delete_system_obj(dpp, sysobj, pool, oid, nullptr, null_yield);
+
+ return ret;
+}
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+ "CONTENT_ENCODING",
+ "CONTENT_DISPOSITION",
+ "CONTENT_LANGUAGE" };
+
+/*
+ * mapping between rgw object attrs and output http fields
+ *
+ static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
+ { RGW_ATTR_CONTENT_LANG, "Content-Language" },
+ { RGW_ATTR_EXPIRES, "Expires" },
+ { RGW_ATTR_CACHE_CONTROL, "Cache-Control" },
+ { RGW_ATTR_CONTENT_DISP, "Content-Disposition" },
+ { RGW_ATTR_CONTENT_ENC, "Content-Encoding" },
+ { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest" },
+ { RGW_ATTR_X_ROBOTS_TAG , "X-Robots-Tag" },
+ { RGW_ATTR_STORAGE_CLASS , "X-Amz-Storage-Class" },
+// RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode:
+// S3 endpoint: x-amz-website-redirect-location
+// S3Website endpoint: Location
+{ RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" },
+}; */
+
+static void init_headers(map<string, bufferlist>& attrs,
+ map<string, string>& headers)
+{
+ for (auto& kv : attrs) {
+ const char * name = kv.first.c_str();
+ const auto aiter = rgw_to_http_attrs.find(name);
+
+ if (aiter != std::end(rgw_to_http_attrs)) {
+ headers[aiter->second] = rgw_bl_str(kv.second);
+ } else if (strncmp(name, RGW_ATTR_META_PREFIX,
+ sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+ name += sizeof(RGW_ATTR_META_PREFIX) - 1;
+ string sname(name);
+ string name_prefix = RGW_ATTR_META_PREFIX;
+ char full_name_buf[name_prefix.size() + sname.size() + 1];
+ snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s",
+ static_cast<int>(name_prefix.length()),
+ name_prefix.data(),
+ static_cast<int>(sname.length()),
+ sname.data());
+ headers[full_name_buf] = rgw_bl_str(kv.second);
+ } else if (strcmp(name,RGW_ATTR_CONTENT_TYPE) == 0) {
+ headers["CONTENT_TYPE"] = rgw_bl_str(kv.second);
+ }
+ }
+}
+
+/* Read object or just head from remote endpoint. For now initializes only headers,
+ * but can be extended to fetch etag, mtime etc if needed.
+ */
+static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+ std::map<std::string, std::string>& headers) {
+ RGWRESTConn::get_obj_params req_params;
+ std::string target_obj_name;
+ int ret = 0;
+ rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+ tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+ tier_ctx.target_storage_class);
+ std::string etag;
+ RGWRESTStreamRWRequest *in_req;
+
+ rgw_bucket dest_bucket;
+ dest_bucket.name = tier_ctx.target_bucket_name;
+ target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+ tier_ctx.obj->get_name();
+ if (!tier_ctx.o.is_current()) {
+ target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+ }
+
+ rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+ /* init input connection */
+ req_params.get_op = !head;
+ req_params.prepend_metadata = true;
+ req_params.rgwx_stat = true;
+ req_params.sync_manifest = true;
+ req_params.skip_decrypt = true;
+
+ ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ /* fetch headers */
+ ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+static bool is_already_tiered(const DoutPrefixProvider *dpp,
+ std::map<std::string, std::string>& headers,
+ ceph::real_time& mtime) {
+ char buf[32];
+ map<string, string> attrs = headers;
+
+ for (const auto& a : attrs) {
+ ldpp_dout(dpp, 20) << "GetCrf attr[" << a.first << "] = " << a.second <<dendl;
+ }
+ utime_t ut(mtime);
+ snprintf(buf, sizeof(buf), "%lld.%09lld",
+ (long long)ut.sec(),
+ (long long)ut.nsec());
+
+ string s = attrs["X_AMZ_META_RGWX_SOURCE_MTIME"];
+
+ if (s.empty())
+ s = attrs["x_amz_meta_rgwx_source_mtime"];
+
+ ldpp_dout(dpp, 20) << "is_already_tiered attrs[X_AMZ_META_RGWX_SOURCE_MTIME] = " << s <<dendl;
+ ldpp_dout(dpp, 20) << "is_already_tiered mtime buf = " << buf <<dendl;
+
+ if (!s.empty() && !strcmp(s.c_str(), buf)){
+ return 1;
+ }
+ return 0;
+}
+
+/* Read object locally & also initialize dest rest obj based on read attrs */
+class RGWLCStreamRead
+{
+ CephContext *cct;
+ const DoutPrefixProvider *dpp;
+ std::map<std::string, bufferlist> attrs;
+ uint64_t obj_size;
+ rgw::sal::Object *obj;
+ const real_time &mtime;
+
+ bool multipart{false};
+ uint64_t m_part_size{0};
+ off_t m_part_off{0};
+ off_t m_part_end{0};
+
+ std::unique_ptr<rgw::sal::Object::ReadOp> read_op;
+ off_t ofs{0};
+ off_t end{0};
+ rgw_rest_obj rest_obj;
+
+ int retcode{0};
+
+ public:
+ RGWLCStreamRead(CephContext *_cct, const DoutPrefixProvider *_dpp,
+ rgw::sal::Object *_obj, const real_time &_mtime) :
+ cct(_cct), dpp(_dpp), obj(_obj), mtime(_mtime),
+ read_op(obj->get_read_op()) {}
+
+ ~RGWLCStreamRead() {};
+ int set_range(off_t _ofs, off_t _end);
+ int get_range(off_t &_ofs, off_t &_end);
+ rgw_rest_obj& get_rest_obj();
+ void set_multipart(uint64_t part_size, off_t part_off, off_t part_end);
+ int init();
+ int init_rest_obj();
+ int read(off_t ofs, off_t end, RGWGetDataCB *out_cb);
+};
+
+/* Send PUT op to remote endpoint */
+class RGWLCCloudStreamPut
+{
+ const DoutPrefixProvider *dpp;
+ rgw_lc_obj_properties obj_properties;
+ RGWRESTConn& conn;
+ const rgw_obj& dest_obj;
+ std::string etag;
+ RGWRESTStreamS3PutObj *out_req{nullptr};
+
+ struct multipart_info {
+ bool is_multipart{false};
+ std::string upload_id;
+ int part_num{0};
+ uint64_t part_size;
+ } multipart;
+
+ int retcode;
+
+ public:
+ RGWLCCloudStreamPut(const DoutPrefixProvider *_dpp,
+ const rgw_lc_obj_properties& _obj_properties,
+ RGWRESTConn& _conn,
+ const rgw_obj& _dest_obj) :
+ dpp(_dpp), obj_properties(_obj_properties), conn(_conn), dest_obj(_dest_obj) {
+ }
+ int init();
+ static bool keep_attr(const std::string& h);
+ static void init_send_attrs(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj,
+ const rgw_lc_obj_properties& obj_properties,
+ std::map<std::string, std::string>& attrs);
+ void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj);
+ void handle_headers(const std::map<std::string, std::string>& headers);
+ bool get_etag(std::string *petag);
+ void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size);
+ int send();
+ RGWGetDataCB *get_cb();
+ int complete_request();
+};
+
+int RGWLCStreamRead::set_range(off_t _ofs, off_t _end) {
+ ofs = _ofs;
+ end = _end;
+
+ return 0;
+}
+
+int RGWLCStreamRead::get_range(off_t &_ofs, off_t &_end) {
+ _ofs = ofs;
+ _end = end;
+
+ return 0;
+}
+
+rgw_rest_obj& RGWLCStreamRead::get_rest_obj() {
+ return rest_obj;
+}
+
+void RGWLCStreamRead::set_multipart(uint64_t part_size, off_t part_off, off_t part_end) {
+ multipart = true;
+ m_part_size = part_size;
+ m_part_off = part_off;
+ m_part_end = part_end;
+}
+
+int RGWLCStreamRead::init() {
+ optional_yield y = null_yield;
+ real_time read_mtime;
+
+ read_op->params.lastmod = &read_mtime;
+
+ int ret = read_op->prepare(y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to prepare read_op, ret = " << ret << dendl;
+ return ret;
+ }
+
+ if (read_mtime != mtime) {
+ /* raced */
+ return -ECANCELED;
+ }
+
+ attrs = obj->get_attrs();
+ obj_size = obj->get_obj_size();
+
+ ret = init_rest_obj();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to initialize rest_obj, ret = " << ret << dendl;
+ return ret;
+ }
+
+ if (!multipart) {
+ set_range(0, obj_size - 1);
+ } else {
+ set_range(m_part_off, m_part_end);
+ }
+ return 0;
+}
+
+int RGWLCStreamRead::init_rest_obj() {
+ /* Initialize rgw_rest_obj.
+ * Reference: do_decode_rest_obj
+ * Check how to copy headers content */
+ rest_obj.init(obj->get_key());
+
+ if (!multipart) {
+ rest_obj.content_len = obj_size;
+ } else {
+ rest_obj.content_len = m_part_size;
+ }
+
+ /* For mulitpart attrs are sent as part of InitMultipartCR itself */
+ if (multipart) {
+ return 0;
+ }
+
+ /*
+ * XXX: verify if its right way to copy attrs into rest obj
+ */
+ init_headers(attrs, rest_obj.attrs);
+
+ rest_obj.acls.set_ctx(cct);
+ const auto aiter = attrs.find(RGW_ATTR_ACL);
+ if (aiter != attrs.end()) {
+ bufferlist& bl = aiter->second;
+ auto bliter = bl.cbegin();
+ try {
+ rest_obj.acls.decode(bliter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+ return -EIO;
+ }
+ } else {
+ ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+ }
+ return 0;
+}
+
+int RGWLCStreamRead::read(off_t ofs, off_t end, RGWGetDataCB *out_cb) {
+ int ret = read_op->iterate(dpp, ofs, end, out_cb, null_yield);
+ return ret;
+}
+
+int RGWLCCloudStreamPut::init() {
+ /* init output connection */
+ if (multipart.is_multipart) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+ rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+ { "partNumber", buf },
+ { nullptr, nullptr } };
+ conn.put_obj_send_init(dest_obj, params, &out_req);
+ } else {
+ conn.put_obj_send_init(dest_obj, nullptr, &out_req);
+ }
+
+ return 0;
+}
+
+bool RGWLCCloudStreamPut::keep_attr(const string& h) {
+ return (keep_headers.find(h) != keep_headers.end());
+}
+
+void RGWLCCloudStreamPut::init_send_attrs(const DoutPrefixProvider *dpp,
+ const rgw_rest_obj& rest_obj,
+ const rgw_lc_obj_properties& obj_properties,
+ std::map<string, string>& attrs) {
+
+ map<string, RGWTierACLMapping>& acl_mappings(obj_properties.target_acl_mappings);
+ const std::string& target_storage_class = obj_properties.target_storage_class;
+
+ attrs.clear();
+
+ for (auto& hi : rest_obj.attrs) {
+ if (keep_attr(hi.first)) {
+ attrs.insert(hi);
+ } else {
+ std::string s1 = boost::algorithm::to_lower_copy(hi.first);
+ const char* k = std::strstr(s1.c_str(), "x-amz");
+ if (k) {
+ attrs[k] = hi.second;
+ }
+ }
+ }
+
+ const auto acl = rest_obj.acls.get_acl();
+
+ map<int, vector<string> > access_map;
+
+ if (!acl_mappings.empty()) {
+ for (auto& grant : acl.get_grant_map()) {
+ auto& orig_grantee = grant.first;
+ auto& perm = grant.second;
+
+ string grantee;
+
+ const auto& am = acl_mappings;
+
+ const auto iter = am.find(orig_grantee);
+ if (iter == am.end()) {
+ ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+ continue;
+ }
+
+ grantee = iter->second.dest_id;
+
+ string type;
+
+ switch (iter->second.type) {
+ case ACL_TYPE_CANON_USER:
+ type = "id";
+ break;
+ case ACL_TYPE_EMAIL_USER:
+ type = "emailAddress";
+ break;
+ case ACL_TYPE_GROUP:
+ type = "uri";
+ break;
+ default:
+ continue;
+ }
+
+ string tv = type + "=" + grantee;
+
+ int flags = perm.get_permission().get_permissions();
+ if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+ access_map[flags].push_back(tv);
+ continue;
+ }
+
+ for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+ if (flags & i) {
+ access_map[i].push_back(tv);
+ }
+ }
+ }
+ }
+
+ for (const auto& aiter : access_map) {
+ int grant_type = aiter.first;
+
+ string header_str("x-amz-grant-");
+
+ switch (grant_type) {
+ case RGW_PERM_READ:
+ header_str.append("read");
+ break;
+ case RGW_PERM_WRITE:
+ header_str.append("write");
+ break;
+ case RGW_PERM_READ_ACP:
+ header_str.append("read-acp");
+ break;
+ case RGW_PERM_WRITE_ACP:
+ header_str.append("write-acp");
+ break;
+ case RGW_PERM_FULL_CONTROL:
+ header_str.append("full-control");
+ break;
+ }
+
+ string s;
+
+ for (const auto& viter : aiter.second) {
+ if (!s.empty()) {
+ s.append(", ");
+ }
+ s.append(viter);
+ }
+
+ ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+ attrs[header_str] = s;
+ }
+
+ /* Copy target storage class */
+ if (!target_storage_class.empty()) {
+ attrs["x-amz-storage-class"] = target_storage_class;
+ } else {
+ attrs["x-amz-storage-class"] = "STANDARD";
+ }
+
+ /* New attribute to specify its transitioned from RGW */
+ attrs["x-amz-meta-rgwx-source"] = "rgw";
+ attrs["x-rgw-cloud"] = "true";
+ attrs["x-rgw-cloud-keep-attrs"] = "true";
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llu", (long long)obj_properties.versioned_epoch);
+ attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+ utime_t ut(obj_properties.mtime);
+ snprintf(buf, sizeof(buf), "%lld.%09lld",
+ (long long)ut.sec(),
+ (long long)ut.nsec());
+
+ attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+ attrs["x-amz-meta-rgwx-source-etag"] = obj_properties.etag;
+ attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+ if (!rest_obj.key.instance.empty()) {
+ attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+ }
+ for (const auto& a : attrs) {
+ ldpp_dout(dpp, 30) << "init_send_attrs attr[" << a.first << "] = " << a.second <<dendl;
+ }
+}
+
+void RGWLCCloudStreamPut::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) {
+ auto r = static_cast<RGWRESTStreamS3PutObj *>(out_req);
+
+ std::map<std::string, std::string> new_attrs;
+ if (!multipart.is_multipart) {
+ init_send_attrs(dpp, rest_obj, obj_properties, new_attrs);
+ }
+
+ r->set_send_length(rest_obj.content_len);
+
+ RGWAccessControlPolicy policy;
+
+ r->send_ready(dpp, conn.get_key(), new_attrs, policy);
+}
+
+void RGWLCCloudStreamPut::handle_headers(const map<string, string>& headers) {
+ for (const auto& h : headers) {
+ if (h.first == "ETAG") {
+ etag = h.second;
+ }
+ }
+}
+
+bool RGWLCCloudStreamPut::get_etag(string *petag) {
+ if (etag.empty()) {
+ return false;
+ }
+ *petag = etag;
+ return true;
+}
+
+void RGWLCCloudStreamPut::set_multipart(const string& upload_id, int part_num, uint64_t part_size) {
+ multipart.is_multipart = true;
+ multipart.upload_id = upload_id;
+ multipart.part_num = part_num;
+ multipart.part_size = part_size;
+}
+
+int RGWLCCloudStreamPut::send() {
+ int ret = RGWHTTP::send(out_req);
+ return ret;
+}
+
+RGWGetDataCB *RGWLCCloudStreamPut::get_cb() {
+ return out_req->get_out_cb();
+}
+
+int RGWLCCloudStreamPut::complete_request() {
+ int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield);
+ return ret;
+}
+
+/* Read local copy and write to Cloud endpoint */
+static int cloud_tier_transfer_object(const DoutPrefixProvider* dpp,
+ RGWLCStreamRead* readf, RGWLCCloudStreamPut* writef) {
+ std::string url;
+ bufferlist bl;
+ bool sent_attrs{false};
+ int ret{0};
+ off_t ofs;
+ off_t end;
+
+ ret = readf->init();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to initialize in_crf, ret = " << ret << dendl;
+ return ret;
+ }
+ readf->get_range(ofs, end);
+ rgw_rest_obj& rest_obj = readf->get_rest_obj();
+ if (!sent_attrs) {
+ ret = writef->init();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to initialize out_crf, ret = " << ret << dendl;
+ return ret;
+ }
+
+ writef->send_ready(dpp, rest_obj);
+ ret = writef->send();
+ if (ret < 0) {
+ return ret;
+ }
+ sent_attrs = true;
+ }
+
+ ret = readf->read(ofs, end, writef->get_cb());
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to read from in_crf, ret = " << ret << dendl;
+ return ret;
+ }
+
+ ret = writef->complete_request();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to complete request, ret = " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int cloud_tier_plain_transfer(RGWLCCloudTierCtx& tier_ctx) {
+ int ret;
+
+ rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+ tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+ tier_ctx.target_storage_class);
+ std::string target_obj_name;
+
+ rgw_bucket dest_bucket;
+ dest_bucket.name = tier_ctx.target_bucket_name;
+
+ target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+ tier_ctx.obj->get_name();
+ if (!tier_ctx.o.is_current()) {
+ target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+ }
+
+ rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+ tier_ctx.obj->set_atomic();
+
+ /* Prepare Read from source */
+ /* TODO: Define readf, writef as stack variables. For some reason,
+ * when used as stack variables (esp., readf), the transition seems to
+ * be taking lot of time eventually erroring out at times.
+ */
+ std::shared_ptr<RGWLCStreamRead> readf;
+ readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+ tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+ std::shared_ptr<RGWLCCloudStreamPut> writef;
+ writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+ dest_obj));
+
+ /* actual Read & Write */
+ ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+
+ return ret;
+}
+
+static int cloud_tier_send_multipart_part(RGWLCCloudTierCtx& tier_ctx,
+ const std::string& upload_id,
+ const rgw_lc_multipart_part_info& part_info,
+ std::string *petag) {
+ int ret;
+
+ rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+ tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+ tier_ctx.target_storage_class);
+ std::string target_obj_name;
+ off_t end;
+
+ rgw_bucket dest_bucket;
+ dest_bucket.name = tier_ctx.target_bucket_name;
+
+ target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+ tier_ctx.obj->get_name();
+ if (!tier_ctx.o.is_current()) {
+ target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+ }
+
+ rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+ tier_ctx.obj->set_atomic();
+
+ /* TODO: Define readf, writef as stack variables. For some reason,
+ * when used as stack variables (esp., readf), the transition seems to
+ * be taking lot of time eventually erroring out at times. */
+ std::shared_ptr<RGWLCStreamRead> readf;
+ readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+ tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+ std::shared_ptr<RGWLCCloudStreamPut> writef;
+ writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+ dest_obj));
+
+ /* Prepare Read from source */
+ end = part_info.ofs + part_info.size - 1;
+ readf->set_multipart(part_info.size, part_info.ofs, end);
+
+ /* Prepare write */
+ writef->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+ /* actual Read & Write */
+ ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!(writef->get_etag(petag))) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int cloud_tier_abort_multipart(const DoutPrefixProvider *dpp,
+ RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+ const std::string& upload_id) {
+ int ret;
+ bufferlist out_bl;
+ bufferlist bl;
+ rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+ string resource = obj_to_aws_path(dest_obj);
+ ret = dest_conn.send_resource(dpp, "DELETE", resource, params, nullptr,
+ out_bl, &bl, nullptr, null_yield);
+
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (ret=" << ret << ")" << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int cloud_tier_init_multipart(const DoutPrefixProvider *dpp,
+ RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+ uint64_t obj_size, std::map<std::string, std::string>& attrs,
+ std::string& upload_id) {
+ bufferlist out_bl;
+ bufferlist bl;
+
+ struct InitMultipartResult {
+ std::string bucket;
+ std::string key;
+ std::string upload_id;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+ RGWXMLDecoder::decode_xml("Key", key, obj);
+ RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+ }
+ } result;
+
+ int ret;
+ rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+
+ string resource = obj_to_aws_path(dest_obj);
+
+ ret = dest_conn.send_resource(dpp, "POST", resource, params, &attrs,
+ out_bl, &bl, nullptr, null_yield);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+ return ret;
+ }
+ /*
+ * If one of the following fails we cannot abort upload, as we cannot
+ * extract the upload id. If one of these fail it's very likely that that's
+ * the least of our problem.
+ */
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+ return -EIO;
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: failed to parse xml initmultipart: " << str << dendl;
+ return -EIO;
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return -EIO;
+ }
+
+ ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+ upload_id = result.upload_id;
+
+ return 0;
+}
+
+static int cloud_tier_complete_multipart(const DoutPrefixProvider *dpp,
+ RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+ std::string& upload_id,
+ const std::map<int, rgw_lc_multipart_part_info>& parts) {
+ rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+ stringstream ss;
+ XMLFormatter formatter;
+ int ret;
+
+ bufferlist bl, out_bl;
+ string resource = obj_to_aws_path(dest_obj);
+
+ struct CompleteMultipartReq {
+ std::map<int, rgw_lc_multipart_part_info> parts;
+
+ explicit CompleteMultipartReq(const std::map<int, rgw_lc_multipart_part_info>& _parts) : parts(_parts) {}
+
+ void dump_xml(Formatter *f) const {
+ for (const auto& p : parts) {
+ f->open_object_section("Part");
+ encode_xml("PartNumber", p.first, f);
+ encode_xml("ETag", p.second.etag, f);
+ f->close_section();
+ };
+ }
+ } req_enc(parts);
+
+ struct CompleteMultipartResult {
+ std::string location;
+ std::string bucket;
+ std::string key;
+ std::string etag;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Location", bucket, obj);
+ RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+ RGWXMLDecoder::decode_xml("Key", key, obj);
+ RGWXMLDecoder::decode_xml("ETag", etag, obj);
+ }
+ } result;
+
+ encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+ formatter.flush(ss);
+ bl.append(ss.str());
+
+ ret = dest_conn.send_resource(dpp, "POST", resource, params, nullptr,
+ out_bl, &bl, nullptr, null_yield);
+
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload for dest object=" << dest_obj << dendl;
+ return ret;
+ }
+ /*
+ * If one of the following fails we cannot abort upload, as we cannot
+ * extract the upload id. If one of these fail it's very likely that that's
+ * the least of our problem.
+ */
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+ return -EIO;
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: failed to parse xml Completemultipart: " << str << dendl;
+ return -EIO;
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return -EIO;
+ }
+
+ ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+ return ret;
+}
+
+static int cloud_tier_abort_multipart_upload(RGWLCCloudTierCtx& tier_ctx,
+ const rgw_obj& dest_obj, const rgw_raw_obj& status_obj,
+ const std::string& upload_id) {
+ int ret;
+
+ ret = cloud_tier_abort_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, upload_id);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " ret=" << ret << dendl;
+ /* ignore error, best effort */
+ }
+ /* remove status obj */
+ ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " ret=" << ret << dendl;
+ // ignore error, best effort
+ }
+ return 0;
+}
+
+static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
+ rgw_obj src_obj;
+ rgw_obj dest_obj;
+
+ uint64_t obj_size;
+ std::string src_etag;
+ rgw_rest_obj rest_obj;
+
+ rgw_lc_multipart_upload_info status;
+
+ std::map<std::string, std::string> new_attrs;
+
+ rgw_raw_obj status_obj;
+
+ RGWBucketInfo b;
+ std::string target_obj_name;
+ rgw_bucket target_bucket;
+
+ int ret;
+
+ rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+ tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+ tier_ctx.target_storage_class);
+
+ uint32_t part_size{0};
+ uint32_t num_parts{0};
+
+ int cur_part{0};
+ uint64_t cur_ofs{0};
+ std::map<int, rgw_lc_multipart_part_info> parts;
+
+ obj_size = tier_ctx.o.meta.size;
+
+ target_bucket.name = tier_ctx.target_bucket_name;
+
+ target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+ tier_ctx.obj->get_name();
+ if (!tier_ctx.o.is_current()) {
+ target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+ }
+ dest_obj.init(target_bucket, target_obj_name);
+
+ rgw_pool pool = static_cast<rgw::sal::RadosStore*>(tier_ctx.driver)->svc()->zone->get_zone_params().log_pool;
+ status_obj = rgw_raw_obj(pool, "lc_multipart_" + tier_ctx.obj->get_oid());
+
+ ret = read_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (ret >= 0) {
+ // check here that mtime and size did not change
+ if (status.mtime != obj_properties.mtime || status.obj_size != obj_size ||
+ status.etag != obj_properties.etag) {
+ cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+ ret = -ENOENT;
+ }
+ }
+
+ if (ret == -ENOENT) {
+ RGWLCStreamRead readf(tier_ctx.cct, tier_ctx.dpp, tier_ctx.obj, tier_ctx.o.meta.mtime);
+
+ readf.init();
+
+ rest_obj = readf.get_rest_obj();
+
+ RGWLCCloudStreamPut::init_send_attrs(tier_ctx.dpp, rest_obj, obj_properties, new_attrs);
+
+ ret = cloud_tier_init_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, obj_size, new_attrs, status.upload_id);
+ if (ret < 0) {
+ return ret;
+ }
+
+ status.obj_size = obj_size;
+ status.mtime = obj_properties.mtime;
+ status.etag = obj_properties.etag;
+
+ ret = put_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to driver multipart upload state, ret=" << ret << dendl;
+ // continue with upload anyway
+ }
+
+#define MULTIPART_MAX_PARTS 10000
+#define MULTIPART_MAX_PARTS 10000
+ uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+ uint64_t min_conf_size = tier_ctx.multipart_min_part_size;
+
+ if (min_conf_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+ min_conf_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+ }
+
+ part_size = std::max(min_conf_size, min_part_size);
+ num_parts = (obj_size + part_size - 1) / part_size;
+ cur_part = 1;
+ cur_ofs = 0;
+ }
+
+ for (; (uint32_t)cur_part <= num_parts; ++cur_part) {
+ ldpp_dout(tier_ctx.dpp, 20) << "cur_part = "<< cur_part << ", info.ofs = " << cur_ofs << ", info.size = " << part_size << ", obj size = " << obj_size<< ", num_parts:" << num_parts << dendl;
+ rgw_lc_multipart_part_info& cur_part_info = parts[cur_part];
+ cur_part_info.part_num = cur_part;
+ cur_part_info.ofs = cur_ofs;
+ cur_part_info.size = std::min((uint64_t)part_size, obj_size - cur_ofs);
+
+ cur_ofs += cur_part_info.size;
+
+ ret = cloud_tier_send_multipart_part(tier_ctx,
+ status.upload_id,
+ cur_part_info,
+ &cur_part_info.etag);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to send multipart part of obj=" << tier_ctx.obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << cur_part << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+ cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+ return ret;
+ }
+
+ }
+
+ ret = cloud_tier_complete_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, status.upload_id, parts);
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << tier_ctx.obj << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+ cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+ return ret;
+ }
+
+ /* remove status obj */
+ ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload obj=" << tier_ctx.obj << " upload_id=" << status.upload_id << " part number " << cur_part << " (" << cpp_strerror(-ret) << ")" << dendl;
+ // ignore error, best effort
+ }
+ return 0;
+}
+
+/* Check if object has already been transitioned */
+static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
+ int ret;
+ std::map<std::string, std::string> headers;
+
+ /* Fetch Head object */
+ ret = cloud_tier_get_object(tier_ctx, true, headers);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
+ return ret;
+ }
+
+ already_tiered = is_already_tiered(tier_ctx.dpp, headers, tier_ctx.o.meta.mtime);
+
+ if (already_tiered) {
+ ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered true" << dendl;
+ } else {
+ ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered false..going with out_crf writing" << dendl;
+ }
+
+ return ret;
+}
+
+static int cloud_tier_create_bucket(RGWLCCloudTierCtx& tier_ctx) {
+ bufferlist out_bl;
+ int ret = 0;
+ pair<string, string> key(tier_ctx.storage_class, tier_ctx.target_bucket_name);
+ struct CreateBucketResult {
+ std::string code;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Code", code, obj);
+ }
+ } result;
+
+ ldpp_dout(tier_ctx.dpp, 30) << "Cloud_tier_ctx: creating bucket:" << tier_ctx.target_bucket_name << dendl;
+ bufferlist bl;
+ string resource = tier_ctx.target_bucket_name;
+
+ ret = tier_ctx.conn.send_resource(tier_ctx.dpp, "PUT", resource, nullptr, nullptr,
+ out_bl, &bl, nullptr, null_yield);
+
+ if (ret < 0 ) {
+ ldpp_dout(tier_ctx.dpp, 0) << "create target bucket : " << tier_ctx.target_bucket_name << " returned ret:" << ret << dendl;
+ }
+ if (out_bl.length() > 0) {
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize xml parser for parsing create_bucket response from server" << dendl;
+ return -EIO;
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(tier_ctx.dpp, 5) << "ERROR: failed to parse xml createbucket: " << str << dendl;
+ return -EIO;
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(tier_ctx.dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return -EIO;
+ }
+
+ if (result.code != "BucketAlreadyOwnedByYou" && result.code != "BucketAlreadyExists") {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: Creating target bucket failed with error: " << result.code << dendl;
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets) {
+ int ret = 0;
+
+ // check if target_path is already created
+ std::set<std::string>::iterator it;
+
+ it = cloud_targets.find(tier_ctx.target_bucket_name);
+ tier_ctx.target_bucket_created = (it != cloud_targets.end());
+
+ /* If run first time attempt to create the target bucket */
+ if (!tier_ctx.target_bucket_created) {
+ ret = cloud_tier_create_bucket(tier_ctx);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to create target bucket on the cloud endpoint ret=" << ret << dendl;
+ return ret;
+ }
+ tier_ctx.target_bucket_created = true;
+ cloud_targets.insert(tier_ctx.target_bucket_name);
+ }
+
+ /* Since multiple zones may try to transition the same object to the cloud,
+ * verify if the object is already transitioned. And since its just a best
+ * effort, do not bail out in case of any errors.
+ */
+ bool already_tiered = false;
+ ret = cloud_tier_check_object(tier_ctx, already_tiered);
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to check object on the cloud endpoint ret=" << ret << dendl;
+ }
+
+ if (already_tiered) {
+ ldpp_dout(tier_ctx.dpp, 20) << "Object (" << tier_ctx.o.key << ") is already tiered" << dendl;
+ return 0;
+ }
+
+ uint64_t size = tier_ctx.o.meta.size;
+ uint64_t multipart_sync_threshold = tier_ctx.multipart_sync_threshold;
+
+ if (multipart_sync_threshold < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+ multipart_sync_threshold = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+ }
+
+ if (size < multipart_sync_threshold) {
+ ret = cloud_tier_plain_transfer(tier_ctx);
+ } else {
+ tier_ctx.is_multipart_upload = true;
+ ret = cloud_tier_multipart_transfer(tier_ctx);
+ }
+
+ if (ret < 0) {
+ ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to transition object ret=" << ret << dendl;
+ }
+
+ return ret;
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h
new file mode 100644
index 000000000..729c4c304
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_lc_tier.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_lc.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+#include "rgw_cr_rest.h"
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+
+struct RGWLCCloudTierCtx {
+ CephContext *cct;
+ const DoutPrefixProvider *dpp;
+
+ /* Source */
+ rgw_bucket_dir_entry& o;
+ rgw::sal::Driver *driver;
+ RGWBucketInfo& bucket_info;
+ std::string storage_class;
+
+ rgw::sal::Object *obj;
+
+ /* Remote */
+ RGWRESTConn& conn;
+ std::string target_bucket_name;
+ std::string target_storage_class;
+
+ std::map<std::string, RGWTierACLMapping> acl_mappings;
+ uint64_t multipart_min_part_size;
+ uint64_t multipart_sync_threshold;
+
+ bool is_multipart_upload{false};
+ bool target_bucket_created{true};
+
+ RGWLCCloudTierCtx(CephContext* _cct, const DoutPrefixProvider *_dpp,
+ rgw_bucket_dir_entry& _o, rgw::sal::Driver *_driver,
+ RGWBucketInfo &_binfo, rgw::sal::Object *_obj,
+ RGWRESTConn& _conn, std::string& _bucket,
+ std::string& _storage_class) :
+ cct(_cct), dpp(_dpp), o(_o), driver(_driver), bucket_info(_binfo),
+ obj(_obj), conn(_conn), target_bucket_name(_bucket),
+ target_storage_class(_storage_class) {}
+};
+
+/* Transition object to cloud endpoint */
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
diff --git a/src/rgw/driver/rados/rgw_log_backing.cc b/src/rgw/driver/rados/rgw_log_backing.cc
new file mode 100644
index 000000000..7c9dafe7e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_log_backing.cc
@@ -0,0 +1,708 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "cls/log/cls_log_client.h"
+#include "cls/version/cls_version_client.h"
+
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+#include "cls_fifo_legacy.h"
+
+using namespace std::chrono_literals;
+namespace cb = ceph::buffer;
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+enum class shard_check { dne, omap, fifo, corrupt };
+inline std::ostream& operator <<(std::ostream& m, const shard_check& t) {
+ switch (t) {
+ case shard_check::dne:
+ return m << "shard_check::dne";
+ case shard_check::omap:
+ return m << "shard_check::omap";
+ case shard_check::fifo:
+ return m << "shard_check::fifo";
+ case shard_check::corrupt:
+ return m << "shard_check::corrupt";
+ }
+
+ return m << "shard_check::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+namespace {
+/// Return the shard type, and a bool to see whether it has entries.
+shard_check
+probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ bool& fifo_unsupported, optional_yield y)
+{
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " probing oid=" << oid
+ << dendl;
+ if (!fifo_unsupported) {
+ std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+ auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid,
+ &fifo, y,
+ std::nullopt, true);
+ switch (r) {
+ case 0:
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": oid=" << oid << " is FIFO"
+ << dendl;
+ return shard_check::fifo;
+
+ case -ENODATA:
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": oid=" << oid << " is empty and therefore OMAP"
+ << dendl;
+ return shard_check::omap;
+
+ case -ENOENT:
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": oid=" << oid << " does not exist"
+ << dendl;
+ return shard_check::dne;
+
+ case -EPERM:
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": FIFO is unsupported, marking."
+ << dendl;
+ fifo_unsupported = true;
+ return shard_check::omap;
+
+ default:
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": error probing: r=" << r
+ << ", oid=" << oid << dendl;
+ return shard_check::corrupt;
+ }
+ } else {
+ // Since FIFO is unsupported, OMAP is the only alternative
+ return shard_check::omap;
+ }
+}
+
+tl::expected<log_type, bs::error_code>
+handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx,
+ log_type def,
+ std::string oid,
+ bool fifo_unsupported,
+ optional_yield y)
+{
+ if (def == log_type::fifo) {
+ if (fifo_unsupported) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " WARNING: FIFO set as default but not supported by OSD. "
+ << "Falling back to OMAP." << dendl;
+ return log_type::omap;
+ }
+ std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+ auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid,
+ &fifo, y,
+ std::nullopt);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " error creating FIFO: r=" << r
+ << ", oid=" << oid << dendl;
+ return tl::unexpected(bs::error_code(-r, bs::system_category()));
+ }
+ }
+ return def;
+}
+}
+
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp,
+ librados::IoCtx& ioctx,
+ log_type def,
+ int shards,
+ const fu2::unique_function<std::string(int) const>& get_oid,
+ optional_yield y)
+{
+ auto check = shard_check::dne;
+ bool fifo_unsupported = false;
+ for (int i = 0; i < shards; ++i) {
+ auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y);
+ if (c == shard_check::corrupt)
+ return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+ if (c == shard_check::dne) continue;
+ if (check == shard_check::dne) {
+ check = c;
+ continue;
+ }
+
+ if (check != c) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " clashing types: check=" << check
+ << ", c=" << c << dendl;
+ return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+ }
+ }
+ if (check == shard_check::corrupt) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " should be unreachable!" << dendl;
+ return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+ }
+
+ if (check == shard_check::dne)
+ return handle_dne(dpp, ioctx,
+ def,
+ get_oid(0),
+ fifo_unsupported,
+ y);
+
+ return (check == shard_check::fifo ? log_type::fifo : log_type::omap);
+}
+
+bs::error_code log_remove(const DoutPrefixProvider *dpp,
+ librados::IoCtx& ioctx,
+ int shards,
+ const fu2::unique_function<std::string(int) const>& get_oid,
+ bool leave_zero,
+ optional_yield y)
+{
+ bs::error_code ec;
+ for (int i = 0; i < shards; ++i) {
+ auto oid = get_oid(i);
+ rados::cls::fifo::info info;
+ uint32_t part_header_size = 0, part_entry_overhead = 0;
+
+ auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info,
+ &part_header_size, &part_entry_overhead,
+ 0, y, true);
+ if (r == -ENOENT) continue;
+ if (r == 0 && info.head_part_num > -1) {
+ for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) {
+ librados::ObjectWriteOperation op;
+ op.remove();
+ auto part_oid = info.part_oid(j);
+ auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield);
+ if (subr < 0 && subr != -ENOENT) {
+ if (!ec)
+ ec = bs::error_code(-subr, bs::system_category());
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed removing FIFO part: part_oid=" << part_oid
+ << ", subr=" << subr << dendl;
+ }
+ }
+ }
+ if (r < 0 && r != -ENODATA) {
+ if (!ec)
+ ec = bs::error_code(-r, bs::system_category());
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed checking FIFO part: oid=" << oid
+ << ", r=" << r << dendl;
+ }
+ librados::ObjectWriteOperation op;
+ if (i == 0 && leave_zero) {
+ // Leave shard 0 in existence, but remove contents and
+ // omap. cls_lock stores things in the xattrs. And sync needs to
+ // rendezvous with locks on generation 0 shard 0.
+ op.omap_set_header({});
+ op.omap_clear();
+ op.truncate(0);
+ } else {
+ op.remove();
+ }
+ r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield);
+ if (r < 0 && r != -ENOENT) {
+ if (!ec)
+ ec = bs::error_code(-r, bs::system_category());
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed removing shard: oid=" << oid
+ << ", r=" << r << dendl;
+ }
+ }
+ return ec;
+}
+
+logback_generations::~logback_generations() {
+ if (watchcookie > 0) {
+ auto cct = static_cast<CephContext*>(ioctx.cct());
+ auto r = ioctx.unwatch2(watchcookie);
+ if (r < 0) {
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed unwatching oid=" << oid
+ << ", r=" << r << dendl;
+ }
+ }
+}
+
+bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp,
+ log_type def,
+ optional_yield y) noexcept
+{
+ try {
+ // First, read.
+ auto cct = static_cast<CephContext*>(ioctx.cct());
+ auto res = read(dpp, y);
+ if (!res && res.error() != bs::errc::no_such_file_or_directory) {
+ return res.error();
+ }
+ if (res) {
+ std::unique_lock lock(m);
+ std::tie(entries_, version) = std::move(*res);
+ } else {
+ // Are we the first? Then create generation 0 and the generations
+ // metadata.
+ librados::ObjectWriteOperation op;
+ auto type = log_backing_type(dpp, ioctx, def, shards,
+ [this](int shard) {
+ return this->get_oid(0, shard);
+ }, y);
+ if (!type)
+ return type.error();
+
+ logback_generation l;
+ l.type = *type;
+
+ std::unique_lock lock(m);
+ version.ver = 1;
+ static constexpr auto TAG_LEN = 24;
+ version.tag.clear();
+ append_rand_alpha(cct, version.tag, version.tag, TAG_LEN);
+ op.create(true);
+ cls_version_set(op, version);
+ cb::list bl;
+ entries_.emplace(0, std::move(l));
+ encode(entries_, bl);
+ lock.unlock();
+
+ op.write_full(bl);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed writing oid=" << oid
+ << ", r=" << r << dendl;
+ bs::system_error(-r, bs::system_category());
+ }
+ // Did someone race us? Then re-read.
+ if (r != 0) {
+ res = read(dpp, y);
+ if (!res)
+ return res.error();
+ if (res->first.empty())
+ return bs::error_code(EIO, bs::system_category());
+ auto l = res->first.begin()->second;
+ // In the unlikely event that someone raced us, created
+ // generation zero, incremented, then erased generation zero,
+ // don't leave generation zero lying around.
+ if (l.gen_id != 0) {
+ auto ec = log_remove(dpp, ioctx, shards,
+ [this](int shard) {
+ return this->get_oid(0, shard);
+ }, true, y);
+ if (ec) return ec;
+ }
+ std::unique_lock lock(m);
+ std::tie(entries_, version) = std::move(*res);
+ }
+ }
+ // Pass all non-empty generations to the handler
+ std::unique_lock lock(m);
+ auto i = lowest_nomempty(entries_);
+ entries_t e;
+ std::copy(i, entries_.cend(),
+ std::inserter(e, e.end()));
+ m.unlock();
+ auto ec = watch();
+ if (ec) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed to re-establish watch, unsafe to continue: oid="
+ << oid << ", ec=" << ec.message() << dendl;
+ }
+ return handle_init(std::move(e));
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+}
+
+bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept
+{
+ try {
+ auto res = read(dpp, y);
+ if (!res) {
+ return res.error();
+ }
+
+ std::unique_lock l(m);
+ auto& [es, v] = *res;
+ if (v == version) {
+ // Nothing to do!
+ return {};
+ }
+
+ // Check consistency and prepare update
+ if (es.empty()) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": INCONSISTENCY! Read empty update." << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+ auto cur_lowest = lowest_nomempty(entries_);
+ // Straight up can't happen
+ assert(cur_lowest != entries_.cend());
+ auto new_lowest = lowest_nomempty(es);
+ if (new_lowest == es.cend()) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": INCONSISTENCY! Read update with no active head." << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+ if (new_lowest->first < cur_lowest->first) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": INCONSISTENCY! Tail moved wrong way." << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+
+ std::optional<uint64_t> highest_empty;
+ if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) {
+ --new_lowest;
+ highest_empty = new_lowest->first;
+ }
+
+ entries_t new_entries;
+
+ if ((es.end() - 1)->first < (entries_.end() - 1)->first) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": INCONSISTENCY! Head moved wrong way." << dendl;
+ return bs::error_code(EFAULT, bs::system_category());
+ }
+
+ if ((es.end() - 1)->first > (entries_.end() - 1)->first) {
+ auto ei = es.lower_bound((entries_.end() - 1)->first + 1);
+ std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end()));
+ }
+
+ // Everything checks out!
+
+ version = v;
+ entries_ = es;
+ l.unlock();
+
+ if (highest_empty) {
+ auto ec = handle_empty_to(*highest_empty);
+ if (ec) return ec;
+ }
+
+ if (!new_entries.empty()) {
+ auto ec = handle_new_gens(std::move(new_entries));
+ if (ec) return ec;
+ }
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+ return {};
+}
+
+auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept ->
+ tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+{
+ try {
+ librados::ObjectReadOperation op;
+ std::unique_lock l(m);
+ cls_version_check(op, version, VER_COND_GE);
+ l.unlock();
+ obj_version v2;
+ cls_version_read(op, &v2);
+ cb::list bl;
+ op.read(0, 0, &bl, nullptr);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": oid=" << oid
+ << " not found" << dendl;
+ } else {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed reading oid=" << oid
+ << ", r=" << r << dendl;
+ }
+ return tl::unexpected(bs::error_code(-r, bs::system_category()));
+ }
+ auto bi = bl.cbegin();
+ entries_t e;
+ try {
+ decode(e, bi);
+ } catch (const cb::error& err) {
+ return tl::unexpected(err.code());
+ }
+ return std::pair{ std::move(e), std::move(v2) };
+ } catch (const std::bad_alloc&) {
+ return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+ }
+}
+
+bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e,
+ std::unique_lock<std::mutex>&& l_,
+ optional_yield y) noexcept
+{
+ auto l = std::move(l_);
+ ceph_assert(l.mutex() == &m &&
+ l.owns_lock());
+ try {
+ librados::ObjectWriteOperation op;
+ cls_version_check(op, version, VER_COND_GE);
+ cb::list bl;
+ encode(e, bl);
+ op.write_full(bl);
+ cls_version_inc(op);
+ auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (r == 0) {
+ entries_ = std::move(e);
+ version.inc();
+ return {};
+ }
+ l.unlock();
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed reading oid=" << oid
+ << ", r=" << r << dendl;
+ return { -r, bs::system_category() };
+ }
+ if (r == -ECANCELED) {
+ auto ec = update(dpp, y);
+ if (ec) {
+ return ec;
+ } else {
+ return { ECANCELED, bs::system_category() };
+ }
+ }
+ } catch (const std::bad_alloc&) {
+ return { ENOMEM, bs::system_category() };
+ }
+ return {};
+}
+
+
+bs::error_code logback_generations::watch() noexcept {
+ try {
+ auto cct = static_cast<CephContext*>(ioctx.cct());
+ auto r = ioctx.watch2(oid, &watchcookie, this);
+ if (r < 0) {
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed to set watch oid=" << oid
+ << ", r=" << r << dendl;
+ return { -r, bs::system_category() };
+ }
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+ return {};
+}
+
+bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp,
+ log_type type,
+ optional_yield y) noexcept {
+ static constexpr auto max_tries = 10;
+ try {
+ auto ec = update(dpp, y);
+ if (ec) return ec;
+ auto tries = 0;
+ entries_t new_entries;
+ do {
+ std::unique_lock l(m);
+ auto last = entries_.end() - 1;
+ if (last->second.type == type) {
+ // Nothing to be done
+ return {};
+ }
+ auto newgenid = last->first + 1;
+ logback_generation newgen;
+ newgen.gen_id = newgenid;
+ newgen.type = type;
+ new_entries.emplace(newgenid, newgen);
+ auto es = entries_;
+ es.emplace(newgenid, std::move(newgen));
+ ec = write(dpp, std::move(es), std::move(l), y);
+ ++tries;
+ } while (ec == bs::errc::operation_canceled &&
+ tries < max_tries);
+ if (tries >= max_tries) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": exhausted retry attempts." << dendl;
+ return ec;
+ }
+
+ if (ec) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": write failed with ec=" << ec.message() << dendl;
+ return ec;
+ }
+
+ cb::list bl, rbl;
+
+ auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": notify failed with r=" << r << dendl;
+ return { -r, bs::system_category() };
+ }
+ ec = handle_new_gens(new_entries);
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+ return {};
+}
+
+bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp,
+ uint64_t gen_id,
+ optional_yield y) noexcept {
+ static constexpr auto max_tries = 10;
+ try {
+ auto ec = update(dpp, y);
+ if (ec) return ec;
+ auto tries = 0;
+ uint64_t newtail = 0;
+ do {
+ std::unique_lock l(m);
+ {
+ auto last = entries_.end() - 1;
+ if (gen_id >= last->first) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": Attempt to trim beyond the possible." << dendl;
+ return bs::error_code(EINVAL, bs::system_category());
+ }
+ }
+ auto es = entries_;
+ auto ei = es.upper_bound(gen_id);
+ if (ei == es.begin()) {
+ // Nothing to be done.
+ return {};
+ }
+ for (auto i = es.begin(); i < ei; ++i) {
+ newtail = i->first;
+ i->second.pruned = ceph::real_clock::now();
+ }
+ ec = write(dpp, std::move(es), std::move(l), y);
+ ++tries;
+ } while (ec == bs::errc::operation_canceled &&
+ tries < max_tries);
+ if (tries >= max_tries) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": exhausted retry attempts." << dendl;
+ return ec;
+ }
+
+ if (ec) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": write failed with ec=" << ec.message() << dendl;
+ return ec;
+ }
+
+ cb::list bl, rbl;
+
+ auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": notify failed with r=" << r << dendl;
+ return { -r, bs::system_category() };
+ }
+ ec = handle_empty_to(newtail);
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+ return {};
+}
+
+bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept {
+ static constexpr auto max_tries = 10;
+ try {
+ auto ec = update(dpp, y);
+ if (ec) return ec;
+ auto tries = 0;
+ entries_t new_entries;
+ std::unique_lock l(m);
+ ceph_assert(!entries_.empty());
+ {
+ auto i = lowest_nomempty(entries_);
+ if (i == entries_.begin()) {
+ return {};
+ }
+ }
+ entries_t es;
+ auto now = ceph::real_clock::now();
+ l.unlock();
+ do {
+ std::copy_if(entries_.cbegin(), entries_.cend(),
+ std::inserter(es, es.end()),
+ [now](const auto& e) {
+ if (!e.second.pruned)
+ return false;
+
+ auto pruned = *e.second.pruned;
+ return (now - pruned) >= 1h;
+ });
+ auto es2 = entries_;
+ for (const auto& [gen_id, e] : es) {
+ ceph_assert(e.pruned);
+ auto ec = log_remove(dpp, ioctx, shards,
+ [this, gen_id = gen_id](int shard) {
+ return this->get_oid(gen_id, shard);
+ }, (gen_id == 0), y);
+ if (ec) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": Error pruning: gen_id=" << gen_id
+ << " ec=" << ec.message() << dendl;
+ }
+ if (auto i = es2.find(gen_id); i != es2.end()) {
+ es2.erase(i);
+ }
+ }
+ l.lock();
+ es.clear();
+ ec = write(dpp, std::move(es2), std::move(l), y);
+ ++tries;
+ } while (ec == bs::errc::operation_canceled &&
+ tries < max_tries);
+ if (tries >= max_tries) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": exhausted retry attempts." << dendl;
+ return ec;
+ }
+
+ if (ec) {
+ ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": write failed with ec=" << ec.message() << dendl;
+ return ec;
+ }
+ } catch (const std::bad_alloc&) {
+ return bs::error_code(ENOMEM, bs::system_category());
+ }
+ return {};
+}
+
+void logback_generations::handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl)
+{
+ auto cct = static_cast<CephContext*>(ioctx.cct());
+ const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: ");
+ if (notifier_id != my_id) {
+ auto ec = update(&dp, null_yield);
+ if (ec) {
+ lderr(cct)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": update failed, no one to report to and no safe way to continue."
+ << dendl;
+ abort();
+ }
+ }
+ cb::list rbl;
+ ioctx.notify_ack(oid, notify_id, watchcookie, rbl);
+}
+
+void logback_generations::handle_error(uint64_t cookie, int err) {
+ auto cct = static_cast<CephContext*>(ioctx.cct());
+ auto r = ioctx.unwatch2(watchcookie);
+ if (r < 0) {
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed to set unwatch oid=" << oid
+ << ", r=" << r << dendl;
+ }
+
+ auto ec = watch();
+ if (ec) {
+ lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << ": failed to re-establish watch, unsafe to continue: oid="
+ << oid << ", ec=" << ec.message() << dendl;
+ }
+}
diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h
new file mode 100644
index 000000000..3dfdb8ee4
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_log_backing.h
@@ -0,0 +1,394 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <optional>
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include <strings.h>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/system/error_code.hpp>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/encoding.h"
+#include "include/expected.hpp"
+#include "include/function2.hpp"
+
+#include "cls/version/cls_version_types.h"
+
+#include "common/async/yield_context.h"
+#include "common/Formatter.h"
+#include "common/strtol.h"
+
+namespace bc = boost::container;
+namespace bs = boost::system;
+
+#include "cls_fifo_legacy.h"
+
+/// Type of log backing, stored in the mark used in the quick check,
+/// and passed to checking functions.
+enum class log_type {
+ omap = 0,
+ fifo = 1
+};
+
+inline void encode(const log_type& type, ceph::buffer::list& bl) {
+ auto t = static_cast<uint8_t>(type);
+ encode(t, bl);
+}
+
+inline void decode(log_type& type, bufferlist::const_iterator& bl) {
+ uint8_t t;
+ decode(t, bl);
+ type = static_cast<log_type>(t);
+}
+
+inline std::optional<log_type> to_log_type(std::string_view s) {
+ if (strncasecmp(s.data(), "omap", s.length()) == 0) {
+ return log_type::omap;
+ } else if (strncasecmp(s.data(), "fifo", s.length()) == 0) {
+ return log_type::fifo;
+ } else {
+ return std::nullopt;
+ }
+}
+inline std::ostream& operator <<(std::ostream& m, const log_type& t) {
+ switch (t) {
+ case log_type::omap:
+ return m << "log_type::omap";
+ case log_type::fifo:
+ return m << "log_type::fifo";
+ }
+
+ return m << "log_type::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+/// Look over the shards in a log and determine the type.
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp,
+ librados::IoCtx& ioctx,
+ log_type def,
+ int shards, //< Total number of shards
+ /// A function taking a shard number and
+ /// returning an oid.
+ const fu2::unique_function<std::string(int) const>& get_oid,
+ optional_yield y);
+
+/// Remove all log shards and associated parts of fifos.
+bs::error_code log_remove(librados::IoCtx& ioctx,
+ int shards, //< Total number of shards
+ /// A function taking a shard number and
+ /// returning an oid.
+ const fu2::unique_function<std::string(int) const>& get_oid,
+ bool leave_zero,
+ optional_yield y);
+
+
+struct logback_generation {
+ uint64_t gen_id = 0;
+ log_type type;
+ std::optional<ceph::real_time> pruned;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(gen_id, bl);
+ encode(type, bl);
+ encode(pruned, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(gen_id, bl);
+ decode(type, bl);
+ decode(pruned, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(logback_generation)
+inline std::ostream& operator <<(std::ostream& m, const logback_generation& g) {
+ return m << "[" << g.gen_id << "," << g.type << ","
+ << (g.pruned ? "PRUNED" : "NOT PRUNED") << "]";
+}
+
+class logback_generations : public librados::WatchCtx2 {
+public:
+ using entries_t = bc::flat_map<uint64_t, logback_generation>;
+
+protected:
+ librados::IoCtx& ioctx;
+ logback_generations(librados::IoCtx& ioctx,
+ std::string oid,
+ fu2::unique_function<std::string(
+ uint64_t, int) const>&& get_oid,
+ int shards) noexcept
+ : ioctx(ioctx), oid(oid), get_oid(std::move(get_oid)),
+ shards(shards) {}
+
+ uint64_t my_id = ioctx.get_instance_id();
+
+private:
+ const std::string oid;
+ const fu2::unique_function<std::string(uint64_t, int) const> get_oid;
+
+protected:
+ const int shards;
+
+private:
+
+ uint64_t watchcookie = 0;
+
+ obj_version version;
+ std::mutex m;
+ entries_t entries_;
+
+ tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+ read(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+ bs::error_code write(const DoutPrefixProvider *dpp, entries_t&& e, std::unique_lock<std::mutex>&& l_,
+ optional_yield y) noexcept;
+ bs::error_code setup(const DoutPrefixProvider *dpp, log_type def, optional_yield y) noexcept;
+
+ bs::error_code watch() noexcept;
+
+ auto lowest_nomempty(const entries_t& es) {
+ return std::find_if(es.begin(), es.end(),
+ [](const auto& e) {
+ return !e.second.pruned;
+ });
+ }
+
+public:
+
+ /// For the use of watch/notify.
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) override final;
+
+ void handle_error(uint64_t cookie, int err) override final;
+
+ /// Public interface
+
+ virtual ~logback_generations();
+
+ template<typename T, typename... Args>
+ static tl::expected<std::unique_ptr<T>, bs::error_code>
+ init(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx_, std::string oid_,
+ fu2::unique_function<std::string(uint64_t, int) const>&& get_oid_,
+ int shards_, log_type def, optional_yield y,
+ Args&& ...args) noexcept {
+ try {
+ T* lgp = new T(ioctx_, std::move(oid_),
+ std::move(get_oid_),
+ shards_, std::forward<Args>(args)...);
+ std::unique_ptr<T> lg(lgp);
+ lgp = nullptr;
+ auto ec = lg->setup(dpp, def, y);
+ if (ec)
+ return tl::unexpected(ec);
+ // Obnoxiousness for C++ Compiler in Bionic Beaver
+ return tl::expected<std::unique_ptr<T>, bs::error_code>(std::move(lg));
+ } catch (const std::bad_alloc&) {
+ return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+ }
+ }
+
+ bs::error_code update(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+ entries_t entries() const {
+ return entries_;
+ }
+
+ bs::error_code new_backing(const DoutPrefixProvider *dpp, log_type type, optional_yield y) noexcept;
+
+ bs::error_code empty_to(const DoutPrefixProvider *dpp, uint64_t gen_id, optional_yield y) noexcept;
+
+ bs::error_code remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+ // Callbacks, to be defined by descendant.
+
+ /// Handle initialization on startup
+ ///
+ /// @param e All non-empty generations
+ virtual bs::error_code handle_init(entries_t e) noexcept = 0;
+
+ /// Handle new generations.
+ ///
+ /// @param e Map of generations added since last update
+ virtual bs::error_code handle_new_gens(entries_t e) noexcept = 0;
+
+ /// Handle generations being marked empty
+ ///
+ /// @param new_tail Lowest non-empty generation
+ virtual bs::error_code handle_empty_to(uint64_t new_tail) noexcept = 0;
+};
+
+inline std::string gencursor(uint64_t gen_id, std::string_view cursor) {
+ return (gen_id > 0 ?
+ fmt::format("G{:0>20}@{}", gen_id, cursor) :
+ std::string(cursor));
+}
+
+inline std::pair<uint64_t, std::string_view>
+cursorgen(std::string_view cursor_) {
+ if (cursor_.empty()) {
+ return { 0, "" };
+ }
+ std::string_view cursor = cursor_;
+ if (cursor[0] != 'G') {
+ return { 0, cursor };
+ }
+ cursor.remove_prefix(1);
+ auto gen_id = ceph::consume<uint64_t>(cursor);
+ if (!gen_id || cursor[0] != '@') {
+ return { 0, cursor_ };
+ }
+ cursor.remove_prefix(1);
+ return { *gen_id, cursor };
+}
+
+class LazyFIFO {
+ librados::IoCtx& ioctx;
+ std::string oid;
+ std::mutex m;
+ std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+
+ int lazy_init(const DoutPrefixProvider *dpp, optional_yield y) {
+ std::unique_lock l(m);
+ if (fifo) return 0;
+ auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, &fifo, y);
+ if (r) {
+ fifo.reset();
+ }
+ return r;
+ }
+
+public:
+
+ LazyFIFO(librados::IoCtx& ioctx, std::string oid)
+ : ioctx(ioctx), oid(std::move(oid)) {}
+
+ int read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->read_meta(dpp, y);
+ }
+
+ int meta(const DoutPrefixProvider *dpp, rados::cls::fifo::info& info, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ info = fifo->meta();
+ return 0;
+ }
+
+ int get_part_layout_info(const DoutPrefixProvider *dpp,
+ std::uint32_t& part_header_size,
+ std::uint32_t& part_entry_overhead,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ std::tie(part_header_size, part_entry_overhead)
+ = fifo->get_part_layout_info();
+ return 0;
+ }
+
+ int push(const DoutPrefixProvider *dpp,
+ const ceph::buffer::list& bl,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->push(dpp, bl, y);
+ }
+
+ int push(const DoutPrefixProvider *dpp,
+ ceph::buffer::list& bl,
+ librados::AioCompletion* c,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->push(dpp, bl, c);
+ return 0;
+ }
+
+ int push(const DoutPrefixProvider *dpp,
+ const std::vector<ceph::buffer::list>& data_bufs,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->push(dpp, data_bufs, y);
+ }
+
+ int push(const DoutPrefixProvider *dpp,
+ const std::vector<ceph::buffer::list>& data_bufs,
+ librados::AioCompletion* c,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->push(dpp, data_bufs, c);
+ return 0;
+ }
+
+ int list(const DoutPrefixProvider *dpp,
+ int max_entries, std::optional<std::string_view> markstr,
+ std::vector<rgw::cls::fifo::list_entry>* out,
+ bool* more, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->list(dpp, max_entries, markstr, out, more, y);
+ }
+
+ int list(const DoutPrefixProvider *dpp, int max_entries, std::optional<std::string_view> markstr,
+ std::vector<rgw::cls::fifo::list_entry>* out, bool* more,
+ librados::AioCompletion* c, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->list(dpp, max_entries, markstr, out, more, c);
+ return 0;
+ }
+
+ int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->trim(dpp, markstr, exclusive, y);
+ }
+
+ int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, librados::AioCompletion* c,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->trim(dpp, markstr, exclusive, c);
+ return 0;
+ }
+
+ int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ return fifo->get_part_info(dpp, part_num, header, y);
+ }
+
+ int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+ librados::AioCompletion* c, optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->get_part_info(part_num, header, c);
+ return 0;
+ }
+
+ int get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<
+ void(int r, rados::cls::fifo::part_header&&)>&& f,
+ librados::AioCompletion* c,
+ optional_yield y) {
+ auto r = lazy_init(dpp, y);
+ if (r < 0) return r;
+ fifo->get_head_info(dpp, std::move(f), c);
+ return 0;
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_metadata.cc b/src/rgw/driver/rados/rgw_metadata.cc
new file mode 100644
index 000000000..e3e49316e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_metadata.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_metadata.h"
+
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_cls.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const std::string RGWMetadataLogHistory::oid = "meta.history";
+
+struct obj_version;
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
+{
+ uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+ char buf[16];
+ if (shard_id) {
+ *shard_id = val % max_shards;
+ }
+ snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+ name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
+{
+ uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+ val ^= ceph_str_hash_linux(section.c_str(), section.size());
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+ name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned shard_id, string& name)
+{
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%u", shard_id);
+ name = prefix + buf;
+}
+
+int RGWMetadataLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) {
+ if (!svc.zone->need_to_log_metadata())
+ return 0;
+
+ string oid;
+ int shard_id;
+
+ rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id);
+ mark_modified(shard_id);
+ real_time now = real_clock::now();
+ return svc.cls->timelog.add(dpp, oid, now, section, key, bl, null_yield);
+}
+
+int RGWMetadataLog::get_shard_id(const string& hash_key, int *shard_id)
+{
+ string oid;
+
+ rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, shard_id);
+ return 0;
+}
+
+int RGWMetadataLog::store_entries_in_shard(const DoutPrefixProvider *dpp, list<cls_log_entry>& entries, int shard_id, librados::AioCompletion *completion)
+{
+ string oid;
+
+ mark_modified(shard_id);
+ rgw_shard_name(prefix, shard_id, oid);
+ return svc.cls->timelog.add(dpp, oid, entries, completion, false, null_yield);
+}
+
+void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time,
+ const string& marker, void **handle)
+{
+ LogListCtx *ctx = new LogListCtx();
+
+ ctx->cur_shard = shard_id;
+ ctx->from_time = from_time;
+ ctx->end_time = end_time;
+ ctx->marker = marker;
+
+ get_shard_oid(ctx->cur_shard, ctx->cur_oid);
+
+ *handle = (void *)ctx;
+}
+
+void RGWMetadataLog::complete_list_entries(void *handle) {
+ LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+ delete ctx;
+}
+
+int RGWMetadataLog::list_entries(const DoutPrefixProvider *dpp, void *handle,
+ int max_entries,
+ list<cls_log_entry>& entries,
+ string *last_marker,
+ bool *truncated) {
+ LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+
+ if (!max_entries) {
+ *truncated = false;
+ return 0;
+ }
+
+ std::string next_marker;
+ int ret = svc.cls->timelog.list(dpp, ctx->cur_oid, ctx->from_time, ctx->end_time,
+ max_entries, entries, ctx->marker,
+ &next_marker, truncated, null_yield);
+ if ((ret < 0) && (ret != -ENOENT))
+ return ret;
+
+ ctx->marker = std::move(next_marker);
+ if (last_marker) {
+ *last_marker = ctx->marker;
+ }
+
+ if (ret == -ENOENT)
+ *truncated = false;
+
+ return 0;
+}
+
+int RGWMetadataLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info)
+{
+ string oid;
+ get_shard_oid(shard_id, oid);
+
+ cls_log_header header;
+
+ int ret = svc.cls->timelog.info(dpp, oid, &header, null_yield);
+ if ((ret < 0) && (ret != -ENOENT))
+ return ret;
+
+ info->marker = header.max_marker;
+ info->last_update = header.max_time.to_real_time();
+
+ return 0;
+}
+
+static void _mdlog_info_completion(librados::completion_t cb, void *arg)
+{
+ auto infoc = static_cast<RGWMetadataLogInfoCompletion *>(arg);
+ infoc->finish(cb);
+ infoc->put(); // drop the ref from get_info_async()
+}
+
+RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb)
+ : completion(librados::Rados::aio_create_completion((void *)this,
+ _mdlog_info_completion)),
+ callback(cb)
+{
+}
+
+RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion()
+{
+ completion->release();
+}
+
+int RGWMetadataLog::get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion)
+{
+ string oid;
+ get_shard_oid(shard_id, oid);
+
+ completion->get(); // hold a ref until the completion fires
+
+ return svc.cls->timelog.info_async(dpp, completion->get_io_obj(), oid,
+ &completion->get_header(),
+ completion->get_completion());
+}
+
+int RGWMetadataLog::trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time,
+ const string& start_marker, const string& end_marker)
+{
+ string oid;
+ get_shard_oid(shard_id, oid);
+
+ return svc.cls->timelog.trim(dpp, oid, from_time, end_time, start_marker,
+ end_marker, nullptr, null_yield);
+}
+
+int RGWMetadataLog::lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, string& zone_id, string& owner_id) {
+ string oid;
+ get_shard_oid(shard_id, oid);
+
+ return svc.cls->lock.lock_exclusive(dpp, svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id);
+}
+
+int RGWMetadataLog::unlock(const DoutPrefixProvider *dpp, int shard_id, string& zone_id, string& owner_id) {
+ string oid;
+ get_shard_oid(shard_id, oid);
+
+ return svc.cls->lock.unlock(dpp, svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id);
+}
+
+void RGWMetadataLog::mark_modified(int shard_id)
+{
+ lock.get_read();
+ if (modified_shards.find(shard_id) != modified_shards.end()) {
+ lock.unlock();
+ return;
+ }
+ lock.unlock();
+
+ std::unique_lock wl{lock};
+ modified_shards.insert(shard_id);
+}
+
+void RGWMetadataLog::read_clear_modified(set<int> &modified)
+{
+ std::unique_lock wl{lock};
+ modified.swap(modified_shards);
+ modified_shards.clear();
+}
+
+void RGWMetadataLogInfo::dump(Formatter *f) const
+{
+ encode_json("marker", marker, f);
+ utime_t ut(last_update);
+ encode_json("last_update", ut, f);
+}
+
+void RGWMetadataLogInfo::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("marker", marker, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("last_update", ut, obj);
+ last_update = ut.to_real_time();
+}
+
diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h
new file mode 100644
index 000000000..c83db7c40
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_metadata.h
@@ -0,0 +1,298 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <boost/optional.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_period_history.h"
+#include "rgw_mdlog_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "services/svc_meta_be.h"
+#include "rgw_sal_fwd.h"
+
+
+class RGWCoroutine;
+class JSONObj;
+struct RGWObjVersionTracker;
+
+struct obj_version;
+
+
+class RGWMetadataObject {
+protected:
+ obj_version objv;
+ ceph::real_time mtime;
+ std::map<std::string, bufferlist> *pattrs{nullptr};
+
+public:
+ RGWMetadataObject() {}
+ RGWMetadataObject(const obj_version& v,
+ real_time m) : objv(v), mtime(m) {}
+ virtual ~RGWMetadataObject() {}
+ obj_version& get_version();
+ real_time& get_mtime() { return mtime; }
+ void set_pattrs(std::map<std::string, bufferlist> *_pattrs) {
+ pattrs = _pattrs;
+ }
+ std::map<std::string, bufferlist> *get_pattrs() {
+ return pattrs;
+ }
+
+ virtual void dump(Formatter *f) const {}
+};
+
+class RGWMetadataManager;
+
+class RGWMetadataHandler {
+ friend class RGWMetadataManager;
+
+protected:
+ CephContext *cct;
+
+public:
+ RGWMetadataHandler() {}
+ virtual ~RGWMetadataHandler();
+ virtual std::string get_type() = 0;
+
+ void base_init(CephContext *_cct) {
+ cct = _cct;
+ }
+
+ virtual RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) = 0;
+
+ virtual int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) = 0;
+ virtual int put(std::string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type,
+ bool from_remote_zone) = 0;
+ virtual int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) = 0;
+
+ virtual int mutate(const std::string& entry,
+ const ceph::real_time& mtime,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogStatus op_type,
+ std::function<int()> f) = 0;
+
+ virtual int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) = 0;
+ virtual int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) = 0;
+ virtual void list_keys_complete(void *handle) = 0;
+
+ virtual std::string get_marker(void *handle) = 0;
+
+ virtual int get_shard_id(const std::string& entry, int *shard_id) {
+ *shard_id = 0;
+ return 0;
+ }
+ virtual int attach(RGWMetadataManager *manager);
+};
+
+class RGWMetadataHandler_GenericMetaBE : public RGWMetadataHandler {
+ friend class RGWSI_MetaBackend;
+ friend class RGWMetadataManager;
+ friend class Put;
+
+public:
+ class Put;
+
+protected:
+ RGWSI_MetaBackend_Handler *be_handler;
+
+ virtual int do_get(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+ virtual int do_put(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker, optional_yield y,
+ const DoutPrefixProvider *dpp, RGWMDLogSyncType type,
+ bool from_remote_zone) = 0;
+ virtual int do_put_operate(Put *put_op, const DoutPrefixProvider *dpp);
+ virtual int do_remove(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+
+public:
+ RGWMetadataHandler_GenericMetaBE() {}
+
+ void base_init(CephContext *_cct,
+ RGWSI_MetaBackend_Handler *_be_handler) {
+ RGWMetadataHandler::base_init(_cct);
+ be_handler = _be_handler;
+ }
+
+ RGWSI_MetaBackend_Handler *get_be_handler() {
+ return be_handler;
+ }
+
+ class Put {
+ protected:
+ RGWMetadataHandler_GenericMetaBE *handler;
+ RGWSI_MetaBackend_Handler::Op *op;
+ std::string& entry;
+ RGWMetadataObject *obj;
+ RGWObjVersionTracker& objv_tracker;
+ RGWMDLogSyncType apply_type;
+ optional_yield y;
+ bool from_remote_zone{false};
+
+ int get(RGWMetadataObject **obj, const DoutPrefixProvider *dpp) {
+ return handler->do_get(op, entry, obj, y, dpp);
+ }
+ public:
+ Put(RGWMetadataHandler_GenericMetaBE *_handler, RGWSI_MetaBackend_Handler::Op *_op,
+ std::string& _entry, RGWMetadataObject *_obj,
+ RGWObjVersionTracker& _objv_tracker, optional_yield _y,
+ RGWMDLogSyncType _type, bool from_remote_zone);
+
+ virtual ~Put() {}
+
+ virtual int put_pre(const DoutPrefixProvider *dpp) {
+ return 0;
+ }
+ virtual int put(const DoutPrefixProvider *dpp) {
+ return 0;
+ }
+ virtual int put_post(const DoutPrefixProvider *dpp) {
+ return 0;
+ }
+ virtual int finalize() {
+ return 0;
+ }
+ };
+
+ int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) override;
+ int put(std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override;
+ int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) override;
+
+ int mutate(const std::string& entry,
+ const ceph::real_time& mtime,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogStatus op_type,
+ std::function<int()> f) override;
+
+ int get_shard_id(const std::string& entry, int *shard_id) override;
+
+ int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) override;
+ int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) override;
+ void list_keys_complete(void *handle) override;
+
+ std::string get_marker(void *handle) override;
+
+ /**
+ * Compare an incoming versus on-disk tag/version+mtime combo against
+ * the sync mode to see if the new one should replace the on-disk one.
+ *
+ * @return true if the update should proceed, false otherwise.
+ */
+ static bool check_versions(bool exists,
+ const obj_version& ondisk, const real_time& ondisk_time,
+ const obj_version& incoming, const real_time& incoming_time,
+ RGWMDLogSyncType sync_mode) {
+ switch (sync_mode) {
+ case APPLY_UPDATES:
+ if ((ondisk.tag != incoming.tag) ||
+ (ondisk.ver >= incoming.ver))
+ return false;
+ break;
+ case APPLY_NEWER:
+ if (ondisk_time >= incoming_time)
+ return false;
+ break;
+ case APPLY_EXCLUSIVE:
+ if (exists)
+ return false;
+ break;
+ case APPLY_ALWAYS: //deliberate fall-thru -- we always apply!
+ default: break;
+ }
+ return true;
+ }
+};
+
+class RGWMetadataTopHandler;
+
+class RGWMetadataManager {
+ friend class RGWMetadataHandler;
+
+ CephContext *cct;
+ RGWSI_Meta *meta_svc;
+ std::map<std::string, RGWMetadataHandler *> handlers;
+ std::unique_ptr<RGWMetadataTopHandler> md_top_handler;
+
+ int find_handler(const std::string& metadata_key, RGWMetadataHandler **handler, std::string& entry);
+ int register_handler(RGWMetadataHandler *handler);
+
+public:
+ RGWMetadataManager(RGWSI_Meta *_meta_svc);
+ ~RGWMetadataManager();
+
+ RGWMetadataHandler *get_handler(const std::string& type);
+
+ int get(std::string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp);
+ int put(std::string& metadata_key, bufferlist& bl, optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType sync_mode,
+ bool from_remote_zone,
+ obj_version *existing_version = NULL);
+ int remove(std::string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp);
+
+ int mutate(const std::string& metadata_key,
+ const ceph::real_time& mtime,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogStatus op_type,
+ std::function<int()> f);
+
+ int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, void **phandle);
+ int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void **phandle);
+ int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated);
+ void list_keys_complete(void *handle);
+
+ std::string get_marker(void *handle);
+
+ void dump_log_entry(cls_log_entry& entry, Formatter *f);
+
+ void get_sections(std::list<std::string>& sections);
+
+ void parse_metadata_key(const std::string& metadata_key, std::string& type, std::string& entry);
+
+ int get_shard_id(const std::string& section, const std::string& key, int *shard_id);
+};
+
+class RGWMetadataHandlerPut_SObj : public RGWMetadataHandler_GenericMetaBE::Put
+{
+protected:
+ std::unique_ptr<RGWMetadataObject> oo;
+ RGWMetadataObject *old_obj{nullptr};
+ bool exists{false};
+
+public:
+ RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, RGWSI_MetaBackend_Handler::Op *op,
+ std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ RGWMDLogSyncType type, bool from_remote_zone);
+ ~RGWMetadataHandlerPut_SObj();
+
+ int put_pre(const DoutPrefixProvider *dpp) override;
+ int put(const DoutPrefixProvider *dpp) override;
+ virtual int put_check(const DoutPrefixProvider *dpp) {
+ return 0;
+ }
+ virtual int put_checked(const DoutPrefixProvider *dpp);
+ virtual void encode_obj(bufferlist *bl) {}
+};
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
new file mode 100644
index 000000000..b1835016e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -0,0 +1,1023 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_notify.h"
+#include "cls/2pc_queue/cls_2pc_queue_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include <memory>
+#include <boost/algorithm/hex.hpp>
+#include <boost/context/protected_fixedsize_stack.hpp>
+#include <spawn/spawn.hpp>
+#include "rgw_sal_rados.h"
+#include "rgw_pubsub.h"
+#include "rgw_pubsub_push.h"
+#include "rgw_perf_counters.h"
+#include "common/dout.h"
+#include <chrono>
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::notify {
+
+struct event_entry_t {
+ rgw_pubsub_s3_event event;
+ std::string push_endpoint;
+ std::string push_endpoint_args;
+ std::string arn_topic;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(event, bl);
+ encode(push_endpoint, bl);
+ encode(push_endpoint_args, bl);
+ encode(arn_topic, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(event, bl);
+ decode(push_endpoint, bl);
+ decode(push_endpoint_args, bl);
+ decode(arn_topic, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(event_entry_t)
+
+using queues_t = std::set<std::string>;
+
+// use mmap/mprotect to allocate 128k coroutine stacks
+auto make_stack_allocator() {
+ return boost::context::protected_fixedsize_stack{128*1024};
+}
+
+const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
+
+class Manager : public DoutPrefixProvider {
+ const size_t max_queue_size;
+ const uint32_t queues_update_period_ms;
+ const uint32_t queues_update_retry_ms;
+ const uint32_t queue_idle_sleep_us;
+ const utime_t failover_time;
+ CephContext* const cct;
+ static constexpr auto COOKIE_LEN = 16;
+ const std::string lock_cookie;
+ boost::asio::io_context io_context;
+ boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_guard;
+ const uint32_t worker_count;
+ std::vector<std::thread> workers;
+ const uint32_t stale_reservations_period_s;
+ const uint32_t reservations_cleanup_period_s;
+public:
+ librados::IoCtx& rados_ioctx;
+private:
+
+ CephContext *get_cct() const override { return cct; }
+ unsigned get_subsys() const override { return dout_subsys; }
+ std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw notify: "; }
+
+ // read the list of queues from the queue list object
+ int read_queue_list(queues_t& queues, optional_yield y) {
+ constexpr auto max_chunk = 1024U;
+ std::string start_after;
+ bool more = true;
+ int rval;
+ while (more) {
+ librados::ObjectReadOperation op;
+ queues_t queues_chunk;
+ op.omap_get_keys2(start_after, max_chunk, &queues_chunk, &more, &rval);
+ const auto ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, nullptr, y);
+ if (ret == -ENOENT) {
+ // queue list object was not created - nothing to do
+ return 0;
+ }
+ if (ret < 0) {
+ // TODO: do we need to check on rval as well as ret?
+ ldpp_dout(this, 1) << "ERROR: failed to read queue list. error: " << ret << dendl;
+ return ret;
+ }
+ queues.merge(queues_chunk);
+ }
+ return 0;
+ }
+
+ // set m1 to be the minimum between m1 and m2
+ static int set_min_marker(std::string& m1, const std::string m2) {
+ cls_queue_marker mr1;
+ cls_queue_marker mr2;
+ if (mr1.from_str(m1.c_str()) < 0 || mr2.from_str(m2.c_str()) < 0) {
+ return -EINVAL;
+ }
+ if (mr2.gen <= mr1.gen && mr2.offset < mr1.offset) {
+ m1 = m2;
+ }
+ return 0;
+ }
+
+ using Clock = ceph::coarse_mono_clock;
+ using Executor = boost::asio::io_context::executor_type;
+ using Timer = boost::asio::basic_waitable_timer<Clock,
+ boost::asio::wait_traits<Clock>, Executor>;
+
+ class tokens_waiter {
+ const std::chrono::hours infinite_duration;
+ size_t pending_tokens;
+ Timer timer;
+
+ struct token {
+ tokens_waiter& waiter;
+ token(tokens_waiter& _waiter) : waiter(_waiter) {
+ ++waiter.pending_tokens;
+ }
+
+ ~token() {
+ --waiter.pending_tokens;
+ if (waiter.pending_tokens == 0) {
+ waiter.timer.cancel();
+ }
+ }
+ };
+
+ public:
+
+ tokens_waiter(boost::asio::io_context& io_context) :
+ infinite_duration(1000),
+ pending_tokens(0),
+ timer(io_context) {}
+
+ void async_wait(yield_context yield) {
+ if (pending_tokens == 0) {
+ return;
+ }
+ timer.expires_from_now(infinite_duration);
+ boost::system::error_code ec;
+ timer.async_wait(yield[ec]);
+ ceph_assert(ec == boost::system::errc::operation_canceled);
+ }
+
+ token make_token() {
+ return token(*this);
+ }
+ };
+
+ // processing of a specific entry
+ // return whether processing was successfull (true) or not (false)
+ bool process_entry(const cls_queue_entry& entry, yield_context yield) {
+ event_entry_t event_entry;
+ auto iter = entry.data.cbegin();
+ try {
+ decode(event_entry, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(this, 5) << "WARNING: failed to decode entry. error: " << err.what() << dendl;
+ return false;
+ }
+ try {
+ // TODO move endpoint creation to queue level
+ const auto push_endpoint = RGWPubSubEndpoint::create(event_entry.push_endpoint, event_entry.arn_topic,
+ RGWHTTPArgs(event_entry.push_endpoint_args, this),
+ cct);
+ ldpp_dout(this, 20) << "INFO: push endpoint created: " << event_entry.push_endpoint <<
+ " for entry: " << entry.marker << dendl;
+ const auto ret = push_endpoint->send_to_completion_async(cct, event_entry.event, optional_yield(io_context, yield));
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "WARNING: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint
+ << " failed. error: " << ret << " (will retry)" << dendl;
+ return false;
+ } else {
+ ldpp_dout(this, 20) << "INFO: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint
+ << " ok" << dendl;
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+ return true;
+ }
+ } catch (const RGWPubSubEndpoint::configuration_error& e) {
+ ldpp_dout(this, 5) << "WARNING: failed to create push endpoint: "
+ << event_entry.push_endpoint << " for entry: " << entry.marker << ". error: " << e.what() << " (will retry) " << dendl;
+ return false;
+ }
+ }
+
+ // clean stale reservation from queue
+ void cleanup_queue(const std::string& queue_name, yield_context yield) {
+ while (true) {
+ ldpp_dout(this, 20) << "INFO: trying to perform stale reservation cleanup for queue: " << queue_name << dendl;
+ const auto now = ceph::coarse_real_time::clock::now();
+ const auto stale_time = now - std::chrono::seconds(stale_reservations_period_s);
+ librados::ObjectWriteOperation op;
+ op.assert_exists();
+ rados::cls::lock::assert_locked(&op, queue_name+"_lock",
+ ClsLockType::EXCLUSIVE,
+ lock_cookie,
+ "" /*no tag*/);
+ cls_2pc_queue_expire_reservations(op, stale_time);
+ // check ownership and do reservation cleanup in one batch
+ auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+ if (ret == -ENOENT) {
+ // queue was deleted
+ ldpp_dout(this, 5) << "INFO: queue: "
+ << queue_name << ". was removed. cleanup will stop" << dendl;
+ return;
+ }
+ if (ret == -EBUSY) {
+ ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+ return;
+ }
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "WARNING: failed to cleanup stale reservation from queue and/or lock queue: " << queue_name
+ << ". error: " << ret << dendl;
+ }
+ Timer timer(io_context);
+ timer.expires_from_now(std::chrono::seconds(reservations_cleanup_period_s));
+ boost::system::error_code ec;
+ timer.async_wait(yield[ec]);
+ }
+ }
+
+ // processing of a specific queue
+ void process_queue(const std::string& queue_name, yield_context yield) {
+ constexpr auto max_elements = 1024;
+ auto is_idle = false;
+ const std::string start_marker;
+
+ // start a the cleanup coroutine for the queue
+ spawn::spawn(io_context, [this, queue_name](yield_context yield) {
+ cleanup_queue(queue_name, yield);
+ }, make_stack_allocator());
+
+ while (true) {
+ // if queue was empty the last time, sleep for idle timeout
+ if (is_idle) {
+ Timer timer(io_context);
+ timer.expires_from_now(std::chrono::microseconds(queue_idle_sleep_us));
+ boost::system::error_code ec;
+ timer.async_wait(yield[ec]);
+ }
+
+ // get list of entries in the queue
+ is_idle = true;
+ bool truncated = false;
+ std::string end_marker;
+ std::vector<cls_queue_entry> entries;
+ auto total_entries = 0U;
+ {
+ librados::ObjectReadOperation op;
+ op.assert_exists();
+ bufferlist obl;
+ int rval;
+ rados::cls::lock::assert_locked(&op, queue_name+"_lock",
+ ClsLockType::EXCLUSIVE,
+ lock_cookie,
+ "" /*no tag*/);
+ cls_2pc_queue_list_entries(op, start_marker, max_elements, &obl, &rval);
+ // check ownership and list entries in one batch
+ auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, nullptr, optional_yield(io_context, yield));
+ if (ret == -ENOENT) {
+ // queue was deleted
+ ldpp_dout(this, 5) << "INFO: queue: "
+ << queue_name << ". was removed. processing will stop" << dendl;
+ return;
+ }
+ if (ret == -EBUSY) {
+ ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+ return;
+ }
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "WARNING: failed to get list of entries in queue and/or lock queue: "
+ << queue_name << ". error: " << ret << " (will retry)" << dendl;
+ continue;
+ }
+ ret = cls_2pc_queue_list_entries_result(obl, entries, &truncated, end_marker);
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "WARNING: failed to parse list of entries in queue: "
+ << queue_name << ". error: " << ret << " (will retry)" << dendl;
+ continue;
+ }
+ }
+ total_entries = entries.size();
+ if (total_entries == 0) {
+ // nothing in the queue
+ continue;
+ }
+ // log when queue is not idle
+ ldpp_dout(this, 20) << "INFO: found: " << total_entries << " entries in: " << queue_name <<
+ ". end marker is: " << end_marker << dendl;
+
+ is_idle = false;
+ auto has_error = false;
+ auto remove_entries = false;
+ auto entry_idx = 1U;
+ tokens_waiter waiter(io_context);
+ for (auto& entry : entries) {
+ if (has_error) {
+ // bail out on first error
+ break;
+ }
+ // TODO pass entry pointer instead of by-value
+ spawn::spawn(yield, [this, &queue_name, entry_idx, total_entries, &end_marker, &remove_entries, &has_error, &waiter, entry](yield_context yield) {
+ const auto token = waiter.make_token();
+ if (process_entry(entry, yield)) {
+ ldpp_dout(this, 20) << "INFO: processing of entry: " <<
+ entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " ok" << dendl;
+ remove_entries = true;
+ } else {
+ if (set_min_marker(end_marker, entry.marker) < 0) {
+ ldpp_dout(this, 1) << "ERROR: cannot determin minimum between malformed markers: " << end_marker << ", " << entry.marker << dendl;
+ } else {
+ ldpp_dout(this, 20) << "INFO: new end marker for removal: " << end_marker << " from: " << queue_name << dendl;
+ }
+ has_error = true;
+ ldpp_dout(this, 20) << "INFO: processing of entry: " <<
+ entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " failed" << dendl;
+ }
+ }, make_stack_allocator());
+ ++entry_idx;
+ }
+
+ // wait for all pending work to finish
+ waiter.async_wait(yield);
+
+ // delete all published entries from queue
+ if (remove_entries) {
+ librados::ObjectWriteOperation op;
+ op.assert_exists();
+ rados::cls::lock::assert_locked(&op, queue_name+"_lock",
+ ClsLockType::EXCLUSIVE,
+ lock_cookie,
+ "" /*no tag*/);
+ cls_2pc_queue_remove_entries(op, end_marker);
+ // check ownership and deleted entries in one batch
+ const auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+ if (ret == -ENOENT) {
+ // queue was deleted
+ ldpp_dout(this, 5) << "INFO: queue: "
+ << queue_name << ". was removed. processing will stop" << dendl;
+ return;
+ }
+ if (ret == -EBUSY) {
+ ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+ return;
+ }
+ if (ret < 0) {
+ ldpp_dout(this, 1) << "ERROR: failed to remove entries and/or lock queue up to: " << end_marker << " from queue: "
+ << queue_name << ". error: " << ret << dendl;
+ } else {
+ ldpp_dout(this, 20) << "INFO: removed entries up to: " << end_marker << " from queue: "
+ << queue_name << dendl;
+ }
+ }
+ }
+ }
+
+ // lits of owned queues
+ using owned_queues_t = std::unordered_set<std::string>;
+
+ // process all queues
+ // find which of the queues is owned by this daemon and process it
+ void process_queues(yield_context yield) {
+ auto has_error = false;
+ owned_queues_t owned_queues;
+
+ // add randomness to the duration between queue checking
+ // to make sure that different daemons are not synced
+ std::random_device seed;
+ std::mt19937 rnd_gen(seed());
+ const auto min_jitter = 100; // ms
+ const auto max_jitter = 500; // ms
+ std::uniform_int_distribution<> duration_jitter(min_jitter, max_jitter);
+
+ std::vector<std::string> queue_gc;
+ std::mutex queue_gc_lock;
+ while (true) {
+ Timer timer(io_context);
+ const auto duration = (has_error ?
+ std::chrono::milliseconds(queues_update_retry_ms) : std::chrono::milliseconds(queues_update_period_ms)) +
+ std::chrono::milliseconds(duration_jitter(rnd_gen));
+ timer.expires_from_now(duration);
+ const auto tp = ceph::coarse_real_time::clock::to_time_t(ceph::coarse_real_time::clock::now() + duration);
+ ldpp_dout(this, 20) << "INFO: next queues processing will happen at: " << std::ctime(&tp) << dendl;
+ boost::system::error_code ec;
+ timer.async_wait(yield[ec]);
+
+ queues_t queues;
+ auto ret = read_queue_list(queues, optional_yield(io_context, yield));
+ if (ret < 0) {
+ has_error = true;
+ continue;
+ }
+
+ for (const auto& queue_name : queues) {
+ // try to lock the queue to check if it is owned by this rgw
+ // or if ownershif needs to be taken
+ librados::ObjectWriteOperation op;
+ op.assert_exists();
+ rados::cls::lock::lock(&op, queue_name+"_lock",
+ ClsLockType::EXCLUSIVE,
+ lock_cookie,
+ "" /*no tag*/,
+ "" /*no description*/,
+ failover_time,
+ LOCK_FLAG_MAY_RENEW);
+
+ ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+ if (ret == -EBUSY) {
+ // lock is already taken by another RGW
+ ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " owned (locked) by another daemon" << dendl;
+ // if queue was owned by this RGW, processing should be stopped, queue would be deleted from list afterwards
+ continue;
+ }
+ if (ret == -ENOENT) {
+ // queue is deleted - processing will stop the next time we try to read from the queue
+ ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " should not be locked - already deleted" << dendl;
+ continue;
+ }
+ if (ret < 0) {
+ // failed to lock for another reason, continue to process other queues
+ ldpp_dout(this, 1) << "ERROR: failed to lock queue: " << queue_name << ". error: " << ret << dendl;
+ has_error = true;
+ continue;
+ }
+ // add queue to list of owned queues
+ if (owned_queues.insert(queue_name).second) {
+ ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " now owned (locked) by this daemon" << dendl;
+ // start processing this queue
+ spawn::spawn(io_context, [this, &queue_gc, &queue_gc_lock, queue_name](yield_context yield) {
+ process_queue(queue_name, yield);
+ // if queue processing ended, it measn that the queue was removed or not owned anymore
+ // mark it for deletion
+ std::lock_guard lock_guard(queue_gc_lock);
+ queue_gc.push_back(queue_name);
+ ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " marked for removal" << dendl;
+ }, make_stack_allocator());
+ } else {
+ ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " ownership (lock) renewed" << dendl;
+ }
+ }
+ // erase all queue that were deleted
+ {
+ std::lock_guard lock_guard(queue_gc_lock);
+ std::for_each(queue_gc.begin(), queue_gc.end(), [this, &owned_queues](const std::string& queue_name) {
+ owned_queues.erase(queue_name);
+ ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " removed" << dendl;
+ });
+ queue_gc.clear();
+ }
+ }
+ }
+
+public:
+
+ ~Manager() {
+ work_guard.reset();
+ io_context.stop();
+ std::for_each(workers.begin(), workers.end(), [] (auto& worker) { worker.join(); });
+ }
+
+ // ctor: start all threads
+ Manager(CephContext* _cct, uint32_t _max_queue_size, uint32_t _queues_update_period_ms,
+ uint32_t _queues_update_retry_ms, uint32_t _queue_idle_sleep_us, u_int32_t failover_time_ms,
+ uint32_t _stale_reservations_period_s, uint32_t _reservations_cleanup_period_s,
+ uint32_t _worker_count, rgw::sal::RadosStore* store) :
+ max_queue_size(_max_queue_size),
+ queues_update_period_ms(_queues_update_period_ms),
+ queues_update_retry_ms(_queues_update_retry_ms),
+ queue_idle_sleep_us(_queue_idle_sleep_us),
+ failover_time(std::chrono::milliseconds(failover_time_ms)),
+ cct(_cct),
+ lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
+ work_guard(boost::asio::make_work_guard(io_context)),
+ worker_count(_worker_count),
+ stale_reservations_period_s(_stale_reservations_period_s),
+ reservations_cleanup_period_s(_reservations_cleanup_period_s),
+ rados_ioctx(store->getRados()->get_notif_pool_ctx())
+ {
+ spawn::spawn(io_context, [this] (yield_context yield) {
+ process_queues(yield);
+ }, make_stack_allocator());
+
+ // start the worker threads to do the actual queue processing
+ const std::string WORKER_THREAD_NAME = "notif-worker";
+ for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
+ workers.emplace_back([this]() {
+ try {
+ io_context.run();
+ } catch (const std::exception& err) {
+ ldpp_dout(this, 10) << "Notification worker failed with error: " << err.what() << dendl;
+ throw(err);
+ }
+ });
+ const auto rc = ceph_pthread_setname(workers.back().native_handle(),
+ (WORKER_THREAD_NAME+std::to_string(worker_id)).c_str());
+ ceph_assert(rc == 0);
+ }
+ ldpp_dout(this, 10) << "Started notification manager with: " << worker_count << " workers" << dendl;
+ }
+
+ int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+ if (topic_name == Q_LIST_OBJECT_NAME) {
+ ldpp_dout(this, 1) << "ERROR: topic name cannot be: " << Q_LIST_OBJECT_NAME << " (conflict with queue list object name)" << dendl;
+ return -EINVAL;
+ }
+ librados::ObjectWriteOperation op;
+ op.create(true);
+ cls_2pc_queue_init(op, topic_name, max_queue_size);
+ auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
+ if (ret == -EEXIST) {
+ // queue already exists - nothing to do
+ ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already exists. nothing to do" << dendl;
+ return 0;
+ }
+ if (ret < 0) {
+ // failed to create queue
+ ldpp_dout(this, 1) << "ERROR: failed to create queue for topic: " << topic_name << ". error: " << ret << dendl;
+ return ret;
+ }
+
+ bufferlist empty_bl;
+ std::map<std::string, bufferlist> new_topic{{topic_name, empty_bl}};
+ op.omap_set(new_topic);
+ ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+ if (ret < 0) {
+ ldpp_dout(this, 1) << "ERROR: failed to add queue: " << topic_name << " to queue list. error: " << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " added to queue list" << dendl;
+ return 0;
+ }
+};
+
+// singleton manager
+// note that the manager itself is not a singleton, and multiple instances may co-exist
+// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
+static Manager* s_manager = nullptr;
+
+constexpr size_t MAX_QUEUE_SIZE = 128*1000*1000; // 128MB
+constexpr uint32_t Q_LIST_UPDATE_MSEC = 1000*30; // check queue list every 30seconds
+constexpr uint32_t Q_LIST_RETRY_MSEC = 1000; // retry every second if queue list update failed
+constexpr uint32_t IDLE_TIMEOUT_USEC = 100*1000; // idle sleep 100ms
+constexpr uint32_t FAILOVER_TIME_MSEC = 3*Q_LIST_UPDATE_MSEC; // FAILOVER TIME 3x renew time
+constexpr uint32_t WORKER_COUNT = 1; // 1 worker thread
+constexpr uint32_t STALE_RESERVATIONS_PERIOD_S = 120; // cleanup reservations that are more than 2 minutes old
+constexpr uint32_t RESERVATIONS_CLEANUP_PERIOD_S = 30; // reservation cleanup every 30 seconds
+
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp) {
+ if (s_manager) {
+ return false;
+ }
+ // TODO: take conf from CephContext
+ s_manager = new Manager(cct, MAX_QUEUE_SIZE,
+ Q_LIST_UPDATE_MSEC, Q_LIST_RETRY_MSEC,
+ IDLE_TIMEOUT_USEC, FAILOVER_TIME_MSEC,
+ STALE_RESERVATIONS_PERIOD_S, RESERVATIONS_CLEANUP_PERIOD_S,
+ WORKER_COUNT,
+ store);
+ return true;
+}
+
+void shutdown() {
+ delete s_manager;
+ s_manager = nullptr;
+}
+
+int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+ if (!s_manager) {
+ return -EAGAIN;
+ }
+ return s_manager->add_persistent_topic(topic_name, y);
+}
+
+int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y) {
+ librados::ObjectWriteOperation op;
+ op.remove();
+ auto ret = rgw_rados_operate(dpp, rados_ioctx, topic_name, &op, y);
+ if (ret == -ENOENT) {
+ // queue already removed - nothing to do
+ ldpp_dout(dpp, 20) << "INFO: queue for topic: " << topic_name << " already removed. nothing to do" << dendl;
+ return 0;
+ }
+ if (ret < 0) {
+ // failed to remove queue
+ ldpp_dout(dpp, 1) << "ERROR: failed to remove queue for topic: " << topic_name << ". error: " << ret << dendl;
+ return ret;
+ }
+
+ std::set<std::string> topic_to_remove{{topic_name}};
+ op.omap_rm_keys(topic_to_remove);
+ ret = rgw_rados_operate(dpp, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to remove queue: " << topic_name << " from queue list. error: " << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << "INFO: queue: " << topic_name << " removed from queue list" << dendl;
+ return 0;
+}
+
+int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
+ if (!s_manager) {
+ return -EAGAIN;
+ }
+ return remove_persistent_topic(s_manager, s_manager->rados_ioctx, topic_name, y);
+}
+
+rgw::sal::Object* get_object_with_atttributes(
+ const reservation_t& res, rgw::sal::Object* obj) {
+ // in case of copy obj, the tags and metadata are taken from source
+ const auto src_obj = res.src_object ? res.src_object : obj;
+ if (src_obj->get_attrs().empty()) {
+ if (!src_obj->get_bucket()) {
+ src_obj->set_bucket(res.bucket);
+ }
+ const auto ret = src_obj->get_obj_attrs(res.yield, res.dpp);
+ if (ret < 0) {
+ ldpp_dout(res.dpp, 20) << "failed to get attributes from object: " <<
+ src_obj->get_key() << ". ret = " << ret << dendl;
+ return nullptr;
+ }
+ }
+ return src_obj;
+}
+
+static inline void filter_amz_meta(meta_map_t& dest, const meta_map_t& src) {
+ std::copy_if(src.cbegin(), src.cend(),
+ std::inserter(dest, dest.end()),
+ [](const auto& m) {
+ return (boost::algorithm::starts_with(m.first, RGW_AMZ_META_PREFIX));
+ });
+}
+
+
+static inline void metadata_from_attributes(
+ reservation_t& res, rgw::sal::Object* obj) {
+ auto& metadata = res.x_meta_map;
+ const auto src_obj = get_object_with_atttributes(res, obj);
+ if (!src_obj) {
+ return;
+ }
+ res.metadata_fetched_from_attributes = true;
+ for (auto& attr : src_obj->get_attrs()) {
+ if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) {
+ std::string_view key(attr.first);
+ key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1);
+ // we want to pass a null terminated version
+ // of the bufferlist, hence "to_str().c_str()"
+ metadata.emplace(key, attr.second.to_str().c_str());
+ }
+ }
+}
+
+static inline void tags_from_attributes(
+ const reservation_t& res, rgw::sal::Object* obj, KeyMultiValueMap& tags) {
+ const auto src_obj = get_object_with_atttributes(res, obj);
+ if (!src_obj) {
+ return;
+ }
+ const auto& attrs = src_obj->get_attrs();
+ const auto attr_iter = attrs.find(RGW_ATTR_TAGS);
+ if (attr_iter != attrs.end()) {
+ auto bliter = attr_iter->second.cbegin();
+ RGWObjTags obj_tags;
+ try {
+ ::decode(obj_tags, bliter);
+ } catch(buffer::error&) {
+ // not able to decode tags
+ return;
+ }
+ tags = std::move(obj_tags.get_tags());
+ }
+}
+
+// populate event from request
+static inline void populate_event(reservation_t& res,
+ rgw::sal::Object* obj,
+ uint64_t size,
+ const ceph::real_time& mtime,
+ const std::string& etag,
+ const std::string& version,
+ EventType event_type,
+ rgw_pubsub_s3_event& event) {
+ event.eventTime = mtime;
+ event.eventName = to_event_string(event_type);
+ event.userIdentity = res.user_id; // user that triggered the change
+ event.x_amz_request_id = res.req_id; // request ID of the original change
+ event.x_amz_id_2 = res.store->getRados()->host_id; // RGW on which the change was made
+ // configurationId is filled from notification configuration
+ event.bucket_name = res.bucket->get_name();
+ event.bucket_ownerIdentity = res.bucket->get_owner() ?
+ res.bucket->get_owner()->get_id().id : res.bucket->get_info().owner.id;
+ const auto region = res.store->get_zone()->get_zonegroup().get_api_name();
+ rgw::ARN bucket_arn(res.bucket->get_key());
+ bucket_arn.region = region;
+ event.bucket_arn = to_string(bucket_arn);
+ event.object_key = res.object_name ? *res.object_name : obj->get_name();
+ event.object_size = size;
+ event.object_etag = etag;
+ event.object_versionId = version;
+ event.awsRegion = region;
+ // use timestamp as per key sequence id (hex encoded)
+ const utime_t ts(real_clock::now());
+ boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t),
+ std::back_inserter(event.object_sequencer));
+ set_event_id(event.id, etag, ts);
+ event.bucket_id = res.bucket->get_bucket_id();
+ // pass meta data
+ if (!res.metadata_fetched_from_attributes) {
+ // either no metadata exist or no metadata filter was used
+ metadata_from_attributes(res, obj);
+ }
+ event.x_meta_map = res.x_meta_map;
+ // pass tags
+ if (!res.tagset ||
+ (*res.tagset).get_tags().empty()) {
+ // try to fetch the tags from the attributes
+ tags_from_attributes(res, obj, event.tags);
+ } else {
+ event.tags = (*res.tagset).get_tags();
+ }
+ // opaque data will be filled from topic configuration
+}
+
+static inline bool notification_match(reservation_t& res,
+ const rgw_pubsub_topic_filter& filter,
+ EventType event,
+ const RGWObjTags* req_tags) {
+ if (!match(filter.events, event)) {
+ return false;
+ }
+ const auto obj = res.object;
+ if (!match(filter.s3_filter.key_filter,
+ res.object_name ? *res.object_name : obj->get_name())) {
+ return false;
+ }
+
+ if (!filter.s3_filter.metadata_filter.kv.empty()) {
+ // metadata filter exists
+ if (res.s) {
+ filter_amz_meta(res.x_meta_map, res.s->info.x_meta_map);
+ }
+ metadata_from_attributes(res, obj);
+ if (!match(filter.s3_filter.metadata_filter, res.x_meta_map)) {
+ return false;
+ }
+ }
+
+ if (!filter.s3_filter.tag_filter.kv.empty()) {
+ // tag filter exists
+ if (req_tags) {
+ // tags in the request
+ if (!match(filter.s3_filter.tag_filter, req_tags->get_tags())) {
+ return false;
+ }
+ } else if (res.tagset && !(*res.tagset).get_tags().empty()) {
+ // tags were cached in req_state
+ if (!match(filter.s3_filter.tag_filter, (*res.tagset).get_tags())) {
+ return false;
+ }
+ } else {
+ // try to fetch tags from the attributes
+ KeyMultiValueMap tags;
+ tags_from_attributes(res, obj, tags);
+ if (!match(filter.s3_filter.tag_filter, tags)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+ int publish_reserve(const DoutPrefixProvider* dpp,
+ EventType event_type,
+ reservation_t& res,
+ const RGWObjTags* req_tags)
+{
+ const RGWPubSub ps(res.store, res.user_tenant);
+ const RGWPubSub::Bucket ps_bucket(ps, res.bucket);
+ rgw_pubsub_bucket_topics bucket_topics;
+ auto rc = ps_bucket.get_topics(res.dpp, bucket_topics, res.yield);
+ if (rc < 0) {
+ // failed to fetch bucket topics
+ return rc;
+ }
+ for (const auto& bucket_topic : bucket_topics.topics) {
+ const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second;
+ const rgw_pubsub_topic& topic_cfg = topic_filter.topic;
+ if (!notification_match(res, topic_filter, event_type, req_tags)) {
+ // notification does not apply to req_state
+ continue;
+ }
+ ldpp_dout(res.dpp, 20) << "INFO: notification: '" << topic_filter.s3_id <<
+ "' on topic: '" << topic_cfg.dest.arn_topic <<
+ "' and bucket: '" << res.bucket->get_name() <<
+ "' (unique topic: '" << topic_cfg.name <<
+ "') apply to event of type: '" << to_string(event_type) << "'" << dendl;
+
+ cls_2pc_reservation::id_t res_id;
+ if (topic_cfg.dest.persistent) {
+ // TODO: take default reservation size from conf
+ constexpr auto DEFAULT_RESERVATION = 4*1024U; // 4K
+ res.size = DEFAULT_RESERVATION;
+ librados::ObjectWriteOperation op;
+ bufferlist obl;
+ int rval;
+ const auto& queue_name = topic_cfg.dest.arn_topic;
+ cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval);
+ auto ret = rgw_rados_operate(
+ res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+ queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+ if (ret < 0) {
+ ldpp_dout(res.dpp, 1) <<
+ "ERROR: failed to reserve notification on queue: "
+ << queue_name << ". error: " << ret << dendl;
+ // if no space is left in queue we ask client to slow down
+ return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+ }
+ ret = cls_2pc_queue_reserve_result(obl, res_id);
+ if (ret < 0) {
+ ldpp_dout(res.dpp, 1) << "ERROR: failed to parse reservation id. error: " << ret << dendl;
+ return ret;
+ }
+ }
+ res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id);
+ }
+ return 0;
+}
+
+int publish_commit(rgw::sal::Object* obj,
+ uint64_t size,
+ const ceph::real_time& mtime,
+ const std::string& etag,
+ const std::string& version,
+ EventType event_type,
+ reservation_t& res,
+ const DoutPrefixProvider* dpp)
+{
+ for (auto& topic : res.topics) {
+ if (topic.cfg.dest.persistent &&
+ topic.res_id == cls_2pc_reservation::NO_ID) {
+ // nothing to commit or already committed/aborted
+ continue;
+ }
+ event_entry_t event_entry;
+ populate_event(res, obj, size, mtime, etag, version, event_type, event_entry.event);
+ event_entry.event.configurationId = topic.configurationId;
+ event_entry.event.opaque_data = topic.cfg.opaque_data;
+ if (topic.cfg.dest.persistent) {
+ event_entry.push_endpoint = std::move(topic.cfg.dest.push_endpoint);
+ event_entry.push_endpoint_args =
+ std::move(topic.cfg.dest.push_endpoint_args);
+ event_entry.arn_topic = topic.cfg.dest.arn_topic;
+ bufferlist bl;
+ encode(event_entry, bl);
+ const auto& queue_name = topic.cfg.dest.arn_topic;
+ if (bl.length() > res.size) {
+ // try to make a larger reservation, fail only if this is not possible
+ ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length()
+ << " exceeded reserved size: " << res.size
+ <<
+ " . trying to make a larger reservation on queue:" << queue_name
+ << dendl;
+ // first cancel the existing reservation
+ librados::ObjectWriteOperation op;
+ cls_2pc_queue_abort(op, topic.res_id);
+ auto ret = rgw_rados_operate(
+ dpp, res.store->getRados()->get_notif_pool_ctx(),
+ topic.cfg.dest.arn_topic, &op,
+ res.yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: "
+ << topic.res_id <<
+ " when trying to make a larger reservation on queue: " << queue_name
+ << ". error: " << ret << dendl;
+ return ret;
+ }
+ // now try to make a bigger one
+ buffer::list obl;
+ int rval;
+ cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval);
+ ret = rgw_rados_operate(
+ dpp, res.store->getRados()->get_notif_pool_ctx(),
+ queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: "
+ << queue_name
+ << ". error: " << ret << dendl;
+ return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+ }
+ ret = cls_2pc_queue_reserve_result(obl, topic.res_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to parse reservation id for "
+ "extra space. error: " << ret << dendl;
+ return ret;
+ }
+ }
+ std::vector<buffer::list> bl_data_vec{std::move(bl)};
+ librados::ObjectWriteOperation op;
+ cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
+ const auto ret = rgw_rados_operate(
+ dpp, res.store->getRados()->get_notif_pool_ctx(),
+ queue_name, &op, res.yield);
+ topic.res_id = cls_2pc_reservation::NO_ID;
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
+ << queue_name << ". error: " << ret
+ << dendl;
+ return ret;
+ }
+ } else {
+ try {
+ // TODO add endpoint LRU cache
+ const auto push_endpoint = RGWPubSubEndpoint::create(
+ topic.cfg.dest.push_endpoint,
+ topic.cfg.dest.arn_topic,
+ RGWHTTPArgs(topic.cfg.dest.push_endpoint_args, dpp),
+ dpp->get_cct());
+ ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: "
+ << topic.cfg.dest.push_endpoint << dendl;
+ const auto ret = push_endpoint->send_to_completion_async(
+ dpp->get_cct(), event_entry.event, res.yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: push to endpoint "
+ << topic.cfg.dest.push_endpoint
+ << " failed. error: " << ret << dendl;
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+ return ret;
+ }
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+ } catch (const RGWPubSubEndpoint::configuration_error& e) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to create push endpoint: "
+ << topic.cfg.dest.push_endpoint << ". error: " << e.what() << dendl;
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+ return -EINVAL;
+ }
+ }
+ }
+ return 0;
+}
+
+int publish_abort(reservation_t& res) {
+ for (auto& topic : res.topics) {
+ if (!topic.cfg.dest.persistent ||
+ topic.res_id == cls_2pc_reservation::NO_ID) {
+ // nothing to abort or already committed/aborted
+ continue;
+ }
+ const auto& queue_name = topic.cfg.dest.arn_topic;
+ librados::ObjectWriteOperation op;
+ cls_2pc_queue_abort(op, topic.res_id);
+ const auto ret = rgw_rados_operate(
+ res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+ queue_name, &op, res.yield);
+ if (ret < 0) {
+ ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: "
+ << topic.res_id <<
+ " from queue: " << queue_name << ". error: " << ret << dendl;
+ return ret;
+ }
+ topic.res_id = cls_2pc_reservation::NO_ID;
+ }
+ return 0;
+}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+ rgw::sal::RadosStore* _store,
+ const req_state* _s,
+ rgw::sal::Object* _object,
+ rgw::sal::Object* _src_object,
+ const std::string* _object_name,
+ optional_yield y) :
+ dpp(_s), store(_store), s(_s), size(0) /* XXX */,
+ object(_object), src_object(_src_object), bucket(_s->bucket.get()),
+ object_name(_object_name),
+ tagset(_s->tagset),
+ metadata_fetched_from_attributes(false),
+ user_id(_s->user->get_id().id),
+ user_tenant(_s->user->get_id().tenant),
+ req_id(_s->req_id),
+ yield(y)
+{
+ filter_amz_meta(x_meta_map, _s->info.x_meta_map);
+}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+ rgw::sal::RadosStore* _store,
+ rgw::sal::Object* _object,
+ rgw::sal::Object* _src_object,
+ rgw::sal::Bucket* _bucket,
+ const std::string& _user_id,
+ const std::string& _user_tenant,
+ const std::string& _req_id,
+ optional_yield y) :
+ dpp(_dpp), store(_store), s(nullptr), size(0) /* XXX */,
+ object(_object), src_object(_src_object), bucket(_bucket),
+ object_name(nullptr),
+ metadata_fetched_from_attributes(false),
+ user_id(_user_id),
+ user_tenant(_user_tenant),
+ req_id(_req_id),
+ yield(y)
+{}
+
+reservation_t::~reservation_t() {
+ publish_abort(*this);
+}
+
+} // namespace rgw::notify
diff --git a/src/rgw/driver/rados/rgw_notify.h b/src/rgw/driver/rados/rgw_notify.h
new file mode 100644
index 000000000..9269611e4
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_notify.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include "common/ceph_time.h"
+#include "include/common_fwd.h"
+#include "rgw_notify_event_type.h"
+#include "common/async/yield_context.h"
+#include "cls/2pc_queue/cls_2pc_queue_types.h"
+#include "rgw_pubsub.h"
+
+// forward declarations
+namespace rgw::sal {
+ class RadosStore;
+ class RGWObject;
+}
+
+class RGWRados;
+struct rgw_obj_key;
+
+namespace rgw::notify {
+
+// initialize the notification manager
+// notification manager is dequeing the 2-phase-commit queues
+// and send the notifications to the endpoints
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp);
+
+// shutdown the notification manager
+void shutdown();
+
+// create persistent delivery queue for a topic (endpoint)
+// this operation also add a topic name to the common (to all RGWs) list of all topics
+int add_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// remove persistent delivery queue for a topic (endpoint)
+// this operation also remove the topic name from the common (to all RGWs) list of all topics
+int remove_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// same as the above, expect you need to provide the IoCtx, the above uses rgw::notify::Manager::rados_ioctx
+int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y);
+
+// struct holding reservation information
+// populated in the publish_reserve call
+// then used to commit or abort the reservation
+struct reservation_t {
+ struct topic_t {
+ topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg,
+ cls_2pc_reservation::id_t _res_id) :
+ configurationId(_configurationId), cfg(_cfg), res_id(_res_id) {}
+
+ const std::string configurationId;
+ const rgw_pubsub_topic cfg;
+ // res_id is reset after topic is committed/aborted
+ cls_2pc_reservation::id_t res_id;
+ };
+
+ const DoutPrefixProvider* const dpp;
+ std::vector<topic_t> topics;
+ rgw::sal::RadosStore* const store;
+ const req_state* const s;
+ size_t size;
+ rgw::sal::Object* const object;
+ rgw::sal::Object* const src_object; // may differ from object
+ rgw::sal::Bucket* const bucket;
+ const std::string* const object_name;
+ boost::optional<const RGWObjTags&> tagset;
+ meta_map_t x_meta_map; // metadata cached by value
+ bool metadata_fetched_from_attributes;
+ const std::string user_id;
+ const std::string user_tenant;
+ const std::string req_id;
+ optional_yield yield;
+
+ /* ctor for rgw_op callers */
+ reservation_t(const DoutPrefixProvider* _dpp,
+ rgw::sal::RadosStore* _store,
+ const req_state* _s,
+ rgw::sal::Object* _object,
+ rgw::sal::Object* _src_object,
+ const std::string* _object_name,
+ optional_yield y);
+
+ /* ctor for non-request caller (e.g., lifecycle) */
+ reservation_t(const DoutPrefixProvider* _dpp,
+ rgw::sal::RadosStore* _store,
+ rgw::sal::Object* _object,
+ rgw::sal::Object* _src_object,
+ rgw::sal::Bucket* _bucket,
+ const std::string& _user_id,
+ const std::string& _user_tenant,
+ const std::string& _req_id,
+ optional_yield y);
+
+ // dtor doing resource leak guarding
+ // aborting the reservation if not already committed or aborted
+ ~reservation_t();
+};
+
+// create a reservation on the 2-phase-commit queue
+ int publish_reserve(const DoutPrefixProvider *dpp,
+ EventType event_type,
+ reservation_t& reservation,
+ const RGWObjTags* req_tags);
+
+// commit the reservation to the queue
+int publish_commit(rgw::sal::Object* obj,
+ uint64_t size,
+ const ceph::real_time& mtime,
+ const std::string& etag,
+ const std::string& version,
+ EventType event_type,
+ reservation_t& reservation,
+ const DoutPrefixProvider *dpp);
+
+// cancel the reservation
+int publish_abort(reservation_t& reservation);
+
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc
new file mode 100644
index 000000000..92ade8120
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_obj_manifest.cc
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_obj_manifest.h"
+
+#include "services/svc_zone.h"
+#include "rgw_rados.h"
+#include "rgw_bucket.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWObjManifest::generator::create_next(uint64_t ofs)
+{
+ if (ofs < last_ofs) /* only going forward */
+ return -EINVAL;
+
+ uint64_t max_head_size = manifest->get_max_head_size();
+
+ if (ofs < max_head_size) {
+ manifest->set_head_size(ofs);
+ }
+
+ if (ofs >= max_head_size) {
+ manifest->set_head_size(max_head_size);
+ cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
+ cur_stripe_size = rule.stripe_max_size;
+
+ if (cur_part_id == 0 && max_head_size > 0) {
+ cur_stripe++;
+ }
+ }
+
+ last_ofs = ofs;
+ manifest->set_obj_size(ofs);
+
+ manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
+
+ return 0;
+}
+
+int RGWObjManifest::append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+ const RGWZoneParams& zone_params)
+{
+ if (explicit_objs || m.explicit_objs) {
+ return append_explicit(dpp, m, zonegroup, zone_params);
+ }
+
+ if (rules.empty()) {
+ *this = m;
+ return 0;
+ }
+
+ string override_prefix;
+
+ if (prefix.empty()) {
+ prefix = m.prefix;
+ }
+
+ if (prefix != m.prefix) {
+ override_prefix = m.prefix;
+ }
+
+ map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
+ if (miter == m.rules.end()) {
+ return append_explicit(dpp, m, zonegroup, zone_params);
+ }
+
+ for (; miter != m.rules.end(); ++miter) {
+ map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
+
+ RGWObjManifestRule& rule = last_rule->second;
+
+ if (rule.part_size == 0) {
+ rule.part_size = obj_size - rule.start_ofs;
+ }
+
+ RGWObjManifestRule& next_rule = miter->second;
+ if (!next_rule.part_size) {
+ next_rule.part_size = m.obj_size - next_rule.start_ofs;
+ }
+
+ string rule_prefix = prefix;
+ if (!rule.override_prefix.empty()) {
+ rule_prefix = rule.override_prefix;
+ }
+
+ string next_rule_prefix = m.prefix;
+ if (!next_rule.override_prefix.empty()) {
+ next_rule_prefix = next_rule.override_prefix;
+ }
+
+ if (rule.part_size != next_rule.part_size ||
+ rule.stripe_max_size != next_rule.stripe_max_size ||
+ rule_prefix != next_rule_prefix) {
+ if (next_rule_prefix != prefix) {
+ append_rules(m, miter, &next_rule_prefix);
+ } else {
+ append_rules(m, miter, NULL);
+ }
+ break;
+ }
+
+ uint64_t expected_part_num = rule.start_part_num + 1;
+ if (rule.part_size > 0) {
+ expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
+ }
+
+ if (expected_part_num != next_rule.start_part_num) {
+ append_rules(m, miter, NULL);
+ break;
+ }
+ }
+
+ set_obj_size(obj_size + m.obj_size);
+
+ return 0;
+}
+
+void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
+ string *override_prefix)
+{
+ for (; miter != m.rules.end(); ++miter) {
+ RGWObjManifestRule rule = miter->second;
+ rule.start_ofs += obj_size;
+ if (override_prefix)
+ rule.override_prefix = *override_prefix;
+ rules[rule.start_ofs] = rule;
+ }
+}
+
+void RGWObjManifest::convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+ if (explicit_objs) {
+ return;
+ }
+ obj_iterator iter = obj_begin(dpp);
+
+ while (iter != obj_end(dpp)) {
+ RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
+ const rgw_obj_select& os = iter.get_location();
+ const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
+ part.loc_ofs = 0;
+
+ uint64_t ofs = iter.get_stripe_ofs();
+
+ if (ofs == 0) {
+ part.loc = obj;
+ } else {
+ RGWSI_Tier_RADOS::raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
+ }
+ ++iter;
+ uint64_t next_ofs = iter.get_stripe_ofs();
+
+ part.size = next_ofs - ofs;
+ }
+
+ explicit_objs = true;
+ rules.clear();
+ prefix.clear();
+}
+
+int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+ if (!explicit_objs) {
+ convert_to_explicit(dpp, zonegroup, zone_params);
+ }
+ if (!m.explicit_objs) {
+ m.convert_to_explicit(dpp, zonegroup, zone_params);
+ }
+ map<uint64_t, RGWObjManifestPart>::iterator iter;
+ uint64_t base = obj_size;
+ for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
+ RGWObjManifestPart& part = iter->second;
+ objs[base + iter->first] = part;
+ }
+ obj_size += m.obj_size;
+
+ return 0;
+}
+
+bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
+{
+ if (rules.empty()) {
+ return false;
+ }
+
+ map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
+ if (iter != rules.begin()) {
+ --iter;
+ }
+
+ *rule = iter->second;
+
+ return true;
+}
+
+int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
+ const rgw_placement_rule& head_placement_rule,
+ const rgw_placement_rule *tail_placement_rule,
+ const rgw_bucket& _b, const rgw_obj& _obj)
+{
+ manifest = _m;
+
+ if (!tail_placement_rule) {
+ manifest->set_tail_placement(head_placement_rule, _b);
+ } else {
+ rgw_placement_rule new_tail_rule = *tail_placement_rule;
+ new_tail_rule.inherit_from(head_placement_rule);
+ manifest->set_tail_placement(new_tail_rule, _b);
+ }
+
+ manifest->set_head(head_placement_rule, _obj, 0);
+ last_ofs = 0;
+
+ if (manifest->get_prefix().empty()) {
+ char buf[33];
+ gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+
+ string oid_prefix = ".";
+ oid_prefix.append(buf);
+ oid_prefix.append("_");
+
+ manifest->set_prefix(oid_prefix);
+ }
+
+ bool found = manifest->get_rule(0, &rule);
+ if (!found) {
+ derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
+ return -EIO;
+ }
+
+ uint64_t head_size = manifest->get_head_size();
+
+ if (head_size > 0) {
+ cur_stripe_size = head_size;
+ } else {
+ cur_stripe_size = rule.stripe_max_size;
+ }
+
+ cur_part_id = rule.start_part_num;
+
+ manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
+
+ // Normal object which not generated through copy operation
+ manifest->set_tail_instance(_obj.key.instance);
+
+ return 0;
+}
+
+void RGWObjManifestPart::generate_test_instances(std::list<RGWObjManifestPart*>& o)
+{
+ o.push_back(new RGWObjManifestPart);
+
+ RGWObjManifestPart *p = new RGWObjManifestPart;
+ rgw_bucket b;
+ init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+
+ p->loc = rgw_obj(b, "object");
+ p->loc_ofs = 512 * 1024;
+ p->size = 128 * 1024;
+ o.push_back(p);
+}
+
+void RGWObjManifest::generate_test_instances(std::list<RGWObjManifest*>& o)
+{
+ RGWObjManifest *m = new RGWObjManifest;
+ map<uint64_t, RGWObjManifestPart> objs;
+ uint64_t total_size = 0;
+ for (int i = 0; i<10; i++) {
+ RGWObjManifestPart p;
+ rgw_bucket b;
+ init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+ p.loc = rgw_obj(b, "object");
+ p.loc_ofs = 0;
+ p.size = 512 * 1024;
+ total_size += p.size;
+ objs[total_size] = p;
+ }
+ m->set_explicit(total_size, objs);
+ o.push_back(m);
+ o.push_back(new RGWObjManifest);
+}
+
+void RGWObjManifestPart::dump(Formatter *f) const
+{
+ f->open_object_section("loc");
+ loc.dump(f);
+ f->close_section();
+ f->dump_unsigned("loc_ofs", loc_ofs);
+ f->dump_unsigned("size", size);
+}
+
+void RGWObjManifest::obj_iterator::dump(Formatter *f) const
+{
+ f->dump_unsigned("part_ofs", part_ofs);
+ f->dump_unsigned("stripe_ofs", stripe_ofs);
+ f->dump_unsigned("ofs", ofs);
+ f->dump_unsigned("stripe_size", stripe_size);
+ f->dump_int("cur_part_id", cur_part_id);
+ f->dump_int("cur_stripe", cur_stripe);
+ f->dump_string("cur_override_prefix", cur_override_prefix);
+ f->dump_object("location", location);
+}
+
+void RGWObjManifest::dump(Formatter *f) const
+{
+ map<uint64_t, RGWObjManifestPart>::const_iterator iter = objs.begin();
+ f->open_array_section("objs");
+ for (; iter != objs.end(); ++iter) {
+ f->dump_unsigned("ofs", iter->first);
+ f->open_object_section("part");
+ iter->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_unsigned("obj_size", obj_size);
+ ::encode_json("explicit_objs", explicit_objs, f);
+ ::encode_json("head_size", head_size, f);
+ ::encode_json("max_head_size", max_head_size, f);
+ ::encode_json("prefix", prefix, f);
+ ::encode_json("rules", rules, f);
+ ::encode_json("tail_instance", tail_instance, f);
+ ::encode_json("tail_placement", tail_placement, f);
+ ::encode_json("tier_type", tier_type, f);
+
+ if (tier_type == "cloud-s3") {
+ ::encode_json("tier_config", tier_config, f);
+ }
+
+ // nullptr being passed into iterators since there
+ // is no cct and we aren't doing anything with these
+ // iterators that would write do the log
+ f->dump_object("begin_iter", obj_begin(nullptr));
+ f->dump_object("end_iter", obj_end(nullptr));
+}
+
+void RGWObjManifestRule::dump(Formatter *f) const
+{
+ encode_json("start_part_num", start_part_num, f);
+ encode_json("start_ofs", start_ofs, f);
+ encode_json("part_size", part_size, f);
+ encode_json("stripe_max_size", stripe_max_size, f);
+ encode_json("override_prefix", override_prefix, f);
+}
+
+void rgw_obj_select::dump(Formatter *f) const
+{
+ f->dump_string("placement_rule", placement_rule.to_str());
+ f->dump_object("obj", obj);
+ f->dump_object("raw_obj", raw_obj);
+ f->dump_bool("is_raw", is_raw);
+}
+
+void RGWObjTier::dump(Formatter *f) const
+{
+ encode_json("name", name, f);
+ encode_json("tier_placement", tier_placement, f);
+ encode_json("is_multipart_upload", is_multipart_upload, f);
+}
+
+// returns true on success, false on failure
+static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+ const rgw_placement_rule& head_placement_rule,
+ const rgw_obj& obj, rgw_pool *pool)
+{
+ if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
+ RGWZonePlacementInfo placement;
+ if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
+ return false;
+ }
+
+ if (!obj.in_extra_data) {
+ *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
+ } else {
+ *pool = placement.get_data_extra_pool();
+ }
+ }
+
+ return true;
+}
+
+static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+ const rgw_placement_rule& head_placement_rule,
+ const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+ get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+ return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
+}
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
+{
+ if (!is_raw) {
+ rgw_raw_obj r;
+ rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
+ return r;
+ }
+ return raw_obj;
+}
+
+// returns true on success, false on failure
+bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
+{
+ return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h
new file mode 100644
index 000000000..6984184aa
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_obj_manifest.h
@@ -0,0 +1,622 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types. Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include "rgw_zone_types.h"
+#include "rgw_bucket_types.h"
+#include "rgw_obj_types.h"
+#include "rgw_placement_types.h"
+
+#include "common/dout.h"
+#include "common/Formatter.h"
+
+class RGWSI_Zone;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWRados;
+
+namespace rgw { namespace sal {
+ class RadosStore;
+} };
+
+class rgw_obj_select {
+ rgw_placement_rule placement_rule;
+ rgw_obj obj;
+ rgw_raw_obj raw_obj;
+ bool is_raw;
+
+public:
+ rgw_obj_select() : is_raw(false) {}
+ explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {}
+ explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {}
+ rgw_obj_select(const rgw_obj_select& rhs) {
+ placement_rule = rhs.placement_rule;
+ is_raw = rhs.is_raw;
+ if (is_raw) {
+ raw_obj = rhs.raw_obj;
+ } else {
+ obj = rhs.obj;
+ }
+ }
+
+ rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const;
+ rgw_raw_obj get_raw_obj(RGWRados* store) const;
+
+ rgw_obj_select& operator=(const rgw_obj& rhs) {
+ obj = rhs;
+ is_raw = false;
+ return *this;
+ }
+
+ rgw_obj_select& operator=(const rgw_raw_obj& rhs) {
+ raw_obj = rhs;
+ is_raw = true;
+ return *this;
+ }
+
+ void set_placement_rule(const rgw_placement_rule& rule) {
+ placement_rule = rule;
+ }
+ void dump(Formatter *f) const;
+};
+
+struct RGWObjManifestPart {
+ rgw_obj loc; /* the object where the data is located */
+ uint64_t loc_ofs; /* the offset at that object where the data is located */
+ uint64_t size; /* the part size */
+
+ RGWObjManifestPart() : loc_ofs(0), size(0) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 2, bl);
+ encode(loc, bl);
+ encode(loc_ofs, bl);
+ encode(size, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+ decode(loc, bl);
+ decode(loc_ofs, bl);
+ decode(size, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<RGWObjManifestPart*>& o);
+};
+WRITE_CLASS_ENCODER(RGWObjManifestPart)
+
+/*
+ The manifest defines a set of rules for structuring the object parts.
+ There are a few terms to note:
+ - head: the head part of the object, which is the part that contains
+ the first chunk of data. An object might not have a head (as in the
+ case of multipart-part objects).
+ - stripe: data portion of a single rgw object that resides on a single
+ rados object.
+ - part: a collection of stripes that make a contiguous part of an
+ object. A regular object will only have one part (although might have
+ many stripes), a multipart object might have many parts. Each part
+ has a fixed stripe size, although the last stripe of a part might
+ be smaller than that. Consecutive parts may be merged if their stripe
+ value is the same.
+*/
+
+struct RGWObjManifestRule {
+ uint32_t start_part_num;
+ uint64_t start_ofs;
+ uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */
+ uint64_t stripe_max_size; /* underlying obj max size */
+ std::string override_prefix;
+
+ RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {}
+ RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) :
+ start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(start_part_num, bl);
+ encode(start_ofs, bl);
+ encode(part_size, bl);
+ encode(stripe_max_size, bl);
+ encode(override_prefix, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(start_part_num, bl);
+ decode(start_ofs, bl);
+ decode(part_size, bl);
+ decode(stripe_max_size, bl);
+ if (struct_v >= 2)
+ decode(override_prefix, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjManifestRule)
+
+struct RGWObjTier {
+ std::string name;
+ RGWZoneGroupPlacementTier tier_placement;
+ bool is_multipart_upload{false};
+
+ RGWObjTier(): name("none") {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 2, bl);
+ encode(name, bl);
+ encode(tier_placement, bl);
+ encode(is_multipart_upload, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(name, bl);
+ decode(tier_placement, bl);
+ decode(is_multipart_upload, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjTier)
+
+class RGWObjManifest {
+protected:
+ bool explicit_objs{false}; /* really old manifest? */
+ std::map<uint64_t, RGWObjManifestPart> objs;
+
+ uint64_t obj_size{0};
+
+ rgw_obj obj;
+ uint64_t head_size{0};
+ rgw_placement_rule head_placement_rule;
+
+ uint64_t max_head_size{0};
+ std::string prefix;
+ rgw_bucket_placement tail_placement; /* might be different than the original bucket,
+ as object might have been copied across pools */
+ std::map<uint64_t, RGWObjManifestRule> rules;
+
+ std::string tail_instance; /* tail object's instance */
+
+ std::string tier_type;
+ RGWObjTier tier_config;
+
+ void convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+ int append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+ void append_rules(RGWObjManifest& m, std::map<uint64_t, RGWObjManifestRule>::iterator& iter, std::string *override_prefix);
+
+public:
+
+ RGWObjManifest() = default;
+ RGWObjManifest(const RGWObjManifest& rhs) {
+ *this = rhs;
+ }
+ RGWObjManifest& operator=(const RGWObjManifest& rhs) {
+ explicit_objs = rhs.explicit_objs;
+ objs = rhs.objs;
+ obj_size = rhs.obj_size;
+ obj = rhs.obj;
+ head_size = rhs.head_size;
+ max_head_size = rhs.max_head_size;
+ prefix = rhs.prefix;
+ tail_placement = rhs.tail_placement;
+ rules = rhs.rules;
+ tail_instance = rhs.tail_instance;
+ tier_type = rhs.tier_type;
+ tier_config = rhs.tier_config;
+ return *this;
+ }
+
+ std::map<uint64_t, RGWObjManifestPart>& get_explicit_objs() {
+ return objs;
+ }
+
+
+ void set_explicit(uint64_t _size, std::map<uint64_t, RGWObjManifestPart>& _objs) {
+ explicit_objs = true;
+ objs.swap(_objs);
+ set_obj_size(_size);
+ }
+
+ void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
+ std::string *override_prefix, rgw_obj_select *location) const;
+
+ void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
+ RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
+ rules[0] = rule;
+ max_head_size = tail_ofs;
+ }
+
+ void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) {
+ RGWObjManifestRule rule(0, 0, 0, stripe_max_size);
+ rule.start_part_num = part_num;
+ rules[0] = rule;
+ max_head_size = 0;
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(8, 6, bl);
+ encode(obj_size, bl);
+ encode(objs, bl);
+ encode(explicit_objs, bl);
+ encode(obj, bl);
+ encode(head_size, bl);
+ encode(max_head_size, bl);
+ encode(prefix, bl);
+ encode(rules, bl);
+ bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket);
+ encode(encode_tail_bucket, bl);
+ if (encode_tail_bucket) {
+ encode(tail_placement.bucket, bl);
+ }
+ bool encode_tail_instance = (tail_instance != obj.key.instance);
+ encode(encode_tail_instance, bl);
+ if (encode_tail_instance) {
+ encode(tail_instance, bl);
+ }
+ encode(head_placement_rule, bl);
+ encode(tail_placement.placement_rule, bl);
+ encode(tier_type, bl);
+ encode(tier_config, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl);
+ decode(obj_size, bl);
+ decode(objs, bl);
+ if (struct_v >= 3) {
+ decode(explicit_objs, bl);
+ decode(obj, bl);
+ decode(head_size, bl);
+ decode(max_head_size, bl);
+ decode(prefix, bl);
+ decode(rules, bl);
+ } else {
+ explicit_objs = true;
+ if (!objs.empty()) {
+ std::map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+ obj = iter->second.loc;
+ head_size = iter->second.size;
+ max_head_size = head_size;
+ }
+ }
+
+ if (explicit_objs && head_size > 0 && !objs.empty()) {
+ /* patch up manifest due to issue 16435:
+ * the first object in the explicit objs list might not be the one we need to access, use the
+ * head object instead if set. This would happen if we had an old object that was created
+ * when the explicit objs manifest was around, and it got copied.
+ */
+ rgw_obj& obj_0 = objs[0].loc;
+ if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) {
+ objs[0].loc = obj;
+ objs[0].size = head_size;
+ }
+ }
+
+ if (struct_v >= 4) {
+ if (struct_v < 6) {
+ decode(tail_placement.bucket, bl);
+ } else {
+ bool need_to_decode;
+ decode(need_to_decode, bl);
+ if (need_to_decode) {
+ decode(tail_placement.bucket, bl);
+ } else {
+ tail_placement.bucket = obj.bucket;
+ }
+ }
+ }
+
+ if (struct_v >= 5) {
+ if (struct_v < 6) {
+ decode(tail_instance, bl);
+ } else {
+ bool need_to_decode;
+ decode(need_to_decode, bl);
+ if (need_to_decode) {
+ decode(tail_instance, bl);
+ } else {
+ tail_instance = obj.key.instance;
+ }
+ }
+ } else { // old object created before 'tail_instance' field added to manifest
+ tail_instance = obj.key.instance;
+ }
+
+ if (struct_v >= 7) {
+ decode(head_placement_rule, bl);
+ decode(tail_placement.placement_rule, bl);
+ }
+
+ if (struct_v >= 8) {
+ decode(tier_type, bl);
+ decode(tier_config, bl);
+ }
+
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<RGWObjManifest*>& o);
+
+ int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+ const RGWZoneParams& zone_params);
+
+ bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
+
+ bool empty() const {
+ if (explicit_objs)
+ return objs.empty();
+ return rules.empty();
+ }
+
+ bool has_explicit_objs() const {
+ return explicit_objs;
+ }
+
+ bool has_tail() const {
+ if (explicit_objs) {
+ if (objs.size() == 1) {
+ auto iter = objs.begin();
+ const rgw_obj& o = iter->second.loc;
+ return !(obj == o);
+ }
+ return (objs.size() >= 2);
+ }
+ return (obj_size > head_size);
+ }
+
+ void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) {
+ head_placement_rule = placement_rule;
+ obj = _o;
+ head_size = _s;
+
+ if (explicit_objs && head_size > 0) {
+ objs[0].loc = obj;
+ objs[0].size = head_size;
+ }
+ }
+
+ const rgw_obj& get_obj() const {
+ return obj;
+ }
+
+ void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) {
+ tail_placement.placement_rule = placement_rule;
+ tail_placement.bucket = _b;
+ }
+
+ const rgw_bucket_placement& get_tail_placement() const {
+ return tail_placement;
+ }
+
+ const rgw_placement_rule& get_head_placement_rule() const {
+ return head_placement_rule;
+ }
+
+ void set_prefix(const std::string& _p) {
+ prefix = _p;
+ }
+
+ const std::string& get_prefix() const {
+ return prefix;
+ }
+
+ void set_tail_instance(const std::string& _ti) {
+ tail_instance = _ti;
+ }
+
+ const std::string& get_tail_instance() const {
+ return tail_instance;
+ }
+
+ void set_head_size(uint64_t _s) {
+ head_size = _s;
+ }
+
+ void set_obj_size(uint64_t s) {
+ obj_size = s;
+ }
+
+ uint64_t get_obj_size() const {
+ return obj_size;
+ }
+
+ uint64_t get_head_size() const {
+ return head_size;
+ }
+
+ uint64_t get_max_head_size() const {
+ return max_head_size;
+ }
+
+ const std::string& get_tier_type() {
+ return tier_type;
+ }
+
+ inline void set_tier_type(std::string value) {
+ /* Only "cloud-s3" tier-type is supported for now */
+ if (value == "cloud-s3") {
+ tier_type = value;
+ }
+ }
+
+ inline void set_tier_config(RGWObjTier t) {
+ /* Set only if tier_type set to "cloud-s3" */
+ if (tier_type != "cloud-s3")
+ return;
+
+ tier_config.name = t.name;
+ tier_config.tier_placement = t.tier_placement;
+ tier_config.is_multipart_upload = t.is_multipart_upload;
+ }
+
+ inline const void get_tier_config(RGWObjTier* t) {
+ if (tier_type != "cloud-s3")
+ return;
+
+ t->name = tier_config.name;
+ t->tier_placement = tier_config.tier_placement;
+ t->is_multipart_upload = tier_config.is_multipart_upload;
+ }
+
+ class obj_iterator {
+ const DoutPrefixProvider *dpp;
+ const RGWObjManifest *manifest = nullptr;
+ uint64_t part_ofs = 0; /* where current part starts */
+ uint64_t stripe_ofs = 0; /* where current stripe starts */
+ uint64_t ofs = 0; /* current position within the object */
+ uint64_t stripe_size = 0; /* current part size */
+
+ int cur_part_id = 0;
+ int cur_stripe = 0;
+ std::string cur_override_prefix;
+
+ rgw_obj_select location;
+
+ std::map<uint64_t, RGWObjManifestRule>::const_iterator rule_iter;
+ std::map<uint64_t, RGWObjManifestRule>::const_iterator next_rule_iter;
+ std::map<uint64_t, RGWObjManifestPart>::const_iterator explicit_iter;
+
+ void update_explicit_pos();
+
+ public:
+ obj_iterator() = default;
+ explicit obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m)
+ : obj_iterator(_dpp, _m, 0)
+ {}
+ obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m, uint64_t _ofs) : dpp(_dpp), manifest(_m) {
+ seek(_ofs);
+ }
+ void seek(uint64_t ofs);
+
+ void operator++();
+ bool operator==(const obj_iterator& rhs) const {
+ return (ofs == rhs.ofs);
+ }
+ bool operator!=(const obj_iterator& rhs) const {
+ return (ofs != rhs.ofs);
+ }
+ const rgw_obj_select& get_location() {
+ return location;
+ }
+
+ /* where current part starts */
+ uint64_t get_part_ofs() const {
+ return part_ofs;
+ }
+
+ /* start of current stripe */
+ uint64_t get_stripe_ofs() {
+ if (manifest->explicit_objs) {
+ return explicit_iter->first;
+ }
+ return stripe_ofs;
+ }
+
+ /* current ofs relative to start of rgw object */
+ uint64_t get_ofs() const {
+ return ofs;
+ }
+
+ int get_cur_part_id() const {
+ return cur_part_id;
+ }
+
+ /* stripe number */
+ int get_cur_stripe() const {
+ return cur_stripe;
+ }
+
+ /* current stripe size */
+ uint64_t get_stripe_size() {
+ if (manifest->explicit_objs) {
+ return explicit_iter->second.size;
+ }
+ return stripe_size;
+ }
+
+ /* offset where data starts within current stripe */
+ uint64_t location_ofs() {
+ if (manifest->explicit_objs) {
+ return explicit_iter->second.loc_ofs;
+ }
+ return 0; /* all stripes start at zero offset */
+ }
+
+ void update_location();
+
+ void dump(Formatter *f) const;
+ }; // class obj_iterator
+
+ obj_iterator obj_begin(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this}; }
+ obj_iterator obj_end(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this, obj_size}; }
+ obj_iterator obj_find(const DoutPrefixProvider *dpp, uint64_t ofs) const {
+ return obj_iterator{dpp, this, std::min(ofs, obj_size)};
+ }
+
+ /*
+ * simple object generator. Using a simple single rule manifest.
+ */
+ class generator {
+ RGWObjManifest *manifest;
+ uint64_t last_ofs;
+ uint64_t cur_part_ofs;
+ int cur_part_id;
+ int cur_stripe;
+ uint64_t cur_stripe_size;
+ std::string cur_oid;
+
+ std::string oid_prefix;
+
+ rgw_obj_select cur_obj;
+
+ RGWObjManifestRule rule;
+
+ public:
+ generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0),
+ cur_stripe(0), cur_stripe_size(0) {}
+ int create_begin(CephContext *cct, RGWObjManifest *manifest,
+ const rgw_placement_rule& head_placement_rule,
+ const rgw_placement_rule *tail_placement_rule,
+ const rgw_bucket& bucket,
+ const rgw_obj& obj);
+
+ int create_next(uint64_t ofs);
+
+ rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); }
+ rgw_raw_obj get_cur_obj(RGWRados* store) const { return cur_obj.get_raw_obj(store); }
+
+ /* total max size of current stripe (including head obj) */
+ uint64_t cur_stripe_max_size() const {
+ return cur_stripe_size;
+ }
+ };
+};
+WRITE_CLASS_ENCODER(RGWObjManifest)
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
new file mode 100644
index 000000000..ec1bf3fb6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -0,0 +1,442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_object_expirer_core.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_bi_rados.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/timeindex/cls_timeindex_client.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string objexp_lock_name = "gc_process";
+
+static string objexp_hint_get_shardname(int shard_num)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "obj_delete_at_hint.%010u", (unsigned)shard_num);
+ return buf;
+}
+
+static int objexp_key_shard(const rgw_obj_index_key& key, int num_shards)
+{
+ string obj_key = key.name + key.instance;
+ return RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+}
+
+static string objexp_hint_get_keyext(const string& tenant_name,
+ const string& bucket_name,
+ const string& bucket_id,
+ const rgw_obj_key& obj_key) {
+ return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
+ ":" + obj_key.name + ":" + obj_key.instance;
+}
+
+static void objexp_get_shard(int shard_num,
+ string *shard)
+{
+ *shard = objexp_hint_get_shardname(shard_num);
+}
+
+static int objexp_hint_parse(const DoutPrefixProvider *dpp, CephContext *cct, cls_timeindex_entry &ti_entry,
+ objexp_hint_entry *hint_entry)
+{
+ try {
+ auto iter = ti_entry.value.cbegin();
+ decode(*hint_entry, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl;
+ }
+
+ return 0;
+}
+
+int RGWObjExpStore::objexp_hint_add(const DoutPrefixProvider *dpp,
+ const ceph::real_time& delete_at,
+ const string& tenant_name,
+ const string& bucket_name,
+ const string& bucket_id,
+ const rgw_obj_index_key& obj_key)
+{
+ const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
+ bucket_id, obj_key);
+ objexp_hint_entry he = {
+ .tenant = tenant_name,
+ .bucket_name = bucket_name,
+ .bucket_id = bucket_id,
+ .obj_key = obj_key,
+ .exp_time = delete_at };
+ bufferlist hebl;
+ encode(he, hebl);
+ librados::ObjectWriteOperation op;
+ cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
+
+ string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key, cct->_conf->rgw_objexp_hints_num_shards));
+ auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, shard_name));
+ int r = obj.open(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+ return r;
+ }
+ return obj.operate(dpp, &op, null_yield);
+}
+
+int RGWObjExpStore::objexp_hint_list(const DoutPrefixProvider *dpp,
+ const string& oid,
+ const ceph::real_time& start_time,
+ const ceph::real_time& end_time,
+ const int max_entries,
+ const string& marker,
+ list<cls_timeindex_entry>& entries, /* out */
+ string *out_marker, /* out */
+ bool *truncated) /* out */
+{
+ librados::ObjectReadOperation op;
+ cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
+ out_marker, truncated);
+
+ auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+ int r = obj.open(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+ return r;
+ }
+ bufferlist obl;
+ int ret = obj.operate(dpp, &op, &obl, null_yield);
+
+ if ((ret < 0 ) && (ret != -ENOENT)) {
+ return ret;
+ }
+
+ if ((ret == -ENOENT) && truncated) {
+ *truncated = false;
+ }
+
+ return 0;
+}
+
+static int cls_timeindex_trim_repeat(const DoutPrefixProvider *dpp,
+ rgw_rados_ref ref,
+ const string& oid,
+ const utime_t& from_time,
+ const utime_t& to_time,
+ const string& from_marker,
+ const string& to_marker)
+{
+ bool done = false;
+ do {
+ librados::ObjectWriteOperation op;
+ cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker);
+ int r = rgw_rados_operate(dpp, ref.pool.ioctx(), oid, &op, null_yield);
+ if (r == -ENODATA)
+ done = true;
+ else if (r < 0)
+ return r;
+ } while (!done);
+
+ return 0;
+}
+
+int RGWObjExpStore::objexp_hint_trim(const DoutPrefixProvider *dpp,
+ const string& oid,
+ const ceph::real_time& start_time,
+ const ceph::real_time& end_time,
+ const string& from_marker,
+ const string& to_marker)
+{
+ auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+ int r = obj.open(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+ return r;
+ }
+ auto& ref = obj.get_ref();
+ int ret = cls_timeindex_trim_repeat(dpp, ref, oid, utime_t(start_time), utime_t(end_time),
+ from_marker, to_marker);
+ if ((ret < 0 ) && (ret != -ENOENT)) {
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint)
+{
+ RGWBucketInfo bucket_info;
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+
+ int ret = driver->get_bucket(dpp, nullptr, rgw_bucket(hint.tenant, hint.bucket_name, hint.bucket_id), &bucket, null_yield);
+ if (-ENOENT == ret) {
+ ldpp_dout(dpp, 15) << "NOTICE: cannot find bucket = " \
+ << hint.bucket_name << ". The object must be already removed" << dendl;
+ return -ERR_PRECONDITION_FAILED;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: could not init bucket = " \
+ << hint.bucket_name << "due to ret = " << ret << dendl;
+ return ret;
+ }
+
+ rgw_obj_key key = hint.obj_key;
+ if (key.instance.empty()) {
+ key.instance = "null";
+ }
+
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+ obj->set_atomic();
+ ret = obj->delete_object(dpp, null_yield);
+
+ return ret;
+}
+
+void RGWObjectExpirer::garbage_chunk(const DoutPrefixProvider *dpp,
+ list<cls_timeindex_entry>& entries, /* in */
+ bool& need_trim) /* out */
+{
+ need_trim = false;
+
+ for (list<cls_timeindex_entry>::iterator iter = entries.begin();
+ iter != entries.end();
+ ++iter)
+ {
+ objexp_hint_entry hint;
+ ldpp_dout(dpp, 15) << "got removal hint for: " << iter->key_ts.sec() \
+ << " - " << iter->key_ext << dendl;
+
+ int ret = objexp_hint_parse(dpp, driver->ctx(), *iter, &hint);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "cannot parse removal hint for " << hint.obj_key << dendl;
+ continue;
+ }
+
+ /* PRECOND_FAILED simply means that our hint is not valid.
+ * We can silently ignore that and move forward. */
+ ret = garbage_single_object(dpp, hint);
+ if (ret == -ERR_PRECONDITION_FAILED) {
+ ldpp_dout(dpp, 15) << "not actual hint for object: " << hint.obj_key << dendl;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 1) << "cannot remove expired object: " << hint.obj_key << dendl;
+ }
+
+ need_trim = true;
+ }
+
+ return;
+}
+
+void RGWObjectExpirer::trim_chunk(const DoutPrefixProvider *dpp,
+ const string& shard,
+ const utime_t& from,
+ const utime_t& to,
+ const string& from_marker,
+ const string& to_marker)
+{
+ ldpp_dout(dpp, 20) << "trying to trim removal hints to=" << to
+ << ", to_marker=" << to_marker << dendl;
+
+ real_time rt_from = from.to_real_time();
+ real_time rt_to = to.to_real_time();
+
+ int ret = exp_store.objexp_hint_trim(dpp, shard, rt_from, rt_to,
+ from_marker, to_marker);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR during trim: " << ret << dendl;
+ }
+
+ return;
+}
+
+bool RGWObjectExpirer::process_single_shard(const DoutPrefixProvider *dpp,
+ const string& shard,
+ const utime_t& last_run,
+ const utime_t& round_start)
+{
+ string marker;
+ string out_marker;
+ bool truncated = false;
+ bool done = true;
+
+ CephContext *cct = driver->ctx();
+ int num_entries = cct->_conf->rgw_objexp_chunk_size;
+
+ int max_secs = cct->_conf->rgw_objexp_gc_interval;
+ utime_t end = ceph_clock_now();
+ end += max_secs;
+
+ rados::cls::lock::Lock l(objexp_lock_name);
+
+ utime_t time(max_secs, 0);
+ l.set_duration(time);
+
+ int ret = l.lock_exclusive(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+ if (ret == -EBUSY) { /* already locked by another processor */
+ ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << shard << dendl;
+ return false;
+ }
+
+ do {
+ real_time rt_last = last_run.to_real_time();
+ real_time rt_start = round_start.to_real_time();
+
+ list<cls_timeindex_entry> entries;
+ ret = exp_store.objexp_hint_list(dpp, shard, rt_last, rt_start,
+ num_entries, marker, entries,
+ &out_marker, &truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, 10) << "cannot get removal hints from shard: " << shard
+ << dendl;
+ continue;
+ }
+
+ bool need_trim;
+ garbage_chunk(dpp, entries, need_trim);
+
+ if (need_trim) {
+ trim_chunk(dpp, shard, last_run, round_start, marker, out_marker);
+ }
+
+ utime_t now = ceph_clock_now();
+ if (now >= end) {
+ done = false;
+ break;
+ }
+
+ marker = out_marker;
+ } while (truncated);
+
+ l.unlock(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+ return done;
+}
+
+/* Returns true if all shards have been processed successfully. */
+bool RGWObjectExpirer::inspect_all_shards(const DoutPrefixProvider *dpp,
+ const utime_t& last_run,
+ const utime_t& round_start)
+{
+ CephContext * const cct = driver->ctx();
+ int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
+ bool all_done = true;
+
+ for (int i = 0; i < num_shards; i++) {
+ string shard;
+ objexp_get_shard(i, &shard);
+
+ ldpp_dout(dpp, 20) << "processing shard = " << shard << dendl;
+
+ if (! process_single_shard(dpp, shard, last_run, round_start)) {
+ all_done = false;
+ }
+ }
+
+ return all_done;
+}
+
+bool RGWObjectExpirer::going_down()
+{
+ return down_flag;
+}
+
+void RGWObjectExpirer::start_processor()
+{
+ worker = new OEWorker(driver->ctx(), this);
+ worker->create("rgw_obj_expirer");
+}
+
+void RGWObjectExpirer::stop_processor()
+{
+ down_flag = true;
+ if (worker) {
+ worker->stop();
+ worker->join();
+ }
+ delete worker;
+ worker = NULL;
+}
+
+void *RGWObjectExpirer::OEWorker::entry() {
+ utime_t last_run;
+ do {
+ utime_t start = ceph_clock_now();
+ ldpp_dout(this, 2) << "object expiration: start" << dendl;
+ if (oe->inspect_all_shards(this, last_run, start)) {
+ /* All shards have been processed properly. Next time we can start
+ * from this moment. */
+ last_run = start;
+ }
+ ldpp_dout(this, 2) << "object expiration: stop" << dendl;
+
+
+ if (oe->going_down())
+ break;
+
+ utime_t end = ceph_clock_now();
+ end -= start;
+ int secs = cct->_conf->rgw_objexp_gc_interval;
+
+ if (secs <= end.sec())
+ continue; // next round
+
+ secs -= end.sec();
+
+ std::unique_lock l{lock};
+ cond.wait_for(l, std::chrono::seconds(secs));
+ } while (!oe->going_down());
+
+ return NULL;
+}
+
+void RGWObjectExpirer::OEWorker::stop()
+{
+ std::lock_guard l{lock};
+ cond.notify_all();
+}
+
+CephContext *RGWObjectExpirer::OEWorker::get_cct() const
+{
+ return cct;
+}
+
+unsigned RGWObjectExpirer::OEWorker::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWObjectExpirer::OEWorker::gen_prefix(std::ostream& out) const
+{
+ return out << "rgw object expirer Worker thread: ";
+}
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h
new file mode 100644
index 000000000..be63815c1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <cerrno>
+#include <sstream>
+#include <iostream>
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "global/global_init.h"
+
+#include "include/common_fwd.h"
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_sal_rados.h"
+
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWBucketInfo;
+class cls_timeindex_entry;
+
+class RGWObjExpStore {
+ CephContext *cct;
+ RGWSI_RADOS *rados_svc;
+ rgw::sal::RadosStore* driver;
+public:
+ RGWObjExpStore(CephContext *_cct, RGWSI_RADOS *_rados_svc, rgw::sal::RadosStore* _driver) : cct(_cct),
+ rados_svc(_rados_svc),
+ driver(_driver) {}
+
+ int objexp_hint_add(const DoutPrefixProvider *dpp,
+ const ceph::real_time& delete_at,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ const std::string& bucket_id,
+ const rgw_obj_index_key& obj_key);
+
+ int objexp_hint_list(const DoutPrefixProvider *dpp,
+ const std::string& oid,
+ const ceph::real_time& start_time,
+ const ceph::real_time& end_time,
+ const int max_entries,
+ const std::string& marker,
+ std::list<cls_timeindex_entry>& entries, /* out */
+ std::string *out_marker, /* out */
+ bool *truncated); /* out */
+
+ int objexp_hint_trim(const DoutPrefixProvider *dpp,
+ const std::string& oid,
+ const ceph::real_time& start_time,
+ const ceph::real_time& end_time,
+ const std::string& from_marker,
+ const std::string& to_marker);
+};
+
+class RGWObjectExpirer {
+protected:
+ rgw::sal::Driver* driver;
+ RGWObjExpStore exp_store;
+
+ class OEWorker : public Thread, public DoutPrefixProvider {
+ CephContext *cct;
+ RGWObjectExpirer *oe;
+ ceph::mutex lock = ceph::make_mutex("OEWorker");
+ ceph::condition_variable cond;
+
+ public:
+ OEWorker(CephContext * const cct,
+ RGWObjectExpirer * const oe)
+ : cct(cct),
+ oe(oe) {
+ }
+
+ void *entry() override;
+ void stop();
+
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+ };
+
+ OEWorker *worker{nullptr};
+ std::atomic<bool> down_flag = { false };
+
+public:
+ explicit RGWObjectExpirer(rgw::sal::Driver* _driver)
+ : driver(_driver),
+ exp_store(_driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados, static_cast<rgw::sal::RadosStore*>(driver)),
+ worker(NULL) {
+ }
+ ~RGWObjectExpirer() {
+ stop_processor();
+ }
+
+ int hint_add(const DoutPrefixProvider *dpp,
+ const ceph::real_time& delete_at,
+ const std::string& tenant_name,
+ const std::string& bucket_name,
+ const std::string& bucket_id,
+ const rgw_obj_index_key& obj_key) {
+ return exp_store.objexp_hint_add(dpp, delete_at, tenant_name, bucket_name,
+ bucket_id, obj_key);
+ }
+
+ int garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint);
+
+ void garbage_chunk(const DoutPrefixProvider *dpp,
+ std::list<cls_timeindex_entry>& entries, /* in */
+ bool& need_trim); /* out */
+
+ void trim_chunk(const DoutPrefixProvider *dpp,
+ const std::string& shard,
+ const utime_t& from,
+ const utime_t& to,
+ const std::string& from_marker,
+ const std::string& to_marker);
+
+ bool process_single_shard(const DoutPrefixProvider *dpp,
+ const std::string& shard,
+ const utime_t& last_run,
+ const utime_t& round_start);
+
+ bool inspect_all_shards(const DoutPrefixProvider *dpp,
+ const utime_t& last_run,
+ const utime_t& round_start);
+
+ bool going_down();
+ void start_processor();
+ void stop_processor();
+};
diff --git a/src/rgw/driver/rados/rgw_otp.cc b/src/rgw/driver/rados/rgw_otp.cc
new file mode 100644
index 000000000..07cc14f11
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_otp.cc
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include <string>
+#include <map>
+#include <boost/algorithm/string.hpp>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "rgw_otp.h"
+#include "rgw_zone.h"
+#include "rgw_metadata.h"
+
+#include "include/types.h"
+
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_otp.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+
+class RGWOTPMetadataHandler;
+
+class RGWOTPMetadataObject : public RGWMetadataObject {
+ friend class RGWOTPMetadataHandler;
+
+ otp_devices_list_t devices;
+public:
+ RGWOTPMetadataObject() {}
+ RGWOTPMetadataObject(otp_devices_list_t&& _devices, const obj_version& v, const real_time m) {
+ devices = std::move(_devices);
+ objv = v;
+ mtime = m;
+ }
+
+ void dump(Formatter *f) const override {
+ encode_json("devices", devices, f);
+ }
+
+ otp_devices_list_t& get_devs() {
+ return devices;
+ }
+};
+
+
+class RGWOTPMetadataHandler : public RGWOTPMetadataHandlerBase {
+ friend class RGWOTPCtl;
+
+ struct Svc {
+ RGWSI_Zone *zone;
+ RGWSI_MetaBackend *meta_be;
+ RGWSI_OTP *otp;
+ } svc;
+
+ int init(RGWSI_Zone *zone,
+ RGWSI_MetaBackend *_meta_be,
+ RGWSI_OTP *_otp) {
+ base_init(zone->ctx(), _otp->get_be_handler().get());
+ svc.zone = zone;
+ svc.meta_be = _meta_be;
+ svc.otp = _otp;
+ return 0;
+ }
+
+ int call(std::function<int(RGWSI_OTP_BE_Ctx& ctx)> f) {
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ RGWSI_OTP_BE_Ctx ctx(op->ctx());
+ return f(ctx);
+ });
+ }
+
+ RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+ otp_devices_list_t devices;
+ try {
+ JSONDecoder::decode_json("devices", devices, jo);
+ } catch (JSONDecoder::err& e) {
+ return nullptr;
+ }
+
+ return new RGWOTPMetadataObject(std::move(devices), objv, mtime);
+ }
+
+ int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWObjVersionTracker objv_tracker;
+
+ std::unique_ptr<RGWOTPMetadataObject> mdo(new RGWOTPMetadataObject);
+
+
+ RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+ int ret = svc.otp->read_all(be_ctx,
+ entry,
+ &mdo->get_devs(),
+ &mdo->get_mtime(),
+ &objv_tracker,
+ y,
+ dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ mdo->objv = objv_tracker.read_version;
+
+ *obj = mdo.release();
+
+ return 0;
+ }
+
+ int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone) override {
+ RGWOTPMetadataObject *obj = static_cast<RGWOTPMetadataObject *>(_obj);
+
+ RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+ int ret = svc.otp->store_all(dpp, be_ctx,
+ entry,
+ obj->devices,
+ obj->mtime,
+ &objv_tracker,
+ y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return STATUS_APPLIED;
+ }
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWSI_MBOTP_RemoveParams params;
+
+ RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+ return svc.otp->remove_all(dpp, be_ctx,
+ entry,
+ &objv_tracker,
+ y);
+ }
+
+public:
+ RGWOTPMetadataHandler() {}
+
+ string get_type() override { return "otp"; }
+};
+
+
+RGWOTPCtl::RGWOTPCtl(RGWSI_Zone *zone_svc,
+ RGWSI_OTP *otp_svc)
+{
+ svc.zone = zone_svc;
+ svc.otp = otp_svc;
+}
+
+
+void RGWOTPCtl::init(RGWOTPMetadataHandler *_meta_handler)
+{
+ meta_handler = _meta_handler;
+ be_handler = meta_handler->get_be_handler();
+}
+
+int RGWOTPCtl::read_all(const rgw_user& uid,
+ RGWOTPInfo *info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const GetParams& params)
+{
+ info->uid = uid;
+ return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+ return svc.otp->read_all(ctx, uid, &info->devices, params.mtime, params.objv_tracker, y, dpp);
+ });
+}
+
+int RGWOTPCtl::store_all(const DoutPrefixProvider *dpp,
+ const RGWOTPInfo& info,
+ optional_yield y,
+ const PutParams& params)
+{
+ return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+ return svc.otp->store_all(dpp, ctx, info.uid, info.devices, params.mtime, params.objv_tracker, y);
+ });
+}
+
+int RGWOTPCtl::remove_all(const DoutPrefixProvider *dpp,
+ const rgw_user& uid,
+ optional_yield y,
+ const RemoveParams& params)
+{
+ return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+ return svc.otp->remove_all(dpp, ctx, uid, params.objv_tracker, y);
+ });
+}
+
+
+RGWMetadataHandler *RGWOTPMetaHandlerAllocator::alloc()
+{
+ return new RGWOTPMetadataHandler();
+}
diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h
new file mode 100644
index 000000000..885e8abb8
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_otp.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sal_fwd.h"
+#include "cls/otp/cls_otp_types.h"
+#include "services/svc_meta_be_otp.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_metadata.h"
+
+
+class RGWObjVersionTracker;
+class RGWMetadataHandler;
+class RGWOTPMetadataHandler;
+class RGWSI_Zone;
+class RGWSI_OTP;
+class RGWSI_MetaBackend;
+
+class RGWOTPMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+ virtual ~RGWOTPMetadataHandlerBase() {}
+ virtual int init(RGWSI_Zone *zone,
+ RGWSI_MetaBackend *_meta_be,
+ RGWSI_OTP *_otp) = 0;
+};
+
+class RGWOTPMetaHandlerAllocator {
+public:
+ static RGWMetadataHandler *alloc();
+};
+
+struct RGWOTPInfo {
+ rgw_user uid;
+ otp_devices_list_t devices;
+};
+
+
+class RGWOTPCtl
+{
+ struct Svc {
+ RGWSI_Zone *zone{nullptr};
+ RGWSI_OTP *otp{nullptr};
+ } svc;
+
+ RGWOTPMetadataHandler *meta_handler;
+ RGWSI_MetaBackend_Handler *be_handler;
+
+public:
+ RGWOTPCtl(RGWSI_Zone *zone_svc,
+ RGWSI_OTP *otp_svc);
+
+ void init(RGWOTPMetadataHandler *_meta_handler);
+
+ struct GetParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ ceph::real_time *mtime{nullptr};
+
+ GetParams() {}
+
+ GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ GetParams& set_mtime(ceph::real_time *_mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+ };
+
+ struct PutParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ ceph::real_time mtime;
+
+ PutParams() {}
+
+ PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ PutParams& set_mtime(const ceph::real_time& _mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+ };
+
+ struct RemoveParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+
+ RemoveParams() {}
+
+ RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+ };
+
+ int read_all(const rgw_user& uid, RGWOTPInfo *info, optional_yield y,
+ const DoutPrefixProvider *dpp,
+ const GetParams& params = {});
+ int store_all(const DoutPrefixProvider *dpp,
+ const RGWOTPInfo& info, optional_yield y,
+ const PutParams& params = {});
+ int remove_all(const DoutPrefixProvider *dpp,
+ const rgw_user& user, optional_yield y,
+ const RemoveParams& params = {});
+};
diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc
new file mode 100644
index 000000000..61602b354
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_period.cc
@@ -0,0 +1,324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup,
+ const string& zonegroup_id) const
+{
+ map<string, RGWZoneGroup>::const_iterator iter;
+ if (!zonegroup_id.empty()) {
+ iter = period_map.zonegroups.find(zonegroup_id);
+ } else {
+ iter = period_map.zonegroups.find("default");
+ }
+ if (iter != period_map.zonegroups.end()) {
+ zonegroup = iter->second;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+int RGWPeriod::get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& latest_epoch, optional_yield y)
+{
+ RGWPeriodLatestEpochInfo info;
+
+ int ret = read_latest_epoch(dpp, info, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ latest_epoch = info.epoch;
+
+ return 0;
+}
+
+int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ rgw_pool pool(get_pool(cct));
+
+ // delete the object for each period epoch
+ for (epoch_t e = 1; e <= epoch; e++) {
+ RGWPeriod p{get_id(), e};
+ rgw_raw_obj oid{pool, p.get_period_oid()};
+ auto sysobj = sysobj_svc->get_obj(oid);
+ int ret = sysobj.wop().remove(dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+ << ": " << cpp_strerror(-ret) << dendl;
+ }
+ }
+
+ // delete the .latest_epoch object
+ rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
+ auto sysobj = sysobj_svc->get_obj(oid);
+ int ret = sysobj.wop().remove(dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+ << ": " << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+}
+
+int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y)
+{
+ if (zonegroup.realm_id != realm_id) {
+ return 0;
+ }
+ int ret = period_map.update(zonegroup, cct);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return store_info(dpp, false, y);
+}
+
+int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ auto zone_svc = sysobj_svc->get_zone_svc();
+ ldpp_dout(dpp, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
+ list<string> zonegroups;
+ int ret = zone_svc->list_zonegroups(dpp, zonegroups);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ // clear zone short ids of removed zones. period_map.update() will add the
+ // remaining zones back
+ period_map.short_zone_ids.clear();
+
+ for (auto& iter : zonegroups) {
+ RGWZoneGroup zg(string(), iter);
+ ret = zg.init(dpp, cct, sysobj_svc, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+
+ if (zg.realm_id != realm_id) {
+ ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
+ continue;
+ }
+
+ if (zg.master_zone.empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+ return -EINVAL;
+ }
+
+ if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+ ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+ << " has a non existent master zone "<< dendl;
+ return -EINVAL;
+ }
+
+ if (zg.is_master_zonegroup()) {
+ master_zonegroup = zg.get_id();
+ master_zone = zg.master_zone;
+ }
+
+ int ret = period_map.update(zg, cct);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ ret = period_config.read(dpp, sysobj_svc, realm_id, y);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+void RGWPeriod::fork()
+{
+ ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
+ predecessor_uuid = id;
+ id = get_staging_id(realm_id);
+ period_map.reset();
+ realm_epoch++;
+}
+
+static int read_sync_status(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw_meta_sync_status *sync_status)
+{
+ rgw::sal::RadosStore* rados_store = static_cast<rgw::sal::RadosStore*>(driver);
+ // initialize a sync status manager to read the status
+ RGWMetaSyncStatusManager mgr(rados_store, rados_store->svc()->rados->get_async_processor());
+ int r = mgr.init(dpp);
+ if (r < 0) {
+ return r;
+ }
+ r = mgr.read_sync_status(dpp, sync_status);
+ mgr.stop();
+ return r;
+}
+
+int RGWPeriod::update_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, /* for now */
+ const RGWPeriod &current_period,
+ std::ostream& error_stream,
+ bool force_if_stale)
+{
+ rgw_meta_sync_status status;
+ int r = read_sync_status(dpp, driver, &status);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "period failed to read sync status: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ std::vector<std::string> markers;
+
+ const auto current_epoch = current_period.get_realm_epoch();
+ if (current_epoch != status.sync_info.realm_epoch) {
+ // no sync status markers for the current period
+ ceph_assert(current_epoch > status.sync_info.realm_epoch);
+ const int behind = current_epoch - status.sync_info.realm_epoch;
+ if (!force_if_stale && current_epoch > 1) {
+ error_stream << "ERROR: This zone is " << behind << " period(s) behind "
+ "the current master zone in metadata sync. If this zone is promoted "
+ "to master, any metadata changes during that time are likely to "
+ "be lost.\n"
+ "Waiting for this zone to catch up on metadata sync (see "
+ "'radosgw-admin sync status') is recommended.\n"
+ "To promote this zone to master anyway, add the flag "
+ "--yes-i-really-mean-it." << std::endl;
+ return -EINVAL;
+ }
+ // empty sync status markers - other zones will skip this period during
+ // incremental metadata sync
+ markers.resize(status.sync_info.num_shards);
+ } else {
+ markers.reserve(status.sync_info.num_shards);
+ for (auto& i : status.sync_markers) {
+ auto& marker = i.second;
+ // filter out markers from other periods
+ if (marker.realm_epoch != current_epoch) {
+ marker.marker.clear();
+ }
+ markers.emplace_back(std::move(marker.marker));
+ }
+ }
+
+ std::swap(sync_status, markers);
+ return 0;
+}
+
+int RGWPeriod::commit(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWRealm& realm, const RGWPeriod& current_period,
+ std::ostream& error_stream, optional_yield y,
+ bool force_if_stale)
+{
+ auto zone_svc = sysobj_svc->get_zone_svc();
+ ldpp_dout(dpp, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
+ // gateway must be in the master zone to commit
+ if (master_zone != zone_svc->get_zone_params().get_id()) {
+ error_stream << "Cannot commit period on zone "
+ << zone_svc->get_zone_params().get_id() << ", it must be sent to "
+ "the period's master zone " << master_zone << '.' << std::endl;
+ return -EINVAL;
+ }
+ // period predecessor must match current period
+ if (predecessor_uuid != current_period.get_id()) {
+ error_stream << "Period predecessor " << predecessor_uuid
+ << " does not match current period " << current_period.get_id()
+ << ". Use 'period pull' to get the latest period from the master, "
+ "reapply your changes, and try again." << std::endl;
+ return -EINVAL;
+ }
+ // realm epoch must be 1 greater than current period
+ if (realm_epoch != current_period.get_realm_epoch() + 1) {
+ error_stream << "Period's realm epoch " << realm_epoch
+ << " does not come directly after current realm epoch "
+ << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
+ "latest realm and period from the master zone, reapply your changes, "
+ "and try again." << std::endl;
+ return -EINVAL;
+ }
+ // did the master zone change?
+ if (master_zone != current_period.get_master_zone()) {
+ // store the current metadata sync status in the period
+ int r = update_sync_status(dpp, driver, current_period, error_stream, force_if_stale);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ // create an object with a new period id
+ r = create(dpp, y, true);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ // set as current period
+ r = realm.set_current_period(dpp, *this, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+ << id << dendl;
+ realm.notify_new_period(dpp, *this, y);
+ return 0;
+ }
+ // period must be based on current epoch
+ if (epoch != current_period.get_epoch()) {
+ error_stream << "Period epoch " << epoch << " does not match "
+ "predecessor epoch " << current_period.get_epoch()
+ << ". Use 'period pull' to get the latest epoch from the master zone, "
+ "reapply your changes, and try again." << std::endl;
+ return -EINVAL;
+ }
+ // set period as next epoch
+ set_id(current_period.get_id());
+ set_epoch(current_period.get_epoch() + 1);
+ set_predecessor(current_period.get_predecessor());
+ realm_epoch = current_period.get_realm_epoch();
+ // write the period to rados
+ int r = store_info(dpp, false, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ // set as latest epoch
+ r = update_latest_epoch(dpp, epoch, y);
+ if (r == -EEXIST) {
+ // already have this epoch (or a more recent one)
+ return 0;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ r = reflect(dpp, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 4) << "Committed new epoch " << epoch
+ << " for period " << id << dendl;
+ realm.notify_new_period(dpp, *this, y);
+ return 0;
+}
+
+void RGWPeriod::generate_test_instances(list<RGWPeriod*> &o)
+{
+ RGWPeriod *z = new RGWPeriod;
+ o.push_back(z);
+ o.push_back(new RGWPeriod);
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc
new file mode 100644
index 000000000..bdb24ce9a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.cc
@@ -0,0 +1,460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_pubsub_push.h"
+#include <string>
+#include <sstream>
+#include <algorithm>
+#include "include/buffer_fwd.h"
+#include "common/Formatter.h"
+#include "common/iso_8601.h"
+#include "common/async/completion.h"
+#include "rgw_common.h"
+#include "rgw_data_sync.h"
+#include "rgw_pubsub.h"
+#include "acconfig.h"
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+#include "rgw_amqp.h"
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+#include "rgw_kafka.h"
+#endif
+#include <boost/asio/yield.hpp>
+#include <boost/algorithm/string.hpp>
+#include <functional>
+#include "rgw_perf_counters.h"
+
+using namespace rgw;
+
+template<typename EventType>
+std::string json_format_pubsub_event(const EventType& event) {
+ std::stringstream ss;
+ JSONFormatter f(false);
+ {
+ Formatter::ObjectSection s(f, EventType::json_type_plural);
+ {
+ Formatter::ArraySection s(f, EventType::json_type_plural);
+ encode_json("", event, &f);
+ }
+ }
+ f.flush(ss);
+ return ss.str();
+}
+
+bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
+ bool value;
+ bool exists;
+ if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
+ throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
+ }
+ if (!exists) {
+ return default_value;
+ }
+ return value;
+}
+
+class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
+private:
+ const std::string endpoint;
+ typedef unsigned ack_level_t;
+ ack_level_t ack_level; // TODO: not used for now
+ const bool verify_ssl;
+ const bool cloudevents;
+ static const ack_level_t ACK_LEVEL_ANY = 0;
+ static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
+
+public:
+ RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) :
+ endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false))
+ {
+ bool exists;
+ const auto& str_ack_level = args.get("http-ack-level", &exists);
+ if (!exists || str_ack_level == "any") {
+ // "any" is default
+ ack_level = ACK_LEVEL_ANY;
+ } else if (str_ack_level == "non-error") {
+ ack_level = ACK_LEVEL_NON_ERROR;
+ } else {
+ ack_level = std::atoi(str_ack_level.c_str());
+ if (ack_level < 100 || ack_level >= 600) {
+ throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
+ }
+ }
+ }
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ bufferlist read_bl;
+ RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
+ const auto post_data = json_format_pubsub_event(event);
+ if (cloudevents) {
+ // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
+ // using "Binary Content Mode"
+ request.append_header("ce-specversion", "1.0");
+ request.append_header("ce-type", "com.amazonaws." + event.eventName);
+ request.append_header("ce-time", to_iso_8601(event.eventTime));
+ // default output of iso8601 is also RFC3339 compatible
+ request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
+ request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
+ request.append_header("ce-subject", event.object_key);
+ }
+ request.set_post_data(post_data);
+ request.set_send_length(post_data.length());
+ request.append_header("Content-Type", "application/json");
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
+ const auto rc = RGWHTTP::process(&request, y);
+ if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
+ // TODO: use read_bl to process return code and handle according to ack level
+ return rc;
+ }
+
+ std::string to_str() const override {
+ std::string str("HTTP/S Endpoint");
+ str += "\nURI: " + endpoint;
+ str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
+ return str;
+ }
+};
+
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
+private:
+ enum class ack_level_t {
+ None,
+ Broker,
+ Routable
+ };
+ CephContext* const cct;
+ const std::string endpoint;
+ const std::string topic;
+ const std::string exchange;
+ ack_level_t ack_level;
+ amqp::connection_id_t conn_id;
+
+ bool get_verify_ssl(const RGWHTTPArgs& args) {
+ bool exists;
+ auto str_verify_ssl = args.get("verify-ssl", &exists);
+ if (!exists) {
+ // verify server certificate by default
+ return true;
+ }
+ boost::algorithm::to_lower(str_verify_ssl);
+ if (str_verify_ssl == "true") {
+ return true;
+ }
+ if (str_verify_ssl == "false") {
+ return false;
+ }
+ throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
+ }
+
+ std::string get_exchange(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto exchange = args.get("amqp-exchange", &exists);
+ if (!exists) {
+ throw configuration_error("AMQP: missing amqp-exchange");
+ }
+ return exchange;
+ }
+
+ ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto& str_ack_level = args.get("amqp-ack-level", &exists);
+ if (!exists || str_ack_level == "broker") {
+ // "broker" is default
+ return ack_level_t::Broker;
+ }
+ if (str_ack_level == "none") {
+ return ack_level_t::None;
+ }
+ if (str_ack_level == "routable") {
+ return ack_level_t::Routable;
+ }
+ throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
+ }
+
+public:
+ RGWPubSubAMQPEndpoint(const std::string& _endpoint,
+ const std::string& _topic,
+ const RGWHTTPArgs& args,
+ CephContext* _cct) :
+ cct(_cct),
+ endpoint(_endpoint),
+ topic(_topic),
+ exchange(get_exchange(args)),
+ ack_level(get_ack_level(args)) {
+ if (!amqp::connect(conn_id, endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
+ throw configuration_error("AMQP: failed to create connection to: " + endpoint);
+ }
+ }
+
+ // this allows waiting untill "finish()" is called from a different thread
+ // waiting could be blocking the waiting thread or yielding, depending
+ // with compilation flag support and whether the optional_yield is set
+ class Waiter {
+ using Signature = void(boost::system::error_code);
+ using Completion = ceph::async::Completion<Signature>;
+ std::unique_ptr<Completion> completion = nullptr;
+ int ret;
+
+ mutable std::atomic<bool> done = false;
+ mutable std::mutex lock;
+ mutable std::condition_variable cond;
+
+ template <typename ExecutionContext, typename CompletionToken>
+ auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, Signature> init(token);
+ auto& handler = init.completion_handler;
+ {
+ std::unique_lock l{lock};
+ completion = Completion::create(ctx.get_executor(), std::move(handler));
+ }
+ return init.result.get();
+ }
+
+ public:
+ int wait(optional_yield y) {
+ if (done) {
+ return ret;
+ }
+ if (y) {
+ auto& io_ctx = y.get_io_context();
+ auto& yield_ctx = y.get_yield_context();
+ boost::system::error_code ec;
+ async_wait(io_ctx, yield_ctx[ec]);
+ return -ec.value();
+ }
+ std::unique_lock l(lock);
+ cond.wait(l, [this]{return (done==true);});
+ return ret;
+ }
+
+ void finish(int r) {
+ std::unique_lock l{lock};
+ ret = r;
+ done = true;
+ if (completion) {
+ boost::system::error_code ec(-ret, boost::system::system_category());
+ Completion::post(std::move(completion), ec);
+ } else {
+ cond.notify_all();
+ }
+ }
+ };
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ if (ack_level == ack_level_t::None) {
+ return amqp::publish(conn_id, topic, json_format_pubsub_event(event));
+ } else {
+ // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
+ // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+ auto w = std::unique_ptr<Waiter>(new Waiter);
+ const auto rc = amqp::publish_with_confirm(conn_id,
+ topic,
+ json_format_pubsub_event(event),
+ std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+ if (rc < 0) {
+ // failed to publish, does not wait for reply
+ return rc;
+ }
+ return w->wait(y);
+ }
+ }
+
+ std::string to_str() const override {
+ std::string str("AMQP(0.9.1) Endpoint");
+ str += "\nURI: " + endpoint;
+ str += "\nTopic: " + topic;
+ str += "\nExchange: " + exchange;
+ return str;
+ }
+};
+
+static const std::string AMQP_0_9_1("0-9-1");
+static const std::string AMQP_1_0("1-0");
+static const std::string AMQP_SCHEMA("amqp");
+#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT
+
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
+private:
+ enum class ack_level_t {
+ None,
+ Broker,
+ };
+ CephContext* const cct;
+ const std::string topic;
+ const ack_level_t ack_level;
+ std::string conn_name;
+
+
+ ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto& str_ack_level = args.get("kafka-ack-level", &exists);
+ if (!exists || str_ack_level == "broker") {
+ // "broker" is default
+ return ack_level_t::Broker;
+ }
+ if (str_ack_level == "none") {
+ return ack_level_t::None;
+ }
+ throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
+ }
+
+public:
+ RGWPubSubKafkaEndpoint(const std::string& _endpoint,
+ const std::string& _topic,
+ const RGWHTTPArgs& args,
+ CephContext* _cct) :
+ cct(_cct),
+ topic(_topic),
+ ack_level(get_ack_level(args)) {
+ if (!kafka::connect(conn_name, _endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true),
+ args.get_optional("ca-location"), args.get_optional("mechanism"))) {
+ throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
+ }
+ }
+
+ // this allows waiting untill "finish()" is called from a different thread
+ // waiting could be blocking the waiting thread or yielding, depending
+ // with compilation flag support and whether the optional_yield is set
+ class Waiter {
+ using Signature = void(boost::system::error_code);
+ using Completion = ceph::async::Completion<Signature>;
+ std::unique_ptr<Completion> completion = nullptr;
+ int ret;
+
+ mutable std::atomic<bool> done = false;
+ mutable std::mutex lock;
+ mutable std::condition_variable cond;
+
+ template <typename ExecutionContext, typename CompletionToken>
+ auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, Signature> init(token);
+ auto& handler = init.completion_handler;
+ {
+ std::unique_lock l{lock};
+ completion = Completion::create(ctx.get_executor(), std::move(handler));
+ }
+ return init.result.get();
+ }
+
+ public:
+ int wait(optional_yield y) {
+ if (done) {
+ return ret;
+ }
+ if (y) {
+ auto& io_ctx = y.get_io_context();
+ auto& yield_ctx = y.get_yield_context();
+ boost::system::error_code ec;
+ async_wait(io_ctx, yield_ctx[ec]);
+ return -ec.value();
+ }
+ std::unique_lock l(lock);
+ cond.wait(l, [this]{return (done==true);});
+ return ret;
+ }
+
+ void finish(int r) {
+ std::unique_lock l{lock};
+ ret = r;
+ done = true;
+ if (completion) {
+ boost::system::error_code ec(-ret, boost::system::system_category());
+ Completion::post(std::move(completion), ec);
+ } else {
+ cond.notify_all();
+ }
+ }
+ };
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ if (ack_level == ack_level_t::None) {
+ return kafka::publish(conn_name, topic, json_format_pubsub_event(event));
+ } else {
+ // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+ auto w = std::unique_ptr<Waiter>(new Waiter);
+ const auto rc = kafka::publish_with_confirm(conn_name,
+ topic,
+ json_format_pubsub_event(event),
+ std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+ if (rc < 0) {
+ // failed to publish, does not wait for reply
+ return rc;
+ }
+ return w->wait(y);
+ }
+ }
+
+ std::string to_str() const override {
+ std::string str("Kafka Endpoint");
+ str += "\nBroker: " + conn_name;
+ str += "\nTopic: " + topic;
+ return str;
+ }
+};
+
+static const std::string KAFKA_SCHEMA("kafka");
+#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+
+static const std::string WEBHOOK_SCHEMA("webhook");
+static const std::string UNKNOWN_SCHEMA("unknown");
+static const std::string NO_SCHEMA("");
+
+const std::string& get_schema(const std::string& endpoint) {
+ if (endpoint.empty()) {
+ return NO_SCHEMA;
+ }
+ const auto pos = endpoint.find(':');
+ if (pos == std::string::npos) {
+ return UNKNOWN_SCHEMA;
+ }
+ const auto& schema = endpoint.substr(0,pos);
+ if (schema == "http" || schema == "https") {
+ return WEBHOOK_SCHEMA;
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+ } else if (schema == "amqp" || schema == "amqps") {
+ return AMQP_SCHEMA;
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+ } else if (schema == "kafka") {
+ return KAFKA_SCHEMA;
+#endif
+ }
+ return UNKNOWN_SCHEMA;
+}
+
+RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint,
+ const std::string& topic,
+ const RGWHTTPArgs& args,
+ CephContext* cct) {
+ const auto& schema = get_schema(endpoint);
+ if (schema == WEBHOOK_SCHEMA) {
+ return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+ } else if (schema == AMQP_SCHEMA) {
+ bool exists;
+ std::string version = args.get("amqp-version", &exists);
+ if (!exists) {
+ version = AMQP_0_9_1;
+ }
+ if (version == AMQP_0_9_1) {
+ return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
+ } else if (version == AMQP_1_0) {
+ throw configuration_error("AMQP: v1.0 not supported");
+ return nullptr;
+ } else {
+ throw configuration_error("AMQP: unknown version: " + version);
+ return nullptr;
+ }
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+ } else if (schema == KAFKA_SCHEMA) {
+ return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
+#endif
+ }
+
+ throw configuration_error("unknown schema in: " + endpoint);
+ return nullptr;
+}
+
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.h b/src/rgw/driver/rados/rgw_pubsub_push.h
new file mode 100644
index 000000000..17905937c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#pragma once
+
+#include <string>
+#include <memory>
+#include <stdexcept>
+#include "include/buffer_fwd.h"
+#include "include/common_fwd.h"
+#include "common/async/yield_context.h"
+
+// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
+class RGWDataSyncEnv;
+class RGWHTTPArgs;
+struct rgw_pubsub_s3_event;
+
+// endpoint base class all endpoint - types should derive from it
+class RGWPubSubEndpoint {
+public:
+ RGWPubSubEndpoint() = default;
+ // endpoint should not be copied
+ RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
+ const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
+
+ typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
+
+ // factory method for the actual notification endpoint
+ // derived class specific arguments are passed in http args format
+ // may throw a configuration_error if creation fails
+ static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
+
+ // this method is used in order to send notification (S3 compliant) and wait for completion
+ // in async manner via a coroutine when invoked in the frontend environment
+ virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
+
+ // present as string
+ virtual std::string to_str() const { return ""; }
+
+ virtual ~RGWPubSubEndpoint() = default;
+
+ // exception object for configuration error
+ struct configuration_error : public std::logic_error {
+ configuration_error(const std::string& what_arg) :
+ std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
+ };
+};
+
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc
new file mode 100644
index 000000000..e453db5a9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.cc
@@ -0,0 +1,761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "rgw_aio.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_multi.h"
+#include "rgw_compression.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::putobj {
+
+/*
+ * For the cloudtiered objects, update the object manifest with the
+ * cloudtier config info read from the attrs.
+ * Since these attrs are used internally for only replication, do not store them
+ * in the head object.
+ */
+void read_cloudtier_info_from_attrs(rgw::sal::Attrs& attrs, RGWObjCategory& category,
+ RGWObjManifest& manifest) {
+ auto attr_iter = attrs.find(RGW_ATTR_CLOUD_TIER_TYPE);
+ if (attr_iter != attrs.end()) {
+ auto i = attr_iter->second;
+ string m = i.to_str();
+
+ if (m == "cloud-s3") {
+ category = RGWObjCategory::CloudTiered;
+ manifest.set_tier_type("cloud-s3");
+
+ auto config_iter = attrs.find(RGW_ATTR_CLOUD_TIER_CONFIG);
+ if (config_iter != attrs.end()) {
+ auto i = config_iter->second.cbegin();
+ RGWObjTier tier_config;
+
+ try {
+ using ceph::decode;
+ decode(tier_config, i);
+ manifest.set_tier_config(tier_config);
+ attrs.erase(config_iter);
+ } catch (buffer::error& err) {
+ }
+ }
+ }
+ attrs.erase(attr_iter);
+ }
+}
+
+int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
+{
+ const bool flush = (data.length() == 0);
+
+ // capture the first chunk for special handling
+ if (data_offset < head_chunk_size || data_offset == 0) {
+ if (flush) {
+ // flush partial chunk
+ return process_first_chunk(std::move(head_data), &processor);
+ }
+
+ auto remaining = head_chunk_size - data_offset;
+ auto count = std::min<uint64_t>(data.length(), remaining);
+ data.splice(0, count, &head_data);
+ data_offset += count;
+
+ if (data_offset == head_chunk_size) {
+ // process the first complete chunk
+ ceph_assert(head_data.length() == head_chunk_size);
+ int r = process_first_chunk(std::move(head_data), &processor);
+ if (r < 0) {
+ return r;
+ }
+ }
+ if (data.length() == 0) { // avoid flushing stripe processor
+ return 0;
+ }
+ }
+ ceph_assert(processor); // process_first_chunk() must initialize
+
+ // send everything else through the processor
+ auto write_offset = data_offset;
+ data_offset += data.length();
+ return processor->process(std::move(data), write_offset);
+}
+
+
+static int process_completed(const AioResultList& completed, RawObjSet *written)
+{
+ std::optional<int> error;
+ for (auto& r : completed) {
+ if (r.result >= 0) {
+ written->insert(r.obj.get_ref().obj);
+ } else if (!error) { // record first error code
+ error = r.result;
+ }
+ }
+ return error.value_or(0);
+}
+
+void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
+ const RGWObjStateManifest *sm = obj_ctx.get_state(head_obj);
+ const bool compressed = sm->state.compressed;
+ uint32_t alloc_hint_flags = 0;
+ if (compressed) {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ }
+
+ op.set_alloc_hint2(0, 0, alloc_hint_flags);
+}
+
+int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
+{
+ stripe_obj = store->svc.rados->obj(raw_obj);
+ return stripe_obj.open(dpp);
+}
+
+int RadosWriter::process(bufferlist&& bl, uint64_t offset)
+{
+ bufferlist data = std::move(bl);
+ const uint64_t cost = data.length();
+ if (cost == 0) { // no empty writes, use aio directly for creates
+ return 0;
+ }
+ librados::ObjectWriteOperation op;
+ add_write_hint(op);
+ if (offset == 0) {
+ op.write_full(data);
+ } else {
+ op.write(offset, data);
+ }
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::write_exclusive(const bufferlist& data)
+{
+ const uint64_t cost = data.length();
+
+ librados::ObjectWriteOperation op;
+ op.create(true); // exclusive create
+ add_write_hint(op);
+ op.write_full(data);
+
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+ auto d = aio->drain();
+ c.splice(c.end(), d);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::drain()
+{
+ return process_completed(aio->drain(), &written);
+}
+
+RadosWriter::~RadosWriter()
+{
+ // wait on any outstanding aio completions
+ process_completed(aio->drain(), &written);
+
+ bool need_to_remove_head = false;
+ std::optional<rgw_raw_obj> raw_head;
+ if (!head_obj.empty()) {
+ raw_head.emplace();
+ store->obj_to_raw(bucket_info.placement_rule, head_obj, &*raw_head);
+ }
+
+ /**
+ * We should delete the object in the "multipart" namespace to avoid race condition.
+ * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
+ * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
+ * written by the second upload may be deleted by the first upload.
+ * details is describled on #11749
+ *
+ * The above comment still stands, but instead of searching for a specific object in the multipart
+ * namespace, we just make sure that we remove the object that is marked as the head object after
+ * we remove all the other raw objects. Note that we use different call to remove the head object,
+ * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
+ */
+ for (const auto& obj : written) {
+ if (raw_head && obj == *raw_head) {
+ ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
+ need_to_remove_head = true;
+ continue;
+ }
+
+ int r = store->delete_raw_obj(dpp, obj);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
+ }
+ }
+
+ if (need_to_remove_head) {
+ std::string version_id;
+ ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
+ int r = store->delete_obj(dpp, obj_ctx, bucket_info, head_obj, 0, 0);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
+ }
+ }
+}
+
+
+// advance to the next stripe
+int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
+{
+ // advance the manifest
+ int r = manifest_gen.create_next(offset);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ *pstripe_size = manifest_gen.cur_stripe_max_size();
+ return 0;
+}
+
+
+
+int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ first_chunk = std::move(data);
+ *processor = &stripe;
+ return 0;
+}
+
+int AtomicObjectProcessor::prepare(optional_yield y)
+{
+ uint64_t max_head_chunk_size;
+ uint64_t head_max_size;
+ uint64_t chunk_size = 0;
+ uint64_t alignment;
+ rgw_pool head_pool;
+
+ if (!store->get_obj_data_pool(bucket_info.placement_rule, head_obj, &head_pool)) {
+ return -EIO;
+ }
+
+ int r = store->get_max_chunk_size(head_pool, &max_head_chunk_size, dpp, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ bool same_pool = true;
+ if (bucket_info.placement_rule != tail_placement_rule) {
+ rgw_pool tail_pool;
+ if (!store->get_obj_data_pool(tail_placement_rule, head_obj, &tail_pool)) {
+ return -EIO;
+ }
+
+ if (tail_pool != head_pool) {
+ same_pool = false;
+
+ r = store->get_max_chunk_size(tail_pool, &chunk_size, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ head_max_size = 0;
+ }
+ }
+
+ if (same_pool) {
+ RGWZonePlacementInfo placement_info;
+ if (!store->svc.zone->get_zone_params().get_placement(bucket_info.placement_rule.name, &placement_info) || placement_info.inline_data) {
+ head_max_size = max_head_chunk_size;
+ } else {
+ head_max_size = 0;
+ }
+ chunk_size = max_head_chunk_size;
+ }
+
+ uint64_t stripe_size;
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+
+ store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_trivial_rule(head_max_size, stripe_size);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ bucket_info.placement_rule,
+ &tail_placement_rule,
+ head_obj.bucket, head_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ set_head_chunk_size(head_max_size);
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, head_max_size);
+ return 0;
+}
+
+int AtomicObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ rgw::sal::Attrs& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled, optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ obj_ctx.set_atomic(head_obj);
+
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+
+ /* some object types shouldn't be versioned, e.g., multipart parts */
+ op_target.set_versioning_disabled(!bucket_info.versioning_enabled());
+
+ RGWRados::Object::Write obj_op(&op_target);
+ obj_op.meta.data = &first_chunk;
+ obj_op.meta.manifest = &manifest;
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.if_match = if_match;
+ obj_op.meta.if_nomatch = if_nomatch;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.olh_epoch = olh_epoch;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ read_cloudtier_info_from_attrs(attrs, obj_op.meta.category, manifest);
+
+ r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+ if (r < 0) {
+ if (r == -ETIMEDOUT) {
+ // The head object write may eventually succeed, clear the set of objects for deletion. if it
+ // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
+ writer.clear_written();
+ }
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+
+int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ // write the first chunk of the head object as part of an exclusive create,
+ // then drain to wait for the result in case of EEXIST
+ int r = writer.write_exclusive(data);
+ if (r == -EEXIST) {
+ // randomize the oid prefix and reprepare the head/manifest
+ std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
+
+ mp.init(target_obj.key.name, upload_id, oid_rand);
+ manifest.set_prefix(target_obj.key.name + "." + oid_rand);
+
+ r = prepare_head();
+ if (r < 0) {
+ return r;
+ }
+ // resubmit the write op on the new head object
+ r = writer.write_exclusive(data);
+ }
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare_head()
+{
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+ uint64_t chunk_size;
+ uint64_t stripe_size;
+ uint64_t alignment;
+
+ int r = store->get_max_chunk_size(tail_placement_rule, target_obj, &chunk_size, dpp, &alignment);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
+ return r;
+ }
+ store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_multipart_part_rule(stripe_size, part_num);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ bucket_info.placement_rule,
+ &tail_placement_rule,
+ target_obj.bucket, target_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+ RGWSI_Tier_RADOS::raw_obj_to_obj(head_obj.bucket, stripe_obj, &head_obj);
+ head_obj.index_hash_source = target_obj.key.name;
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+ stripe_size = manifest_gen.cur_stripe_max_size();
+ set_head_chunk_size(stripe_size);
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare(optional_yield y)
+{
+ manifest.set_prefix(target_obj.key.name + "." + upload_id);
+
+ return prepare_head();
+}
+
+int MultipartObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled, optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+ op_target.set_versioning_disabled(true);
+ op_target.set_meta_placement_rule(&tail_placement_rule);
+
+ RGWRados::Object::Write obj_op(&op_target);
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+ if (r < 0)
+ return r;
+
+ RGWUploadPartInfo info;
+ string p = "part.";
+ bool sorted_omap = is_v2_upload_id(upload_id);
+
+ if (sorted_omap) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%08d", part_num);
+ p.append(buf);
+ } else {
+ p.append(part_num_str);
+ }
+ info.num = part_num;
+ info.etag = etag;
+ info.size = actual_size;
+ info.accounted_size = accounted_size;
+ info.modified = real_clock::now();
+ info.manifest = manifest;
+
+ bool compressed;
+ r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+ if (r < 0) {
+ ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+ return r;
+ }
+
+ rgw_obj meta_obj;
+ meta_obj.init_ns(bucket_info.bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART);
+ meta_obj.set_in_extra_data(true);
+
+ rgw_raw_obj meta_raw_obj;
+ store->obj_to_raw(bucket_info.placement_rule, meta_obj, &meta_raw_obj);
+
+ rgw_rados_ref meta_obj_ref;
+ r = store->get_raw_obj_ref(dpp, meta_raw_obj, &meta_obj_ref);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref of meta obj with ret=" << r << dendl;
+ return r;
+ }
+
+ librados::ObjectWriteOperation op;
+ cls_rgw_mp_upload_part_info_update(op, p, info);
+ r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y);
+ ldpp_dout(dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ // New CLS call to update part info is not yet supported. Fall back to the old handling.
+ bufferlist bl;
+ encode(info, bl);
+
+ map<string, bufferlist> m;
+ m[p] = bl;
+
+ op = librados::ObjectWriteOperation{};
+ op.assert_exists(); // detect races with abort
+ op.omap_set(m);
+ r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y);
+ }
+ if (r < 0) {
+ return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
+ }
+
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
+{
+ int r = writer.write_exclusive(data);
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int AppendObjectProcessor::prepare(optional_yield y)
+{
+ RGWObjState *astate;
+ int r = store->get_obj_state(dpp, &obj_ctx, bucket_info, head_obj,
+ &astate, &cur_manifest, y);
+ if (r < 0) {
+ return r;
+ }
+ cur_size = astate->size;
+ *cur_accounted_size = astate->accounted_size;
+ if (!astate->exists) {
+ if (position != 0) {
+ ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ } else {
+ cur_part_num = 1;
+ //set the prefix
+ char buf[33];
+ gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+ string oid_prefix = head_obj.key.name;
+ oid_prefix.append(".");
+ oid_prefix.append(buf);
+ oid_prefix.append("_");
+ manifest.set_prefix(oid_prefix);
+ }
+ } else {
+ // check whether the object appendable
+ map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+ if (iter == astate->attrset.end()) {
+ ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
+ return -ERR_OBJECT_NOT_APPENDABLE;
+ }
+ if (position != *cur_accounted_size) {
+ ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ }
+ try {
+ using ceph::decode;
+ decode(cur_part_num, iter->second);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
+ return -EIO;
+ }
+ cur_part_num++;
+ //get the current obj etag
+ iter = astate->attrset.find(RGW_ATTR_ETAG);
+ if (iter != astate->attrset.end()) {
+ string s = rgw_string_unquote(iter->second.c_str());
+ size_t pos = s.find("-");
+ cur_etag = s.substr(0, pos);
+ }
+
+ iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != astate->attrset.end()) {
+ tail_placement_rule.storage_class = iter->second.to_str();
+ } else {
+ tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
+ }
+ manifest.set_prefix(cur_manifest->get_prefix());
+ astate->keep_tail = true;
+ }
+ manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, &tail_placement_rule, head_obj.bucket, head_obj);
+ if (r < 0) {
+ return r;
+ }
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(std::move(stripe_obj));
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
+
+ uint64_t max_head_size = std::min(chunk_size, stripe_size);
+ set_head_chunk_size(max_head_size);
+
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+
+ return 0;
+}
+
+int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
+ ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
+ ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
+ const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
+ optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0)
+ return r;
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+ obj_ctx.set_atomic(head_obj);
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+ //For Append obj, disable versioning
+ op_target.set_versioning_disabled(true);
+ RGWRados::Object::Write obj_op(&op_target);
+ if (cur_manifest) {
+ cur_manifest->append(dpp, manifest, store->svc.zone->get_zonegroup(), store->svc.zone->get_zone_params());
+ obj_op.meta.manifest = cur_manifest;
+ } else {
+ obj_op.meta.manifest = &manifest;
+ }
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+ obj_op.meta.appendable = true;
+ //Add the append part number
+ bufferlist cur_part_num_bl;
+ using ceph::encode;
+ encode(cur_part_num, cur_part_num_bl);
+ attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
+ //calculate the etag
+ if (!cur_etag.empty()) {
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+ hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hash.Final((unsigned char *)final_etag);
+ buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+ snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+ "-%lld", (long long)cur_part_num);
+ bufferlist etag_bl;
+ etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
+ attrs[RGW_ATTR_ETAG] = etag_bl;
+ }
+ r = obj_op.write_meta(dpp, actual_size + cur_size,
+ accounted_size + *cur_accounted_size,
+ attrs, y);
+ if (r < 0) {
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ *cur_accounted_size += accounted_size;
+
+ return 0;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h
new file mode 100644
index 000000000..fa9200f32
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.h
@@ -0,0 +1,282 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "rgw_putobj.h"
+#include "services/svc_rados.h"
+#include "services/svc_tier_rados.h"
+#include "rgw_sal.h"
+#include "rgw_obj_manifest.h"
+
+namespace rgw {
+
+namespace sal {
+ class RadosStore;
+}
+
+class Aio;
+
+namespace putobj {
+
+// an object processor with special handling for the first chunk of the head.
+// the virtual process_first_chunk() function returns a processor to handle the
+// rest of the object
+class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
+ uint64_t head_chunk_size;
+ // buffer to capture the first chunk of the head object
+ bufferlist head_data;
+ // initialized after process_first_chunk() to process everything else
+ rgw::sal::DataProcessor *processor = nullptr;
+ uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
+ protected:
+ uint64_t get_actual_size() const { return data_offset; }
+
+ // process the first chunk of data and return a processor for the rest
+ virtual int process_first_chunk(bufferlist&& data,
+ rgw::sal::DataProcessor **processor) = 0;
+ public:
+ HeadObjectProcessor(uint64_t head_chunk_size)
+ : head_chunk_size(head_chunk_size)
+ {}
+
+ void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
+
+ // cache first chunk for process_first_chunk(), then forward everything else
+ // to the returned processor
+ int process(bufferlist&& data, uint64_t logical_offset) final override;
+};
+
+using RawObjSet = std::set<rgw_raw_obj>;
+
+// a data sink that writes to rados objects and deletes them on cancelation
+class RadosWriter : public rgw::sal::DataProcessor {
+ Aio *const aio;
+ RGWRados *const store;
+ const RGWBucketInfo& bucket_info;
+ RGWObjectCtx& obj_ctx;
+ const rgw_obj head_obj;
+ RGWSI_RADOS::Obj stripe_obj; // current stripe object
+ RawObjSet written; // set of written objects for deletion
+ const DoutPrefixProvider *dpp;
+ optional_yield y;
+
+ public:
+ RadosWriter(Aio *aio, RGWRados *store,
+ const RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : aio(aio), store(store), bucket_info(bucket_info),
+ obj_ctx(obj_ctx), head_obj(_head_obj), dpp(dpp), y(y)
+ {}
+ ~RadosWriter();
+
+ // add alloc hint to osd
+ void add_write_hint(librados::ObjectWriteOperation& op);
+
+ // change the current stripe object
+ int set_stripe_obj(const rgw_raw_obj& obj);
+
+ // write the data at the given offset of the current stripe object
+ int process(bufferlist&& data, uint64_t stripe_offset) override;
+
+ // write the data as an exclusive create and wait for it to complete
+ int write_exclusive(const bufferlist& data);
+
+ int drain();
+
+ // when the operation completes successfully, clear the set of written objects
+ // so they aren't deleted on destruction
+ void clear_written() { written.clear(); }
+
+};
+
+
+// a rados object processor that stripes according to RGWObjManifest
+class ManifestObjectProcessor : public HeadObjectProcessor,
+ public StripeGenerator {
+ protected:
+ RGWRados* const store;
+ RGWBucketInfo& bucket_info;
+ rgw_placement_rule tail_placement_rule;
+ rgw_user owner;
+ RGWObjectCtx& obj_ctx;
+ rgw_obj head_obj;
+
+ RadosWriter writer;
+ RGWObjManifest manifest;
+ RGWObjManifest::generator manifest_gen;
+ ChunkProcessor chunk;
+ StripeProcessor stripe;
+ const DoutPrefixProvider *dpp;
+
+ // implements StripeGenerator
+ int next(uint64_t offset, uint64_t *stripe_size) override;
+
+ public:
+ ManifestObjectProcessor(Aio *aio, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& _obj_ctx,
+ const rgw_obj& _head_obj,
+ const DoutPrefixProvider* dpp, optional_yield y)
+ : HeadObjectProcessor(0),
+ store(store), bucket_info(bucket_info),
+ owner(owner),
+ obj_ctx(_obj_ctx), head_obj(_head_obj),
+ writer(aio, store, bucket_info, obj_ctx, head_obj, dpp, y),
+ chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
+ if (ptail_placement_rule) {
+ tail_placement_rule = *ptail_placement_rule;
+ }
+ }
+
+ void set_owner(const rgw_user& _owner) {
+ owner = _owner;
+ }
+
+ void set_tail_placement(const rgw_placement_rule& tpr) {
+ tail_placement_rule = tpr;
+ }
+ void set_tail_placement(const rgw_placement_rule&& tpr) {
+ tail_placement_rule = tpr;
+ }
+
+};
+
+
+// a processor that completes with an atomic write to the head object as part of
+// a bucket index transaction
+class AtomicObjectProcessor : public ManifestObjectProcessor {
+ const std::optional<uint64_t> olh_epoch;
+ const std::string unique_tag;
+ bufferlist first_chunk; // written with the head in complete()
+
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ public:
+ AtomicObjectProcessor(Aio *aio, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner,
+ RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj,
+ std::optional<uint64_t> olh_epoch,
+ const std::string& unique_tag,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+ owner, obj_ctx, _head_obj, dpp, y),
+ olh_epoch(olh_epoch), unique_tag(unique_tag)
+ {}
+
+ // prepare a trivial manifest
+ int prepare(optional_yield y) override;
+ // write the head object atomically in a bucket index transaction
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+
+};
+
+
+// a processor for multipart parts, which don't require atomic completion. the
+// part's head is written with an exclusive create to detect racing uploads of
+// the same part/upload id, which are restarted with a random oid prefix
+class MultipartObjectProcessor : public ManifestObjectProcessor {
+ const rgw_obj target_obj; // target multipart object
+ const std::string upload_id;
+ const int part_num;
+ const std::string part_num_str;
+ RGWMPObj mp;
+
+ // write the first chunk and wait on aio->drain() for its completion.
+ // on EEXIST, retry with random prefix
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ // prepare the head stripe and manifest
+ int prepare_head();
+ public:
+ MultipartObjectProcessor(Aio *aio, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& obj_ctx,
+ const rgw_obj& _head_obj,
+ const std::string& upload_id, uint64_t part_num,
+ const std::string& part_num_str,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+ owner, obj_ctx, _head_obj, dpp, y),
+ target_obj(head_obj), upload_id(upload_id),
+ part_num(part_num), part_num_str(part_num_str),
+ mp(head_obj.key.name, upload_id)
+ {}
+
+ // prepare a multipart manifest
+ int prepare(optional_yield y) override;
+ // write the head object attributes in a bucket index transaction, then
+ // register the completed part with the multipart meta object
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+
+};
+
+ class AppendObjectProcessor : public ManifestObjectProcessor {
+ uint64_t cur_part_num;
+ uint64_t position;
+ uint64_t cur_size;
+ uint64_t *cur_accounted_size;
+ std::string cur_etag;
+ const std::string unique_tag;
+
+ RGWObjManifest *cur_manifest;
+
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+
+ public:
+ AppendObjectProcessor(Aio *aio, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& obj_ctx,
+ const rgw_obj& _head_obj,
+ const std::string& unique_tag, uint64_t position,
+ uint64_t *cur_accounted_size,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+ owner, obj_ctx, _head_obj, dpp, y),
+ position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
+ unique_tag(unique_tag), cur_manifest(nullptr)
+ {}
+ int prepare(optional_yield y) override;
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch, const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+ };
+
+} // namespace putobj
+} // namespace rgw
+
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
new file mode 100644
index 000000000..10018d4a6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -0,0 +1,10076 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sstream>
+
+#include <boost/algorithm/string.hpp>
+#include <string_view>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "common/ceph_json.h"
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Throttle.h"
+#include "common/BackTrace.h"
+
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_cache.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
+#include "rgw_aio_throttle.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_putobj_processor.h"
+
+#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw/cls_rgw_const.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "osd/osd_types.h"
+
+#include "rgw_tools.h"
+#include "rgw_coroutine.h"
+#include "rgw_compression.h"
+#include "rgw_crypt.h"
+#include "rgw_etag_verifier.h"
+#include "rgw_worker.h"
+#include "rgw_notify.h"
+#include "rgw_http_errors.h"
+
+#undef fork // fails to compile RGWPeriod::fork() below
+
+#include "common/Clock.h"
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <atomic>
+#include <list>
+#include <map>
+#include "include/random.h"
+
+#include "rgw_gc.h"
+#include "rgw_lc.h"
+
+#include "rgw_object_expirer_core.h"
+#include "rgw_sync.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_trace.h"
+#include "rgw_trim_datalog.h"
+#include "rgw_trim_mdlog.h"
+#include "rgw_data_sync.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_reshard.h"
+#include "rgw_cr_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_bucket.h"
+#include "services/svc_mdlog.h"
+
+#include "compressor/Compressor.h"
+
+#include "rgw_d3n_datacache.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/rgw_rados.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
+#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
+#define dendl_bitx dendl ; }
+
+static string shadow_ns = "shadow";
+static string default_bucket_index_pool_suffix = "rgw.buckets.index";
+static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
+
+static RGWObjCategory main_category = RGWObjCategory::Main;
+#define RGW_USAGE_OBJ_PREFIX "usage."
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados* store) const
+{
+ if (!is_raw) {
+ rgw_raw_obj r;
+ store->obj_to_raw(placement_rule, obj, &r);
+ return r;
+ }
+ return raw_obj;
+}
+
+void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
+{
+ obj_version* check_objv = version_for_check();
+
+ if (check_objv) {
+ cls_version_check(*op, *check_objv, VER_COND_EQ);
+ }
+
+ cls_version_read(*op, &read_version);
+}
+
+void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
+{
+ obj_version* check_objv = version_for_check();
+ obj_version* modify_version = version_for_write();
+
+ if (check_objv) {
+ cls_version_check(*op, *check_objv, VER_COND_EQ);
+ }
+
+ if (modify_version) {
+ cls_version_set(*op, *modify_version);
+ } else {
+ cls_version_inc(*op);
+ }
+}
+
+void RGWObjVersionTracker::apply_write()
+{
+ const bool checked = (read_version.ver != 0);
+ const bool incremented = (write_version.ver == 0);
+
+ if (checked && incremented) {
+ // apply cls_version_inc() so our next operation can recheck it
+ ++read_version.ver;
+ } else {
+ read_version = write_version;
+ }
+ write_version = obj_version();
+}
+
+RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
+ RGWObjStateManifest *result;
+ typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
+ lock.lock_shared();
+ assert (!obj.empty());
+ iter = objs_state.find(obj);
+ if (iter != objs_state.end()) {
+ result = &iter->second;
+ lock.unlock_shared();
+ } else {
+ lock.unlock_shared();
+ lock.lock();
+ result = &objs_state[obj];
+ lock.unlock();
+ }
+ return result;
+}
+
+void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.compressed = true;
+}
+
+void RGWObjectCtx::set_atomic(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.is_atomic = true;
+}
+void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.prefetch_data = true;
+}
+
+void RGWObjectCtx::invalidate(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ auto iter = objs_state.find(obj);
+ if (iter == objs_state.end()) {
+ return;
+ }
+ bool is_atomic = iter->second.state.is_atomic;
+ bool prefetch_data = iter->second.state.prefetch_data;
+ bool compressed = iter->second.state.compressed;
+
+ objs_state.erase(iter);
+
+ if (is_atomic || prefetch_data) {
+ auto& sm = objs_state[obj];
+ sm.state.is_atomic = is_atomic;
+ sm.state.prefetch_data = prefetch_data;
+ sm.state.compressed = compressed;
+ }
+}
+
+class RGWMetaNotifierManager : public RGWCoroutinesManager {
+ RGWRados* store;
+ RGWHTTPManager http_manager;
+
+public:
+ RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+ http_manager(store->ctx(), completion_mgr) {
+ http_manager.start();
+ }
+
+ int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
+ rgw_http_param_pair pairs[] = { { "type", "metadata" },
+ { "notify", NULL },
+ { NULL, NULL } };
+
+ list<RGWCoroutinesStack *> stacks;
+ for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+ RGWRESTConn *conn = iter->second;
+ RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+ stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
+
+ stacks.push_back(stack);
+ }
+ return run(dpp, stacks);
+ }
+};
+
+class RGWDataNotifierManager : public RGWCoroutinesManager {
+ RGWRados* store;
+ RGWHTTPManager http_manager;
+
+public:
+ RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+ http_manager(store->ctx(), completion_mgr) {
+ http_manager.start();
+ }
+
+ int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
+
+ list<RGWCoroutinesStack *> stacks;
+ const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
+ for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+ RGWRESTConn *conn = iter->second;
+ RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+ stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
+ stacks.push_back(stack);
+ }
+
+ return run(dpp, stacks);
+ }
+};
+
+/* class RGWRadosThread */
+
+void RGWRadosThread::start()
+{
+ worker = new Worker(cct, this);
+ worker->create(thread_name.c_str());
+}
+
+void RGWRadosThread::stop()
+{
+ down_flag = true;
+ stop_process();
+ if (worker) {
+ worker->signal();
+ worker->join();
+ }
+ delete worker;
+ worker = NULL;
+}
+
+void *RGWRadosThread::Worker::entry() {
+ uint64_t msec = processor->interval_msec();
+ auto interval = std::chrono::milliseconds(msec);
+
+ do {
+ auto start = ceph::real_clock::now();
+ int r = processor->process(this);
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
+ }
+
+ if (processor->going_down())
+ break;
+
+ auto end = ceph::real_clock::now() - start;
+
+ uint64_t cur_msec = processor->interval_msec();
+ if (cur_msec != msec) { /* was it reconfigured? */
+ msec = cur_msec;
+ interval = std::chrono::milliseconds(msec);
+ }
+
+ if (cur_msec > 0) {
+ if (interval <= end)
+ continue; // next round
+
+ auto wait_time = interval - end;
+ wait_interval(wait_time);
+ } else {
+ wait();
+ }
+ } while (!processor->going_down());
+
+ return NULL;
+}
+
+class RGWMetaNotifier : public RGWRadosThread {
+ RGWMetaNotifierManager notify_mgr;
+ RGWMetadataLog *const log;
+
+ uint64_t interval_msec() override {
+ return cct->_conf->rgw_md_notify_interval_msec;
+ }
+ void stop_process() override {
+ notify_mgr.stop();
+ }
+public:
+ RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
+ : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
+
+ int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
+{
+ set<int> shards;
+
+ log->read_clear_modified(shards);
+
+ if (shards.empty()) {
+ return 0;
+ }
+
+ for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
+ ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
+ }
+
+ notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
+
+ return 0;
+}
+
+class RGWDataNotifier : public RGWRadosThread {
+ RGWDataNotifierManager notify_mgr;
+ bc::flat_set<rgw_data_notify_entry> entry;
+
+ uint64_t interval_msec() override {
+ return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
+ }
+ void stop_process() override {
+ notify_mgr.stop();
+ }
+public:
+ RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
+
+ int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
+{
+ auto data_log = store->svc.datalog_rados;
+ if (!data_log) {
+ return 0;
+ }
+
+ auto shards = data_log->read_clear_modified();
+
+ if (shards.empty()) {
+ return 0;
+ }
+
+ for (const auto& [shard_id, entries] : shards) {
+ bc::flat_set<rgw_data_notify_entry>::iterator it;
+ for (const auto& entry : entries) {
+ ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
+ << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
+ }
+ }
+
+ notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
+
+ return 0;
+}
+
+class RGWSyncProcessorThread : public RGWRadosThread {
+public:
+ RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
+ RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
+ ~RGWSyncProcessorThread() override {}
+ int init(const DoutPrefixProvider *dpp) override = 0 ;
+ int process(const DoutPrefixProvider *dpp) override = 0;
+};
+
+class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
+{
+ RGWMetaSyncStatusManager sync;
+
+ uint64_t interval_msec() override {
+ return 0; /* no interval associated, it'll run once until stopped */
+ }
+ void stop_process() override {
+ sync.stop();
+ }
+public:
+ RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
+ : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
+
+ void wakeup_sync_shards(set<int>& shard_ids) {
+ for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
+ sync.wakeup(*iter);
+ }
+ }
+ RGWMetaSyncStatusManager* get_manager() { return &sync; }
+
+ int init(const DoutPrefixProvider *dpp) override {
+ int ret = sync.init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+
+ int process(const DoutPrefixProvider *dpp) override {
+ sync.run(dpp, null_yield);
+ return 0;
+ }
+};
+
+class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
+{
+ PerfCountersRef counters;
+ RGWDataSyncStatusManager sync;
+ bool initialized;
+
+ uint64_t interval_msec() override {
+ if (initialized) {
+ return 0; /* no interval associated, it'll run once until stopped */
+ } else {
+#define DATA_SYNC_INIT_WAIT_SEC 20
+ return DATA_SYNC_INIT_WAIT_SEC * 1000;
+ }
+ }
+ void stop_process() override {
+ sync.stop();
+ }
+public:
+ RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+ const RGWZone* source_zone)
+ : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
+ counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
+ sync(_driver, async_rados, source_zone->id, counters.get()),
+ initialized(false) {}
+
+ void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ sync.wakeup(iter->first, iter->second);
+ }
+ }
+
+ RGWDataSyncStatusManager* get_manager() { return &sync; }
+
+ int init(const DoutPrefixProvider *dpp) override {
+ return 0;
+ }
+
+ int process(const DoutPrefixProvider *dpp) override {
+ while (!initialized) {
+ if (going_down()) {
+ return 0;
+ }
+ int ret = sync.init(dpp);
+ if (ret >= 0) {
+ initialized = true;
+ break;
+ }
+ /* we'll be back! */
+ return 0;
+ }
+ sync.run(dpp);
+ return 0;
+ }
+};
+
+class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
+{
+ RGWCoroutinesManager crs;
+ rgw::sal::RadosStore* store;
+ rgw::BucketTrimManager *bucket_trim;
+ RGWHTTPManager http;
+ const utime_t trim_interval;
+
+ uint64_t interval_msec() override { return 0; }
+ void stop_process() override { crs.stop(); }
+public:
+ RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
+ int interval)
+ : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
+ crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
+ bucket_trim(bucket_trim),
+ http(store->ctx(), crs.get_completion_mgr()),
+ trim_interval(interval, 0)
+ {}
+
+ int init(const DoutPrefixProvider *dpp) override {
+ return http.start();
+ }
+ int process(const DoutPrefixProvider *dpp) override {
+ list<RGWCoroutinesStack*> stacks;
+ auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
+ cct->_conf->rgw_md_log_max_shards,
+ trim_interval);
+ if (!metatrimcr) {
+ ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
+ return -EINVAL;
+ }
+ auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
+ meta->call(metatrimcr);
+
+ stacks.push_back(meta);
+
+ if (store->svc()->zone->sync_module_exports_data()) {
+ auto data = new RGWCoroutinesStack(store->ctx(), &crs);
+ data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
+ cct->_conf->rgw_data_log_num_shards,
+ trim_interval));
+ stacks.push_back(data);
+
+ auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
+ bucket->call(bucket_trim->create_bucket_trim_cr(&http));
+ stacks.push_back(bucket);
+ }
+
+ crs.run(dpp, stacks);
+ return 0;
+ }
+
+ // implements DoutPrefixProvider
+ CephContext *get_cct() const override { return store->ctx(); }
+ unsigned get_subsys() const override
+ {
+ return dout_subsys;
+ }
+
+ std::ostream& gen_prefix(std::ostream& out) const override
+ {
+ return out << "sync log trim: ";
+ }
+
+};
+
+void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
+{
+ std::lock_guard l{meta_sync_thread_lock};
+ if (meta_sync_processor_thread) {
+ meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
+ }
+}
+
+void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
+{
+ ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
+ << ", gen=" << gen << dendl;
+ }
+ }
+
+ std::lock_guard l{data_sync_thread_lock};
+ auto iter = data_sync_processor_threads.find(source_zone);
+ if (iter == data_sync_processor_threads.end()) {
+ ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
+ return;
+ }
+
+ RGWDataSyncProcessorThread *thread = iter->second;
+ ceph_assert(thread);
+ thread->wakeup_sync_shards(entries);
+}
+
+RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
+{
+ std::lock_guard l{meta_sync_thread_lock};
+ if (meta_sync_processor_thread) {
+ return meta_sync_processor_thread->get_manager();
+ }
+ return nullptr;
+}
+
+RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+ std::lock_guard l{data_sync_thread_lock};
+ auto thread = data_sync_processor_threads.find(source_zone);
+ if (thread == data_sync_processor_threads.end()) {
+ return nullptr;
+ }
+ return thread->second->get_manager();
+}
+
+int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
+{
+ IoCtx ioctx;
+ int r = open_pool_ctx(dpp, pool, ioctx, false, true);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
+ return r;
+ }
+
+ bool req;
+ r = ioctx.pool_requires_alignment2(&req);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
+ << r << dendl;
+ return r;
+ }
+
+ if (!req) {
+ *alignment = 0;
+ return 0;
+ }
+
+ uint64_t align;
+ r = ioctx.pool_required_alignment2(&align);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
+ << r << dendl;
+ return r;
+ }
+ if (align != 0) {
+ ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
+ }
+ *alignment = align;
+ return 0;
+}
+
+void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
+{
+ if (alignment == 0) {
+ *max_size = size;
+ return;
+ }
+
+ if (size <= alignment) {
+ *max_size = alignment;
+ return;
+ }
+
+ *max_size = size - (size % alignment);
+}
+
+int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+ uint64_t alignment;
+ int r = get_required_alignment(dpp, pool, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ if (palignment) {
+ *palignment = alignment;
+ }
+
+ uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+ get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
+
+ ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
+
+ return 0;
+}
+
+int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
+ uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+ rgw_pool pool;
+ if (!get_obj_data_pool(placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
+ return -EIO;
+ }
+ return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
+}
+
+void add_datalog_entry(const DoutPrefixProvider* dpp,
+ RGWDataChangesLog* datalog,
+ const RGWBucketInfo& bucket_info,
+ uint32_t shard_id, optional_yield y)
+{
+ const auto& logs = bucket_info.layout.logs;
+ if (logs.empty()) {
+ return;
+ }
+ int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
+ } // datalog error is not fatal
+}
+
+class RGWIndexCompletionManager;
+
+struct complete_op_data {
+ ceph::mutex lock = ceph::make_mutex("complete_op_data");
+ AioCompletion *rados_completion{nullptr};
+ int manager_shard_id{-1};
+ RGWIndexCompletionManager *manager{nullptr};
+ rgw_obj obj;
+ RGWModifyOp op;
+ string tag;
+ rgw_bucket_entry_ver ver;
+ cls_rgw_obj_key key;
+ rgw_bucket_dir_entry_meta dir_meta;
+ list<cls_rgw_obj_key> remove_objs;
+ bool log_op;
+ uint16_t bilog_op;
+ rgw_zone_set zones_trace;
+
+ bool stopped{false};
+
+ void stop() {
+ std::lock_guard l{lock};
+ stopped = true;
+ }
+};
+
+class RGWIndexCompletionManager {
+ RGWRados* const store;
+ const uint32_t num_shards;
+ ceph::containers::tiny_vector<ceph::mutex> locks;
+ std::vector<set<complete_op_data*>> completions;
+ std::vector<complete_op_data*> retry_completions;
+
+ std::condition_variable cond;
+ std::mutex retry_completions_lock;
+ bool _stop{false};
+ std::thread retry_thread;
+
+ // used to distribute the completions and the locks they use across
+ // their respective vectors; it will get incremented and can wrap
+ // around back to 0 without issue
+ std::atomic<uint32_t> cur_shard {0};
+
+ void process();
+
+ void add_completion(complete_op_data *completion);
+
+ void stop() {
+ if (retry_thread.joinable()) {
+ _stop = true;
+ cond.notify_all();
+ retry_thread.join();
+ }
+
+ for (uint32_t i = 0; i < num_shards; ++i) {
+ std::lock_guard l{locks[i]};
+ for (auto c : completions[i]) {
+ c->stop();
+ }
+ }
+ completions.clear();
+ }
+
+ uint32_t next_shard() {
+ return cur_shard++ % num_shards;
+ }
+
+public:
+ RGWIndexCompletionManager(RGWRados *_driver) :
+ store(_driver),
+ num_shards(store->ctx()->_conf->rgw_thread_pool_size),
+ locks{ceph::make_lock_container<ceph::mutex>(
+ num_shards,
+ [](const size_t i) {
+ return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
+ std::to_string(i));
+ })},
+ completions(num_shards),
+ retry_thread(&RGWIndexCompletionManager::process, this)
+ {}
+
+ ~RGWIndexCompletionManager() {
+ stop();
+ }
+
+ void create_completion(const rgw_obj& obj,
+ RGWModifyOp op, string& tag,
+ rgw_bucket_entry_ver& ver,
+ const cls_rgw_obj_key& key,
+ rgw_bucket_dir_entry_meta& dir_meta,
+ list<cls_rgw_obj_key> *remove_objs, bool log_op,
+ uint16_t bilog_op,
+ rgw_zone_set *zones_trace,
+ complete_op_data **result);
+
+ bool handle_completion(completion_t cb, complete_op_data *arg);
+
+ CephContext* ctx() {
+ return store->ctx();
+ }
+};
+
+static void obj_complete_cb(completion_t cb, void *arg)
+{
+ complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
+ completion->lock.lock();
+ if (completion->stopped) {
+ completion->lock.unlock(); /* can drop lock, no one else is referencing us */
+ delete completion;
+ return;
+ }
+ bool need_delete = completion->manager->handle_completion(cb, completion);
+ completion->lock.unlock();
+ if (need_delete) {
+ delete completion;
+ }
+}
+
+void RGWIndexCompletionManager::process()
+{
+ DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
+ while(!_stop) {
+ std::vector<complete_op_data*> comps;
+
+ {
+ std::unique_lock l{retry_completions_lock};
+ cond.wait(l, [this](){return _stop || !retry_completions.empty();});
+ if (_stop) {
+ return;
+ }
+ retry_completions.swap(comps);
+ }
+
+ for (auto c : comps) {
+ std::unique_ptr<complete_op_data> up{c};
+
+ ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
+
+ RGWRados::BucketShard bs(store);
+ RGWBucketInfo bucket_info;
+
+ int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
+ if (r < 0) {
+ ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
+ /* not much to do */
+ continue;
+ }
+
+ r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
+ [&](RGWRados::BucketShard *bs) -> int {
+ const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, &dpp, 10) <<
+ "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+ " obj=" << c->obj << " tag=" << c->tag <<
+ " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
+ ldout_bitx(bitx, &dpp, 25) <<
+ "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
+
+ librados::ObjectWriteOperation o;
+ o.assert_exists();
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+ c->log_op, c->bilog_op, &c->zones_trace);
+ int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
+ ldout_bitx(bitx, &dpp, 10) <<
+ "EXITING " << __func__ << ": ret=" << dendl_bitx;
+ return ret;
+ });
+ if (r < 0) {
+ ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
+ /* ignoring error, can't do anything about it */
+ continue;
+ }
+
+ // This null_yield can stay, for now, since we're in our own thread
+ add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info,
+ bs.shard_id, null_yield);
+ }
+ }
+}
+
+void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
+ RGWModifyOp op, string& tag,
+ rgw_bucket_entry_ver& ver,
+ const cls_rgw_obj_key& key,
+ rgw_bucket_dir_entry_meta& dir_meta,
+ list<cls_rgw_obj_key> *remove_objs, bool log_op,
+ uint16_t bilog_op,
+ rgw_zone_set *zones_trace,
+ complete_op_data **result)
+{
+ complete_op_data *entry = new complete_op_data;
+
+ int shard_id = next_shard();
+
+ entry->manager_shard_id = shard_id;
+ entry->manager = this;
+ entry->obj = obj;
+ entry->op = op;
+ entry->tag = tag;
+ entry->ver = ver;
+ entry->key = key;
+ entry->dir_meta = dir_meta;
+ entry->log_op = log_op;
+ entry->bilog_op = bilog_op;
+
+ if (remove_objs) {
+ for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
+ entry->remove_objs.push_back(*iter);
+ }
+ }
+
+ if (zones_trace) {
+ entry->zones_trace = *zones_trace;
+ } else {
+ entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
+ }
+
+ *result = entry;
+
+ entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
+
+ std::lock_guard l{locks[shard_id]};
+ const auto ok = completions[shard_id].insert(entry).second;
+ ceph_assert(ok);
+}
+
+void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
+ {
+ std::lock_guard l{retry_completions_lock};
+ retry_completions.push_back(completion);
+ }
+ cond.notify_all();
+}
+
+bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
+{
+ int shard_id = arg->manager_shard_id;
+ {
+ std::lock_guard l{locks[shard_id]};
+
+ auto& comps = completions[shard_id];
+
+ auto iter = comps.find(arg);
+ if (iter == comps.end()) {
+ ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
+ return true;
+ }
+
+ comps.erase(iter);
+ }
+
+ int r = rados_aio_get_return_value(cb);
+ if (r != -ERR_BUSY_RESHARDING) {
+ ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " <<
+ (r == 0 ? "ok" : "failed with " + to_string(r)) <<
+ " for obj=" << arg->key << dendl;
+ return true;
+ }
+ add_completion(arg);
+ ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
+ return false;
+}
+
+void RGWRados::finalize()
+{
+ /* Before joining any sync threads, drain outstanding requests &
+ * mark the async_processor as going_down() */
+ if (svc.rados) {
+ svc.rados->stop_processor();
+ }
+
+ if (run_sync_thread) {
+ std::lock_guard l{meta_sync_thread_lock};
+ meta_sync_processor_thread->stop();
+
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto iter : data_sync_processor_threads) {
+ RGWDataSyncProcessorThread *thread = iter.second;
+ thread->stop();
+ }
+ if (sync_log_trimmer) {
+ sync_log_trimmer->stop();
+ }
+ }
+ if (run_sync_thread) {
+ delete meta_sync_processor_thread;
+ meta_sync_processor_thread = NULL;
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto iter : data_sync_processor_threads) {
+ RGWDataSyncProcessorThread *thread = iter.second;
+ delete thread;
+ }
+ data_sync_processor_threads.clear();
+ delete sync_log_trimmer;
+ sync_log_trimmer = nullptr;
+ bucket_trim = boost::none;
+ }
+ if (meta_notifier) {
+ meta_notifier->stop();
+ delete meta_notifier;
+ }
+ if (data_notifier) {
+ data_notifier->stop();
+ delete data_notifier;
+ }
+ delete sync_tracer;
+
+ delete lc;
+ lc = NULL;
+
+ delete gc;
+ gc = NULL;
+
+ delete obj_expirer;
+ obj_expirer = NULL;
+
+ RGWQuotaHandler::free_handler(quota_handler);
+ if (cr_registry) {
+ cr_registry->put();
+ }
+
+ svc.shutdown();
+
+ delete binfo_cache;
+ delete obj_tombstone_cache;
+ if (d3n_data_cache)
+ delete d3n_data_cache;
+
+ if (reshard_wait.get()) {
+ reshard_wait->stop();
+ reshard_wait.reset();
+ }
+
+ if (run_reshard_thread) {
+ reshard->stop_processor();
+ }
+ delete reshard;
+ delete index_completion_manager;
+
+ rgw::notify::shutdown();
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_rados()
+{
+ int ret = 0;
+
+ ret = rados.init_with_context(cct);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = rados.connect();
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
+ new RGWCoroutinesManagerRegistry(cct)};
+ ret = crs->hook_to_admin_command("cr dump");
+ if (ret < 0) {
+ return ret;
+ }
+
+ cr_registry = crs.release();
+
+ if (use_datacache) {
+ d3n_data_cache = new D3nDataCache();
+ d3n_data_cache->init(cct);
+ }
+
+ return ret;
+}
+
+int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
+{
+ string name = cct->_conf->name.get_id();
+ if (name.compare(0, 4, "rgw.") == 0) {
+ name = name.substr(4);
+ }
+ map<string,string> metadata = meta;
+ metadata["num_handles"] = "1"s;
+ metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
+ metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
+ metadata["zone_name"] = svc.zone->zone_name();
+ metadata["zone_id"] = svc.zone->zone_id().id;
+ metadata["realm_name"] = svc.zone->get_realm().get_name();
+ metadata["realm_id"] = svc.zone->get_realm().get_id();
+ metadata["id"] = name;
+ int ret = rados.service_daemon_register(
+ daemon_type,
+ stringify(rados.get_instance_id()),
+ metadata);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
+{
+ int ret = rados.service_daemon_update_status(move(status));
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_complete(const DoutPrefixProvider *dpp)
+{
+ int ret;
+
+ /*
+ * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
+ */
+ sync_module = svc.sync_modules->get_sync_module();
+
+ ret = open_root_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_gc_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_lc_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_objexp_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_reshard_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_notif_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ pools_initialized = true;
+
+ if (use_gc) {
+ gc = new RGWGC();
+ gc->initialize(cct, this);
+ } else {
+ ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
+ }
+
+ obj_expirer = new RGWObjectExpirer(this->driver);
+
+ if (use_gc_thread && use_gc) {
+ gc->start_processor();
+ obj_expirer->start_processor();
+ }
+
+ auto& current_period = svc.zone->get_current_period();
+ auto& zonegroup = svc.zone->get_zonegroup();
+ auto& zone_params = svc.zone->get_zone_params();
+ auto& zone = svc.zone->get_zone();
+
+ /* no point of running sync thread if we don't have a master zone configured
+ or there is no rest_master_conn */
+ if (!svc.zone->need_to_sync()) {
+ run_sync_thread = false;
+ }
+
+ if (svc.zone->is_meta_master()) {
+ auto md_log = svc.mdlog->get_log(current_period.get_id());
+ meta_notifier = new RGWMetaNotifier(this, md_log);
+ meta_notifier->start();
+ }
+
+ /* init it anyway, might run sync through radosgw-admin explicitly */
+ sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
+ sync_tracer->init(this);
+ ret = sync_tracer->hook_to_admin_command();
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (run_sync_thread) {
+ for (const auto &pt: zonegroup.placement_targets) {
+ if (zone_params.placement_pools.find(pt.second.name)
+ == zone_params.placement_pools.end()){
+ ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
+ << pt.second.name << " present in zonegroup" << dendl;
+ }
+ }
+ auto async_processor = svc.rados->get_async_processor();
+ std::lock_guard l{meta_sync_thread_lock};
+ meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
+ ret = meta_sync_processor_thread->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
+ return ret;
+ }
+ meta_sync_processor_thread->start();
+
+ // configure the bucket trim manager
+ rgw::BucketTrimConfig config;
+ rgw::configure_bucket_trim(cct, config);
+
+ bucket_trim.emplace(this->driver, config);
+ ret = bucket_trim->init();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
+ return ret;
+ }
+ svc.datalog_rados->set_observer(&*bucket_trim);
+
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
+ ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
+ auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
+ ret = thread->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
+ return ret;
+ }
+ thread->start();
+ data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
+ }
+ auto interval = cct->_conf->rgw_sync_log_trim_interval;
+ if (interval > 0) {
+ sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
+ ret = sync_log_trimmer->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
+ return ret;
+ }
+ sync_log_trimmer->start();
+ }
+ }
+ if (cct->_conf->rgw_data_notify_interval_msec) {
+ data_notifier = new RGWDataNotifier(this);
+ data_notifier->start();
+ }
+
+ binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
+ binfo_cache->init(svc.cache);
+
+ lc = new RGWLC();
+ lc->initialize(cct, this->driver);
+
+ if (use_lc_thread)
+ lc->start_processor();
+
+ quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
+
+ bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
+ zone.bucket_index_max_shards);
+ if (bucket_index_max_shards > get_max_bucket_shards()) {
+ bucket_index_max_shards = get_max_bucket_shards();
+ ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
+ << get_max_bucket_shards() << dendl;
+ }
+ ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
+
+ bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
+
+ if (need_tombstone_cache) {
+ obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
+ }
+
+ reshard_wait = std::make_shared<RGWReshardWait>();
+
+ reshard = new RGWReshard(this->driver);
+
+ // disable reshard thread based on zone/zonegroup support
+ run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
+
+ if (run_reshard_thread) {
+ reshard->start_processor();
+ }
+
+ index_completion_manager = new RGWIndexCompletionManager(this);
+ ret = rgw::notify::init(cct, driver, dpp);
+ if (ret < 0 ) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
+ }
+
+ return ret;
+}
+
+int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
+{
+ if (raw) {
+ return svc.init_raw(cct, use_cache, null_yield, dpp);
+ }
+
+ return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
+}
+
+int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
+{
+ return ctl.init(&svc, driver, dpp);
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_begin(const DoutPrefixProvider *dpp)
+{
+ int ret = init_svc(false, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
+ return ret;
+ }
+
+ ret = init_ctl(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
+ return ret;
+ }
+
+ host_id = svc.zone_utils->gen_host_id();
+
+ return init_rados();
+}
+
+/**
+ * Open the pool used as root for this gateway
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
+}
+
+int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
+}
+
+int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
+}
+
+int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
+}
+
+int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
+}
+
+int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
+}
+
+int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+ bool mostly_omap, bool bulk)
+{
+ constexpr bool create = true; // create the pool if it doesn't exist
+ return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap, bulk);
+}
+
+/**** logs ****/
+
+struct log_list_state {
+ string prefix;
+ librados::IoCtx io_ctx;
+ librados::NObjectIterator obit;
+};
+
+int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
+{
+ log_list_state *state = new log_list_state;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+ if (r < 0) {
+ delete state;
+ return r;
+ }
+ try {
+ state->prefix = prefix;
+ state->obit = state->io_ctx.nobjects_begin();
+ *handle = (RGWAccessHandle)state;
+ return 0;
+ } catch (const std::system_error& e) {
+ r = -e.code().value();
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
+int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
+{
+ log_list_state *state = static_cast<log_list_state *>(handle);
+ while (true) {
+ if (state->obit == state->io_ctx.nobjects_end()) {
+ delete state;
+ return -ENOENT;
+ }
+ if (state->prefix.length() &&
+ state->obit->get_oid().find(state->prefix) != 0) {
+ state->obit++;
+ continue;
+ }
+ *name = state->obit->get_oid();
+ state->obit++;
+ break;
+ }
+ return 0;
+}
+
+int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
+{
+ librados::IoCtx io_ctx;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
+ if (r < 0)
+ return r;
+ return io_ctx.remove(name);
+}
+
+struct log_show_state {
+ librados::IoCtx io_ctx;
+ bufferlist bl;
+ bufferlist::const_iterator p;
+ string name;
+ uint64_t pos;
+ bool eof;
+ log_show_state() : pos(0), eof(false) {}
+};
+
+int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
+{
+ log_show_state *state = new log_show_state;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+ if (r < 0) {
+ delete state;
+ return r;
+ }
+ state->name = name;
+ *handle = (RGWAccessHandle)state;
+ return 0;
+}
+
+int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
+{
+ log_show_state *state = static_cast<log_show_state *>(handle);
+ off_t off = state->p.get_off();
+
+ ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
+ << " off " << off
+ << " eof " << (int)state->eof
+ << dendl;
+ // read some?
+ unsigned chunk = 1024*1024;
+ if ((state->bl.length() - off) < chunk/2 && !state->eof) {
+ bufferlist more;
+ int r = state->io_ctx.read(state->name, more, chunk, state->pos);
+ if (r < 0)
+ return r;
+ state->pos += r;
+ bufferlist old;
+ try {
+ old.substr_of(state->bl, off, state->bl.length() - off);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ state->bl = std::move(old);
+ state->bl.claim_append(more);
+ state->p = state->bl.cbegin();
+ if ((unsigned)r < chunk)
+ state->eof = true;
+ ldpp_dout(dpp, 10) << " read " << r << dendl;
+ }
+
+ if (state->p.end())
+ return 0; // end of file
+ try {
+ decode(*entry, state->p);
+ }
+ catch (const buffer::error &e) {
+ return -EINVAL;
+ }
+ return 1;
+}
+
+/**
+ * usage_log_hash: get usage log key hash, based on name and index
+ *
+ * Get the usage object name. Since a user may have more than 1
+ * object holding that info (multiple shards), we use index to
+ * specify that shard number. Once index exceeds max shards it
+ * wraps.
+ * If name is not being set, results for all users will be returned
+ * and index will wrap only after total shards number.
+ *
+ * @param cct [in] ceph context
+ * @param name [in] user name
+ * @param hash [out] hash value
+ * @param index [in] shard index number
+ */
+static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
+{
+ uint32_t val = index;
+
+ if (!name.empty()) {
+ int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
+ val %= max_user_shards;
+ val += ceph_str_hash_linux(name.c_str(), name.size());
+ }
+ char buf[17];
+ int max_shards = cct->_conf->rgw_usage_max_shards;
+ snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
+ hash = buf;
+}
+
+int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+ uint32_t index = 0;
+
+ map<string, rgw_usage_log_info> log_objs;
+
+ string hash;
+ string last_user;
+
+ /* restructure usage map, zone by object hash */
+ map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
+ for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
+ const rgw_user_bucket& ub = iter->first;
+ RGWUsageBatch& info = iter->second;
+
+ if (ub.user.empty()) {
+ ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
+ continue;
+ }
+
+ if (ub.user != last_user) {
+ /* index *should* be random, but why waste extra cycles
+ in most cases max user shards is not going to exceed 1,
+ so just incrementing it */
+ usage_log_hash(cct, ub.user, hash, index++);
+ }
+ last_user = ub.user;
+ vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
+
+ for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
+ v.push_back(miter->second);
+ }
+ }
+
+ map<string, rgw_usage_log_info>::iterator liter;
+
+ for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
+ int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
+ rgw_usage_log_entry>& usage)
+{
+ uint32_t num = max_entries;
+ string hash, first_hash;
+ string user_str = user.to_str();
+ usage_log_hash(cct, user_str, first_hash, 0);
+
+ if (usage_iter.index) {
+ usage_log_hash(cct, user_str, hash, usage_iter.index);
+ } else {
+ hash = first_hash;
+ }
+
+ usage.clear();
+
+ do {
+ map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
+ map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+
+ int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
+ usage_iter.read_iter, ret_usage, is_truncated);
+ if (ret == -ENOENT)
+ goto next;
+
+ if (ret < 0)
+ return ret;
+
+ num -= ret_usage.size();
+
+ for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
+ usage[iter->first].aggregate(iter->second);
+ }
+
+next:
+ if (!*is_truncated) {
+ usage_iter.read_iter.clear();
+ usage_log_hash(cct, user_str, hash, ++usage_iter.index);
+ }
+ } while (num && !*is_truncated && hash != first_hash);
+ return 0;
+}
+
+int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
+{
+ uint32_t index = 0;
+ string hash, first_hash;
+ string user_str = user.to_str();
+ usage_log_hash(cct, user_str, first_hash, index);
+
+ hash = first_hash;
+ do {
+ int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
+
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+
+ usage_log_hash(cct, user_str, hash, ++index);
+ } while (hash != first_hash);
+
+ return 0;
+}
+
+
+int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
+{
+ auto max_shards = cct->_conf->rgw_usage_max_shards;
+ int ret=0;
+ for (unsigned i=0; i < max_shards; i++){
+ string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
+ ret = cls_obj_usage_log_clear(dpp, oid);
+ if (ret < 0){
+ ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ return ret;
+}
+
+int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
+ ceph::buffer::list& bl,
+ ACLOwner *owner)
+{
+ auto i = bl.cbegin();
+ RGWAccessControlPolicy policy(cct);
+ try {
+ policy.decode_owner(i);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ *owner = policy.get_owner();
+ return 0;
+}
+
+int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
+{
+ rgw_bucket bucket = bucket_info.bucket;
+ bucket.update_bucket_id(new_bucket_id);
+
+ bucket_info.objv_tracker.clear();
+ int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ * Any skipped results will have the matching portion of their name
+ * inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int RGWRados::Bucket::List::list_objects_ordered(
+ const DoutPrefixProvider *dpp,
+ int64_t max_p,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y)
+{
+ RGWRados *store = target->get_store();
+ CephContext *cct = store->ctx();
+ int shard_id = target->get_shard_id();
+ const auto& current_index = target->get_bucket_info().layout.current_index;
+
+ int count = 0;
+ bool truncated = true;
+ bool cls_filtered = false;
+ const int64_t max = // protect against memory issues and negative vals
+ std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+ int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
+
+ result->clear();
+
+ // use a local marker; either the marker will have a previous entry
+ // or it will be empty; either way it's OK to copy
+ rgw_obj_key marker_obj(params.marker.name,
+ params.marker.instance,
+ params.ns.empty() ? params.marker.ns : params.ns);
+ rgw_obj_index_key cur_marker;
+ marker_obj.get_index_key(&cur_marker);
+
+ rgw_obj_key end_marker_obj(params.end_marker.name,
+ params.end_marker.instance,
+ params.ns.empty() ? params.end_marker.ns : params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
+ const bool cur_end_marker_valid = !params.end_marker.empty();
+
+ rgw_obj_key prefix_obj(params.prefix);
+ prefix_obj.set_ns(params.ns);
+ std::string cur_prefix = prefix_obj.get_index_key_name();
+ std::string after_delim_s; /* needed in !params.delim.empty() AND later */
+
+ if (!params.delim.empty()) {
+ after_delim_s = cls_rgw_after_delim(params.delim);
+ /* if marker points at a common prefix, fast forward it into its
+ * upper bound string */
+ int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
+ if (delim_pos >= 0) {
+ string s = cur_marker.name.substr(0, delim_pos);
+ s.append(after_delim_s);
+ cur_marker = s;
+ }
+ }
+
+ // we'll stop after this many attempts as long we return at least
+ // one entry; but we will also go beyond this number of attempts
+ // until we return at least one entry
+ constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
+
+ rgw_obj_index_key prev_marker;
+ for (uint16_t attempt = 1; /* empty */; ++attempt) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": starting attempt " << attempt << dendl;
+
+ if (attempt > 1 && !(prev_marker < cur_marker)) {
+ // we've failed to make forward progress
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " marker failed to make forward progress; attempt=" << attempt <<
+ ", prev_marker=" << prev_marker <<
+ ", cur_marker=" << cur_marker << dendl;
+ break;
+ }
+ prev_marker = cur_marker;
+
+ ent_map_t ent_map;
+ ent_map.reserve(read_ahead);
+ int r = store->cls_bucket_list_ordered(dpp,
+ target->get_bucket_info(),
+ current_index,
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ params.delim,
+ read_ahead + 1 - count,
+ params.list_versions,
+ attempt,
+ ent_map,
+ &truncated,
+ &cls_filtered,
+ &cur_marker,
+ y,
+ params.force_check_filter);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+ rgw_bucket_dir_entry& entry = eiter->second;
+ rgw_obj_index_key index_key = entry.key;
+ rgw_obj_key obj(index_key);
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": considering entry " << entry.key << dendl;
+
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
+ */
+ bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+ if (!valid) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " could not parse object name: " << obj.name << dendl;
+ continue;
+ }
+
+ bool matched_ns = (obj.ns == params.ns);
+ if (!params.list_versions && !entry.is_visible()) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": skipping not visible entry \"" << entry.key << "\"" << dendl;
+ continue;
+ }
+
+ if (params.enforce_ns && !matched_ns) {
+ if (!params.ns.empty()) {
+ /* we've iterated past the namespace we're searching -- done now */
+ truncated = false;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": finished due to getting past requested namespace \"" <<
+ params.ns << "\"" << dendl;
+ goto done;
+ }
+
+ /* we're skipping past namespaced objects */
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping past namespaced objects, including \"" << entry.key <<
+ "\"" << dendl;
+ continue;
+ }
+
+ if (cur_end_marker_valid && cur_end_marker <= index_key) {
+ truncated = false;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": finished due to gitting end marker of \"" << cur_end_marker <<
+ "\" with \"" << entry.key << "\"" << dendl;
+ goto done;
+ }
+
+ if (count < max) {
+ params.marker = index_key;
+ next_marker = index_key;
+ }
+
+ if (params.access_list_filter &&
+ ! params.access_list_filter->filter(obj.name, index_key.name)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping past namespaced objects, including \"" << entry.key <<
+ "\"" << dendl;
+ continue;
+ }
+
+ if (params.prefix.size() &&
+ 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping object \"" << entry.key <<
+ "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
+ continue;
+ }
+
+ if (!params.delim.empty()) {
+ const int delim_pos = obj.name.find(params.delim, params.prefix.size());
+ if (delim_pos >= 0) {
+ // run either the code where delimiter filtering is done a)
+ // in the OSD/CLS or b) here.
+ if (cls_filtered) {
+ // NOTE: this condition is for the newer versions of the
+ // OSD that does filtering on the CLS side should only
+ // find one delimiter at the end if it finds any after the
+ // prefix
+ if (delim_pos !=
+ int(obj.name.length() - params.delim.length())) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+ " found delimiter in place other than the end of "
+ "the prefix; obj.name=" << obj.name <<
+ ", prefix=" << params.prefix << dendl;
+ }
+ if (common_prefixes) {
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with common prefix \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached (cls filtered)" << dendl;
+ goto done;
+ }
+
+ (*common_prefixes)[obj.name] = true;
+ count++;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": finished entry with common prefix \"" << entry.key <<
+ "\" so continuing loop (cls filtered)" << dendl;
+ continue;
+ } else {
+ // NOTE: this condition is for older versions of the OSD
+ // that do not filter on the CLS side, so the following code
+ // must do the filtering; once we reach version 16 of ceph,
+ // this code can be removed along with the conditional that
+ // can lead this way
+
+ /* extract key -with trailing delimiter- for CommonPrefix */
+ string prefix_key =
+ obj.name.substr(0, delim_pos + params.delim.length());
+
+ if (common_prefixes &&
+ common_prefixes->find(prefix_key) == common_prefixes->end()) {
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with common prefix \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached (not cls filtered)" << dendl;
+ goto done;
+ }
+ next_marker = prefix_key;
+ (*common_prefixes)[prefix_key] = true;
+
+ count++;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": finished entry with common prefix \"" << entry.key <<
+ "\" so continuing loop (not cls filtered)" << dendl;
+ continue;
+ } // if we're running an older OSD version
+ } // if a delimiter was found after prefix
+ } // if a delimiter was passed in
+
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with entry \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached" << dendl;
+ goto done;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": adding entry " << entry.key << " to result" << dendl;
+
+ result->emplace_back(std::move(entry));
+ count++;
+ } // eiter for loop
+
+ // NOTE: the following conditional is needed by older versions of
+ // the OSD that don't do delimiter filtering on the CLS side; once
+ // we reach version 16 of ceph, the following conditional and the
+ // code within can be removed
+ if (!cls_filtered && !params.delim.empty()) {
+ int marker_delim_pos =
+ cur_marker.name.find(params.delim, cur_prefix.size());
+ if (marker_delim_pos >= 0) {
+ std::string skip_after_delim =
+ cur_marker.name.substr(0, marker_delim_pos);
+ skip_after_delim.append(after_delim_s);
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skip_after_delim=" << skip_after_delim << dendl;
+
+ if (skip_after_delim > cur_marker.name) {
+ cur_marker = skip_after_delim;
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": setting cur_marker=" << cur_marker.name <<
+ "[" << cur_marker.instance << "]" << dendl;
+ }
+ }
+ } // if older osd didn't do delimiter filtering
+
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": end of outer loop, truncated=" << truncated <<
+ ", count=" << count << ", attempt=" << attempt << dendl;
+
+ if (!truncated || count >= (max + 1) / 2) {
+ // if we finished listing, or if we're returning at least half the
+ // requested entries, that's enough; S3 and swift protocols allow
+ // returning fewer than max entries
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": exiting attempt loop because we reached end (" << truncated <<
+ ") or we're returning half the requested entries (" << count <<
+ " of " << max << ")" << dendl;
+ break;
+ } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
+ // if we've made at least 8 attempts and we have some, but very
+ // few, results, return with what we have
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": exiting attempt loop because we made " << attempt <<
+ " attempts and we're returning " << count << " entries" << dendl;
+ break;
+ }
+ } // for (uint16_t attempt...
+
+done:
+
+ if (is_truncated) {
+ *is_truncated = truncated;
+ }
+
+ return 0;
+} // list_objects_ordered
+
+
+/**
+ * Get listing of the objects in a bucket and allow the results to be out
+ * of order.
+ *
+ * Even though there are key differences with the ordered counterpart,
+ * the parameters are the same to maintain some compatability.
+ *
+ * max: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: should not be set; if it is we should have indicated an error
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: this is never filled with an unordered list; the param
+ * is maintained for compatibility
+ * is_truncated: if number of objects in the bucket is bigger than max, then
+ * truncated.
+ */
+int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
+ int64_t max_p,
+ std::vector<rgw_bucket_dir_entry>* result,
+ std::map<std::string, bool>* common_prefixes,
+ bool* is_truncated,
+ optional_yield y)
+{
+ RGWRados *store = target->get_store();
+ int shard_id = target->get_shard_id();
+ const auto& current_index = target->get_bucket_info().layout.current_index;
+
+ int count = 0;
+ bool truncated = true;
+
+ const int64_t max = // protect against memory issues and negative vals
+ std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+
+ // read a few extra in each call to cls_bucket_list_unordered in
+ // case some are filtered out due to namespace matching, versioning,
+ // filtering, etc.
+ const int64_t max_read_ahead = 100;
+ const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
+
+ result->clear();
+
+ // use a local marker; either the marker will have a previous entry
+ // or it will be empty; either way it's OK to copy
+ rgw_obj_key marker_obj(params.marker.name,
+ params.marker.instance,
+ params.ns.empty() ? params.marker.ns : params.ns);
+ rgw_obj_index_key cur_marker;
+ marker_obj.get_index_key(&cur_marker);
+
+ rgw_obj_key end_marker_obj(params.end_marker.name,
+ params.end_marker.instance,
+ params.ns.empty() ? params.end_marker.ns : params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
+ const bool cur_end_marker_valid = !params.end_marker.empty();
+
+ rgw_obj_key prefix_obj(params.prefix);
+ prefix_obj.set_ns(params.ns);
+ std::string cur_prefix = prefix_obj.get_index_key_name();
+
+ while (truncated && count <= max) {
+ std::vector<rgw_bucket_dir_entry> ent_list;
+ ent_list.reserve(read_ahead);
+
+ int r = store->cls_bucket_list_unordered(dpp,
+ target->get_bucket_info(),
+ current_index,
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ read_ahead,
+ params.list_versions,
+ ent_list,
+ &truncated,
+ &cur_marker,
+ y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " cls_bucket_list_unordered returned " << r << " for " <<
+ target->get_bucket_info().bucket << dendl;
+ return r;
+ }
+
+ // NB: while regions of ent_list will be sorted, we have no
+ // guarantee that all items will be sorted since they can cross
+ // shard boundaries
+
+ for (auto& entry : ent_list) {
+ rgw_obj_index_key index_key = entry.key;
+ rgw_obj_key obj(index_key);
+
+ if (count < max) {
+ params.marker.set(index_key);
+ next_marker.set(index_key);
+ }
+
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
+ */
+ bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+ if (!valid) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " could not parse object name: " << obj.name << dendl;
+ continue;
+ }
+
+ if (!params.list_versions && !entry.is_visible()) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because not listing versions and entry not visibile" << dendl;
+ continue;
+ }
+
+ if (params.enforce_ns && obj.ns != params.ns) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because namespace does not match" << dendl;
+ continue;
+ }
+
+ if (cur_end_marker_valid && cur_end_marker <= index_key) {
+ // we're not guaranteed items will come in order, so we have
+ // to loop through all
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because after end_marker" << dendl;
+ continue;
+ }
+
+ if (params.access_list_filter &&
+ !params.access_list_filter->filter(obj.name, index_key.name)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because doesn't match filter" << dendl;
+ continue;
+ }
+
+ if (params.prefix.size() &&
+ (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because doesn't match prefix" << dendl;
+ continue;
+ }
+
+ if (count >= max) {
+ truncated = true;
+ goto done;
+ }
+
+ result->emplace_back(std::move(entry));
+ count++;
+ } // for (auto& entry : ent_list)
+ } // while (truncated && count <= max)
+
+done:
+
+ if (is_truncated) {
+ *is_truncated = truncated;
+ }
+
+ return 0;
+} // list_objects_unordered
+
+
+/**
+ * create a rados pool, associated meta info
+ * returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
+{
+ librados::IoCtx io_ctx;
+ constexpr bool create = true;
+ return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
+}
+
+void RGWRados::create_bucket_id(string *bucket_id)
+{
+ uint64_t iid = instance_id();
+ uint64_t bid = next_bucket_id();
+ char buf[svc.zone->get_zone_params().get_id().size() + 48];
+ snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
+ svc.zone->get_zone_params().get_id().c_str(), iid, bid);
+ *bucket_id = buf;
+}
+
+int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+ const string& zonegroup_id,
+ const rgw_placement_rule& placement_rule,
+ const string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ map<std::string, bufferlist>& attrs,
+ RGWBucketInfo& info,
+ obj_version *pobjv,
+ obj_version *pep_objv,
+ real_time creation_time,
+ rgw_bucket *pmaster_bucket,
+ uint32_t *pmaster_num_shards,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool exclusive)
+{
+#define MAX_CREATE_RETRIES 20 /* need to bound retries */
+ rgw_placement_rule selected_placement_rule;
+ RGWZonePlacementInfo rule_info;
+
+ for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
+ int ret = 0;
+ ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
+ &selected_placement_rule, &rule_info, y);
+ if (ret < 0)
+ return ret;
+
+ if (!pmaster_bucket) {
+ create_bucket_id(&bucket.marker);
+ bucket.bucket_id = bucket.marker;
+ } else {
+ bucket.marker = pmaster_bucket->marker;
+ bucket.bucket_id = pmaster_bucket->bucket_id;
+ }
+
+ RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+ objv_tracker.read_version.clear();
+
+ if (pobjv) {
+ objv_tracker.write_version = *pobjv;
+ } else {
+ objv_tracker.generate_new_write_ver(cct);
+ }
+
+ info.bucket = bucket;
+ info.owner = owner.user_id;
+ info.zonegroup = zonegroup_id;
+ info.placement_rule = selected_placement_rule;
+ info.swift_ver_location = swift_ver_location;
+ info.swift_versioning = (!swift_ver_location.empty());
+
+ init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
+ pmaster_num_shards ?
+ std::optional{*pmaster_num_shards} :
+ std::nullopt,
+ rule_info.index_type);
+
+ info.requester_pays = false;
+ if (real_clock::is_zero(creation_time)) {
+ info.creation_time = ceph::real_clock::now();
+ } else {
+ info.creation_time = creation_time;
+ }
+ if (pquota_info) {
+ info.quota = *pquota_info;
+ }
+
+ int r = svc.bi->init_index(dpp, info, info.layout.current_index);
+ if (r < 0) {
+ return r;
+ }
+
+ ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp, y);
+ if (ret == -ECANCELED) {
+ ret = -EEXIST;
+ }
+ if (ret == -EEXIST) {
+ /* we need to reread the info and return it, caller will have a use for it */
+ RGWBucketInfo orig_info;
+ r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ continue;
+ }
+ ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
+ return r;
+ }
+
+ /* only remove it if it's a different bucket instance */
+ if (orig_info.bucket.bucket_id != bucket.bucket_id) {
+ int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
+ }
+ r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
+ /* continue anyway */
+ }
+ }
+
+ info = std::move(orig_info);
+ /* ret == -EEXIST here */
+ }
+ return ret;
+ }
+
+ /* this is highly unlikely */
+ ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
+ return -ENOENT;
+}
+
+bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+ get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+ return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
+}
+
+std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ return svc.rados->cluster_fsid();
+}
+
+int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ librados::IoCtx *ioctx)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj, oid, key);
+
+ rgw_pool pool;
+ if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
+ ", probably misconfiguration" << dendl;
+ return -EIO;
+ }
+
+ int r = open_pool_ctx(dpp, pool, *ioctx, false, true);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
+ " for obj=" << obj << " with error-code=" << r << dendl;
+ return r;
+ }
+
+ ioctx->locator_set_key(key);
+
+ return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+ const rgw_placement_rule& target_placement_rule,
+ const rgw_obj& obj,
+ rgw_rados_ref *ref)
+{
+ get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
+
+ rgw_pool pool;
+ if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
+ return -EIO;
+ }
+
+ ref->pool = svc.rados->pool(pool);
+
+ int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+ .set_mostly_omap(false));
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
+ return r;
+ }
+
+ ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+ return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ rgw_rados_ref *ref)
+{
+ return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
+}
+
+int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+ ref->obj = obj;
+
+ if (ref->obj.oid.empty()) {
+ ref->obj.oid = obj.pool.to_str();
+ ref->obj.pool = svc.zone->get_zone_params().domain_root;
+ }
+ ref->pool = svc.rados->pool(obj.pool);
+ int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+ .set_mostly_omap(false));
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
+ return r;
+ }
+
+ ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+ return 0;
+}
+
+int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+ return get_raw_obj_ref(dpp, obj, ref);
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
+{
+ const rgw_bucket& bucket = bucket_info.bucket;
+ string oid;
+ string locator;
+
+ rgw_obj obj(bucket, key);
+
+ get_obj_bucket_and_oid_loc(obj, oid, locator);
+
+ if (locator.empty()) {
+ ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
+ return 0;
+ }
+
+ librados::IoCtx ioctx;
+
+ int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
+ if (ret < 0) {
+ cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
+ return ret;
+ }
+ ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
+
+ uint64_t size;
+ bufferlist data;
+
+ struct timespec mtime_ts;
+ map<string, bufferlist> attrs;
+ librados::ObjectReadOperation op;
+ op.getxattrs(&attrs, NULL);
+ op.stat2(&size, &mtime_ts, NULL);
+#define HEAD_SIZE 512 * 1024
+ op.read(0, HEAD_SIZE, &data, NULL);
+
+ ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (size > HEAD_SIZE) {
+ ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
+ return -EIO;
+ }
+
+ if (size != data.length()) {
+ ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
+ return -EIO;
+ }
+
+ if (copy_obj) {
+ librados::ObjectWriteOperation wop;
+
+ wop.mtime2(&mtime_ts);
+
+ map<string, bufferlist>::iterator iter;
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ wop.setxattr(iter->first.c_str(), iter->second);
+ }
+
+ wop.write(0, data);
+
+ ioctx.locator_set_key(locator);
+ rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
+ }
+
+ if (remove_bad) {
+ ioctx.locator_set_key(string());
+
+ ret = ioctx.remove(oid);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
+ librados::IoCtx& src_ioctx,
+ const string& src_oid, const string& src_locator,
+ librados::IoCtx& dst_ioctx,
+ const string& dst_oid, const string& dst_locator)
+{
+
+#define COPY_BUF_SIZE (4 * 1024 * 1024)
+ bool done = false;
+ uint64_t chunk_size = COPY_BUF_SIZE;
+ uint64_t ofs = 0;
+ int ret = 0;
+ real_time mtime;
+ struct timespec mtime_ts;
+ uint64_t size;
+
+ if (src_oid == dst_oid && src_locator == dst_locator) {
+ return 0;
+ }
+
+ src_ioctx.locator_set_key(src_locator);
+ dst_ioctx.locator_set_key(dst_locator);
+
+ do {
+ bufferlist data;
+ ObjectReadOperation rop;
+ ObjectWriteOperation wop;
+
+ if (ofs == 0) {
+ rop.stat2(&size, &mtime_ts, NULL);
+ mtime = real_clock::from_timespec(mtime_ts);
+ }
+ rop.read(ofs, chunk_size, &data, NULL);
+ ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
+ if (ret < 0) {
+ goto done_err;
+ }
+
+ if (data.length() == 0) {
+ break;
+ }
+
+ if (ofs == 0) {
+ wop.create(true); /* make it exclusive */
+ wop.mtime2(&mtime_ts);
+ mtime = real_clock::from_timespec(mtime_ts);
+ }
+ wop.write(ofs, data);
+ ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
+ if (ret < 0) {
+ goto done_err;
+ }
+ ofs += data.length();
+ done = data.length() != chunk_size;
+ } while (!done);
+
+ if (ofs != size) {
+ ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
+ << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
+ ret = -EIO;
+ goto done_err;
+ }
+
+ src_ioctx.remove(src_oid);
+
+ return 0;
+
+done_err:
+ // TODO: clean up dst_oid if we created it
+ ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
+ return ret;
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, rgw_obj_key& key,
+ bool fix, bool *need_fix, optional_yield y)
+{
+ const rgw_bucket& bucket = bucket_info.bucket;
+ rgw_obj obj(bucket, key);
+
+ if (need_fix) {
+ *need_fix = false;
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWObjState *astate = nullptr;
+ RGWObjManifest* manifest = nullptr;
+ RGWObjectCtx rctx(this->driver);
+ r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ if (manifest) {
+ RGWObjManifest::obj_iterator miter;
+ for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+ rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
+ rgw_obj loc;
+ string oid;
+ string locator;
+
+ RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
+
+ if (loc.key.ns.empty()) {
+ /* continue, we're only interested in tail objects */
+ continue;
+ }
+
+ auto& ioctx = ref.pool.ioctx();
+
+ get_obj_bucket_and_oid_loc(loc, oid, locator);
+ ref.pool.ioctx().locator_set_key(locator);
+
+ ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
+
+ r = ioctx.stat(oid, NULL, NULL);
+ if (r != -ENOENT) {
+ continue;
+ }
+
+ string bad_loc;
+ prepend_bucket_marker(bucket, loc.key.name, bad_loc);
+
+ /* create a new ioctx with the bad locator */
+ librados::IoCtx src_ioctx;
+ src_ioctx.dup(ioctx);
+ src_ioctx.locator_set_key(bad_loc);
+
+ r = src_ioctx.stat(oid, NULL, NULL);
+ if (r != 0) {
+ /* cannot find a broken part */
+ continue;
+ }
+ ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
+ if (need_fix) {
+ *need_fix = true;
+ }
+ if (fix) {
+ r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+ const rgw_obj& obj,
+ RGWBucketInfo* bucket_info_out,
+ const DoutPrefixProvider *dpp)
+{
+ bucket = _bucket;
+
+ RGWBucketInfo bucket_info;
+ RGWBucketInfo* bucket_info_p =
+ bucket_info_out ? bucket_info_out : &bucket_info;
+
+ int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ string oid;
+
+ ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj)
+{
+ bucket = bucket_info.bucket;
+
+ int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
+ obj.get_hash_object(),
+ &bucket_obj,
+ &shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ int sid)
+{
+ bucket = bucket_info.bucket;
+ shard_id = sid;
+
+ int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, index,
+ shard_id, &bucket_obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+ return 0;
+}
+
+
+/* Execute @handler on last item in bucket listing for bucket specified
+ * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
+ * to objects matching these criterias. */
+int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::string& obj_prefix,
+ const std::string& obj_delim,
+ std::function<int(const rgw_bucket_dir_entry&)> handler)
+{
+ RGWRados::Bucket target(this, bucket_info);
+ RGWRados::Bucket::List list_op(&target);
+
+ list_op.params.prefix = obj_prefix;
+ list_op.params.delim = obj_delim;
+
+ ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
+ << ", obj_prefix=" << obj_prefix
+ << ", obj_delim=" << obj_delim
+ << dendl;
+
+ bool is_truncated = false;
+
+ boost::optional<rgw_bucket_dir_entry> last_entry;
+ /* We need to rewind to the last object in a listing. */
+ do {
+ /* List bucket entries in chunks. */
+ static constexpr int MAX_LIST_OBJS = 100;
+ std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
+
+ int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+ &is_truncated, null_yield);
+ if (ret < 0) {
+ return ret;
+ } else if (!entries.empty()) {
+ last_entry = entries.back();
+ }
+ } while (is_truncated);
+
+ if (last_entry) {
+ return handler(*last_entry);
+ }
+
+ /* Empty listing - no items we can run handler on. */
+ return 0;
+}
+
+bool RGWRados::swift_versioning_enabled(const RGWBucketInfo& bucket_info) const
+{
+ return bucket_info.has_swift_versioning() &&
+ bucket_info.swift_ver_location.size();
+}
+
+int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
+ const rgw_user& user,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ if (! swift_versioning_enabled(bucket_info)) {
+ return 0;
+ }
+
+ obj_ctx.set_atomic(obj);
+
+ RGWObjState * state = nullptr;
+ RGWObjManifest *manifest = nullptr;
+ int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &state, &manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!state->exists) {
+ return 0;
+ }
+
+ const string& src_name = obj.get_oid();
+ char buf[src_name.size() + 32];
+ struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
+ snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
+ src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
+
+ RGWBucketInfo dest_bucket_info;
+
+ r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
+ if (r == -ENOENT) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ return r;
+ }
+
+ if (dest_bucket_info.owner != bucket_info.owner) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ rgw_obj dest_obj(dest_bucket_info.bucket, buf);
+
+ if (dest_bucket_info.versioning_enabled()){
+ gen_rand_obj_instance_name(&dest_obj);
+ }
+
+ obj_ctx.set_atomic(dest_obj);
+
+ rgw_zone_id no_zone;
+
+ r = copy_obj(obj_ctx,
+ user,
+ NULL, /* req_info *info */
+ no_zone,
+ dest_obj,
+ obj,
+ dest_bucket_info,
+ bucket_info,
+ bucket_info.placement_rule,
+ NULL, /* time_t *src_mtime */
+ NULL, /* time_t *mtime */
+ NULL, /* const time_t *mod_ptr */
+ NULL, /* const time_t *unmod_ptr */
+ false, /* bool high_precision_time */
+ NULL, /* const char *if_match */
+ NULL, /* const char *if_nomatch */
+ RGWRados::ATTRSMOD_NONE,
+ true, /* bool copy_if_newer */
+ state->attrset,
+ RGWObjCategory::Main,
+ 0, /* uint64_t olh_epoch */
+ real_time(), /* time_t delete_at */
+ NULL, /* string *version_id */
+ NULL, /* string *ptag */
+ NULL, /* string *petag */
+ NULL, /* void (*progress_cb)(off_t, void *) */
+ NULL, /* void *progress_data */
+ dpp,
+ null_yield);
+ if (r == -ECANCELED || r == -ENOENT) {
+ /* Has already been overwritten, meaning another rgw process already
+ * copied it out */
+ return 0;
+ }
+
+ return r;
+}
+
+int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
+ const rgw_user& user,
+ RGWBucketInfo& bucket_info,
+ rgw_obj& obj,
+ bool& restored,
+ const DoutPrefixProvider *dpp)
+{
+ if (! swift_versioning_enabled(bucket_info)) {
+ return 0;
+ }
+
+ /* Bucket info of the bucket that stores previous versions of our object. */
+ RGWBucketInfo archive_binfo;
+
+ int ret = get_bucket_info(&svc, bucket_info.bucket.tenant,
+ bucket_info.swift_ver_location,
+ archive_binfo, nullptr, null_yield, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Abort the operation if the bucket storing our archive belongs to someone
+ * else. This is a limitation in comparison to Swift as we aren't taking ACLs
+ * into consideration. For we can live with that.
+ *
+ * TODO: delegate this check to un upper layer and compare with ACLs. */
+ if (bucket_info.owner != archive_binfo.owner) {
+ return -EPERM;
+ }
+
+ /* This code will be executed on latest version of the object. */
+ const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
+ rgw_zone_id no_zone;
+
+ /* We don't support object versioning of Swift API on those buckets that
+ * are already versioned using the S3 mechanism. This affects also bucket
+ * storing archived objects. Otherwise the delete operation would create
+ * a deletion marker. */
+ if (archive_binfo.versioned()) {
+ restored = false;
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
+ * irrelevant and may be safely skipped. */
+ std::map<std::string, ceph::bufferlist> no_attrs;
+
+ rgw_obj archive_obj(archive_binfo.bucket, entry.key);
+
+ if (bucket_info.versioning_enabled()){
+ gen_rand_obj_instance_name(&obj);
+ }
+
+ obj_ctx.set_atomic(archive_obj);
+ obj_ctx.set_atomic(obj);
+
+ int ret = copy_obj(obj_ctx,
+ user,
+ nullptr, /* req_info *info */
+ no_zone,
+ obj, /* dest obj */
+ archive_obj, /* src obj */
+ bucket_info, /* dest bucket info */
+ archive_binfo, /* src bucket info */
+ bucket_info.placement_rule, /* placement_rule */
+ nullptr, /* time_t *src_mtime */
+ nullptr, /* time_t *mtime */
+ nullptr, /* const time_t *mod_ptr */
+ nullptr, /* const time_t *unmod_ptr */
+ false, /* bool high_precision_time */
+ nullptr, /* const char *if_match */
+ nullptr, /* const char *if_nomatch */
+ RGWRados::ATTRSMOD_NONE,
+ true, /* bool copy_if_newer */
+ no_attrs,
+ RGWObjCategory::Main,
+ 0, /* uint64_t olh_epoch */
+ real_time(), /* time_t delete_at */
+ nullptr, /* string *version_id */
+ nullptr, /* string *ptag */
+ nullptr, /* string *petag */
+ nullptr, /* void (*progress_cb)(off_t, void *) */
+ nullptr, /* void *progress_data */
+ dpp,
+ null_yield);
+ if (ret == -ECANCELED || ret == -ENOENT) {
+ /* Has already been overwritten, meaning another rgw process already
+ * copied it out */
+ return 0;
+ } else if (ret < 0) {
+ return ret;
+ } else {
+ restored = true;
+ }
+
+ /* Need to remove the archived copy. */
+ ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj,
+ archive_binfo.versioning_status());
+
+ return ret;
+ };
+
+ const std::string& obj_name = obj.get_oid();
+ const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
+ % obj_name);
+
+ return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
+ handler);
+}
+
+int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+ uint64_t size, uint64_t accounted_size,
+ map<string, bufferlist>& attrs,
+ bool assume_noent, bool modify_tail,
+ void *_index_op, optional_yield y)
+{
+ RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
+ RGWRados *store = target->get_store();
+
+ ObjectWriteOperation op;
+#ifdef WITH_LTTNG
+ const req_state* s = get_req_state();
+ string req_id;
+ if (!s) {
+ // fake req_id
+ req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
+ } else {
+ req_id = s->req_id;
+ }
+#endif
+
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
+ if (r < 0)
+ return r;
+
+ rgw_obj& obj = target->get_obj();
+
+ if (obj.get_oid().empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
+ return -EIO;
+ }
+
+ rgw_rados_ref ref;
+ r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
+ if (r < 0)
+ return r;
+
+ bool is_olh = state->is_olh;
+
+ bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
+
+ const string *ptag = meta.ptag;
+ if (!ptag && !index_op->get_optag()->empty()) {
+ ptag = index_op->get_optag();
+ }
+ r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
+ if (r < 0)
+ return r;
+
+ if (real_clock::is_zero(meta.set_mtime)) {
+ meta.set_mtime = real_clock::now();
+ }
+
+ if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
+ auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+ if (iter == attrs.end()) {
+ real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
+ string mode = target->get_bucket_info().obj_lock.get_mode();
+ RGWObjectRetention obj_retention(mode, lock_until_date);
+ bufferlist bl;
+ obj_retention.encode(bl);
+ op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
+ }
+ }
+
+ if (state->is_olh) {
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
+ }
+
+ struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
+ op.mtime2(&mtime_ts);
+
+ if (meta.data) {
+ /* if we want to overwrite the data, we also want to overwrite the
+ xattrs, so just remove the object */
+ op.write_full(*meta.data);
+ if (state->compressed) {
+ uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ op.set_alloc_hint2(0, 0, alloc_hint_flags);
+ }
+ }
+
+ string etag;
+ string content_type;
+ bufferlist acl_bl;
+ string storage_class;
+
+ map<string, bufferlist>::iterator iter;
+ if (meta.rmattrs) {
+ for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+ const string& name = iter->first;
+ op.rmxattr(name.c_str());
+ }
+ }
+
+ if (meta.manifest) {
+ storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+ /* remove existing manifest attr */
+ iter = attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != attrs.end())
+ attrs.erase(iter);
+
+ bufferlist bl;
+ encode(*meta.manifest, bl);
+ op.setxattr(RGW_ATTR_MANIFEST, bl);
+ }
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ const string& name = iter->first;
+ bufferlist& bl = iter->second;
+
+ if (!bl.length())
+ continue;
+
+ op.setxattr(name.c_str(), bl);
+
+ if (name.compare(RGW_ATTR_ETAG) == 0) {
+ etag = rgw_bl_str(bl);
+ } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+ content_type = rgw_bl_str(bl);
+ } else if (name.compare(RGW_ATTR_ACL) == 0) {
+ acl_bl = bl;
+ }
+ }
+ if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
+ cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
+ }
+
+ if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
+ bufferlist bl;
+ encode(store->svc.zone->get_zone_short_id(), bl);
+ op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
+ }
+
+ if (!storage_class.empty()) {
+ bufferlist bl;
+ bl.append(storage_class);
+ op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
+ }
+
+ if (!op.size())
+ return 0;
+
+ uint64_t epoch;
+ int64_t poolid;
+ bool orig_exists;
+ uint64_t orig_size;
+
+ if (!reset_obj) { //Multipart upload, it has immutable head.
+ orig_exists = false;
+ orig_size = 0;
+ } else {
+ orig_exists = state->exists;
+ orig_size = state->accounted_size;
+ }
+
+ bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
+ !obj.key.instance.empty();
+
+ bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
+
+ if (versioned_op) {
+ index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
+ }
+
+ if (!index_op->is_prepared()) {
+ tracepoint(rgw_rados, prepare_enter, req_id.c_str());
+ r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+ tracepoint(rgw_rados, prepare_exit, req_id.c_str());
+ if (r < 0)
+ return r;
+ }
+
+ auto& ioctx = ref.pool.ioctx();
+
+ tracepoint(rgw_rados, operate_enter, req_id.c_str());
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ tracepoint(rgw_rados, operate_exit, req_id.c_str());
+ if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
+ or -ENOENT if was removed, or -EEXIST if it did not exist
+ before and now it does */
+ if (r == -EEXIST && assume_noent) {
+ target->invalidate_state();
+ return r;
+ }
+ goto done_cancel;
+ }
+
+ epoch = ioctx.get_last_version();
+ poolid = ioctx.get_id();
+
+ r = target->complete_atomic_modification(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
+ }
+
+ tracepoint(rgw_rados, complete_enter, req_id.c_str());
+ r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
+ meta.set_mtime, etag, content_type,
+ storage_class, &acl_bl,
+ meta.category, meta.remove_objs, y,
+ meta.user_data, meta.appendable);
+ tracepoint(rgw_rados, complete_exit, req_id.c_str());
+ if (r < 0)
+ goto done_cancel;
+
+ if (meta.mtime) {
+ *meta.mtime = meta.set_mtime;
+ }
+
+ /* note that index_op was using state so we couldn't invalidate it earlier */
+ target->invalidate_state();
+ state = NULL;
+
+ if (versioned_op && meta.olh_epoch) {
+ r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!real_clock::is_zero(meta.delete_at)) {
+ rgw_obj_index_key obj_key;
+ obj.key.get_index_key(&obj_key);
+
+ r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
+ obj.bucket.bucket_id, obj_key);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
+ /* ignoring error, nothing we can do at this point */
+ }
+ }
+ meta.canceled = false;
+
+ /* update quota cache */
+ if (meta.completeMultipart){
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ 0, orig_size);
+ }
+ else {
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ accounted_size, orig_size);
+ }
+ return 0;
+
+done_cancel:
+ int ret = index_op->cancel(dpp, meta.remove_objs, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+ }
+
+ meta.canceled = true;
+
+ /* we lost in a race. There are a few options:
+ * - existing object was rewritten (ECANCELED)
+ * - non existing object was created (EEXIST)
+ * - object was removed (ENOENT)
+ * should treat it as a success
+ */
+ if (meta.if_match == NULL && meta.if_nomatch == NULL) {
+ if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
+ r = 0;
+ }
+ } else {
+ if (meta.if_match != NULL) {
+ // only overwrite existing object
+ if (strcmp(meta.if_match, "*") == 0) {
+ if (r == -ENOENT) {
+ r = -ERR_PRECONDITION_FAILED;
+ } else if (r == -ECANCELED) {
+ r = 0;
+ }
+ }
+ }
+
+ if (meta.if_nomatch != NULL) {
+ // only create a new object
+ if (strcmp(meta.if_nomatch, "*") == 0) {
+ if (r == -EEXIST) {
+ r = -ERR_PRECONDITION_FAILED;
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+ }
+ }
+ }
+
+ return r;
+}
+
+int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+ map<string, bufferlist>& attrs, optional_yield y)
+{
+ RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+ RGWRados::Bucket bop(target->get_store(), bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
+ index_op.set_zones_trace(meta.zones_trace);
+
+ bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
+ int r;
+ if (assume_noent) {
+ r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+ if (r == -EEXIST) {
+ assume_noent = false;
+ }
+ }
+ if (!assume_noent) {
+ r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+ }
+ return r;
+}
+
+class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
+{
+ const DoutPrefixProvider *dpp;
+ CephContext* cct;
+ rgw_obj obj;
+ rgw::sal::DataProcessor *filter;
+ boost::optional<RGWPutObj_Compress>& compressor;
+ bool try_etag_verify;
+ rgw::putobj::etag_verifier_ptr etag_verifier;
+ boost::optional<rgw::putobj::ChunkProcessor> buffering;
+ CompressorRef& plugin;
+ rgw::sal::ObjectProcessor *processor;
+ void (*progress_cb)(off_t, void *);
+ void *progress_data;
+ bufferlist extra_data_bl, manifest_bl;
+ std::optional<RGWCompressionInfo> compression_info;
+ uint64_t extra_data_left{0};
+ bool need_to_process_attrs{true};
+ uint64_t data_len{0};
+ map<string, bufferlist> src_attrs;
+ uint64_t ofs{0};
+ uint64_t lofs{0}; /* logical ofs */
+ std::function<int(map<string, bufferlist>&)> attrs_handler;
+
+public:
+ RGWRadosPutObj(const DoutPrefixProvider *dpp,
+ CephContext* cct,
+ CompressorRef& plugin,
+ boost::optional<RGWPutObj_Compress>& compressor,
+ rgw::sal::ObjectProcessor *p,
+ void (*_progress_cb)(off_t, void *),
+ void *_progress_data,
+ std::function<int(map<string, bufferlist>&)> _attrs_handler) :
+ dpp(dpp),
+ cct(cct),
+ filter(p),
+ compressor(compressor),
+ try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
+ plugin(plugin),
+ processor(p),
+ progress_cb(_progress_cb),
+ progress_data(_progress_data),
+ attrs_handler(_attrs_handler) {}
+
+
+ int process_attrs(void) {
+ bool encrypted = false;
+ if (extra_data_bl.length()) {
+ JSONParser jp;
+ if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+ ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+ return -EIO;
+ }
+
+ JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+ encrypted = src_attrs.count(RGW_ATTR_CRYPT_MODE);
+ if (encrypted) {
+ // we won't have access to the decrypted data for checksumming
+ try_etag_verify = false;
+ }
+
+ // if the object is both compressed and encrypted, it was transferred
+ // in its encrypted+compressed form. we need to preserve the original
+ // RGW_ATTR_COMPRESSION instead of falling back to default compression
+ // settings
+ auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
+ if (iter != src_attrs.end() && !encrypted) {
+ const bufferlist bl = std::move(iter->second);
+ src_attrs.erase(iter); // don't preserve source compression info
+
+ if (try_etag_verify) {
+ // if we're trying to verify etags, we need to convert compressed
+ // ranges in the manifest back into logical multipart part offsets
+ RGWCompressionInfo info;
+ bool compressed = false;
+ int r = rgw_compression_info_from_attr(bl, compressed, info);
+ if (r < 0) {
+ ldpp_dout(dpp, 4) << "failed to decode compression info, "
+ "disabling etag verification" << dendl;
+ try_etag_verify = false;
+ } else if (compressed) {
+ compression_info = std::move(info);
+ }
+ }
+ }
+
+ /* We need the manifest to recompute the ETag for verification */
+ iter = src_attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != src_attrs.end()) {
+ manifest_bl = std::move(iter->second);
+ src_attrs.erase(iter);
+
+ // if the source object was encrypted, preserve the part lengths from
+ // the original object's manifest in RGW_ATTR_CRYPT_PARTS. if the object
+ // already replicated and has the RGW_ATTR_CRYPT_PARTS attr, preserve it
+ if (src_attrs.count(RGW_ATTR_CRYPT_MODE) &&
+ !src_attrs.count(RGW_ATTR_CRYPT_PARTS)) {
+ std::vector<size_t> parts_len;
+ int r = RGWGetObj_BlockDecrypt::read_manifest_parts(dpp, manifest_bl,
+ parts_len);
+ if (r < 0) {
+ ldpp_dout(dpp, 4) << "failed to read part lengths from the manifest" << dendl;
+ } else {
+ // store the encoded part lenghts in RGW_ATTR_CRYPT_PARTS
+ bufferlist parts_bl;
+ encode(parts_len, parts_bl);
+ src_attrs[RGW_ATTR_CRYPT_PARTS] = std::move(parts_bl);
+ }
+ }
+ }
+
+ // filter out olh attributes
+ iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+ while (iter != src_attrs.end()) {
+ if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+ break;
+ }
+ iter = src_attrs.erase(iter);
+ }
+ }
+
+ int ret = attrs_handler(src_attrs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // do not compress if object is encrypted
+ if (plugin && !encrypted) {
+ compressor = boost::in_place(cct, plugin, filter);
+ // add a filter that buffers data so we don't try to compress tiny blocks.
+ // libcurl reads in 16k at a time, and we need at least 64k to get a good
+ // compression ratio
+ constexpr unsigned buffer_size = 512 * 1024;
+ buffering = boost::in_place(&*compressor, buffer_size);
+ filter = &*buffering;
+ }
+
+ if (try_etag_verify) {
+ ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
+ compression_info,
+ etag_verifier);
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
+ "disabling etag verification" << dendl;
+ } else {
+ filter = etag_verifier.get();
+ }
+ }
+
+ need_to_process_attrs = false;
+
+ return 0;
+ }
+
+ int handle_data(bufferlist& bl, bool *pause) override {
+ if (progress_cb) {
+ progress_cb(data_len, progress_data);
+ }
+ if (extra_data_left) {
+ uint64_t extra_len = bl.length();
+ if (extra_len > extra_data_left)
+ extra_len = extra_data_left;
+
+ bufferlist extra;
+ bl.splice(0, extra_len, &extra);
+ extra_data_bl.append(extra);
+
+ extra_data_left -= extra_len;
+ if (extra_data_left == 0) {
+ int res = process_attrs();
+ if (res < 0)
+ return res;
+ }
+ ofs += extra_len;
+ if (bl.length() == 0) {
+ return 0;
+ }
+ }
+ if (need_to_process_attrs) {
+ /* need to call process_attrs() even if we don't get any attrs,
+ * need it to call attrs_handler().
+ */
+ int res = process_attrs();
+ if (res < 0) {
+ return res;
+ }
+ }
+
+ ceph_assert(uint64_t(ofs) >= extra_data_len);
+
+ uint64_t size = bl.length();
+ ofs += size;
+
+ const uint64_t lofs = data_len;
+ data_len += size;
+
+ return filter->process(std::move(bl), lofs);
+ }
+
+ int flush() {
+ return filter->process({}, data_len);
+ }
+
+ bufferlist& get_extra_data() { return extra_data_bl; }
+
+ map<string, bufferlist>& get_attrs() { return src_attrs; }
+
+ void set_extra_data_len(uint64_t len) override {
+ extra_data_left = len;
+ RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
+ }
+
+ uint64_t get_data_len() {
+ return data_len;
+ }
+
+ std::string get_verifier_etag() {
+ if (etag_verifier) {
+ etag_verifier->calculate_etag();
+ return etag_verifier->get_calculated_etag();
+ } else {
+ return "";
+ }
+ }
+};
+
+/*
+ * prepare attrset depending on attrs_mod.
+ */
+static void set_copy_attrs(map<string, bufferlist>& src_attrs,
+ map<string, bufferlist>& attrs,
+ RGWRados::AttrsMod attrs_mod)
+{
+ switch (attrs_mod) {
+ case RGWRados::ATTRSMOD_NONE:
+ attrs = src_attrs;
+ break;
+ case RGWRados::ATTRSMOD_REPLACE:
+ if (!attrs[RGW_ATTR_ETAG].length()) {
+ attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
+ }
+ if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+ auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+ if (ttiter != src_attrs.end()) {
+ attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+ }
+ }
+ break;
+ case RGWRados::ATTRSMOD_MERGE:
+ for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+ if (attrs.find(it->first) == attrs.end()) {
+ attrs[it->first] = it->second;
+ }
+ }
+ break;
+ }
+}
+
+int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y)
+{
+ RGWObjectCtx rctx(this->driver);
+ rgw::sal::Attrs attrset;
+ uint64_t obj_size;
+ ceph::real_time mtime;
+ RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
+ RGWRados::Object::Read read_op(&op_target);
+
+ read_op.params.attrs = &attrset;
+ read_op.params.obj_size = &obj_size;
+ read_op.params.lastmod = &mtime;
+
+ int ret = read_op.prepare(y, dpp);
+ if (ret < 0)
+ return ret;
+
+ attrset.erase(RGW_ATTR_ID_TAG);
+ attrset.erase(RGW_ATTR_TAIL_TAG);
+ attrset.erase(RGW_ATTR_STORAGE_CLASS);
+
+ return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
+ read_op, obj_size - 1, obj, NULL, mtime,
+ attrset, 0, real_time(), NULL, dpp, y);
+}
+
+int RGWRados::reindex_obj(const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ const DoutPrefixProvider* dpp,
+ optional_yield y)
+{
+ if (bucket_info.versioned()) {
+ ldpp_dout(dpp, 10) << "WARNING: " << __func__ <<
+ ": cannot process versioned bucket \"" <<
+ bucket_info.bucket.get_key() << "\"" <<
+ dendl;
+ return -ENOTSUP;
+ }
+
+ Bucket target(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex update_idx(&target, obj);
+ const std::string* no_write_tag = nullptr;
+
+ int ret = update_idx.prepare(dpp, RGWModifyOp::CLS_RGW_OP_ADD, no_write_tag, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": update index prepare for \"" << obj << "\" returned: " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+struct obj_time_weight {
+ real_time mtime;
+ uint32_t zone_short_id;
+ uint64_t pg_ver;
+ bool high_precision;
+
+ obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
+
+ bool compare_low_precision(const obj_time_weight& rhs) {
+ struct timespec l = ceph::real_clock::to_timespec(mtime);
+ struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
+ l.tv_nsec = 0;
+ r.tv_nsec = 0;
+ if (l > r) {
+ return false;
+ }
+ if (l < r) {
+ return true;
+ }
+ if (!zone_short_id || !rhs.zone_short_id) {
+ /* don't compare zone ids, if one wasn't provided */
+ return false;
+ }
+ if (zone_short_id != rhs.zone_short_id) {
+ return (zone_short_id < rhs.zone_short_id);
+ }
+ return (pg_ver < rhs.pg_ver);
+
+ }
+
+ bool operator<(const obj_time_weight& rhs) {
+ if (!high_precision || !rhs.high_precision) {
+ return compare_low_precision(rhs);
+ }
+ if (mtime > rhs.mtime) {
+ return false;
+ }
+ if (mtime < rhs.mtime) {
+ return true;
+ }
+ if (!zone_short_id || !rhs.zone_short_id) {
+ /* don't compare zone ids, if one wasn't provided */
+ return false;
+ }
+ if (zone_short_id != rhs.zone_short_id) {
+ return (zone_short_id < rhs.zone_short_id);
+ }
+ return (pg_ver < rhs.pg_ver);
+ }
+
+ void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
+ mtime = _mtime;
+ zone_short_id = _short_id;
+ pg_ver = _pg_ver;
+ }
+
+ void init(RGWObjState *state) {
+ mtime = state->mtime;
+ zone_short_id = state->zone_short_id;
+ pg_ver = state->pg_ver;
+ }
+};
+
+inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
+ out << o.mtime;
+
+ if (o.zone_short_id != 0 || o.pg_ver != 0) {
+ out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
+ }
+
+ return out;
+}
+
+class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
+ bufferlist extra_data;
+public:
+ RGWGetExtraDataCB() {}
+ int handle_data(bufferlist& bl, bool *pause) override {
+ int bl_len = (int)bl.length();
+ if (extra_data.length() < extra_data_len) {
+ off_t max = extra_data_len - extra_data.length();
+ if (max > bl_len) {
+ max = bl_len;
+ }
+ bl.splice(0, max, &extra_data);
+ }
+ return bl_len;
+ }
+
+ bufferlist& get_extra_data() {
+ return extra_data;
+ }
+};
+
+int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& src_obj,
+ const RGWBucketInfo *src_bucket_info,
+ real_time *src_mtime,
+ uint64_t *psize,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ map<string, bufferlist> *pattrs,
+ map<string, string> *pheaders,
+ string *version_id,
+ string *ptag,
+ string *petag)
+{
+ /* source is in a different zonegroup, copy from there */
+
+ RGWRESTStreamRWRequest *in_stream_req;
+ string tag;
+ map<string, bufferlist> src_attrs;
+ append_rand_alpha(cct, tag, tag, 32);
+ obj_time_weight set_mtime_weight;
+ set_mtime_weight.high_precision = high_precision_time;
+
+ RGWRESTConn *conn;
+ if (source_zone.empty()) {
+ if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+ /* source is in the master zonegroup */
+ conn = svc.zone->get_master_conn();
+ } else {
+ auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+ map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+ if (iter == zonegroup_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+ } else {
+ auto& zone_conn_map = svc.zone->get_zone_conn_map();
+ auto iter = zone_conn_map.find(source_zone);
+ if (iter == zone_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+
+ RGWGetExtraDataCB cb;
+ map<string, string> req_headers;
+ real_time set_mtime;
+
+ const real_time *pmod = mod_ptr;
+
+ obj_time_weight dest_mtime_weight;
+
+ constexpr bool prepend_meta = true;
+ constexpr bool get_op = true;
+ constexpr bool rgwx_stat = true;
+ constexpr bool sync_manifest = true;
+ constexpr bool skip_decrypt = true;
+ constexpr bool sync_cloudtiered = true;
+ int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+ dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt, nullptr, sync_cloudtiered,
+ true, &cb, &in_stream_req);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
+ nullptr, pheaders, null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bufferlist& extra_data_bl = cb.get_extra_data();
+ if (extra_data_bl.length()) {
+ JSONParser jp;
+ if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+ ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+ return -EIO;
+ }
+
+ JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+ src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+ }
+
+ if (src_mtime) {
+ *src_mtime = set_mtime;
+ }
+
+ if (petag) {
+ map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+ if (iter != src_attrs.end()) {
+ bufferlist& etagbl = iter->second;
+ *petag = etagbl.to_str();
+ while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
+ *petag = petag->substr(0, petag->size() - 1);
+ }
+ }
+ }
+
+ if (pattrs) {
+ *pattrs = std::move(src_attrs);
+ }
+
+ return 0;
+}
+
+int RGWFetchObjFilter_Default::filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const map<string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule)
+{
+ const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
+ if (!ptail_rule) {
+ auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != obj_attrs.end()) {
+ dest_rule.storage_class = iter->second.to_str();
+ dest_rule.inherit_from(dest_bucket_info.placement_rule);
+ ptail_rule = &dest_rule;
+ } else {
+ ptail_rule = &dest_bucket_info.placement_rule;
+ }
+ }
+ *prule = ptail_rule;
+ return 0;
+}
+
+int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& dest_obj,
+ const rgw_obj& src_obj,
+ RGWBucketInfo& dest_bucket_info,
+ RGWBucketInfo *src_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ real_time *src_mtime,
+ real_time *mtime,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ std::optional<uint64_t> olh_epoch,
+ real_time delete_at,
+ string *ptag,
+ string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ RGWFetchObjFilter *filter,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace,
+ std::optional<uint64_t>* bytes_transferred)
+{
+ /* source is in a different zonegroup, copy from there */
+
+ RGWRESTStreamRWRequest *in_stream_req;
+ string tag;
+ int i;
+ append_rand_alpha(cct, tag, tag, 32);
+ obj_time_weight set_mtime_weight;
+ set_mtime_weight.high_precision = high_precision_time;
+ int ret;
+
+ rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+ using namespace rgw::putobj;
+ AtomicObjectProcessor processor(&aio, this, dest_bucket_info, nullptr,
+ user_id, obj_ctx, dest_obj, olh_epoch,
+ tag, dpp, null_yield);
+ RGWRESTConn *conn;
+ auto& zone_conn_map = svc.zone->get_zone_conn_map();
+ auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+ if (source_zone.empty()) {
+ if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+ /* source is in the master zonegroup */
+ conn = svc.zone->get_master_conn();
+ } else {
+ map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+ if (iter == zonegroup_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+ } else {
+ auto iter = zone_conn_map.find(source_zone);
+ if (iter == zone_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+
+ boost::optional<RGWPutObj_Compress> compressor;
+ CompressorRef plugin;
+
+ RGWFetchObjFilter_Default source_filter;
+ if (!filter) {
+ filter = &source_filter;
+ }
+
+ std::optional<rgw_user> override_owner;
+
+ RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+ [&](map<string, bufferlist>& obj_attrs) {
+ const rgw_placement_rule *ptail_rule;
+
+ int ret = filter->filter(cct,
+ src_obj.key,
+ dest_bucket_info,
+ dest_placement_rule,
+ obj_attrs,
+ &override_owner,
+ &ptail_rule);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ processor.set_tail_placement(*ptail_rule);
+
+ const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
+ if (compression_type != "none") {
+ plugin = Compressor::create(cct, compression_type);
+ if (!plugin) {
+ ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+ << compression_type << dendl;
+ }
+ }
+
+ ret = processor.prepare(null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+ });
+
+ string etag;
+ real_time set_mtime;
+ uint64_t accounted_size = 0;
+
+ RGWObjState *dest_state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ const real_time *pmod = mod_ptr;
+
+ obj_time_weight dest_mtime_weight;
+ rgw_zone_set_entry dst_zone_trace(svc.zone->get_zone().id, dest_bucket_info.bucket.get_key());
+
+ if (copy_if_newer) {
+ /* need to get mtime for destination */
+ ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield);
+ if (ret < 0)
+ goto set_err_state;
+
+ if (!real_clock::is_zero(dest_state->mtime)) {
+ dest_mtime_weight.init(dest_state);
+ pmod = &dest_mtime_weight.mtime;
+ }
+ }
+
+ static constexpr bool prepend_meta = true;
+ static constexpr bool get_op = true;
+ static constexpr bool rgwx_stat = false;
+ static constexpr bool sync_manifest = true;
+ static constexpr bool skip_decrypt = true;
+ static constexpr bool sync_cloudtiered = true;
+ ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+ dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt, &dst_zone_trace,
+ sync_cloudtiered, true,
+ &cb, &in_stream_req);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+
+ ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
+ &accounted_size, nullptr, nullptr, null_yield);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+ ret = cb.flush();
+ if (ret < 0) {
+ goto set_err_state;
+ }
+ if (cb.get_data_len() != accounted_size) {
+ ret = -EIO;
+ ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
+ << accounted_size << " bytes but received " << cb.get_data_len() << dendl;
+ goto set_err_state;
+ }
+
+ if (compressor && compressor->is_compressed()) {
+ bufferlist tmp;
+ RGWCompressionInfo cs_info;
+ cs_info.compression_type = plugin->get_type_name();
+ cs_info.orig_size = accounted_size;
+ cs_info.compressor_message = compressor->get_compressor_message();
+ cs_info.blocks = move(compressor->get_compression_blocks());
+ encode(cs_info, tmp);
+ cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
+ } else if (auto c = cb.get_attrs().find(RGW_ATTR_COMPRESSION);
+ c != cb.get_attrs().end()) {
+ // if the object was transferred in its compressed+encrypted form, use its
+ // original uncompressed size
+ try {
+ RGWCompressionInfo info;
+ auto p = c->second.cbegin();
+ decode(info, p);
+ accounted_size = info.orig_size;
+ } catch (const buffer::error&) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode compression attr for "
+ "replicated object " << dest_obj << dendl;
+ // decode error isn't fatal, but we might put the wrong size in the index
+ }
+ }
+
+ if (override_owner) {
+ processor.set_owner(*override_owner);
+
+ auto& obj_attrs = cb.get_attrs();
+
+ RGWUserInfo owner_info;
+ if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
+ ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
+ return -EINVAL;
+ }
+
+ RGWAccessControlPolicy acl;
+
+ auto aiter = obj_attrs.find(RGW_ATTR_ACL);
+ if (aiter == obj_attrs.end()) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
+ acl.create_default(owner_info.user_id, owner_info.display_name);
+ } else {
+ auto iter = aiter->second.cbegin();
+ try {
+ acl.decode(iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ }
+
+ ACLOwner new_owner;
+ new_owner.set_id(*override_owner);
+ new_owner.set_name(owner_info.display_name);
+
+ acl.set_owner(new_owner);
+
+ bufferlist bl;
+ acl.encode(bl);
+ obj_attrs[RGW_ATTR_ACL] = std::move(bl);
+ }
+
+ if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
+ cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
+ } else {
+ map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
+ if (iter != cb.get_attrs().end()) {
+ try {
+ decode(delete_at, iter->second);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
+ }
+ }
+ }
+
+ if (src_mtime) {
+ *src_mtime = set_mtime;
+ }
+
+ if (petag) {
+ const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
+ if (iter != cb.get_attrs().end()) {
+ *petag = iter->second.to_str();
+ }
+ }
+
+ //erase the append attr
+ cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
+
+ { // add x-amz-replication-status=REPLICA
+ auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
+ bl.clear(); // overwrite source's status
+ bl.append("REPLICA");
+ }
+ { // update replication trace
+ std::vector<rgw_zone_set_entry> trace;
+ if (auto i = cb.get_attrs().find(RGW_ATTR_OBJ_REPLICATION_TRACE);
+ i != cb.get_attrs().end()) {
+ try {
+ decode(trace, i->second);
+ } catch (const buffer::error&) {}
+ }
+ // add the source entry to the end
+ trace.push_back(source_trace_entry);
+
+ bufferlist bl;
+ encode(trace, bl);
+ cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_TRACE] = std::move(bl);
+ }
+
+ if (source_zone.empty()) {
+ set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
+ } else {
+ attrs = cb.get_attrs();
+ }
+
+ if (copy_if_newer) {
+ uint64_t pg_ver = 0;
+ auto i = attrs.find(RGW_ATTR_PG_VER);
+ if (i != attrs.end() && i->second.length() > 0) {
+ auto iter = i->second.cbegin();
+ try {
+ decode(pg_ver, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
+ /* non critical error */
+ }
+ }
+ set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
+ }
+
+ /* Perform ETag verification is we have computed the object's MD5 sum at our end */
+ if (const auto& verifier_etag = cb.get_verifier_etag();
+ !verifier_etag.empty()) {
+ string trimmed_etag = etag;
+
+ /* Remove the leading and trailing double quotes from etag */
+ trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
+ trimmed_etag.end());
+
+ if (verifier_etag != trimmed_etag) {
+ ret = -EIO;
+ ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
+ << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
+ goto set_err_state;
+ }
+ }
+
+#define MAX_COMPLETE_RETRY 100
+ for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
+ bool canceled = false;
+ ret = processor.complete(accounted_size, etag, mtime, set_mtime,
+ attrs, delete_at, nullptr, nullptr, nullptr,
+ zones_trace, &canceled, null_yield);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+
+ if (copy_if_newer && canceled) {
+ ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
+ obj_ctx.invalidate(dest_obj); /* object was overwritten */
+ ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
+ goto set_err_state;
+ }
+ dest_mtime_weight.init(dest_state);
+ dest_mtime_weight.high_precision = high_precision_time;
+ if (!dest_state->exists ||
+ dest_mtime_weight < set_mtime_weight) {
+ ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ continue;
+ } else {
+ ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ }
+ }
+ break;
+ }
+
+ if (i == MAX_COMPLETE_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
+ ret = -EIO;
+ goto set_err_state;
+ }
+
+ if (bytes_transferred) {
+ *bytes_transferred = cb.get_data_len();
+ }
+ return 0;
+set_err_state:
+ if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
+ // we may have already fetched during sync of OP_ADD, but were waiting
+ // for OP_LINK_OLH to call set_olh() with a real olh_epoch
+ if (olh_epoch && *olh_epoch > 0) {
+ constexpr bool log_data_change = true;
+ ret = set_olh(dpp, obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
+ *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
+ } else {
+ // we already have the latest copy
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+
+int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+ RGWObjState *astate,
+ map<string, bufferlist>& src_attrs,
+ RGWRados::Object::Read& read_op,
+ const rgw_user& user_id,
+ const rgw_obj& dest_obj,
+ real_time *mtime)
+{
+ string etag;
+
+ RGWRESTStreamS3PutObj *out_stream_req;
+
+ auto rest_master_conn = svc.zone->get_master_conn();
+
+ int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
+ if (ret < 0) {
+ return ret;
+ }
+
+ out_stream_req->set_send_length(astate->size);
+
+ ret = RGWHTTP::send(out_stream_req);
+ if (ret < 0) {
+ delete out_stream_req;
+ return ret;
+ }
+
+ ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
+ if (ret < 0) {
+ delete out_stream_req;
+ return ret;
+ }
+
+ ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ * ATTRSMOD_NONE - the attributes of the source object will be
+ * copied without modifications, attrs parameter is ignored;
+ * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ * parameter, source object attributes are not copied;
+ * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ * are overwritten by values contained in attrs parameter.
+ * err: stores any errors resulting from the get of the original object
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& dest_obj,
+ const rgw_obj& src_obj,
+ RGWBucketInfo& dest_bucket_info,
+ RGWBucketInfo& src_bucket_info,
+ const rgw_placement_rule& dest_placement,
+ real_time *src_mtime,
+ real_time *mtime,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ uint64_t olh_epoch,
+ real_time delete_at,
+ string *version_id,
+ string *ptag,
+ string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ int ret;
+ uint64_t obj_size;
+ rgw_obj shadow_obj = dest_obj;
+ string shadow_oid;
+
+ bool remote_src;
+ bool remote_dest;
+
+ append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
+ shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
+
+ auto& zonegroup = svc.zone->get_zonegroup();
+
+ remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
+ remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
+
+ if (remote_src && remote_dest) {
+ ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
+ return -EINVAL;
+ }
+
+ ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
+
+ if (remote_src || !source_zone.empty()) {
+ rgw_zone_set_entry source_trace_entry{source_zone.id, std::nullopt};
+ return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
+ dest_obj, src_obj, dest_bucket_info, &src_bucket_info,
+ dest_placement, src_mtime, mtime, mod_ptr,
+ unmod_ptr, high_precision_time,
+ if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
+ olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
+ nullptr /* filter */, source_trace_entry);
+ }
+
+ map<string, bufferlist> src_attrs;
+ RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
+ RGWRados::Object::Read read_op(&src_op_target);
+
+ read_op.conds.mod_ptr = mod_ptr;
+ read_op.conds.unmod_ptr = unmod_ptr;
+ read_op.conds.high_precision_time = high_precision_time;
+ read_op.conds.if_match = if_match;
+ read_op.conds.if_nomatch = if_nomatch;
+ read_op.params.attrs = &src_attrs;
+ read_op.params.lastmod = src_mtime;
+ read_op.params.obj_size = &obj_size;
+
+ ret = read_op.prepare(y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
+ // Current implementation does not follow S3 spec and even
+ // may result in data corruption silently when copying
+ // multipart objects acorss pools. So reject COPY operations
+ //on encrypted objects before it is fully functional.
+ ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
+ << " has not been implemented." << dendl;
+ return -ERR_NOT_IMPLEMENTED;
+ }
+
+ src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+ src_attrs.erase(RGW_ATTR_DELETE_AT);
+
+ src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
+ src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
+ map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+ if (rt != attrs.end())
+ src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
+ map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+ if (lh != attrs.end())
+ src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
+
+ set_copy_attrs(src_attrs, attrs, attrs_mod);
+ attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_PG_VER);
+ attrs.erase(RGW_ATTR_SOURCE_ZONE);
+ map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
+ if (cmp != src_attrs.end())
+ attrs[RGW_ATTR_COMPRESSION] = cmp->second;
+
+ RGWObjManifest manifest;
+ RGWObjState *astate = NULL;
+ RGWObjManifest *amanifest = nullptr;
+
+ ret = get_obj_state(dpp, &obj_ctx, src_bucket_info, src_obj, &astate, &amanifest, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ vector<rgw_raw_obj> ref_objs;
+
+ if (remote_dest) {
+ /* dest is in a different zonegroup, copy it there */
+ return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
+ }
+ uint64_t max_chunk_size;
+
+ ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
+ return ret;
+ }
+
+ rgw_pool src_pool;
+ rgw_pool dest_pool;
+
+ const rgw_placement_rule *src_rule{nullptr};
+
+ if (amanifest) {
+ src_rule = &amanifest->get_tail_placement().placement_rule;
+ ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
+ }
+
+ if (!src_rule || src_rule->empty()) {
+ src_rule = &src_bucket_info.placement_rule;
+ }
+
+ if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
+ return -EIO;
+ }
+
+ if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
+ return -EIO;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
+ << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
+
+ bool copy_data = (!amanifest) ||
+ (*src_rule != dest_placement) ||
+ (src_pool != dest_pool);
+
+ bool copy_first = false;
+ if (amanifest) {
+ if (!amanifest->has_tail()) {
+ copy_data = true;
+ } else {
+ uint64_t head_size = amanifest->get_head_size();
+
+ if (head_size > 0) {
+ if (head_size > max_chunk_size) {
+ copy_data = true;
+ } else {
+ copy_first = true;
+ }
+ }
+ }
+ }
+
+ if (petag) {
+ const auto iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
+ *petag = iter->second.to_str();
+ }
+ }
+
+ if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+ return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
+ mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
+ }
+
+ /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
+ RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
+
+ if (copy_first) { // we need to copy first chunk, not increase refcount
+ ++miter;
+ }
+
+ bufferlist first_chunk;
+
+ const bool copy_itself = (dest_obj == src_obj);
+ RGWObjManifest *pmanifest;
+ ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
+
+ RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
+ RGWRados::Object::Write write_op(&dest_op_target);
+
+ string tag;
+
+ if (ptag) {
+ tag = *ptag;
+ }
+
+ if (tag.empty()) {
+ append_rand_alpha(cct, tag, tag, 32);
+ }
+
+ std::unique_ptr<rgw::Aio> aio;
+ rgw::AioResultList all_results;
+ if (!copy_itself) {
+ aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+ manifest = *amanifest;
+ const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
+ if (tail_placement.bucket.name.empty()) {
+ manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
+ }
+ string ref_tag;
+ for (; miter != amanifest->obj_end(dpp); ++miter) {
+ ObjectWriteOperation op;
+ ref_tag = tag + '\0';
+ cls_refcount_get(op, ref_tag, true);
+
+ auto obj = svc.rados->obj(miter.get_location().get_raw_obj(this));
+ ret = obj.open(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
+ goto done_ret;
+ }
+
+ static constexpr uint64_t cost = 1; // 1 throttle unit per request
+ static constexpr uint64_t id = 0; // ids unused
+ rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+ ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
+ goto done_ret;
+ }
+ }
+
+ rgw::AioResultList completed = aio->drain();
+ ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
+ goto done_ret;
+ }
+
+ pmanifest = &manifest;
+ } else {
+ pmanifest = amanifest;
+ /* don't send the object's tail for garbage collection */
+ astate->keep_tail = true;
+ }
+
+ if (copy_first) {
+ ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
+ if (ret < 0) {
+ goto done_ret;
+ }
+
+ pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
+ } else {
+ pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
+ }
+
+ write_op.meta.data = &first_chunk;
+ write_op.meta.manifest = pmanifest;
+ write_op.meta.ptag = &tag;
+ write_op.meta.owner = dest_bucket_info.owner;
+ write_op.meta.mtime = mtime;
+ write_op.meta.flags = PUT_OBJ_CREATE;
+ write_op.meta.category = category;
+ write_op.meta.olh_epoch = olh_epoch;
+ write_op.meta.delete_at = delete_at;
+ write_op.meta.modify_tail = !copy_itself;
+
+ ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
+ if (ret < 0) {
+ goto done_ret;
+ }
+
+ return 0;
+
+done_ret:
+ if (!copy_itself) {
+
+ /* wait all pending op done */
+ rgw::AioResultList completed = aio->drain();
+ all_results.splice(all_results.end(), completed);
+
+ /* rollback reference */
+ string ref_tag = tag + '\0';
+ int ret2 = 0;
+ for (auto& r : all_results) {
+ if (r.result < 0) {
+ continue; // skip errors
+ }
+ ObjectWriteOperation op;
+ cls_refcount_put(op, ref_tag, true);
+
+ static constexpr uint64_t cost = 1; // 1 throttle unit per request
+ static constexpr uint64_t id = 0; // ids unused
+ rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
+ }
+ }
+ completed = aio->drain();
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
+ }
+ }
+ return ret;
+}
+
+
+int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& dest_bucket_info,
+ const rgw_placement_rule& dest_placement,
+ RGWRados::Object::Read& read_op, off_t end,
+ const rgw_obj& dest_obj,
+ real_time *mtime,
+ real_time set_mtime,
+ rgw::sal::Attrs& attrs,
+ uint64_t olh_epoch,
+ real_time delete_at,
+ string *petag,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ string tag;
+ append_rand_alpha(cct, tag, tag, 32);
+
+ rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+ using namespace rgw::putobj;
+ // do not change the null_yield in the initialization of this AtomicObjectProcessor
+ // it causes crashes in the ragweed tests
+ AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement,
+ dest_bucket_info.owner, obj_ctx,
+ dest_obj, olh_epoch, tag,
+ dpp, null_yield);
+ int ret = processor.prepare(y);
+ if (ret < 0)
+ return ret;
+
+ off_t ofs = 0;
+
+ do {
+ bufferlist bl;
+ ret = read_op.read(ofs, end, bl, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
+ return ret;
+ }
+
+ uint64_t read_len = ret;
+ ret = processor.process(std::move(bl), ofs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ofs += read_len;
+ } while (ofs <= end);
+
+ // flush
+ ret = processor.process({}, ofs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ string etag;
+ auto iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
+ bufferlist& bl = iter->second;
+ etag = bl.to_str();
+ if (petag) {
+ *petag = etag;
+ }
+ }
+
+ uint64_t accounted_size;
+ {
+ bool compressed{false};
+ RGWCompressionInfo cs_info;
+ ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
+ return ret;
+ }
+ // pass original size if compressed
+ accounted_size = compressed ? cs_info.orig_size : ofs;
+ }
+
+ return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+ nullptr, nullptr, nullptr, nullptr, nullptr, y);
+}
+
+int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ rgw::sal::Attrs attrs;
+ real_time read_mtime;
+ uint64_t obj_size;
+
+ obj_ctx.set_atomic(obj);
+ RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
+ RGWRados::Object::Read read_op(&op_target);
+
+ read_op.params.attrs = &attrs;
+ read_op.params.lastmod = &read_mtime;
+ read_op.params.obj_size = &obj_size;
+
+ int ret = read_op.prepare(y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (read_mtime != mtime) {
+ /* raced */
+ ldpp_dout(dpp, 0) << __func__ << " ERROR: failed to transition obj(" << obj.key << ") read_mtime = " << read_mtime << " doesn't match mtime = " << mtime << dendl;
+ return -ECANCELED;
+ }
+
+ attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+
+ ret = copy_obj_data(obj_ctx,
+ bucket_info,
+ placement_rule,
+ read_op,
+ obj_size - 1,
+ obj,
+ nullptr /* pmtime */,
+ mtime,
+ attrs,
+ olh_epoch,
+ real_time(),
+ nullptr /* petag */,
+ dpp,
+ y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
+{
+ constexpr uint NUM_ENTRIES = 1000u;
+
+ rgw_obj_index_key marker;
+ string prefix;
+ bool is_truncated;
+
+ do {
+ std::vector<rgw_bucket_dir_entry> ent_list;
+ ent_list.reserve(NUM_ENTRIES);
+
+ int r = cls_bucket_list_unordered(dpp,
+ bucket_info,
+ bucket_info.layout.current_index,
+ RGW_NO_SHARD,
+ marker,
+ prefix,
+ NUM_ENTRIES,
+ true,
+ ent_list,
+ &is_truncated,
+ &marker,
+ y);
+ if (r < 0) {
+ return r;
+ }
+
+ string ns;
+ for (auto const& dirent : ent_list) {
+ rgw_obj_key obj;
+
+ if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
+ return -ENOTEMPTY;
+ }
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+/**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
+{
+ const rgw_bucket& bucket = bucket_info.bucket;
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ if (check_empty) {
+ r = check_bucket_empty(dpp, bucket_info, y);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ bool remove_ep = true;
+
+ if (objv_tracker.read_version.empty()) {
+ RGWBucketEntryPoint ep;
+ r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
+ &ep,
+ null_yield,
+ dpp,
+ RGWBucketCtl::Bucket::GetParams()
+ .set_objv_tracker(&objv_tracker));
+ if (r < 0 ||
+ (!bucket_info.bucket.bucket_id.empty() &&
+ ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
+ if (r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
+ /* we have no idea what caused the error, will not try to remove it */
+ }
+ /*
+ * either failed to read bucket entrypoint, or it points to a different bucket instance than
+ * requested
+ */
+ remove_ep = false;
+ }
+ }
+
+ if (remove_ep) {
+ r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
+ RGWBucketCtl::Bucket::RemoveParams()
+ .set_objv_tracker(&objv_tracker));
+ if (r < 0)
+ return r;
+ }
+
+ /* if the bucket is not synced we can remove the meta file */
+ if (!svc.zone->is_syncing_bucket_meta(bucket)) {
+ RGWObjVersionTracker objv_tracker;
+ r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ /* remove bucket index objects asynchronously by best effort */
+ (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
+ }
+
+ return 0;
+}
+
+int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
+{
+ RGWBucketInfo info;
+ map<string, bufferlist> attrs;
+ int r;
+
+ if (bucket.bucket_id.empty()) {
+ r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+ } else {
+ r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+ return r;
+ }
+
+ info.owner = owner.get_id();
+
+ r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+
+ vector<rgw_bucket>::iterator iter;
+
+ for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
+ rgw_bucket& bucket = *iter;
+ if (enabled) {
+ ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
+ }
+
+ RGWBucketInfo info;
+ map<string, bufferlist> attrs;
+ int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+ ret = r;
+ continue;
+ }
+ if (enabled) {
+ info.flags &= ~BUCKET_SUSPENDED;
+ } else {
+ info.flags |= BUCKET_SUSPENDED;
+ }
+
+ r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+ ret = r;
+ continue;
+ }
+ }
+ return ret;
+}
+
+int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
+{
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
+ return 0;
+}
+
+int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
+{
+ if ((!manifest)|| state->keep_tail)
+ return 0;
+
+ cls_rgw_obj_chain chain;
+ store->update_gc_chain(dpp, obj, *manifest, &chain);
+
+ if (chain.empty()) {
+ return 0;
+ }
+
+ string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
+ if (store->gc == nullptr) {
+ ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
+ //Delete objects inline just in case gc hasn't been initialised, prevents crashes
+ store->delete_objs_inline(dpp, chain, tag);
+ } else {
+ auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
+ if (ret < 0 && leftover_chain) {
+ //Delete objects inline if send chain to gc fails
+ store->delete_objs_inline(dpp, *leftover_chain, tag);
+ }
+ }
+ return 0;
+}
+
+void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
+{
+ RGWObjManifest::obj_iterator iter;
+ rgw_raw_obj raw_head;
+ obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
+ for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
+ const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
+ if (mobj == raw_head)
+ continue;
+ cls_rgw_obj_key key(mobj.oid);
+ chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
+ }
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
+{
+ if (chain.empty()) {
+ return {0, std::nullopt};
+ }
+
+ return gc->send_split_chain(chain, tag);
+}
+
+void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
+{
+ string last_pool;
+ std::unique_ptr<IoCtx> ctx(new IoCtx);
+ int ret = 0;
+ for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+ cls_rgw_obj& obj = *liter;
+ if (obj.pool != last_pool) {
+ ctx.reset(new IoCtx);
+ ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
+ if (ret < 0) {
+ last_pool = "";
+ ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
+ obj.pool << dendl;
+ continue;
+ }
+ last_pool = obj.pool;
+ }
+ ctx->locator_set_key(obj.loc);
+ const string& oid = obj.key.name; /* just stored raw oid there */
+ ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
+ ":" << obj.key.name << dendl;
+ ObjectWriteOperation op;
+ cls_refcount_put(op, tag, true);
+ ret = ctx->operate(oid, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
+ }
+ }
+}
+
+static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
+ map<RGWObjCategory, RGWStorageStats>& stats)
+{
+ for (const auto& pair : header.stats) {
+ const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
+ const rgw_bucket_category_stats& header_stats = pair.second;
+
+ RGWStorageStats& s = stats[category];
+
+ s.category = category;
+ s.size += header_stats.total_size;
+ s.size_rounded += header_stats.total_size_rounded;
+ s.size_utilized += header_stats.actual_size;
+ s.num_objects += header_stats.num_entries;
+ }
+}
+
+int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ map<RGWObjCategory, RGWStorageStats> *existing_stats,
+ map<RGWObjCategory, RGWStorageStats> *calculated_stats)
+{
+ RGWSI_RADOS::Pool index_pool;
+
+ // key - bucket index object id
+ // value - bucket index check OP returned result with the given bucket index object (shard)
+ map<int, string> oids;
+
+ int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // declare and pre-populate
+ map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
+ for (auto& iter : oids) {
+ bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
+ }
+
+ ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
+ if (ret < 0) {
+ return ret;
+ }
+
+ // aggregate results (from different shards if there are any)
+ for (const auto& iter : bucket_objs_ret) {
+ accumulate_raw_stats(iter.second.existing_header, *existing_stats);
+ accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+static int resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+ optional_yield y, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx,
+ const RGWObjState& state)
+{
+ // only overwrite if the tag hasn't changed
+ obj_ctx.set_atomic(state.obj);
+
+ // make a tiny adjustment to the existing mtime so that fetch_remote_obj()
+ // won't return ERR_NOT_MODIFIED when resyncing the object
+ const auto set_mtime = state.mtime + std::chrono::nanoseconds(1);
+
+ // use set_attrs() to update the mtime in a bucket index transaction so the
+ // change is recorded in bilog and datalog entries. this will cause any peer
+ // zones to resync the object
+ auto add_attrs = std::map<std::string, bufferlist>{
+ { RGW_ATTR_PREFIX "resync-encrypted-multipart", bufferlist{} },
+ };
+
+ return store->set_attrs(dpp, &obj_ctx, bucket_info, state.obj,
+ add_attrs, nullptr, y, set_mtime);
+}
+
+static void try_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+ optional_yield y, RGWRados* store,
+ RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx,
+ const rgw_bucket_dir_entry& dirent,
+ Formatter* f)
+{
+ const auto obj = rgw_obj{bucket_info.bucket, dirent.key};
+
+ RGWObjState* astate = nullptr;
+ RGWObjManifest* manifest = nullptr;
+ constexpr bool follow_olh = false; // dirent will have version ids
+ int ret = store->get_obj_state(dpp, &obj_ctx, bucket_info, obj,
+ &astate, &manifest, follow_olh, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << obj << " does not exist" << dendl;
+ return;
+ }
+
+ // check whether the object is encrypted
+ if (auto i = astate->attrset.find(RGW_ATTR_CRYPT_MODE);
+ i == astate->attrset.end()) {
+ ldpp_dout(dpp, 4) << obj << " is not encrypted" << dendl;
+ return;
+ }
+
+ // check whether the object is multipart
+ if (!manifest) {
+ ldpp_dout(dpp, 4) << obj << " has no manifest so is not multipart" << dendl;
+ return;
+ }
+ const RGWObjManifest::obj_iterator end = manifest->obj_end(dpp);
+ if (end.get_cur_part_id() == 0) {
+ ldpp_dout(dpp, 4) << obj << " manifest is not multipart" << dendl;
+ return;
+ }
+
+ ret = resync_encrypted_multipart(dpp, y, store, bucket_info,
+ obj_ctx, *astate);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to update " << obj
+ << ": " << cpp_strerror(ret) << dendl;
+ return;
+ }
+
+ f->open_object_section("object");
+ encode_json("name", obj.key.name, f);
+ if (!obj.key.instance.empty()) {
+ encode_json("version", obj.key.instance, f);
+ }
+ encode_json("mtime", astate->mtime, f);
+ f->close_section(); // "object"
+}
+
+int RGWRados::bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ rgw::sal::RadosStore* driver,
+ RGWBucketInfo& bucket_info,
+ const std::string& marker,
+ RGWFormatterFlusher& flusher)
+{
+ RGWRados::Bucket target(this, bucket_info);
+ RGWRados::Bucket::List list_op(&target);
+
+ list_op.params.marker.name = marker;
+ list_op.params.enforce_ns = true; // only empty ns
+ list_op.params.list_versions = true;
+ list_op.params.allow_unordered = true;
+
+ /* List bucket entries in chunks. */
+ static constexpr int MAX_LIST_OBJS = 100;
+ std::vector<rgw_bucket_dir_entry> entries;
+ entries.reserve(MAX_LIST_OBJS);
+
+ int processed = 0;
+ bool is_truncated = true;
+
+ Formatter* f = flusher.get_formatter();
+ f->open_array_section("progress");
+
+ do {
+ int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+ &is_truncated, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ f->open_object_section("batch");
+ f->open_array_section("modified");
+
+ for (const auto& dirent : entries) {
+ RGWObjectCtx obj_ctx{driver};
+ try_resync_encrypted_multipart(dpp, y, this, bucket_info,
+ obj_ctx, dirent, f);
+ }
+
+ f->close_section(); // "modified"
+
+ processed += entries.size();
+ encode_json("total processed", processed, f);
+ encode_json("marker", list_op.get_next_marker().name, f);
+ f->close_section(); // "batch"
+
+ flusher.flush(); // flush after each 'chunk'
+ } while (is_truncated);
+
+ f->close_section(); // "progress" array
+ return 0;
+}
+
+int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to open bucket index, r=" << r << " (" <<
+ cpp_strerror(-r) << ")" << dendl;
+ return r;
+ }
+
+ r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to issue set bucket resharding, r=" << r << " (" <<
+ cpp_strerror(-r) << ")" << dendl;
+ }
+ return r;
+}
+
+int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj, oid, key);
+ if (!rctx)
+ return 0;
+
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
+ return -EINVAL;
+ }
+
+ string tag;
+
+ if (state->tail_tag.length() > 0) {
+ tag = state->tail_tag.c_str();
+ } else if (state->obj_tag.length() > 0) {
+ tag = state->obj_tag.c_str();
+ } else {
+ ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
+ return -EINVAL;
+ }
+
+ ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
+
+ cls_rgw_obj_chain chain;
+ update_gc_chain(dpp, state->obj, *manifest, &chain);
+ return gc->async_defer_chain(tag, chain);
+}
+
+void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
+{
+ list<string> prefixes;
+ prefixes.push_back(RGW_ATTR_OLH_PREFIX);
+ cls_rgw_remove_obj(op, prefixes);
+}
+
+void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
+{
+ cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
+}
+
+void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
+{
+ cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
+}
+
+struct tombstone_entry {
+ ceph::real_time mtime;
+ uint32_t zone_short_id;
+ uint64_t pg_ver;
+
+ tombstone_entry() = default;
+ explicit tombstone_entry(const RGWObjState& state)
+ : mtime(state.mtime), zone_short_id(state.zone_short_id),
+ pg_ver(state.pg_ver) {}
+};
+
+/**
+ * Delete an object.
+ * bucket: name of the bucket storing the object
+ * obj: name of the object to delete
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = target->get_store();
+ const rgw_obj& src_obj = target->get_obj();
+ const string& instance = src_obj.key.instance;
+ rgw_obj obj = target->get_obj();
+
+ if (instance == "null") {
+ obj.key.instance.clear();
+ }
+
+ bool explicit_marker_version = (!params.marker_version_id.empty());
+
+ if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
+ if (instance.empty() || explicit_marker_version) {
+ rgw_obj marker = obj;
+ marker.key.instance.clear();
+
+ if (!params.marker_version_id.empty()) {
+ if (params.marker_version_id != "null") {
+ marker.key.set_instance(params.marker_version_id);
+ }
+ } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
+ store->gen_rand_obj_instance_name(&marker);
+ }
+
+ result.version_id = marker.key.instance;
+ if (result.version_id.empty())
+ result.version_id = "null";
+ result.delete_marker = true;
+
+ struct rgw_bucket_dir_entry_meta meta;
+
+ meta.owner = params.obj_owner.get_id().to_str();
+ meta.owner_display_name = params.obj_owner.get_display_name();
+
+ if (real_clock::is_zero(params.mtime)) {
+ meta.mtime = real_clock::now();
+ } else {
+ meta.mtime = params.mtime;
+ }
+
+ int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ rgw_bucket_dir_entry dirent;
+
+ int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
+ if (r < 0) {
+ return r;
+ }
+ result.delete_marker = dirent.is_delete_marker();
+ r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ result.version_id = instance;
+ }
+
+ BucketShard *bs = nullptr;
+ int r = target->get_bucket_shard(&bs, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
+ return r;
+ }
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->get_bucket_info(), bs->shard_id, y);
+
+ return 0;
+ }
+
+ rgw_rados_ref ref;
+ int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ r = target->get_state(dpp, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ ObjectWriteOperation op;
+
+ if (!real_clock::is_zero(params.unmod_since)) {
+ struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
+ struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
+ if (!params.high_precision_time) {
+ ctime.tv_nsec = 0;
+ unmod.tv_nsec = 0;
+ }
+
+ ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
+ if (ctime > unmod) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ /* only delete object if mtime is less than or equal to params.unmod_since */
+ store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
+ }
+ uint64_t obj_accounted_size = state->accounted_size;
+
+ if(params.abortmp) {
+ obj_accounted_size = params.parts_accounted_size;
+ }
+
+ if (!real_clock::is_zero(params.expiration_time)) {
+ bufferlist bl;
+ real_time delete_at;
+
+ if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
+ try {
+ auto iter = bl.cbegin();
+ decode(delete_at, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
+ return -EIO;
+ }
+
+ if (params.expiration_time != delete_at) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+
+ if (!state->exists) {
+ target->invalidate_state();
+ return -ENOENT;
+ }
+
+ r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
+ if (r < 0)
+ return r;
+
+ RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+ RGWRados::Bucket bop(store, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ index_op.set_zones_trace(params.zones_trace);
+ index_op.set_bilog_flags(params.bilog_flags);
+
+ r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
+ if (r < 0)
+ return r;
+
+ store->remove_rgw_head_obj(op);
+
+ auto& ioctx = ref.pool.ioctx();
+ r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+
+ /* raced with another operation, object state is indeterminate */
+ const bool need_invalidate = (r == -ECANCELED);
+
+ int64_t poolid = ioctx.get_id();
+ if (r >= 0) {
+ tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
+ if (obj_tombstone_cache) {
+ tombstone_entry entry{*state};
+ obj_tombstone_cache->add(obj, entry);
+ }
+ r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs, y);
+
+ int ret = target->complete_atomic_modification(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
+ }
+ /* other than that, no need to propagate error */
+ } else {
+ int ret = index_op.cancel(dpp, params.remove_objs, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+ }
+ }
+
+ if (need_invalidate) {
+ target->invalidate_state();
+ }
+
+ if (r < 0)
+ return r;
+
+ /* update quota cache */
+ store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
+
+ return 0;
+}
+
+int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags,
+ const real_time& expiration_time,
+ rgw_zone_set *zones_trace)
+{
+ RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
+ RGWRados::Object::Delete del_op(&del_target);
+
+ del_op.params.bucket_owner = bucket_info.owner;
+ del_op.params.versioning_status = versioning_status;
+ del_op.params.bilog_flags = bilog_flags;
+ del_op.params.expiration_time = expiration_time;
+ del_op.params.zones_trace = zones_trace;
+
+ return del_op.delete_obj(null_yield, dpp);
+}
+
+int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+
+ op.remove();
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime,
+ const DoutPrefixProvider *dpp, optional_yield y)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj, oid, key);
+
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, nullptr, y);
+}
+
+static void generate_fake_tag(const DoutPrefixProvider *dpp, RGWRados* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
+{
+ string tag;
+
+ RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
+ if (mi != manifest.obj_end(dpp)) {
+ if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
+ ++mi;
+ tag = mi.get_location().get_raw_obj(store).oid;
+ tag.append("_");
+ }
+
+ unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
+
+ map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
+ if (iter != attrset.end()) {
+ bufferlist& bl = iter->second;
+ hash.Update((const unsigned char *)bl.c_str(), bl.length());
+ }
+
+ hash.Final(md5);
+ buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
+ tag.append(md5_str);
+
+ ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
+
+ tag_bl.append(tag.c_str(), tag.size() + 1);
+}
+
+static bool is_olh(map<string, bufferlist>& attrs)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_VER);
+ return (iter != attrs.end());
+}
+
+static bool has_olh_tag(map<string, bufferlist>& attrs)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
+ return (iter != attrs.end());
+}
+
+int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
+ obj_ctx, RGWBucketInfo& bucket_info,
+ const rgw_obj& obj, RGWObjState *olh_state,
+ RGWObjState **target_state,
+ RGWObjManifest **target_manifest, optional_yield y)
+{
+ ceph_assert(olh_state->is_olh);
+
+ rgw_obj target;
+ int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
+ if (r < 0) {
+ return r;
+ }
+
+ r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state,
+ target_manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
+ RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent)
+{
+ if (obj.empty()) {
+ return -EINVAL;
+ }
+
+ bool need_follow_olh = follow_olh && obj.key.instance.empty();
+ *manifest = nullptr;
+
+ RGWObjStateManifest *sm = rctx->get_state(obj);
+ RGWObjState *s = &(sm->state);
+ ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
+ *state = s;
+ if (sm->manifest) {
+ *manifest = &(*sm->manifest);
+ }
+ if (s->has_attrs) {
+ if (s->is_olh && need_follow_olh) {
+ return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+ }
+ return 0;
+ }
+
+ s->obj = obj;
+
+ rgw_raw_obj raw_obj;
+ obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
+
+ int r = -ENOENT;
+
+ if (!assume_noent) {
+ r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+ }
+
+ if (r == -ENOENT) {
+ s->exists = false;
+ s->has_attrs = true;
+ tombstone_entry entry;
+ if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
+ s->mtime = entry.mtime;
+ s->zone_short_id = entry.zone_short_id;
+ s->pg_ver = entry.pg_ver;
+ ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
+ << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
+ } else {
+ s->mtime = real_time();
+ }
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ s->exists = true;
+ s->has_attrs = true;
+ s->accounted_size = s->size;
+
+ auto iter = s->attrset.find(RGW_ATTR_ETAG);
+ if (iter != s->attrset.end()) {
+ /* get rid of extra null character at the end of the etag, as we used to store it like that */
+ bufferlist& bletag = iter->second;
+ if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
+ bufferlist newbl;
+ bletag.splice(0, bletag.length() - 1, &newbl);
+ bletag = std::move(newbl);
+ }
+ }
+
+ iter = s->attrset.find(RGW_ATTR_COMPRESSION);
+ const bool compressed = (iter != s->attrset.end());
+ if (compressed) {
+ // use uncompressed size for accounted_size
+ try {
+ RGWCompressionInfo info;
+ auto p = iter->second.cbegin();
+ decode(info, p);
+ s->accounted_size = info.orig_size;
+ } catch (buffer::error&) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
+ return -EIO;
+ }
+ }
+
+ if (iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); iter != s->attrset.end()) {
+ const bufferlist& bl = iter->second;
+ auto it = bl.begin();
+ it.copy(bl.length(), s->shadow_obj);
+ s->shadow_obj[bl.length()] = '\0';
+ }
+ if (iter = s->attrset.find(RGW_ATTR_ID_TAG); iter != s->attrset.end()) {
+ s->obj_tag = iter->second;
+ }
+ if (iter = s->attrset.find(RGW_ATTR_TAIL_TAG); iter != s->attrset.end()) {
+ s->tail_tag = iter->second;
+ }
+
+ if (iter = s->attrset.find(RGW_ATTR_MANIFEST); iter != s->attrset.end()) {
+ bufferlist manifest_bl = iter->second;
+ auto miter = manifest_bl.cbegin();
+ try {
+ sm->manifest.emplace();
+ decode(*sm->manifest, miter);
+ sm->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
+ broken due to old bugs */
+ s->size = sm->manifest->get_obj_size();
+ if (!compressed)
+ s->accounted_size = s->size;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+ return -EIO;
+ }
+ *manifest = &(*sm->manifest);
+ ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
+ if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
+ sm->manifest->has_explicit_objs()) {
+ RGWObjManifest::obj_iterator mi;
+ for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
+ ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
+ }
+ }
+
+ if (!s->obj_tag.length()) {
+ /*
+ * Uh oh, something's wrong, object with manifest should have tag. Let's
+ * create one out of the manifest, would be unique
+ */
+ generate_fake_tag(dpp, this, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
+ s->fake_tag = true;
+ }
+ }
+ if (iter = s->attrset.find(RGW_ATTR_PG_VER); iter != s->attrset.end()) {
+ const bufferlist& pg_ver_bl = iter->second;
+ if (pg_ver_bl.length()) {
+ auto pgbl = pg_ver_bl.cbegin();
+ try {
+ decode(s->pg_ver, pgbl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+ }
+ }
+ }
+ if (iter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); iter != s->attrset.end()) {
+ const bufferlist& zone_short_id_bl = iter->second;
+ if (zone_short_id_bl.length()) {
+ auto zbl = zone_short_id_bl.cbegin();
+ try {
+ decode(s->zone_short_id, zbl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+ }
+ }
+ }
+ if (s->obj_tag.length()) {
+ ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
+ }
+
+ /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
+ * it exist, and not only if is_olh() returns true
+ */
+ if (iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); iter != s->attrset.end()) {
+ s->olh_tag = iter->second;
+ }
+
+ if (is_olh(s->attrset)) {
+ s->is_olh = true;
+
+ ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
+
+ if (need_follow_olh) {
+ return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+ } else if (obj.key.have_null_instance() && !sm->manifest) {
+ // read null version, and the head object only have olh info
+ s->exists = false;
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent)
+{
+ int ret;
+
+ do {
+ ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
+{
+ RGWObjState *astate;
+ int r = get_state(dpp, &astate, pmanifest, true, y);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
+{
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &state, &manifest, true, y);
+ if (r < 0)
+ return r;
+ if (!state->exists)
+ return -ENOENT;
+ if (!state->get_attr(name, dest))
+ return -ENODATA;
+
+ return 0;
+}
+
+int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
+{
+ RGWObjectCtx& ctx = source->get_ctx();
+ rgw_obj& obj = source->get_obj();
+ RGWRados *store = source->get_store();
+
+ RGWObjStateManifest *sm = ctx.get_state(obj);
+ result.obj = obj;
+ if (sm->state.has_attrs) {
+ state.ret = 0;
+ result.size = sm->state.size;
+ result.mtime = ceph::real_clock::to_timespec(sm->state.mtime);
+ result.attrs = sm->state.attrset;
+ result.manifest = sm->manifest;
+ return 0;
+ }
+
+ string oid;
+ string loc;
+ get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+ int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectReadOperation op;
+ op.stat2(&result.size, &result.mtime, NULL);
+ op.getxattrs(&result.attrs, NULL);
+ state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
+ state.io_ctx.locator_set_key(loc);
+ r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << __func__
+ << ": ERROR: aio_operate() returned ret=" << r
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
+{
+ if (!state.completion) {
+ return state.ret;
+ }
+
+ state.completion->wait_for_complete();
+ state.ret = state.completion->get_return_value();
+ state.completion->release();
+
+ if (state.ret != 0) {
+ return state.ret;
+ }
+
+ return finish(dpp);
+}
+
+int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
+{
+ map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != result.attrs.end()) {
+ bufferlist& bl = iter->second;
+ auto biter = bl.cbegin();
+ try {
+ result.manifest.emplace();
+ decode(*result.manifest, biter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx,
+ RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ ObjectOperation& op, RGWObjState **pstate,
+ RGWObjManifest** pmanifest, optional_yield y)
+{
+ if (!rctx)
+ return 0;
+
+ int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, pmanifest, false, y);
+ if (r < 0)
+ return r;
+
+ return append_atomic_test(dpp, *pstate, op);
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+ const RGWObjState* state,
+ librados::ObjectOperation& op)
+{
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
+ return 0;
+ }
+
+ if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
+ op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+ } else {
+ ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
+ }
+ return 0;
+}
+
+int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
+{
+ return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, pmanifest, follow_olh, y, assume_noent);
+}
+
+void RGWRados::Object::invalidate_state()
+{
+ ctx.invalidate(obj);
+}
+
+int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
+ ObjectWriteOperation& op, bool reset_obj, const string *ptag,
+ const char *if_match, const char *if_nomatch, bool removal_op,
+ bool modify_tail, optional_yield y)
+{
+ int r = get_state(dpp, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
+ if_match != NULL || if_nomatch != NULL) &&
+ (!state->fake_tag);
+
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
+
+ if (reset_obj) {
+ op.create(false);
+ store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
+ }
+
+ return 0;
+ }
+
+ if (need_guard) {
+ /* first verify that the object wasn't replaced under */
+ if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
+ op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+ // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
+ }
+
+ if (if_match) {
+ if (strcmp(if_match, "*") == 0) {
+ // test the object is existing
+ if (!state->exists) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ bufferlist bl;
+ if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+ strncmp(if_match, bl.c_str(), bl.length()) != 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+
+ if (if_nomatch) {
+ if (strcmp(if_nomatch, "*") == 0) {
+ // test the object is NOT existing
+ if (state->exists) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ bufferlist bl;
+ if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+ strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+ }
+
+ if (reset_obj) {
+ if (state->exists) {
+ op.create(false);
+ store->remove_rgw_head_obj(op);
+ } else {
+ op.create(true);
+ }
+ }
+
+ if (removal_op) {
+ /* the object is being removed, no need to update its tag */
+ return 0;
+ }
+
+ if (ptag) {
+ state->write_tag = *ptag;
+ } else {
+ append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
+ }
+ bufferlist bl;
+ bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
+
+ ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
+
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+ if (modify_tail) {
+ op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+ }
+
+ return 0;
+}
+
+/**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl)
+{
+ map<string, bufferlist> attrs;
+ attrs[name] = bl;
+ return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
+}
+
+int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& src_obj,
+ map<string, bufferlist>& attrs,
+ map<string, bufferlist>* rmattrs,
+ optional_yield y,
+ ceph::real_time set_mtime /* = zero() */)
+{
+ rgw_obj obj = src_obj;
+ if (obj.key.instance == "null") {
+ obj.key.instance.clear();
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, &manifest, y);
+ if (r < 0)
+ return r;
+
+ // ensure null version object exist
+ if (src_obj.key.instance == "null" && !manifest) {
+ return -ENOENT;
+ }
+
+ map<string, bufferlist>::iterator iter;
+ if (rmattrs) {
+ for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+ const string& name = iter->first;
+ op.rmxattr(name.c_str());
+ }
+ }
+
+ const rgw_bucket& bucket = obj.bucket;
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ const string& name = iter->first;
+ bufferlist& bl = iter->second;
+
+ if (!bl.length())
+ continue;
+
+ op.setxattr(name.c_str(), bl);
+
+ if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
+ real_time ts;
+ try {
+ decode(ts, bl);
+
+ rgw_obj_index_key obj_key;
+ obj.key.get_index_key(&obj_key);
+
+ obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
+ }
+ }
+ }
+
+ if (!op.size())
+ return 0;
+
+ bufferlist bl;
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ if (state) {
+ string tag;
+ append_rand_alpha(cct, tag, tag, 32);
+ state->write_tag = tag;
+ r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+
+ if (r < 0)
+ return r;
+
+ bl.append(tag.c_str(), tag.size() + 1);
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+ }
+
+
+ /* As per https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html,
+ * the only way for users to modify object metadata is to make a copy of the object and
+ * set the metadata.
+ * Hence do not update mtime for any other attr changes */
+ real_time mtime = state->mtime;
+ if (set_mtime != ceph::real_clock::zero()) {
+ mtime = set_mtime;
+ }
+ struct timespec mtime_ts = real_clock::to_timespec(mtime);
+ op.mtime2(&mtime_ts);
+ auto& ioctx = ref.pool.ioctx();
+ r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
+ if (state) {
+ if (r >= 0) {
+ bufferlist acl_bl;
+ if (iter = attrs.find(RGW_ATTR_ACL); iter != attrs.end()) {
+ acl_bl = iter->second;
+ }
+ std::string etag;
+ if (iter = attrs.find(RGW_ATTR_ETAG); iter != attrs.end()) {
+ etag = rgw_bl_str(iter->second);
+ }
+ std::string content_type;
+ if (iter = attrs.find(RGW_ATTR_CONTENT_TYPE); iter != attrs.end()) {
+ content_type = rgw_bl_str(iter->second);
+ }
+ string storage_class;
+ if (iter = attrs.find(RGW_ATTR_STORAGE_CLASS); iter != attrs.end()) {
+ storage_class = rgw_bl_str(iter->second);
+ }
+ uint64_t epoch = ioctx.get_last_version();
+ int64_t poolid = ioctx.get_id();
+ r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
+ mtime, etag, content_type, storage_class, &acl_bl,
+ RGWObjCategory::Main, nullptr, y);
+ } else {
+ int ret = index_op.cancel(dpp, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
+ }
+ }
+ }
+ if (r < 0)
+ return r;
+
+ if (state) {
+ state->obj_tag.swap(bl);
+ if (rmattrs) {
+ for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+ state->attrset.erase(iter->first);
+ }
+ }
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ state->attrset[iter->first] = iter->second;
+ }
+
+ auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
+ if (iter != state->attrset.end()) {
+ iter->second = state->obj_tag;
+ }
+
+ state->mtime = mtime;
+ }
+
+ return 0;
+}
+
+int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = source->get_store();
+ CephContext *cct = store->ctx();
+
+ bufferlist etag;
+
+ map<string, bufferlist>::iterator iter;
+
+ RGWObjState *astate;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &astate, &manifest, true, y);
+ if (r < 0)
+ return r;
+
+ if (!astate->exists) {
+ return -ENOENT;
+ }
+
+ const RGWBucketInfo& bucket_info = source->get_bucket_info();
+
+ state.obj = astate->obj;
+ store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
+
+ state.cur_pool = state.head_obj.pool;
+ state.cur_ioctx = &state.io_ctxs[state.cur_pool];
+
+ r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
+ if (r < 0) {
+ return r;
+ }
+ if (params.target_obj) {
+ *params.target_obj = state.obj;
+ }
+ if (params.attrs) {
+ *params.attrs = astate->attrset;
+ if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+ ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+ }
+ }
+ }
+
+ /* Convert all times go GMT to make them compatible */
+ if (conds.mod_ptr || conds.unmod_ptr) {
+ obj_time_weight src_weight;
+ src_weight.init(astate);
+ src_weight.high_precision = conds.high_precision_time;
+
+ obj_time_weight dest_weight;
+ dest_weight.high_precision = conds.high_precision_time;
+
+ if (conds.mod_ptr && !conds.if_nomatch) {
+ dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+ ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+ if (!(dest_weight < src_weight)) {
+ return -ERR_NOT_MODIFIED;
+ }
+ }
+
+ if (conds.unmod_ptr && !conds.if_match) {
+ dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+ ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+ if (dest_weight < src_weight) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+ if (conds.if_match || conds.if_nomatch) {
+ r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
+ if (r < 0)
+ return r;
+
+ if (conds.if_match) {
+ string if_match_str = rgw_string_unquote(conds.if_match);
+ ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+ if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+
+ if (conds.if_nomatch) {
+ string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+ ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+ if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+ return -ERR_NOT_MODIFIED;
+ }
+ }
+ }
+
+ if (params.obj_size)
+ *params.obj_size = astate->size;
+ if (params.lastmod)
+ *params.lastmod = astate->mtime;
+
+ return 0;
+}
+
+int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+ if (ofs < 0) {
+ ofs += obj_size;
+ if (ofs < 0)
+ ofs = 0;
+ end = obj_size - 1;
+ } else if (end < 0) {
+ end = obj_size - 1;
+ }
+
+ if (obj_size > 0) {
+ if (ofs >= (off_t)obj_size) {
+ return -ERANGE;
+ }
+ if (end >= (off_t)obj_size) {
+ end = obj_size - 1;
+ }
+ }
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
+{
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+ int r;
+
+#define NUM_RESHARD_RETRIES 10
+ for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" <<
+ obj_instance.key << ". ret=" << ret << dendl;
+ return ret;
+ }
+
+ r = call(bs);
+ if (r != -ERR_BUSY_RESHARDING) {
+ break;
+ }
+
+ ldpp_dout(dpp, 10) <<
+ "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+ obj_instance.key << dendl;
+
+ r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
+ if (r == -ERR_BUSY_RESHARDING) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " NOTICE: block_while_resharding() still busy. obj=" <<
+ obj_instance.key << dendl;
+ continue;
+ } else if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: block_while_resharding() failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
+ i = 0; /* resharding is finished, make sure we can retry */
+ invalidate_bs();
+ } // for loop
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (pbs) {
+ *pbs = bs;
+ }
+
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+
+ if (write_tag && write_tag->length()) {
+ optag = string(write_tag->c_str(), write_tag->length());
+ } else {
+ if (optag.empty()) {
+ append_rand_alpha(store->ctx(), optag, optag, 32);
+ }
+ }
+
+ int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
+ return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
+ });
+
+ if (r < 0) {
+ return r;
+ }
+ prepared = true;
+
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
+ uint64_t size, uint64_t accounted_size,
+ ceph::real_time& ut, const string& etag,
+ const string& content_type, const string& storage_class,
+ bufferlist *acl_bl,
+ RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs,
+ optional_yield y,
+ const string *user_data,
+ bool appendable)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+ return ret;
+ }
+
+ rgw_bucket_dir_entry ent;
+ obj.key.get_index_key(&ent.key);
+ ent.meta.size = size;
+ ent.meta.accounted_size = accounted_size;
+ ent.meta.mtime = ut;
+ ent.meta.etag = etag;
+ ent.meta.storage_class = storage_class;
+ if (user_data)
+ ent.meta.user_data = *user_data;
+
+ ACLOwner owner;
+ if (acl_bl && acl_bl->length()) {
+ int ret = store->decode_policy(dpp, *acl_bl, &owner);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
+ }
+ }
+ ent.meta.owner = owner.get_id().to_str();
+ ent.meta.owner_display_name = owner.get_display_name();
+ ent.meta.content_type = content_type;
+ ent.meta.appendable = appendable;
+
+ ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id, y);
+
+ return ret;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
+ int64_t poolid, uint64_t epoch,
+ real_time& removed_mtime,
+ list<rgw_obj_index_key> *remove_objs,
+ optional_yield y)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+ return ret;
+ }
+
+ ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id, y);
+
+ return ret;
+}
+
+
+int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
+ list<rgw_obj_index_key> *remove_objs,
+ optional_yield y)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs;
+
+ int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
+ return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
+ });
+
+ /*
+ * need to update data log anyhow, so that whoever follows needs to update its internal markers
+ * for following the specific bucket shard log. Otherwise they end up staying behind, and users
+ * have no way to tell that they're all caught up
+ */
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id, y);
+
+ return ret;
+}
+
+/*
+ * Read up through index `end` inclusive. Number of bytes read is up
+ * to `end - ofs + 1`.
+ */
+int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
+ bufferlist& bl, optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = source->get_store();
+
+ rgw_raw_obj read_obj;
+ uint64_t read_ofs = ofs;
+ uint64_t len, read_len;
+ bool reading_from_head = true;
+ ObjectReadOperation op;
+
+ bool merge_bl = false;
+ bufferlist *pbl = &bl;
+ bufferlist read_bl;
+ uint64_t max_chunk_size;
+
+ RGWObjState *astate;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &astate, &manifest, true, y);
+ if (r < 0)
+ return r;
+
+ if (astate->size == 0) {
+ end = 0;
+ } else if (end >= (int64_t)astate->size) {
+ end = astate->size - 1;
+ }
+
+ if (end < 0)
+ len = 0;
+ else
+ len = end - ofs + 1;
+
+ if (manifest && manifest->has_tail()) {
+ /* now get the relevant object part */
+ RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+ uint64_t stripe_ofs = iter.get_stripe_ofs();
+ read_obj = iter.get_location().get_raw_obj(store);
+ len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+ read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+ reading_from_head = (read_obj == state.head_obj);
+ } else {
+ read_obj = state.head_obj;
+ }
+
+ r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
+ return r;
+ }
+
+ if (len > max_chunk_size)
+ len = max_chunk_size;
+
+
+ read_len = len;
+
+ if (reading_from_head) {
+ /* only when reading from the head object do we need to do the atomic test */
+ r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, &manifest, y);
+ if (r < 0)
+ return r;
+
+ if (astate && astate->prefetch_data) {
+ if (!ofs && astate->data.length() >= len) {
+ bl = astate->data;
+ return bl.length();
+ }
+
+ if (ofs < astate->data.length()) {
+ unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
+ astate->data.begin(ofs).copy(copy_len, bl);
+ read_len -= copy_len;
+ read_ofs += copy_len;
+ if (!read_len)
+ return bl.length();
+
+ merge_bl = true;
+ pbl = &read_bl;
+ }
+ }
+ }
+
+ ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+ op.read(read_ofs, read_len, pbl, NULL);
+
+ if (state.cur_pool != read_obj.pool) {
+ auto iter = state.io_ctxs.find(read_obj.pool);
+ if (iter == state.io_ctxs.end()) {
+ state.cur_ioctx = &state.io_ctxs[read_obj.pool];
+ r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false, true);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
+ return r;
+ }
+ } else {
+ state.cur_ioctx = &iter->second;
+ }
+ state.cur_pool = read_obj.pool;
+ }
+
+ state.cur_ioctx->locator_set_key(read_obj.loc);
+
+ r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
+ ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
+
+ if (r < 0) {
+ return r;
+ }
+
+ if (merge_bl) {
+ bl.append(read_bl);
+ }
+
+ return bl.length();
+}
+
+int get_obj_data::flush(rgw::AioResultList&& results) {
+ int r = rgw::check_for_errors(results);
+ if (r < 0) {
+ return r;
+ }
+ std::list<bufferlist> bl_list;
+
+ auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
+ results.sort(cmp); // merge() requires results to be sorted first
+ completed.merge(results, cmp); // merge results in sorted order
+
+ while (!completed.empty() && completed.front().id == offset) {
+ auto bl = std::move(completed.front().data);
+
+ bl_list.push_back(bl);
+ offset += bl.length();
+ int r = client_cb->handle_data(bl, 0, bl.length());
+ if (r < 0) {
+ return r;
+ }
+
+ if (rgwrados->get_use_datacache()) {
+ const std::lock_guard l(d3n_get_data.d3n_lock);
+ auto oid = completed.front().obj.get_ref().obj.oid;
+ if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
+ lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
+ rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
+ } else {
+ lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
+ }
+ }
+ completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
+ }
+ return 0;
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg)
+{
+ struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+ return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
+ is_head_obj, astate, arg);
+}
+
+int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg)
+{
+ ObjectReadOperation op;
+ struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+ string oid, key;
+
+ if (is_head_obj) {
+ /* only when reading from the head object do we need to do the atomic test */
+ int r = append_atomic_test(dpp, astate, op);
+ if (r < 0)
+ return r;
+
+ if (astate &&
+ obj_ofs < astate->data.length()) {
+ unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+ r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+ if (r < 0)
+ return r;
+
+ len -= chunk_len;
+ d->offset += chunk_len;
+ read_ofs += chunk_len;
+ obj_ofs += chunk_len;
+ if (!len)
+ return 0;
+ }
+ }
+
+ auto obj = d->rgwrados->svc.rados->obj(read_obj);
+ int r = obj.open(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+ op.read(read_ofs, len, nullptr, nullptr);
+
+ const uint64_t cost = len;
+ const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+ auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+
+ return d->flush(std::move(completed));
+}
+
+int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
+ optional_yield y)
+{
+ RGWRados *store = source->get_store();
+ CephContext *cct = store->ctx();
+ const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
+ const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
+
+ auto aio = rgw::make_throttle(window_size, y);
+ get_obj_data data(store, cb, &*aio, ofs, y);
+
+ int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(), state.obj,
+ ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+ data.cancel(); // drain completions without writing back to client
+ return r;
+ }
+
+ return data.drain();
+}
+
+int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ off_t ofs, off_t end, uint64_t max_chunk_size,
+ iterate_obj_cb cb, void *arg, optional_yield y)
+{
+ rgw_raw_obj head_obj;
+ rgw_raw_obj read_obj;
+ uint64_t read_ofs = ofs;
+ uint64_t len;
+ bool reading_from_head = true;
+ RGWObjState *astate = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
+
+ int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ if (end < 0)
+ len = 0;
+ else
+ len = end - ofs + 1;
+
+ if (manifest) {
+ /* now get the relevant object stripe */
+ RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+ RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
+
+ for (; iter != obj_end && ofs <= end; ++iter) {
+ off_t stripe_ofs = iter.get_stripe_ofs();
+ off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
+
+ while (ofs < next_stripe_ofs && ofs <= end) {
+ read_obj = iter.get_location().get_raw_obj(this);
+ uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+ read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+
+ if (read_len > max_chunk_size) {
+ read_len = max_chunk_size;
+ }
+
+ reading_from_head = (read_obj == head_obj);
+ r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
+ if (r < 0) {
+ return r;
+ }
+
+ len -= read_len;
+ ofs += read_len;
+ }
+ }
+ } else {
+ while (ofs <= end) {
+ read_obj = head_obj;
+ uint64_t read_len = std::min(len, max_chunk_size);
+
+ r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
+ if (r < 0) {
+ return r;
+ }
+
+ len -= read_len;
+ ofs += read_len;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ bufferlist outbl;
+
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
+}
+
+void RGWRados::olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+ RGWObjState& state, const rgw_obj& olh_obj,
+ const std::string& op_tag, optional_yield y)
+{
+ if (cct->_conf->rgw_debug_inject_olh_cancel_modification_err) {
+ // simulate the scenario where we fail to remove the pending xattr
+ return;
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " get_obj_head_ref() returned " << r << dendl;
+ return;
+ }
+ string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+ attr_name.append(op_tag);
+
+ // first remove the relevant pending prefix
+ ObjectWriteOperation op;
+ bucket_index_guard_olh_op(dpp, state, op);
+ op.rmxattr(attr_name.c_str());
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, y);
+ if (r < 0) {
+ if (r != -ENOENT && r != -ECANCELED) {
+ ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " rmxattr rgw_rados_operate() returned " << r << dendl;
+ }
+ return;
+ }
+
+ if (auto iter = state.attrset.find(RGW_ATTR_OLH_INFO); iter == state.attrset.end()) {
+ // attempt to remove the OLH object if there are no pending ops,
+ // its olh info attr is empty, and its tag hasn't changed
+ ObjectWriteOperation rm_op;
+ bucket_index_guard_olh_op(dpp, state, rm_op);
+ rm_op.cmpxattr(RGW_ATTR_OLH_INFO, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist());
+ cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true);
+ rm_op.remove();
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y);
+ }
+ if (r < 0 && (r != -ENOENT && r != -ECANCELED)) {
+ ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " olh rm rgw_rados_operate() returned " << r << dendl;
+ }
+}
+
+int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
+{
+ ObjectWriteOperation op;
+
+ ceph_assert(olh_obj.key.instance.empty());
+
+ bool has_tag = (state.exists && has_olh_tag(state.attrset));
+
+ if (!state.exists) {
+ op.create(true);
+ } else {
+ op.assert_exists();
+ struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+ op.mtime2(&mtime_ts);
+ }
+
+ /*
+ * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
+ * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
+ * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
+ * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
+ * log will reflect that.
+ *
+ * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
+ * is used for object data instance, olh_tag for olh instance.
+ */
+ if (has_tag) {
+ /* guard against racing writes */
+ bucket_index_guard_olh_op(dpp, state, op);
+ } else if (state.exists) {
+ // This is the case where a null versioned object already exists for this key
+ // but it hasn't been initialized as an OLH object yet. We immediately add
+ // the RGW_ATTR_OLH_INFO attr so that the OLH points back to itself and
+ // therefore effectively makes this an unobservable modification.
+ op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist());
+ RGWOLHInfo info;
+ info.target = olh_obj;
+ info.removed = false;
+ bufferlist bl;
+ encode(info, bl);
+ op.setxattr(RGW_ATTR_OLH_INFO, bl);
+ }
+
+ if (!has_tag) {
+ /* obj tag */
+ string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+ bufferlist bl;
+ bl.append(obj_tag.c_str(), obj_tag.size());
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+
+ state.attrset[RGW_ATTR_ID_TAG] = bl;
+ state.obj_tag = bl;
+
+ /* olh tag */
+ string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+ bufferlist olh_bl;
+ olh_bl.append(olh_tag.c_str(), olh_tag.size());
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
+
+ state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
+ state.olh_tag = olh_bl;
+ state.is_olh = true;
+
+ bufferlist verbl;
+ op.setxattr(RGW_ATTR_OLH_VER, verbl);
+ }
+
+ bufferlist bl;
+ RGWOLHPendingInfo pending_info;
+ pending_info.time = real_clock::now();
+ encode(pending_info, bl);
+
+#define OLH_PENDING_TAG_LEN 32
+ /* tag will start with current time epoch, this so that entries are sorted by time */
+ char buf[32];
+ utime_t ut(pending_info.time);
+ snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
+ *op_tag = buf;
+
+ string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
+
+ op_tag->append(s);
+
+ string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+ attr_name.append(*op_tag);
+
+ op.setxattr(attr_name.c_str(), bl);
+
+ int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
+ if (ret < 0) {
+ return ret;
+ }
+
+ state.exists = true;
+ state.attrset[attr_name] = bl;
+
+ return 0;
+}
+
+int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
+{
+ int ret;
+
+ ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
+ if (ret == -EEXIST) {
+ ret = -ECANCELED;
+ }
+
+ return ret;
+}
+
+int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
+ BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call)
+{
+ rgw_obj obj;
+ const rgw_obj *pobj = &obj_instance;
+ int r;
+
+ for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+ r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
+ return r;
+ }
+
+ r = call(bs);
+ if (r != -ERR_BUSY_RESHARDING) {
+ break;
+ }
+
+ ldpp_dout(dpp, 10) <<
+ "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+ obj_instance.key << dendl;
+
+ r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
+ if (r == -ERR_BUSY_RESHARDING) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " NOTICE: block_while_resharding() still busy. obj=" <<
+ obj_instance.key << dendl;
+ continue;
+ } else if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: block_while_resharding() failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
+ i = 0; /* resharding is finished, make sure we can retry */
+ } // for loop
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+ cls_rgw_bucket_instance_entry entry;
+
+ // gets loaded by fetch_new_bucket_info; can be used by
+ // clear_resharding
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ // since we want to run this recovery code from two distinct places,
+ // let's just put it in a lambda so we can easily re-use; if the
+ // lambda successfully fetches a new bucket id, it sets
+ // new_bucket_id and returns 0, otherwise it returns a negative
+ // error code
+ auto fetch_new_bucket_info =
+ [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
+ int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
+ bucket_info, nullptr, y, dpp, &bucket_attrs);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to refresh bucket info after reshard at " <<
+ log_tag << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ret = bs->init(dpp, bucket_info, obj_instance);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to refresh bucket shard generation after reshard at " <<
+ log_tag << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
+ ldpp_dout(dpp, 20) << __func__ <<
+ " INFO: refreshed bucket info after reshard at " <<
+ log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
+
+ return 0;
+ }; // lambda fetch_new_bucket_info
+
+ constexpr int num_retries = 10;
+ for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
+ auto& ref = bs->bucket_obj.get_ref();
+ ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
+ if (ret == -ENOENT) {
+ ret = fetch_new_bucket_info("get_bucket_resharding_failed");
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " failed to refresh bucket info after reshard when get bucket "
+ "resharding failed, error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
+ dendl;
+ return ret;
+ }
+
+ if (!entry.resharding_in_progress()) {
+ ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " failed to refresh bucket info after reshard when get bucket "
+ "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
+ (i < num_retries ? "retrying" : "too many retries") << dendl;
+
+ if (i == num_retries) {
+ break;
+ }
+
+ // If bucket is erroneously marked as resharding (e.g., crash or
+ // other error) then fix it. If we can take the bucket reshard
+ // lock then it means no other resharding should be taking place,
+ // and we're free to clear the flags.
+ {
+ // since we expect to do this rarely, we'll do our work in a
+ // block and erase our work after each try
+
+ RGWObjectCtx obj_ctx(this->driver);
+ const rgw_bucket& b = bs->bucket;
+ std::string bucket_id = b.get_key();
+ RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
+ ret = reshard_lock.lock(dpp);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ " ERROR: failed to take reshard lock for bucket " <<
+ bucket_id << "; expected if resharding underway" << dendl;
+ } else {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " INFO: was able to take reshard lock for bucket " <<
+ bucket_id << dendl;
+ // the reshard may have finished, so call clear_resharding()
+ // with its current bucket info; ALSO this will load
+ // bucket_attrs for call to clear_resharding below
+ ret = fetch_new_bucket_info("trying_to_clear_resharding");
+ if (ret < 0) {
+ reshard_lock.unlock();
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to update bucket info before clear resharding for bucket " <<
+ bucket_id << dendl;
+ continue; // try again
+ }
+
+ ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
+ reshard_lock.unlock();
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 5) << __func__ <<
+ " INFO: no need to reset reshard flags; old shards apparently"
+ " removed after successful resharding of bucket " <<
+ bucket_id << dendl;
+ continue; // immediately test again
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to clear resharding flags for bucket " <<
+ bucket_id << ", " << cpp_strerror(-ret) << dendl;
+ // wait and then test again
+ } else {
+ ldpp_dout(dpp, 5) << __func__ <<
+ " INFO: apparently successfully cleared resharding flags for "
+ "bucket " << bucket_id << dendl;
+ continue; // if we apparently succeed immediately test again
+ } // if clear resharding succeeded
+ } // if taking of lock succeeded
+ } // block to encapsulate recovery from incomplete reshard
+
+ ret = reshard_wait->wait(y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: bucket is still resharding, please retry" << dendl;
+ return ret;
+ }
+ } // for loop
+
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: bucket is still resharding, please retry" << dendl;
+ return -ERR_BUSY_RESHARDING;
+}
+
+int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ RGWObjState& olh_state, const rgw_obj& obj_instance,
+ bool delete_marker, const string& op_tag,
+ struct rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch,
+ real_time unmod_since, bool high_precision_time,
+ optional_yield y,
+ rgw_zone_set *_zones_trace, bool log_data_change)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+ BucketShard bs(this);
+
+ r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+ auto& ref = bs->bucket_obj.get_ref();
+ librados::ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
+ delete_marker, op_tag, meta, olh_epoch,
+ unmod_since, high_precision_time,
+ svc.zone->need_to_log_data(), zones_trace);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
+ return r;
+ }
+
+ if (log_data_change) {
+ add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id, y);
+ }
+
+ return 0;
+}
+
+void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
+{
+ ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
+ op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
+}
+
+int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj_instance,
+ const string& op_tag, const string& olh_tag,
+ uint64_t olh_epoch, rgw_zone_set *_zones_trace)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+ BucketShard bs(this);
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+ r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ auto& ref = bs->bucket_obj.get_ref();
+ librados::ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_unlink_instance(op, key, op_tag,
+ olh_tag, olh_epoch, svc.zone->need_to_log_data(), zones_trace);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver_marker,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
+ bool *is_truncated)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ auto& shard_ref = bs.bucket_obj.get_ref();
+ ObjectReadOperation op;
+
+ rgw_cls_read_olh_log_ret log_ret;
+ int op_ret = 0;
+ cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
+ bufferlist outbl;
+ r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
+ if (r < 0) {
+ return r;
+ }
+ if (op_ret < 0) {
+ ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
+ return op_ret;
+ }
+
+ *log = std::move(log_ret.log);
+ *is_truncated = log_ret.is_truncated;
+
+ return 0;
+}
+
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj)
+{
+ // fetch the current olh entry from the bucket index
+ rgw_bucket_olh_entry olh;
+ int r = bi_get_olh(dpp, bucket_info, obj, &olh);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+ return r;
+ }
+ if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
+ return 0;
+ }
+
+ ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
+ << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+ // rewrite OLH_ID_TAG and OLH_INFO from current olh
+ ObjectWriteOperation op;
+ // assert this is the same olh tag we think we're fixing
+ bucket_index_guard_olh_op(dpp, *state, op);
+ // preserve existing mtime
+ struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+ op.mtime2(&mtime_ts);
+ {
+ bufferlist bl;
+ bl.append(olh.tag.c_str(), olh.tag.size());
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+ }
+ {
+ RGWOLHInfo info;
+ info.target = rgw_obj(bucket_info.bucket, olh.key);
+ info.removed = olh.delete_marker;
+ bufferlist bl;
+ encode(info, bl);
+ op.setxattr(RGW_ATTR_OLH_INFO, bl);
+ }
+ rgw_rados_ref ref;
+ r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+ return pbs->bucket_obj.operate(dpp, &op, null_yield);
+ });
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::string& olh_tag,
+ const rgw_obj& obj_instance)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ auto& ref = pbs->bucket_obj.get_ref();
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_clear_olh(op, key, olh_tag);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
+{
+ try {
+ auto biter = bl.cbegin();
+ decode(*olh, biter);
+ return 0;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
+ return -EIO;
+ }
+}
+
+int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ RGWObjState& state,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ bufferlist& olh_tag,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+ uint64_t *plast_ver,
+ rgw_zone_set* zones_trace)
+{
+ if (log.empty()) {
+ return 0;
+ }
+
+ librados::ObjectWriteOperation op;
+
+ uint64_t last_ver = log.rbegin()->first;
+ *plast_ver = last_ver;
+
+ map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
+
+ op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+ op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
+
+ bufferlist ver_bl;
+ string last_ver_s = to_string(last_ver);
+ ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+ op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
+ struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+ op.mtime2(&mtime_ts);
+
+ bool need_to_link = false;
+ uint64_t link_epoch = 0;
+ cls_rgw_obj_key key;
+ bool delete_marker = false;
+ list<cls_rgw_obj_key> remove_instances;
+ bool need_to_remove = false;
+
+ // decode current epoch and instance
+ auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
+ if (olh_ver != state.attrset.end()) {
+ std::string str = olh_ver->second.to_str();
+ std::string err;
+ link_epoch = strict_strtoll(str.c_str(), 10, &err);
+ }
+ auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
+ if (olh_info != state.attrset.end()) {
+ RGWOLHInfo info;
+ int r = decode_olh_info(dpp, cct, olh_info->second, &info);
+ if (r < 0) {
+ return r;
+ }
+ info.target.key.get_index_key(&key);
+ delete_marker = info.removed;
+ }
+
+ for (iter = log.begin(); iter != log.end(); ++iter) {
+ vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
+ for (; viter != iter->second.end(); ++viter) {
+ rgw_bucket_olh_log_entry& entry = *viter;
+
+ ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
+ << " key=" << entry.key.name << "[" << entry.key.instance << "] "
+ << (entry.delete_marker ? "(delete)" : "") << dendl;
+ switch (entry.op) {
+ case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
+ remove_instances.push_back(entry.key);
+ break;
+ case CLS_RGW_OLH_OP_LINK_OLH:
+ // only overwrite a link of the same epoch if its key sorts before
+ if (link_epoch < iter->first || key.instance.empty() ||
+ key.instance > entry.key.instance) {
+ ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+ << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+ need_to_link = true;
+ need_to_remove = false;
+ key = entry.key;
+ delete_marker = entry.delete_marker;
+ } else {
+ ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+ << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+ }
+ break;
+ case CLS_RGW_OLH_OP_UNLINK_OLH:
+ need_to_remove = true;
+ need_to_link = false;
+ break;
+ default:
+ ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
+ return -EIO;
+ }
+ string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+ attr_name.append(entry.op_tag);
+ op.rmxattr(attr_name.c_str());
+ }
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ const rgw_bucket& bucket = obj.bucket;
+
+ if (need_to_link) {
+ rgw_obj target(bucket, key);
+ RGWOLHInfo info;
+ info.target = target;
+ info.removed = delete_marker;
+ bufferlist bl;
+ encode(info, bl);
+ op.setxattr(RGW_ATTR_OLH_INFO, bl);
+ }
+
+ /* first remove object instances */
+ for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
+ liter != remove_instances.end(); ++liter) {
+ cls_rgw_obj_key& key = *liter;
+ rgw_obj obj_instance(bucket, key);
+ int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
+ return ret;
+ }
+ }
+
+ /* update olh object */
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+ return r;
+ }
+
+
+ if (need_to_remove) {
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+ r = clear_olh(dpp, obj_ctx, obj, bucket_info, ref, olh_tag, last_ver, null_yield);
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, 0) << "ERROR: could not clear olh, r=" << r << dendl;
+ return r;
+ }
+ } else {
+ r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::clear_olh(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RGWBucketInfo& bucket_info,
+ const std::string& tag,
+ const uint64_t ver,
+ optional_yield y) {
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ return clear_olh(dpp, obj_ctx, obj, bucket_info, ref, tag, ver, y);
+}
+
+int RGWRados::clear_olh(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RGWBucketInfo& bucket_info,
+ rgw_rados_ref& ref,
+ const std::string& tag,
+ const uint64_t ver,
+ optional_yield y) {
+ ObjectWriteOperation rm_op;
+
+ RGWObjManifest *manifest = nullptr;
+ RGWObjState *s = nullptr;
+
+ int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &s, &manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+ map<string, bufferlist> pending_entries;
+ rgw_filter_attrset(s->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+ map<string, bufferlist> rm_pending_entries;
+ check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+ if (!rm_pending_entries.empty()) {
+ r = remove_olh_pending_entries(dpp, bucket_info, *s, obj, rm_pending_entries);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: rm_pending_entries returned ret=" << r << dendl;
+ return r;
+ }
+ }
+
+ bufferlist tag_bl;
+ tag_bl.append(tag.c_str(), tag.length());
+ rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, tag_bl);
+ rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, ver);
+ cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
+ rm_op.remove();
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y);
+ if (r == -ECANCELED) {
+ return r; /* someone else made a modification in the meantime */
+ }
+ /*
+ * only clear if was successful, otherwise we might clobber pending operations on this object
+ */
+ r = bucket_index_clear_olh(dpp, bucket_info, tag, obj);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
+ return r;
+ }
+ return 0;
+}
+
+/*
+ * read olh log and apply it
+ */
+int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
+{
+ map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
+ bool is_truncated;
+ uint64_t ver_marker = 0;
+
+ do {
+ int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
+ if (ret < 0) {
+ return ret;
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& target_obj, bool delete_marker,
+ rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
+ optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
+{
+ string op_tag;
+
+ rgw_obj olh_obj = target_obj;
+ olh_obj.key.instance.clear();
+
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ int ret = 0;
+ int i;
+
+#define MAX_ECANCELED_RETRY 100
+ for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+ if (ret == -ECANCELED) {
+ obj_ctx.invalidate(olh_obj);
+ }
+
+ ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ return ret;
+ }
+ if (cct->_conf->rgw_debug_inject_set_olh_err) {
+ // fail here to simulate the scenario of an unlinked object instance
+ ret = -cct->_conf->rgw_debug_inject_set_olh_err;
+ } else {
+ ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj,
+ delete_marker, op_tag, meta, olh_epoch, unmod_since,
+ high_precision_time, y, zones_trace, log_data_change);
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+ olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y);
+ if (ret == -ECANCELED) {
+ // the bucket index rejected the link_olh() due to olh tag mismatch;
+ // attempt to reconstruct olh head attributes based on the bucket index
+ int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
+ if (r2 < 0 && r2 != -ECANCELED) {
+ return r2;
+ }
+ continue;
+ }
+ // it's possible that the pending xattr from this op prevented the olh
+ // object from being cleaned by another thread that was deleting the last
+ // existing version. We invoke a best-effort update_olh here to handle this case.
+ int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl;
+ }
+ return ret;
+ }
+ break;
+ }
+
+ if (i == MAX_ECANCELED_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+ return -EIO;
+ }
+
+ ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+ if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+ ret = 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
+ uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
+{
+ string op_tag;
+
+ rgw_obj olh_obj = target_obj;
+ olh_obj.key.instance.clear();
+
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = NULL;
+
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+ if (ret == -ECANCELED) {
+ obj_ctx.invalidate(olh_obj);
+ }
+
+ ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */
+ if (ret < 0)
+ return ret;
+
+ ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ return ret;
+ }
+
+ string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+ ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
+ if (ret < 0) {
+ olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y);
+ ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ // it's possible that the pending xattr from this op prevented the olh
+ // object from being cleaned by another thread that was deleting the last
+ // existing version. We invoke a best-effort update_olh here to handle this case.
+ int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
+ if (r < 0 && r != -ECANCELED) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl;
+ }
+ return ret;
+ }
+ break;
+ }
+
+ if (i == MAX_ECANCELED_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+ return -EIO;
+ }
+
+ ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
+ if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+ return 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+ char buf[OBJ_INSTANCE_LEN + 1];
+
+ gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+ no underscore for instance name due to the way we encode the raw keys */
+
+ target_key->set_instance(buf);
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
+{
+ gen_rand_obj_instance_name(&target_obj->key);
+}
+
+int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
+{
+ map<string, bufferlist> attrset;
+
+ ObjectReadOperation op;
+ op.getxattrs(&attrset, NULL);
+
+ int r = obj_operate(dpp, bucket_info, obj, &op);
+ if (r < 0) {
+ return r;
+ }
+
+ auto iter = attrset.find(RGW_ATTR_OLH_VER);
+ if (iter == attrset.end()) { /* not an olh */
+ return -EINVAL;
+ }
+
+ return decode_olh_info(dpp, cct, iter->second, olh);
+}
+
+void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
+ map<string, bufferlist>& pending_entries,
+ map<string, bufferlist> *rm_pending_entries)
+{
+ map<string, bufferlist>::iterator iter = pending_entries.begin();
+
+ real_time now = real_clock::now();
+
+ while (iter != pending_entries.end()) {
+ auto biter = iter->second.cbegin();
+ RGWOLHPendingInfo pending_info;
+ try {
+ decode(pending_info, biter);
+ } catch (buffer::error& err) {
+ /* skipping bad entry, we could remove it but it might hide a bug */
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
+ ++iter;
+ continue;
+ }
+
+ map<string, bufferlist>::iterator cur_iter = iter;
+ ++iter;
+ if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
+ (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
+ pending_entries.erase(cur_iter);
+ } else {
+ /* entries names are sorted by time (rounded to a second) */
+ break;
+ }
+ }
+}
+
+int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ // trim no more than 1000 entries per osd op
+ constexpr int max_entries = 1000;
+
+ auto i = pending_attrs.begin();
+ while (i != pending_attrs.end()) {
+ ObjectWriteOperation op;
+ bucket_index_guard_olh_op(dpp, state, op);
+
+ for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
+ op.rmxattr(i->first.c_str());
+ }
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r == -ENOENT || r == -ECANCELED) {
+ /* raced with some other change, shouldn't sweat about it */
+ return 0;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
+{
+ map<string, bufferlist> pending_entries;
+ rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+ map<string, bufferlist> rm_pending_entries;
+ check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+ if (!rm_pending_entries.empty()) {
+ int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ if (!pending_entries.empty()) {
+ ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
+
+ int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+ if (ret < 0) {
+ if (ret == -ECANCELED) {
+ // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
+ // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
+ // return ENOENT to indicate that the OLH object was removed.
+ ret = -ENOENT;
+ }
+ return ret;
+ }
+ }
+
+ auto iter = state->attrset.find(RGW_ATTR_OLH_VER);
+ if (iter == state->attrset.end()) {
+ return -EINVAL;
+ }
+ iter = state->attrset.find(RGW_ATTR_OLH_INFO);
+ if (iter == state->attrset.end()) {
+ return -ENOENT;
+ }
+
+ RGWOLHInfo olh;
+ int ret = decode_olh_info(dpp, cct, iter->second, &olh);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (olh.removed) {
+ return -ENOENT;
+ }
+
+ *target = olh.target;
+
+ return 0;
+}
+
+int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
+ rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
+ map<string, bufferlist> *attrs, bufferlist *first_chunk,
+ RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ map<string, bufferlist> unfiltered_attrset;
+ uint64_t size = 0;
+ struct timespec mtime_ts;
+
+ ObjectReadOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_read(&op);
+ }
+ if (attrs) {
+ op.getxattrs(&unfiltered_attrset, NULL);
+ }
+ if (psize || pmtime) {
+ op.stat2(&size, &mtime_ts, NULL);
+ }
+ if (first_chunk) {
+ op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
+ }
+ bufferlist outbl;
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
+
+ if (epoch) {
+ *epoch = ref.pool.ioctx().get_last_version();
+ }
+
+ if (r < 0)
+ return r;
+
+ if (psize)
+ *psize = size;
+ if (pmtime)
+ *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+ if (attrs) {
+ rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
+ }
+
+ return 0;
+}
+
+int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, string *bucket_ver, string *master_ver,
+ map<RGWObjCategory, RGWStorageStats>& stats,
+ string *max_marker, bool *syncstopped)
+{
+ vector<rgw_bucket_dir_header> headers;
+ map<int, string> bucket_instance_ids;
+ int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
+ if (r < 0) {
+ return r;
+ }
+
+ ceph_assert(headers.size() == bucket_instance_ids.size());
+
+ auto iter = headers.begin();
+ map<int, string>::iterator viter = bucket_instance_ids.begin();
+ BucketIndexShardsManager ver_mgr;
+ BucketIndexShardsManager master_ver_mgr;
+ BucketIndexShardsManager marker_mgr;
+ char buf[64];
+ for(; iter != headers.end(); ++iter, ++viter) {
+ accumulate_raw_stats(*iter, stats);
+ snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
+ ver_mgr.add(viter->first, string(buf));
+ snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
+ master_ver_mgr.add(viter->first, string(buf));
+ if (shard_id >= 0) {
+ *max_marker = iter->max_marker;
+ } else {
+ marker_mgr.add(viter->first, iter->max_marker);
+ }
+ if (syncstopped != NULL)
+ *syncstopped = iter->syncstopped;
+ }
+ ver_mgr.to_string(bucket_ver);
+ master_ver_mgr.to_string(master_ver);
+ if (shard_id < 0) {
+ marker_mgr.to_string(max_marker);
+ }
+ return 0;
+}
+
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+ RGWGetBucketStats_CB *cb;
+ uint32_t pendings;
+ map<RGWObjCategory, RGWStorageStats> stats;
+ int ret_code;
+ bool should_cb;
+ ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
+
+public:
+ RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
+ : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
+ {}
+
+ void handle_response(int r, rgw_bucket_dir_header& header) override {
+ std::lock_guard l{lock};
+ if (should_cb) {
+ if ( r >= 0) {
+ accumulate_raw_stats(header, stats);
+ } else {
+ ret_code = r;
+ }
+
+ // Are we all done?
+ if (--pendings == 0) {
+ if (!ret_code) {
+ cb->set_response(&stats);
+ }
+ cb->handle_response(ret_code);
+ cb->put();
+ }
+ }
+ }
+
+ void unset_cb() {
+ std::lock_guard l{lock};
+ should_cb = false;
+ }
+};
+
+int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
+{
+ int num_aio = 0;
+ RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
+ ceph_assert(get_ctx);
+ int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
+ if (r < 0) {
+ ctx->put();
+ if (num_aio) {
+ get_ctx->unset_cb();
+ }
+ }
+ get_ctx->put();
+ return r;
+}
+
+int RGWRados::get_bucket_instance_info(const string& meta_key,
+ RGWBucketInfo& info,
+ real_time *pmtime,
+ map<string, bufferlist> *pattrs,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ rgw_bucket bucket;
+ rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
+
+ return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
+}
+
+int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
+ real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ return ctl.bucket->read_bucket_instance_info(bucket, &info,
+ y,
+ dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::get_bucket_info(RGWServices *svc,
+ const string& tenant, const string& bucket_name,
+ RGWBucketInfo& info,
+ real_time *pmtime,
+ optional_yield y,
+ const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
+{
+ rgw_bucket bucket;
+ bucket.tenant = tenant;
+ bucket.name = bucket_name;
+ return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
+ ceph::real_time *pmtime,
+ const DoutPrefixProvider *dpp,
+ map<string, bufferlist> *pattrs)
+{
+ rgw_bucket bucket = info.bucket;
+ bucket.bucket_id.clear();
+
+ auto rv = info.objv_tracker.read_version;
+
+ return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs)
+ .set_refresh_version(rv));
+}
+
+int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
+ real_time mtime, map<string, bufferlist> *pattrs,
+ const DoutPrefixProvider *dpp, optional_yield y)
+{
+ return ctl.bucket->store_bucket_instance_info(info.bucket, info, y, dpp,
+ RGWBucketCtl::BucketInstance::PutParams()
+ .set_exclusive(exclusive)
+ .set_mtime(mtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
+ map<string, bufferlist> *pattrs, bool create_entry_point,
+ const DoutPrefixProvider *dpp, optional_yield y)
+{
+ bool create_head = !info.has_instance_obj || create_entry_point;
+
+ int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!create_head)
+ return 0; /* done! */
+
+ RGWBucketEntryPoint entry_point;
+ entry_point.bucket = info.bucket;
+ entry_point.owner = info.owner;
+ entry_point.creation_time = info.creation_time;
+ entry_point.linked = true;
+ RGWObjVersionTracker ot;
+ if (pep_objv && !pep_objv->tag.empty()) {
+ ot.write_version = *pep_objv;
+ } else {
+ ot.generate_new_write_ver(cct);
+ if (pep_objv) {
+ *pep_objv = ot.write_version;
+ }
+ }
+ ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, y, dpp, RGWBucketCtl::Bucket::PutParams()
+ .set_exclusive(exclusive)
+ .set_objv_tracker(&ot)
+ .set_mtime(mtime));
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
+{
+ map<string, RGWBucketEnt>::iterator iter;
+ for (iter = m.begin(); iter != m.end(); ++iter) {
+ RGWBucketEnt& ent = iter->second;
+ rgw_bucket& bucket = ent.bucket;
+ ent.count = 0;
+ ent.size = 0;
+ ent.size_rounded = 0;
+
+ vector<rgw_bucket_dir_header> headers;
+
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
+ if (r < 0)
+ return r;
+
+ auto hiter = headers.begin();
+ for (; hiter != headers.end(); ++hiter) {
+ RGWObjCategory category = main_category;
+ auto iter = (hiter->stats).find(category);
+ if (iter != hiter->stats.end()) {
+ struct rgw_bucket_category_stats& stats = iter->second;
+ ent.count += stats.num_entries;
+ ent.size += stats.total_size;
+ ent.size_rounded += stats.total_size_rounded;
+ }
+ }
+
+ // fill in placement_rule from the bucket instance for use in swift's
+ // per-storage policy statistics
+ ent.placement_rule = std::move(bucket_info.placement_rule);
+ }
+
+ return m.size();
+}
+
+int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ librados::Rados *rad = get_rados_handle();
+ librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
+
+ r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
+ completion->release();
+ return r;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ int r = open_pool_ctx(dpp, pool, io_ctx, false, false);
+ if (r < 0)
+ return r;
+
+ iter = io_ctx.nobjects_begin();
+
+ return 0;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ int r = open_pool_ctx(dpp, pool, io_ctx, false, false);
+ if (r < 0)
+ return r;
+
+ librados::ObjectCursor oc;
+ if (!oc.from_str(cursor)) {
+ ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
+ return -EINVAL;
+ }
+
+ try {
+ iter = io_ctx.nobjects_begin(oc);
+ return 0;
+ } catch (const std::system_error& e) {
+ r = -e.code().value();
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+ return ctx.iter.get_cursor().to_str();
+}
+
+static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+ vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ if (iter == io_ctx.nobjects_end())
+ return -ENOENT;
+
+ uint32_t i;
+
+ for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
+ rgw_bucket_dir_entry e;
+
+ string oid = iter->get_oid();
+ ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+
+ // fill it in with initial values; we may correct later
+ if (filter && !filter->filter(oid, oid))
+ continue;
+
+ e.key = oid;
+ objs.push_back(e);
+ }
+
+ if (is_truncated)
+ *is_truncated = (iter != io_ctx.nobjects_end());
+
+ return objs.size();
+}
+
+int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter)
+{
+ // catch exceptions from NObjectIterator::operator++()
+ try {
+ return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
+ } catch (const std::system_error& e) {
+ int r = -e.code().value();
+ ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
+int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
+{
+ if (!ctx->initialized) {
+ int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+ return r;
+ }
+ ctx->initialized = true;
+ }
+ return 0;
+}
+
+int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ return -EINVAL;
+ }
+ RGWAccessListFilterPrefix filter(prefix_filter);
+ vector<rgw_bucket_dir_entry> objs;
+ int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
+ if (r < 0) {
+ if(r != -ENOENT)
+ ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+ return r;
+ }
+
+ vector<rgw_bucket_dir_entry>::iterator iter;
+ for (iter = objs.begin(); iter != objs.end(); ++iter) {
+ oids.push_back(iter->key.name);
+ }
+
+ return oids.size();
+}
+
+int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
+ int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ int r = list_raw_objects_init(dpp, pool, string(), &ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+ return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
+int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ rgw_bucket_dir_entry *dirent)
+{
+ rgw_cls_bi_entry bi_entry;
+ int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+ }
+ if (r < 0) {
+ return r;
+ }
+ auto iter = bi_entry.data.cbegin();
+ try {
+ decode(*dirent, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ rgw_bucket_olh_entry *olh)
+{
+ rgw_cls_bi_entry bi_entry;
+ int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+ }
+ if (r < 0) {
+ return r;
+ }
+ auto iter = bi_entry.data.cbegin();
+ try {
+ decode(*olh, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ BIIndexType index_type, rgw_cls_bi_entry *entry)
+{
+ BucketShard bs(this);
+ int ret = bs.init(dpp, bucket_info, obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+
+ auto& ref = bs.bucket_obj.get_ref();
+
+ return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
+}
+
+void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ cls_rgw_bi_put(op, ref.obj.oid, entry);
+}
+
+int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
+{
+ // make sure incomplete multipart uploads are hashed correctly
+ if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
+ RGWMPObj mp;
+ mp.from_meta(obj.key.name);
+ obj.index_hash_source = mp.get_key();
+ }
+ BucketShard bs(this);
+
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return bi_put(bs, entry);
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
+ const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ rgw_obj obj(bucket, obj_name_filter);
+ BucketShard bs(this);
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ auto& ref = bs.bucket_obj.get_ref();
+ ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+ if (ret == -ENOENT) {
+ *is_truncated = false;
+ }
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ BucketShard bs(this);
+ int ret = bs.init(dpp, bucket_info,
+ bucket_info.layout.current_index,
+ shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
+}
+
+int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = ref.pool.ioctx().remove(ref.obj.oid);
+ if (ret == -ENOENT) {
+ ret = 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
+{
+ return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
+}
+
+int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
+ librados::ObjectWriteOperation *op)
+{
+ return gc_pool_ctx.aio_operate(oid, c, op);
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
+{
+ return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
+}
+
+int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+ return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
+}
+
+int RGWRados::process_gc(bool expired_only)
+{
+ return gc->process(expired_only);
+}
+
+int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
+ vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+ int& index)
+{
+ return lc->list_lc_progress(marker, max_entries, progress_map, index);
+}
+
+int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
+{
+ RGWLC lc;
+ lc.initialize(cct, this->driver);
+ RGWLC::LCWorker worker(&lc, cct, &lc, 0);
+ auto ret = lc.process(&worker, optional_bucket, true /* once */);
+ lc.stop_processor(); // sets down_flag, but returns immediately
+ return ret;
+}
+
+bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
+{
+ return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
+}
+
+int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
+ rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+ ObjectWriteOperation o;
+ o.assert_exists(); // bucket index shard must exist
+
+ cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->need_to_log_data(), bilog_flags, zones_trace);
+ int ret = bs.bucket_obj.operate(dpp, &o, y);
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+ return ret;
+}
+
+int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+ " obj=" << obj << " tag=" << tag << " op=" << op <<
+ ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
+ ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ ObjectWriteOperation o;
+ o.assert_exists(); // bucket index shard must exist
+
+ rgw_bucket_dir_entry_meta dir_meta;
+ dir_meta = ent.meta;
+ dir_meta.category = category;
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+ rgw_bucket_entry_ver ver;
+ ver.pool = pool;
+ ver.epoch = epoch;
+ cls_rgw_obj_key key(ent.key.name, ent.key.instance);
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
+ svc.zone->need_to_log_data(), bilog_flags, &zones_trace);
+ complete_op_data *arg;
+ index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
+ svc.zone->need_to_log_data(), bilog_flags, &zones_trace, &arg);
+ librados::AioCompletion *completion = arg->rados_completion;
+ int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
+ completion->release(); /* can't reference arg here, as it might have already been released */
+
+ ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+ return ret;
+}
+
+int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_obj& obj,
+ real_time& removed_mtime,
+ list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags,
+ rgw_zone_set *zones_trace)
+{
+ rgw_bucket_dir_entry ent;
+ ent.meta.mtime = removed_mtime;
+ obj.key.get_index_key(&ent.key);
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
+ ent, RGWObjCategory::None, remove_objs,
+ bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
+ list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+ rgw_bucket_dir_entry ent;
+ obj.key.get_index_key(&ent.key);
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
+ -1 /* pool id */, 0, ent,
+ RGWObjCategory::None, remove_objs, bilog_flags,
+ zones_trace);
+}
+
+int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
+}
+
+
+// returns 0 if there is an error in calculation
+uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+ uint32_t num_shards)
+{
+ if (num_shards == 0) {
+ // we'll get a floating point exception since we divide by
+ // num_shards
+ return 0;
+ }
+
+ // We want to minimize the chances that when num_shards >>
+ // num_entries that we return much fewer than num_entries to the
+ // client. Given all the overhead of making a cls call to the osd,
+ // returning a few entries is not much more work than returning one
+ // entry. This minimum might be better tuned based on future
+ // experiments where num_shards >> num_entries. (Note: ">>" should
+ // be interpreted as "much greater than".)
+ constexpr uint32_t min_read = 8;
+
+ // The following is based on _"Balls into Bins" -- A Simple and
+ // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
+ // cases when num_shards >> num_entries (it almost serves as a
+ // ceiling calculation). We also assume alpha is 1.0 and extract it
+ // from the calculation. Future work could involve memoizing some of
+ // the transcendental functions to minimize repeatedly re-calling
+ // them with the same parameters, which we expect to be the case the
+ // majority of the time.
+ uint32_t calc_read =
+ 1 +
+ static_cast<uint32_t>((num_entries / num_shards) +
+ sqrt((2 * num_entries) *
+ log(num_shards) / num_shards));
+
+ return std::max(min_read, calc_read);
+}
+
+
+int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ const int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ const std::string& delimiter,
+ const uint32_t num_entries,
+ const bool list_versions,
+ const uint16_t expansion_factor,
+ ent_map_t& m,
+ bool* is_truncated,
+ bool* cls_filtered,
+ rgw_obj_index_key* last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+ /* expansion_factor allows the number of entries to read to grow
+ * exponentially; this is used when earlier reads are producing too
+ * few results, perhaps due to filtering or to a series of
+ * namespaced entries */
+
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+ " start_after=\"" << start_after.to_string() <<
+ "\", prefix=\"" << prefix <<
+ ", delimiter=\"" << delimiter <<
+ "\", shard_id=" << shard_id <<
+ "\", num_entries=" << num_entries <<
+ ", shard_id=" << shard_id <<
+ ", list_versions=" << list_versions <<
+ ", expansion_factor=" << expansion_factor <<
+ ", force_check_filter is " <<
+ (force_check_filter ? "set" : "unset") << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ m.clear();
+
+ RGWSI_RADOS::Pool index_pool;
+ // key - oid (for different shards if there is any)
+ // value - list result for the corresponding oid (shard), it is filled by
+ // the AIO callback
+ std::map<int, std::string> shard_oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
+ &index_pool, &shard_oids,
+ nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
+ return r;
+ }
+
+ const uint32_t shard_count = shard_oids.size();
+ if (shard_count == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": the bucket index shard count appears to be 0, "
+ "which is an illegal value" << dendl;
+ return -ERR_INVALID_BUCKET_STATE;
+ }
+
+ uint32_t num_entries_per_shard;
+ if (expansion_factor == 0) {
+ num_entries_per_shard =
+ calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+ } else if (expansion_factor <= 11) {
+ // we'll max out the exponential multiplication factor at 1024 (2<<10)
+ num_entries_per_shard =
+ std::min(num_entries,
+ (uint32_t(1 << (expansion_factor - 1)) *
+ calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+ } else {
+ num_entries_per_shard = num_entries;
+ }
+
+ if (num_entries_per_shard == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to calculate the number of entries to read from each "
+ "bucket index shard" << dendl;
+ return -ERR_INVALID_BUCKET_STATE;
+ }
+
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": request from each of " << shard_count <<
+ " shard(s) for " << num_entries_per_shard << " entries to get " <<
+ num_entries << " total entries" << dendl;
+
+ auto& ioctx = index_pool.ioctx();
+ std::map<int, rgw_cls_list_ret> shard_list_results;
+ cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
+ r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
+ num_entries_per_shard,
+ list_versions, shard_oids, shard_list_results,
+ cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
+ " failed" << dendl;
+ return r;
+ }
+
+ // to manage the iterators through each shard's list results
+ struct ShardTracker {
+ const size_t shard_idx;
+ rgw_cls_list_ret& result;
+ const std::string& oid_name;
+ RGWRados::ent_map_t::iterator cursor;
+ RGWRados::ent_map_t::iterator end;
+
+ // manages an iterator through a shard and provides other
+ // accessors
+ ShardTracker(size_t _shard_idx,
+ rgw_cls_list_ret& _result,
+ const std::string& _oid_name):
+ shard_idx(_shard_idx),
+ result(_result),
+ oid_name(_oid_name),
+ cursor(_result.dir.m.begin()),
+ end(_result.dir.m.end())
+ {}
+
+ inline const std::string& entry_name() const {
+ return cursor->first;
+ }
+ rgw_bucket_dir_entry& dir_entry() const {
+ return cursor->second;
+ }
+ inline bool is_truncated() const {
+ return result.is_truncated;
+ }
+ inline ShardTracker& advance() {
+ ++cursor;
+ // return a self-reference to allow for chaining of calls, such
+ // as x.advance().at_end()
+ return *this;
+ }
+ inline bool at_end() const {
+ return cursor == end;
+ }
+ }; // ShardTracker
+
+ // add the next unique candidate, or return false if we reach the end
+ auto next_candidate = [] (CephContext *cct, ShardTracker& t,
+ std::multimap<std::string, size_t>& candidates,
+ size_t tracker_idx) {
+ if (!t.at_end()) {
+ candidates.emplace(t.entry_name(), tracker_idx);
+ }
+ return;
+ };
+
+ // one tracker per shard requested (may not be all shards)
+ std::vector<ShardTracker> results_trackers;
+ results_trackers.reserve(shard_list_results.size());
+ for (auto& r : shard_list_results) {
+ results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
+
+ // if any *one* shard's result is trucated, the entire result is
+ // truncated
+ *is_truncated = *is_truncated || r.second.is_truncated;
+
+ // unless *all* are shards are cls_filtered, the entire result is
+ // not filtered
+ *cls_filtered = *cls_filtered && r.second.cls_filtered;
+ }
+
+ // create a map to track the next candidate entry from ShardTracker
+ // (key=candidate, value=index into results_trackers); as we consume
+ // entries from shards, we replace them with the next entries in the
+ // shards until we run out
+ std::multimap<std::string, size_t> candidates;
+ size_t tracker_idx = 0;
+ std::vector<size_t> vidx;
+ vidx.reserve(shard_list_results.size());
+ for (auto& t : results_trackers) {
+ // it's important that the values in the map refer to the index
+ // into the results_trackers vector, which may not be the same
+ // as the shard number (i.e., when not all shards are requested)
+ next_candidate(cct, t, candidates, tracker_idx);
+ ++tracker_idx;
+ }
+
+ rgw_bucket_dir_entry*
+ last_entry_visited = nullptr; // to set last_entry (marker)
+ std::map<std::string, bufferlist> updates;
+ uint32_t count = 0;
+ while (count < num_entries && !candidates.empty()) {
+ r = 0;
+ // select the next entry in lexical order (first key in map);
+ // again tracker_idx is not necessarily shard number, but is index
+ // into results_trackers vector
+ tracker_idx = candidates.begin()->second;
+ auto& tracker = results_trackers.at(tracker_idx);
+
+ const std::string& name = tracker.entry_name();
+ rgw_bucket_dir_entry& dirent = tracker.dir_entry();
+
+ ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
+ dirent.key << " from shard " << tracker.shard_idx << dendl;
+
+ const bool force_check =
+ force_check_filter && force_check_filter(dirent.key.name);
+
+ if ((!dirent.exists &&
+ !dirent.is_delete_marker() &&
+ !dirent.is_common_prefix()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
+ /* there are uncommitted ops. We need to check the current
+ * state, and if the tags are old we need to do clean-up as
+ * well. */
+ librados::IoCtx sub_ctx;
+ sub_ctx.dup(ioctx);
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ " calling check_disk_state bucket=" << bucket_info.bucket <<
+ " entry=" << dirent.key << dendl_bitx;
+ r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+ updates[tracker.oid_name], y);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": check_disk_state for \"" << dirent.key <<
+ "\" failed with r=" << r << dendl;
+ return r;
+ }
+ } else {
+ r = 0;
+ }
+
+ // at this point either r >= 0 or r == -ENOENT
+ if (r >= 0) { // i.e., if r != -ENOENT
+ ldpp_dout(dpp, 10) << __func__ << ": got " <<
+ dirent.key << dendl;
+
+ auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
+ last_entry_visited = &it->second;
+ if (inserted) {
+ ++count;
+ } else {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+ " reassigned map value at \"" << name <<
+ "\", which should not happen" << dendl;
+ }
+ } else {
+ ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
+ dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+ last_entry_visited = &tracker.dir_entry();
+ }
+
+ // refresh the candidates map
+ vidx.clear();
+ bool need_to_stop = false;
+ auto range = candidates.equal_range(name);
+ for (auto i = range.first; i != range.second; ++i) {
+ vidx.push_back(i->second);
+ }
+ candidates.erase(range.first, range.second);
+ for (auto idx : vidx) {
+ auto& tracker_match = results_trackers.at(idx);
+ tracker_match.advance();
+ next_candidate(cct, tracker_match, candidates, idx);
+ if (tracker_match.at_end() && tracker_match.is_truncated()) {
+ need_to_stop = true;
+ break;
+ }
+ }
+ if (need_to_stop) {
+ // once we exhaust one shard that is truncated, we need to stop,
+ // as we cannot be certain that one of the next entries needs to
+ // come from that shard; S3 and swift protocols allow returning
+ // fewer than what was requested
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopped accumulating results at count=" << count <<
+ ", dirent=\"" << dirent.key <<
+ "\", because its shard is truncated and exhausted" << dendl;
+ break;
+ }
+ } // while we haven't provided requested # of result entries
+
+ // suggest updates if there are any
+ for (auto& miter : updates) {
+ if (miter.second.length()) {
+ ObjectWriteOperation o;
+ cls_rgw_suggest_changes(o, miter.second);
+ // we don't care if we lose suggested updates, send them off blindly
+ AioCompletion *c =
+ librados::Rados::aio_create_completion(nullptr, nullptr);
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": doing dir_suggest on " << miter.first << dendl_bitx;
+ ioctx.aio_operate(miter.first, c, &o);
+ c->release();
+ }
+ } // updates loop
+
+ // determine truncation by checking if all the returned entries are
+ // consumed or not
+ *is_truncated = false;
+ for (const auto& t : results_trackers) {
+ if (!t.at_end() || t.is_truncated()) {
+ *is_truncated = true;
+ break;
+ }
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+ dendl;
+
+ if (*is_truncated && count < num_entries) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": requested " << num_entries << " entries but returning " <<
+ count << ", which is truncated" << dendl;
+ }
+
+ if (last_entry_visited != nullptr && last_entry) {
+ *last_entry = last_entry_visited->key;
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, last_entry=" << *last_entry << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, last_entry NOT SET" << dendl;
+ }
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::cls_bucket_list_ordered
+
+
+// A helper function to retrieve the hash source from an incomplete
+// multipart entry by removing everything from the second to last
+// period on.
+static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
+ std::size_t found = oid_wo_ns.rfind('.');
+ if (found == std::string::npos || found < 1) {
+ return -EINVAL;
+ }
+ found = oid_wo_ns.rfind('.', found - 1);
+ if (found == std::string::npos || found < 1) {
+ return -EINVAL;
+ }
+ *index_hash_source = oid_wo_ns.substr(0, found);
+ return 0;
+}
+
+
+int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ std::vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter) {
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+ " start_after=\"" << start_after <<
+ "\", prefix=\"" << prefix <<
+ "\", shard_id=" << shard_id <<
+ "\", num_entries=" << num_entries <<
+ ", list_versions=" << list_versions <<
+ (force_check_filter ? "set" : "unset") << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ ent_list.clear();
+ static MultipartMetaFilter multipart_meta_filter;
+
+ *is_truncated = false;
+ RGWSI_RADOS::Pool index_pool;
+
+ std::map<int, std::string> oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ auto& ioctx = index_pool.ioctx();
+
+ const uint32_t num_shards = oids.size();
+
+ rgw_obj_index_key marker = start_after;
+ uint32_t current_shard;
+ if (shard_id >= 0) {
+ current_shard = shard_id;
+ } else if (start_after.empty()) {
+ current_shard = 0u;
+ } else {
+ // at this point we have a marker (start_after) that has something
+ // in it, so we need to get to the bucket shard index, so we can
+ // start reading from there
+
+
+ // now convert the key (oid) to an rgw_obj_key since that will
+ // separate out the namespace, name, and instance
+ rgw_obj_key obj_key;
+ bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
+ if (!parsed) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " received an invalid start marker: \"" << start_after << "\"" <<
+ dendl;
+ return -EINVAL;
+ } else if (obj_key.name.empty()) {
+ // if the name is empty that means the object name came in with
+ // a namespace only, and therefore we need to start our scan at
+ // the first bucket index shard
+ current_shard = 0u;
+ } else {
+ // so now we have the key used to compute the bucket index shard
+ // and can extract the specific shard from it
+ if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
+ // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
+ // the implementation relying on MultipartMetaFilter
+ // because MultipartMetaFilter only checks .meta suffix, which may
+ // exclude data multiparts but include some regular objects with .meta suffix
+ // by mistake.
+ string index_hash_source;
+ r = parse_index_hash_source(obj_key.name, &index_hash_source);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " parse_index_hash_source unable to parse \"" << obj_key.name <<
+ "\", r=" << r << dendl;
+ return r;
+ }
+ current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
+ } else {
+ current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
+ }
+ }
+ }
+
+ uint32_t count = 0u;
+ std::map<std::string, bufferlist> updates;
+ rgw_obj_index_key last_added_entry;
+ while (count <= num_entries &&
+ ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
+ current_shard < num_shards)) {
+ const std::string& oid = oids[current_shard];
+ rgw_cls_list_ret result;
+
+ librados::ObjectReadOperation op;
+ const std::string empty_delimiter;
+ cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
+ num_entries,
+ list_versions, &result);
+ r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
+ return r;
+ }
+
+ for (auto& entry : result.dir.m) {
+ rgw_bucket_dir_entry& dirent = entry.second;
+
+ bool force_check = force_check_filter &&
+ force_check_filter(dirent.key.name);
+ if ((!dirent.exists && !dirent.is_delete_marker()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
+ /* there are uncommitted ops. We need to check the current state,
+ * and if the tags are old we need to do cleanup as well. */
+ librados::IoCtx sub_ctx;
+ sub_ctx.dup(ioctx);
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ ": calling check_disk_state bucket=" << bucket_info.bucket <<
+ " entry=" << dirent.key << dendl_bitx;
+ r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": error in check_disk_state, r=" << r << dendl;
+ return r;
+ }
+ } else {
+ r = 0;
+ }
+
+ // at this point either r >= 0 or r == -ENOENT
+ if (r >= 0) { // i.e., if r != -ENOENT
+ ldpp_dout(dpp, 10) << __func__ << ": got " <<
+ dirent.key << dendl;
+
+ if (count < num_entries) {
+ marker = last_added_entry = dirent.key; // double assign
+ ent_list.emplace_back(std::move(dirent));
+ ++count;
+ } else {
+ last_added_entry = dirent.key;
+ *is_truncated = true;
+ ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
+ ": reached max entries (" << num_entries << ") to return at \"" <<
+ dirent.key << "\"" << dendl;
+ goto check_updates;
+ }
+ } else { // r == -ENOENT
+ // in the case of -ENOENT, make sure we're advancing marker
+ // for possible next call to CLSRGWIssueBucketList
+ marker = dirent.key;
+ }
+ } // entry for loop
+
+ if (!result.is_truncated) {
+ // if we reached the end of the shard read next shard
+ ++current_shard;
+ marker = rgw_obj_index_key();
+ }
+ } // shard loop
+
+check_updates:
+
+ // suggest updates if there is any
+ std::map<std::string, bufferlist>::iterator miter = updates.begin();
+ for (; miter != updates.end(); ++miter) {
+ if (miter->second.length()) {
+ ObjectWriteOperation o;
+ cls_rgw_suggest_changes(o, miter->second);
+ // we don't care if we lose suggested updates, send them off blindly
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ " doing dir_suggest on " << miter->first << dendl_bitx;
+ ioctx.aio_operate(miter->first, c, &o);
+ c->release();
+ }
+ }
+
+ if (last_entry && !ent_list.empty()) {
+ *last_entry = last_added_entry;
+ }
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::cls_bucket_list_unordered
+
+
+int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
+ rgw_usage_log_info& info)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+ cls_rgw_usage_log_add(op, info);
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ return r;
+}
+
+int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+ uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+ string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
+ bool *is_truncated)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ *is_truncated = false;
+
+ r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
+ max_entries, read_iter, usage, is_truncated);
+
+ return r;
+}
+
+static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
+{
+ bool done = false;
+ do {
+ librados::ObjectWriteOperation op;
+ cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
+ int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r == -ENODATA)
+ done = true;
+ else if (r < 0)
+ return r;
+ } while (!done);
+
+ return 0;
+}
+
+int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+ uint64_t start_epoch, uint64_t end_epoch)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
+ return r;
+}
+
+int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ librados::ObjectWriteOperation op;
+ cls_rgw_usage_log_clear(op);
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ return r;
+}
+
+
+// note: this removes entries from the rados bucket index objects
+// without going through CLS; this is known to be called from
+// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
+int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::list<rgw_obj_index_key>& entry_key_list)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
+ " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ const auto& current_index = bucket_info.get_current_index();
+ if (is_layout_indexless(current_index)) {
+ return -EINVAL;
+ }
+ const uint32_t num_shards = current_index.layout.normal.num_shards;
+
+ RGWSI_RADOS::Pool index_pool;
+ std::map<int, std::string> index_oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
+ bucket_info.layout.current_index,
+ &index_pool, &index_oids, nullptr);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+ " open_bucket_index returned " << r << dendl_bitx;
+ return r;
+ }
+
+ // split up removals by shard
+ std::map<int, std::set<std::string>> sharded_removals;
+ for (const auto& entry_key : entry_key_list) {
+ const rgw_obj_key obj_key(entry_key);
+ const uint32_t shard =
+ RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+
+ // entry_key already combines namespace and name, so we first have
+ // to break that apart before we can then combine with instance
+ std::string name;
+ std::string ns; // namespace
+ rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
+ rgw_obj_key full_key(name, entry_key.instance, ns);
+ std::string combined_key = full_key.get_oid();
+
+ sharded_removals[shard].insert(combined_key);
+
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ ": removal from bucket index, bucket=" << bucket_info.bucket <<
+ " key=" << combined_key << " designated for shard " << shard <<
+ dendl_bitx;
+ }
+
+ for (const auto& removals : sharded_removals) {
+ const int shard = removals.first;
+ const std::string& oid = index_oids[shard];
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": removal from bucket index, bucket=" << bucket_info.bucket <<
+ ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
+ removals.second.size() << dendl_bitx;
+
+ r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+ ": omap_rm_keys returned ret=" << r <<
+ dendl_bitx;
+ return r;
+ }
+ }
+
+ ldout_bitx(bitx, dpp, 5) <<
+ "EXITING " << __func__ << " and returning " << r << dendl_bitx;
+
+ return r;
+}
+
+int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
+ librados::IoCtx io_ctx,
+ RGWBucketInfo& bucket_info,
+ rgw_bucket_dir_entry& list_state,
+ rgw_bucket_dir_entry& object,
+ bufferlist& suggested_updates,
+ optional_yield y)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
+ bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
+
+ uint8_t suggest_flag = (svc.zone->need_to_log_data() ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
+
+ std::string loc;
+
+ rgw_obj obj(bucket_info.bucket, list_state.key);
+
+ MultipartMetaFilter multipart_meta_filter;
+ string temp_key;
+ if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
+ obj.in_extra_data = true;
+ }
+
+ string oid;
+ get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+ if (loc != list_state.locator) {
+ ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
+ }
+
+ io_ctx.locator_set_key(list_state.locator);
+
+ RGWObjState *astate = NULL;
+ RGWObjManifest *manifest = nullptr;
+ RGWObjectCtx rctx(this->driver);
+ int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ list_state.pending_map.clear(); // we don't need this and it inflates size
+ if (!list_state.is_delete_marker() && !astate->exists) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
+ /* object doesn't exist right now -- hopefully because it's
+ * marked as !exists and got deleted */
+ if (list_state.exists) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
+ /* FIXME: what should happen now? Work out if there are any
+ * non-bad ways this could happen (there probably are, but annoying
+ * to handle!) */
+ }
+
+ // encode a suggested removal of that key
+ list_state.ver.epoch = io_ctx.get_last_version();
+ list_state.ver.pool = io_ctx.get_id();
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
+ cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
+ return -ENOENT;
+ }
+
+ string etag;
+ string content_type;
+ string storage_class;
+ ACLOwner owner;
+ bool appendable = false;
+
+ object.meta.size = astate->size;
+ object.meta.accounted_size = astate->accounted_size;
+ object.meta.mtime = astate->mtime;
+
+ map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
+ if (iter != astate->attrset.end()) {
+ etag = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
+ if (iter != astate->attrset.end()) {
+ content_type = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != astate->attrset.end()) {
+ storage_class = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_ACL);
+ if (iter != astate->attrset.end()) {
+ r = decode_policy(dpp, iter->second, &owner);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
+ }
+ }
+ iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+ if (iter != astate->attrset.end()) {
+ appendable = true;
+ }
+
+ if (manifest) {
+ RGWObjManifest::obj_iterator miter;
+ for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+ const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
+ rgw_obj loc;
+ RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
+
+ if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
+ r = delete_obj_index(loc, astate->mtime, dpp, y);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) <<
+ "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
+ }
+ }
+ }
+ }
+
+ object.meta.etag = etag;
+ object.meta.content_type = content_type;
+ object.meta.storage_class = storage_class;
+ object.meta.owner = owner.get_id().to_str();
+ object.meta.owner_display_name = owner.get_display_name();
+ object.meta.appendable = appendable;
+
+ // encode suggested updates
+
+ list_state.meta.size = object.meta.size;
+ list_state.meta.accounted_size = object.meta.accounted_size;
+ list_state.meta.mtime = object.meta.mtime;
+ list_state.meta.category = main_category;
+ list_state.meta.etag = etag;
+ list_state.meta.appendable = appendable;
+ list_state.meta.content_type = content_type;
+ list_state.meta.storage_class = storage_class;
+
+ librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
+ r = get_obj_head_ioctx(dpp, bucket_info, obj, &head_obj_ctx);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " WARNING: unable to find head object data pool for \"" <<
+ obj << "\", not updating version pool/epoch" << dendl;
+ } else {
+ list_state.ver.pool = head_obj_ctx.get_id();
+ list_state.ver.epoch = astate->epoch;
+ }
+
+ if (astate->obj_tag.length() > 0) {
+ list_state.tag = astate->obj_tag.c_str();
+ }
+
+ list_state.meta.owner = owner.get_id().to_str();
+ list_state.meta.owner_display_name = owner.get_display_name();
+
+ list_state.exists = true;
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
+ cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::check_disk_state
+
+int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> oids;
+ map<int, struct rgw_cls_list_ret> list_results;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
+ << r << dendl;
+ return r;
+ }
+
+ r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
+ << r << dendl;
+ return r;
+ }
+
+ map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
+ for(; iter != list_results.end(); ++iter) {
+ headers.push_back(std::move(iter->second.dir.header));
+ }
+ return 0;
+}
+
+int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ map<int, string>::iterator iter = bucket_objs.begin();
+ for (; iter != bucket_objs.end(); ++iter) {
+ r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
+ if (r < 0) {
+ ctx->put();
+ break;
+ } else {
+ (*num_aio)++;
+ }
+ }
+ return r;
+}
+
+int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
+ const rgw_bucket& bucket,
+ uint64_t num_objs,
+ const DoutPrefixProvider *dpp)
+{
+ if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
+ return 0;
+ }
+
+ bool need_resharding = false;
+ uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+ const uint32_t max_dynamic_shards =
+ uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
+
+ if (num_source_shards >= max_dynamic_shards) {
+ return 0;
+ }
+
+ uint32_t suggested_num_shards = 0;
+ const uint64_t max_objs_per_shard =
+ cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
+
+ // TODO: consider per-bucket sync policy here?
+ const bool is_multisite = svc.zone->need_to_log_data();
+
+ quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
+ num_objs, is_multisite, need_resharding,
+ &suggested_num_shards);
+ if (! need_resharding) {
+ return 0;
+ }
+
+ const uint32_t final_num_shards =
+ RGWBucketReshard::get_preferred_shards(suggested_num_shards,
+ max_dynamic_shards);
+ // final verification, so we don't reduce number of shards
+ if (final_num_shards <= num_source_shards) {
+ return 0;
+ }
+
+ ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
+ " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
+ "; new num shards " << final_num_shards << " (suggested " <<
+ suggested_num_shards << ")" << dendl;
+
+ return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
+}
+
+int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
+{
+ RGWReshard reshard(this->driver, dpp);
+
+ uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+
+ new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
+ if (new_num_shards <= num_source_shards) {
+ ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
+ return 0;
+ }
+
+ cls_rgw_reshard_entry entry;
+ entry.time = real_clock::now();
+ entry.tenant = bucket_info.owner.tenant;
+ entry.bucket_name = bucket_info.bucket.name;
+ entry.bucket_id = bucket_info.bucket.bucket_id;
+ entry.old_num_shards = num_source_shards;
+ entry.new_num_shards = new_num_shards;
+
+ return reshard.add(dpp, entry);
+}
+
+int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+ RGWQuota& quota,
+ uint64_t obj_size, optional_yield y,
+ bool check_size_only)
+{
+ // if we only check size, then num_objs will set to 0
+ if(check_size_only)
+ return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
+
+ return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
+}
+
+int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
+ int *shard_id)
+{
+ int r = 0;
+ switch (layout.hash_type) {
+ case rgw::BucketHashType::Mod:
+ if (!layout.num_shards) {
+ if (shard_id) {
+ *shard_id = -1;
+ }
+ } else {
+ uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
+ if (shard_id) {
+ *shard_id = (int)sid;
+ }
+ }
+ break;
+ default:
+ r = -ENOTSUP;
+ }
+ return r;
+}
+
+uint64_t RGWRados::instance_id()
+{
+ return get_rados_handle()->get_instance_id();
+}
+
+uint64_t RGWRados::next_bucket_id()
+{
+ std::lock_guard l{bucket_id_lock};
+ return ++max_bucket_id;
+}
+
+librados::Rados* RGWRados::get_rados_handle()
+{
+ return &rados;
+}
+
+int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
+{
+ rgw_rados_ref ref;
+ int ret = get_raw_obj_ref(dpp, obj, &ref);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+ return ret;
+ }
+
+ ObjectWriteOperation op;
+ list<string> prefixes;
+ cls_rgw_remove_obj(op, prefixes);
+
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+ c->release();
+ return ret;
+ }
+
+ handles.push_back(c);
+
+ return 0;
+}
+
+int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
+ RGWBucketInfo& bucket_info, RGWObjState *astate,
+ list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+ optional_yield y)
+{
+ rgw_rados_ref ref;
+ int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (keep_index_consistent) {
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+
+ ObjectWriteOperation op;
+ list<string> prefixes;
+ cls_rgw_remove_obj(op, prefixes);
+
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+ c->release();
+ return ret;
+ }
+
+ handles.push_back(c);
+
+ if (keep_index_consistent) {
+ ret = delete_obj_index(obj, astate->mtime, dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ return ret;
+}
+
+void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
+{
+ auto it = new objexp_hint_entry;
+ it->tenant = "tenant1";
+ it->bucket_name = "bucket1";
+ it->bucket_id = "1234";
+ it->obj_key = rgw_obj_key("obj");
+ o.push_back(it);
+ o.push_back(new objexp_hint_entry);
+}
+
+void objexp_hint_entry::dump(Formatter *f) const
+{
+ f->open_object_section("objexp_hint_entry");
+ encode_json("tenant", tenant, f);
+ encode_json("bucket_name", bucket_name, f);
+ encode_json("bucket_id", bucket_id, f);
+ encode_json("rgw_obj_key", obj_key, f);
+ utime_t ut(exp_time);
+ encode_json("exp_time", ut, f);
+ f->close_section();
+}
+
+void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
+{
+ RGWOLHInfo *olh = new RGWOLHInfo;
+ olh->removed = false;
+ o.push_back(olh);
+ o.push_back(new RGWOLHInfo);
+}
+
+void RGWOLHInfo::dump(Formatter *f) const
+{
+ encode_json("target", target, f);
+}
+
+void RGWOLHPendingInfo::dump(Formatter *f) const
+{
+ utime_t ut(time);
+ encode_json("time", ut, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
new file mode 100644
index 000000000..75a5e1b54
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -0,0 +1,1661 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "rgw_common.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "cls/timeindex/cls_timeindex_types.h"
+#include "cls/otp/cls_otp_types.h"
+#include "rgw_quota.h"
+#include "rgw_log.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_period_puller.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_sync_module.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_service.h"
+#include "rgw_sal.h"
+#include "rgw_aio.h"
+#include "rgw_d3n_cacherequest.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_bi_rados.h"
+#include "common/Throttle.h"
+#include "common/ceph_mutex.h"
+#include "rgw_cache.h"
+#include "rgw_sal_fwd.h"
+
+struct D3nDataCache;
+
+class RGWWatcher;
+class ACLOwner;
+class RGWGC;
+class RGWMetaNotifier;
+class RGWDataNotifier;
+class RGWLC;
+class RGWObjectExpirer;
+class RGWMetaSyncProcessorThread;
+class RGWDataSyncProcessorThread;
+class RGWSyncLogTrimThread;
+class RGWSyncTraceManager;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWReshard;
+class RGWReshardWait;
+
+struct get_obj_data;
+
+/* flags for put_obj_meta() */
+#define PUT_OBJ_CREATE 0x01
+#define PUT_OBJ_EXCL 0x02
+#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
+
+static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
+{
+ if (bucket.marker.empty() || orig_oid.empty()) {
+ oid = orig_oid;
+ } else {
+ oid = bucket.marker;
+ oid.append("_");
+ oid.append(orig_oid);
+ }
+}
+
+static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
+{
+ const rgw_bucket& bucket = obj.bucket;
+ prepend_bucket_marker(bucket, obj.get_oid(), oid);
+ const std::string& loc = obj.key.get_loc();
+ if (!loc.empty()) {
+ prepend_bucket_marker(bucket, loc, locator);
+ } else {
+ locator.clear();
+ }
+}
+
+struct RGWOLHInfo {
+ rgw_obj target;
+ bool removed;
+
+ RGWOLHInfo() : removed(false) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(target, bl);
+ encode(removed, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(target, bl);
+ decode(removed, bl);
+ DECODE_FINISH(bl);
+ }
+ static void generate_test_instances(std::list<RGWOLHInfo*>& o);
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHInfo)
+
+struct RGWOLHPendingInfo {
+ ceph::real_time time;
+
+ RGWOLHPendingInfo() {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(time, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(time, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
+
+struct RGWUsageBatch {
+ std::map<ceph::real_time, rgw_usage_log_entry> m;
+
+ void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
+ bool exists = m.find(t) != m.end();
+ *account = !exists;
+ m[t].aggregate(entry);
+ }
+};
+
+struct RGWCloneRangeInfo {
+ rgw_obj src;
+ off_t src_ofs;
+ off_t dst_ofs;
+ uint64_t len;
+};
+
+class RGWFetchObjFilter {
+public:
+ virtual ~RGWFetchObjFilter() {}
+
+ virtual int filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const std::map<std::string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule) = 0;
+};
+
+class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
+protected:
+ rgw_placement_rule dest_rule;
+public:
+ RGWFetchObjFilter_Default() {}
+
+ int filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const std::map<std::string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule) override;
+};
+
+struct RGWObjStateManifest {
+ RGWObjState state;
+ std::optional<RGWObjManifest> manifest;
+};
+
+class RGWObjectCtx {
+ rgw::sal::Driver* driver;
+ ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
+
+ std::map<rgw_obj, RGWObjStateManifest> objs_state;
+public:
+ explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
+ RGWObjectCtx(RGWObjectCtx& _o) {
+ std::unique_lock wl{lock};
+ this->driver = _o.driver;
+ this->objs_state = _o.objs_state;
+ }
+
+ rgw::sal::Driver* get_driver() {
+ return driver;
+ }
+
+ RGWObjStateManifest *get_state(const rgw_obj& obj);
+
+ void set_compressed(const rgw_obj& obj);
+ void set_atomic(const rgw_obj& obj);
+ void set_prefetch_data(const rgw_obj& obj);
+ void invalidate(const rgw_obj& obj);
+};
+
+
+struct RGWRawObjState {
+ rgw_raw_obj obj;
+ bool has_attrs{false};
+ bool exists{false};
+ uint64_t size{0};
+ ceph::real_time mtime;
+ uint64_t epoch{0};
+ bufferlist obj_tag;
+ bool has_data{false};
+ bufferlist data;
+ bool prefetch_data{false};
+ uint64_t pg_ver{0};
+
+ /* important! don't forget to update copy constructor */
+
+ RGWObjVersionTracker objv_tracker;
+
+ std::map<std::string, bufferlist> attrset;
+ RGWRawObjState() {}
+ RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
+ has_attrs = rhs.has_attrs;
+ exists = rhs.exists;
+ size = rhs.size;
+ mtime = rhs.mtime;
+ epoch = rhs.epoch;
+ if (rhs.obj_tag.length()) {
+ obj_tag = rhs.obj_tag;
+ }
+ has_data = rhs.has_data;
+ if (rhs.data.length()) {
+ data = rhs.data;
+ }
+ prefetch_data = rhs.prefetch_data;
+ pg_ver = rhs.pg_ver;
+ objv_tracker = rhs.objv_tracker;
+ }
+};
+
+struct RGWPoolIterCtx {
+ librados::IoCtx io_ctx;
+ librados::NObjectIterator iter;
+};
+
+struct RGWListRawObjsCtx {
+ bool initialized;
+ RGWPoolIterCtx iter_ctx;
+
+ RGWListRawObjsCtx() : initialized(false) {}
+};
+
+struct objexp_hint_entry {
+ std::string tenant;
+ std::string bucket_name;
+ std::string bucket_id;
+ rgw_obj_key obj_key;
+ ceph::real_time exp_time;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(bucket_name, bl);
+ encode(bucket_id, bl);
+ encode(obj_key, bl);
+ encode(exp_time, bl);
+ encode(tenant, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
+ DECODE_START(2, bl);
+ decode(bucket_name, bl);
+ decode(bucket_id, bl);
+ decode(obj_key, bl);
+ decode(exp_time, bl);
+ if (struct_v >= 2) {
+ decode(tenant, bl);
+ } else {
+ tenant.clear();
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<objexp_hint_entry*>& o);
+};
+WRITE_CLASS_ENCODER(objexp_hint_entry)
+
+class RGWMetaSyncStatusManager;
+class RGWDataSyncStatusManager;
+class RGWCoroutinesManagerRegistry;
+
+class RGWGetDirHeader_CB;
+class RGWGetUserHeader_CB;
+namespace rgw { namespace sal {
+ class RadosStore;
+ class MPRadosSerializer;
+ class LCRadosSerializer;
+} }
+
+class RGWAsyncRadosProcessor;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+struct bucket_info_entry {
+ RGWBucketInfo info;
+ real_time mtime;
+ std::map<std::string, bufferlist> attrs;
+};
+
+struct tombstone_entry;
+
+template <class K, class V>
+class lru_map;
+using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
+
+class RGWIndexCompletionManager;
+
+class RGWRados
+{
+ friend class RGWGC;
+ friend class RGWMetaNotifier;
+ friend class RGWDataNotifier;
+ friend class RGWObjectExpirer;
+ friend class RGWMetaSyncProcessorThread;
+ friend class RGWDataSyncProcessorThread;
+ friend class RGWReshard;
+ friend class RGWBucketReshard;
+ friend class RGWBucketReshardLock;
+ friend class BucketIndexLockGuard;
+ friend class rgw::sal::MPRadosSerializer;
+ friend class rgw::sal::LCRadosSerializer;
+ friend class rgw::sal::RadosStore;
+
+ /** Open the pool used as root for this gateway */
+ int open_root_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
+
+ int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+ bool mostly_omap, bool bulk);
+
+
+ ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
+ SafeTimer *timer;
+
+ rgw::sal::RadosStore* driver = nullptr;
+ RGWGC *gc = nullptr;
+ RGWLC *lc;
+ RGWObjectExpirer *obj_expirer;
+ bool use_gc_thread;
+ bool use_lc_thread;
+ bool quota_threads;
+ bool run_sync_thread;
+ bool run_reshard_thread;
+
+ RGWMetaNotifier *meta_notifier;
+ RGWDataNotifier *data_notifier;
+ RGWMetaSyncProcessorThread *meta_sync_processor_thread;
+ RGWSyncTraceManager *sync_tracer = nullptr;
+ std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
+
+ boost::optional<rgw::BucketTrimManager> bucket_trim;
+ RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
+
+ ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
+ ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
+
+ librados::IoCtx root_pool_ctx; // .rgw
+
+ ceph::mutex bucket_id_lock{ceph::make_mutex("rados_bucket_id")};
+
+ // This field represents the number of bucket index object shards
+ uint32_t bucket_index_max_shards;
+
+ std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
+
+ int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
+ int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
+ int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+ uint64_t max_bucket_id;
+
+ int clear_olh(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RGWBucketInfo& bucket_info,
+ rgw_rados_ref& ref,
+ const std::string& tag,
+ const uint64_t ver,
+ optional_yield y);
+
+ int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
+ RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ RGWObjState *olh_state, RGWObjState **target_state,
+ RGWObjManifest **target_manifest, optional_yield y);
+ int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent = false);
+ int append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ librados::ObjectOperation& op, RGWObjState **state,
+ RGWObjManifest** pmanifest, optional_yield y);
+
+ int update_placement_map();
+ int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
+
+ void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
+ void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
+ void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
+protected:
+ CephContext *cct;
+
+ librados::Rados rados;
+
+ using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
+ RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
+
+ tombstone_cache_t *obj_tombstone_cache;
+
+ librados::IoCtx gc_pool_ctx; // .rgw.gc
+ librados::IoCtx lc_pool_ctx; // .rgw.lc
+ librados::IoCtx objexp_pool_ctx;
+ librados::IoCtx reshard_pool_ctx;
+ librados::IoCtx notif_pool_ctx; // .rgw.notif
+
+ bool pools_initialized;
+
+ RGWQuotaHandler *quota_handler;
+
+ RGWCoroutinesManagerRegistry *cr_registry;
+
+ RGWSyncModuleInstanceRef sync_module;
+ bool writeable_zone{false};
+
+ RGWIndexCompletionManager *index_completion_manager{nullptr};
+
+ bool use_cache{false};
+ bool use_gc{true};
+ bool use_datacache{false};
+
+ int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
+public:
+ RGWRados(): timer(NULL),
+ gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
+ run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
+ data_notifier(NULL), meta_sync_processor_thread(NULL),
+ bucket_index_max_shards(0),
+ max_bucket_id(0), cct(NULL),
+ binfo_cache(NULL), obj_tombstone_cache(nullptr),
+ pools_initialized(false),
+ quota_handler(NULL),
+ cr_registry(NULL),
+ pctl(&ctl),
+ reshard(NULL) {}
+
+ RGWRados& set_use_cache(bool status) {
+ use_cache = status;
+ return *this;
+ }
+
+ RGWRados& set_use_gc(bool status) {
+ use_gc = status;
+ return *this;
+ }
+
+ RGWRados& set_use_datacache(bool status) {
+ use_datacache = status;
+ return *this;
+ }
+
+ bool get_use_datacache() {
+ return use_datacache;
+ }
+
+ RGWLC *get_lc() {
+ return lc;
+ }
+
+ RGWGC *get_gc() {
+ return gc;
+ }
+
+ RGWRados& set_run_gc_thread(bool _use_gc_thread) {
+ use_gc_thread = _use_gc_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_lc_thread(bool _use_lc_thread) {
+ use_lc_thread = _use_lc_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_quota_threads(bool _run_quota_threads) {
+ quota_threads = _run_quota_threads;
+ return *this;
+ }
+
+ RGWRados& set_run_sync_thread(bool _run_sync_thread) {
+ run_sync_thread = _run_sync_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
+ run_reshard_thread = _run_reshard_thread;
+ return *this;
+ }
+
+ librados::IoCtx* get_lc_pool_ctx() {
+ return &lc_pool_ctx;
+ }
+
+ librados::IoCtx& get_notif_pool_ctx() {
+ return notif_pool_ctx;
+ }
+
+ void set_context(CephContext *_cct) {
+ cct = _cct;
+ }
+ void set_store(rgw::sal::RadosStore* _driver) {
+ driver = _driver;
+ }
+
+ RGWServices svc;
+ RGWCtl ctl;
+
+ RGWCtl *pctl{nullptr};
+
+ /**
+ * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
+ * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
+ */
+ std::string host_id;
+
+ RGWReshard *reshard;
+ std::shared_ptr<RGWReshardWait> reshard_wait;
+
+ virtual ~RGWRados() = default;
+
+ tombstone_cache_t *get_tombstone_cache() {
+ return obj_tombstone_cache;
+ }
+ const RGWSyncModuleInstanceRef& get_sync_module() {
+ return sync_module;
+ }
+ RGWSyncTraceManager *get_sync_tracer() {
+ return sync_tracer;
+ }
+
+ int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
+ void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
+ int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+ int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+
+ uint32_t get_max_bucket_shards() {
+ return RGWSI_BucketIndex_RADOS::shards_max();
+ }
+
+
+ int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+
+ int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
+ int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+ bool *is_truncated);
+ int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+ bool *is_truncated);
+ std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
+
+ CephContext *ctx() { return cct; }
+ /** do all necessary setup of the storage device */
+ int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
+ set_context(_cct);
+ return init_begin(dpp);
+ }
+ /** Initialize the RADOS instance and prepare to do other ops */
+ int init_svc(bool raw, const DoutPrefixProvider *dpp);
+ int init_ctl(const DoutPrefixProvider *dpp);
+ virtual int init_rados();
+ int init_begin(const DoutPrefixProvider *dpp);
+ int init_complete(const DoutPrefixProvider *dpp);
+ void finalize();
+
+ int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
+ int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
+
+ /// list logs
+ int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
+ int log_list_next(RGWAccessHandle handle, std::string *name);
+
+ /// remove log
+ int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
+
+ /// show log
+ int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
+ int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
+
+ // log bandwidth info
+ int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
+ int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
+ rgw_usage_log_entry>& usage);
+ int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
+ int clear_usage(const DoutPrefixProvider *dpp);
+
+ int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
+
+ void create_bucket_id(std::string *bucket_id);
+
+ bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
+ bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
+
+ int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+ const std::string& zonegroup_id,
+ const rgw_placement_rule& placement_rule,
+ const std::string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ std::map<std::string,bufferlist>& attrs,
+ RGWBucketInfo& bucket_info,
+ obj_version *pobjv,
+ obj_version *pep_objv,
+ ceph::real_time creation_time,
+ rgw_bucket *master_bucket,
+ uint32_t *master_num_shards,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool exclusive = true);
+
+ RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
+
+ struct BucketShard {
+ RGWRados *store;
+ rgw_bucket bucket;
+ int shard_id;
+ RGWSI_RADOS::Obj bucket_obj;
+
+ explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
+ int init(const rgw_bucket& _bucket, const rgw_obj& obj,
+ RGWBucketInfo* out, const DoutPrefixProvider *dpp);
+ int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
+ int init(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index, int sid);
+
+ friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
+ out << "BucketShard:{ bucket=" << bs.bucket <<
+ ", shard_id=" << bs.shard_id <<
+ ", bucket_ojb=" << bs.bucket_obj << "}";
+ return out;
+ }
+ };
+
+ class Object {
+ RGWRados *store;
+ RGWBucketInfo bucket_info;
+ RGWObjectCtx& ctx;
+ rgw_obj obj;
+
+ BucketShard bs;
+
+ RGWObjState *state;
+ RGWObjManifest *manifest;
+
+ bool versioning_disabled;
+
+ bool bs_initialized;
+
+ const rgw_placement_rule *pmeta_placement_rule;
+
+ protected:
+ int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
+ void invalidate_state();
+
+ int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
+ const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
+ int complete_atomic_modification(const DoutPrefixProvider *dpp);
+
+ public:
+ Object(RGWRados *_store, const RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
+ ctx(_ctx), obj(_obj), bs(store),
+ state(NULL), manifest(nullptr), versioning_disabled(false),
+ bs_initialized(false),
+ pmeta_placement_rule(nullptr) {}
+
+ RGWRados *get_store() { return store; }
+ rgw_obj& get_obj() { return obj; }
+ RGWObjectCtx& get_ctx() { return ctx; }
+ RGWBucketInfo& get_bucket_info() { return bucket_info; }
+ //const std::string& get_instance() { return obj->get_instance(); }
+ //rgw::sal::Object* get_target() { return obj; }
+ int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
+
+ int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+ if (!bs_initialized) {
+ int r =
+ bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+ if (r < 0) {
+ return r;
+ }
+ bs_initialized = true;
+ }
+ *pbs = &bs;
+ return 0;
+ }
+
+ void set_versioning_disabled(bool status) {
+ versioning_disabled = status;
+ }
+
+ bool versioning_enabled() {
+ return (!versioning_disabled && bucket_info.versioning_enabled());
+ }
+
+ void set_meta_placement_rule(const rgw_placement_rule *p) {
+ pmeta_placement_rule = p;
+ }
+
+ const rgw_placement_rule& get_meta_placement_rule() {
+ return pmeta_placement_rule ? *pmeta_placement_rule : bucket_info.placement_rule;
+ }
+
+ struct Read {
+ RGWRados::Object *source;
+
+ struct GetObjState {
+ std::map<rgw_pool, librados::IoCtx> io_ctxs;
+ rgw_pool cur_pool;
+ librados::IoCtx *cur_ioctx{nullptr};
+ rgw_obj obj;
+ rgw_raw_obj head_obj;
+ } state;
+
+ struct ConditionParams {
+ const ceph::real_time *mod_ptr;
+ const ceph::real_time *unmod_ptr;
+ bool high_precision_time;
+ uint32_t mod_zone_id;
+ uint64_t mod_pg_ver;
+ const char *if_match;
+ const char *if_nomatch;
+
+ ConditionParams() :
+ mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+ if_match(NULL), if_nomatch(NULL) {}
+ } conds;
+
+ struct Params {
+ ceph::real_time *lastmod;
+ uint64_t *obj_size;
+ std::map<std::string, bufferlist> *attrs;
+ rgw_obj *target_obj;
+
+ Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+ target_obj(nullptr) {}
+ } params;
+
+ explicit Read(RGWRados::Object *_source) : source(_source) {}
+
+ int prepare(optional_yield y, const DoutPrefixProvider *dpp);
+ static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+ int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
+ int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
+ int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
+ };
+
+ struct Write {
+ RGWRados::Object *target;
+
+ struct MetaParams {
+ ceph::real_time *mtime;
+ std::map<std::string, bufferlist>* rmattrs;
+ const bufferlist *data;
+ RGWObjManifest *manifest;
+ const std::string *ptag;
+ std::list<rgw_obj_index_key> *remove_objs;
+ ceph::real_time set_mtime;
+ rgw_user owner;
+ RGWObjCategory category;
+ int flags;
+ const char *if_match;
+ const char *if_nomatch;
+ std::optional<uint64_t> olh_epoch;
+ ceph::real_time delete_at;
+ bool canceled;
+ const std::string *user_data;
+ rgw_zone_set *zones_trace;
+ bool modify_tail;
+ bool completeMultipart;
+ bool appendable;
+
+ MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+ remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+ if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+ modify_tail(false), completeMultipart(false), appendable(false) {}
+ } meta;
+
+ explicit Write(RGWRados::Object *_target) : target(_target) {}
+
+ int _do_write_meta(const DoutPrefixProvider *dpp,
+ uint64_t size, uint64_t accounted_size,
+ std::map<std::string, bufferlist>& attrs,
+ bool modify_tail, bool assume_noent,
+ void *index_op, optional_yield y);
+ int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+ std::map<std::string, bufferlist>& attrs, optional_yield y);
+ int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
+ const req_state* get_req_state() {
+ return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */
+ }
+ };
+
+ struct Delete {
+ RGWRados::Object *target;
+
+ struct DeleteParams {
+ rgw_user bucket_owner;
+ int versioning_status; // versioning flags defined in enum RGWBucketFlags
+ ACLOwner obj_owner; // needed for creation of deletion marker
+ uint64_t olh_epoch;
+ std::string marker_version_id;
+ uint32_t bilog_flags;
+ std::list<rgw_obj_index_key> *remove_objs;
+ ceph::real_time expiration_time;
+ ceph::real_time unmod_since;
+ ceph::real_time mtime; /* for setting delete marker mtime */
+ bool high_precision_time;
+ rgw_zone_set *zones_trace;
+ bool abortmp;
+ uint64_t parts_accounted_size;
+
+ DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+ } params;
+
+ struct DeleteResult {
+ bool delete_marker;
+ std::string version_id;
+
+ DeleteResult() : delete_marker(false) {}
+ } result;
+
+ explicit Delete(RGWRados::Object *_target) : target(_target) {}
+
+ int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
+ };
+
+ struct Stat {
+ RGWRados::Object *source;
+
+ struct Result {
+ rgw_obj obj;
+ std::optional<RGWObjManifest> manifest;
+ uint64_t size{0};
+ struct timespec mtime {};
+ std::map<std::string, bufferlist> attrs;
+ } result;
+
+ struct State {
+ librados::IoCtx io_ctx;
+ librados::AioCompletion *completion;
+ int ret;
+
+ State() : completion(NULL), ret(0) {}
+ } state;
+
+
+ explicit Stat(RGWRados::Object *_source) : source(_source) {}
+
+ int stat_async(const DoutPrefixProvider *dpp);
+ int wait(const DoutPrefixProvider *dpp);
+ int stat();
+ private:
+ int finish(const DoutPrefixProvider *dpp);
+ };
+ };
+
+ class Bucket {
+ RGWRados *store;
+ RGWBucketInfo bucket_info;
+ rgw_bucket& bucket;
+ int shard_id;
+
+ public:
+ Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
+ shard_id(RGW_NO_SHARD) {}
+ RGWRados *get_store() { return store; }
+ rgw_bucket& get_bucket() { return bucket; }
+ RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+ int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
+
+ int get_shard_id() { return shard_id; }
+ void set_shard_id(int id) {
+ shard_id = id;
+ }
+
+ class UpdateIndex {
+ RGWRados::Bucket *target;
+ std::string optag;
+ rgw_obj obj;
+ uint16_t bilog_flags{0};
+ BucketShard bs;
+ bool bs_initialized{false};
+ bool blind;
+ bool prepared{false};
+ rgw_zone_set *zones_trace{nullptr};
+
+ int init_bs(const DoutPrefixProvider *dpp) {
+ int r =
+ bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
+ if (r < 0) {
+ return r;
+ }
+ bs_initialized = true;
+ return 0;
+ }
+
+ void invalidate_bs() {
+ bs_initialized = false;
+ }
+
+ int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
+ public:
+
+ UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
+ bs(target->get_store()) {
+ blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
+ }
+
+ int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+ if (!bs_initialized) {
+ int r = init_bs(dpp);
+ if (r < 0) {
+ return r;
+ }
+ }
+ *pbs = &bs;
+ return 0;
+ }
+
+ void set_bilog_flags(uint16_t flags) {
+ bilog_flags = flags;
+ }
+
+ void set_zones_trace(rgw_zone_set *_zones_trace) {
+ zones_trace = _zones_trace;
+ }
+
+ int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
+ int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
+ uint64_t accounted_size, ceph::real_time& ut,
+ const std::string& etag, const std::string& content_type,
+ const std::string& storage_class,
+ bufferlist *acl_bl, RGWObjCategory category,
+ std::list<rgw_obj_index_key> *remove_objs,
+ optional_yield y,
+ const std::string *user_data = nullptr,
+ bool appendable = false);
+ int complete_del(const DoutPrefixProvider *dpp,
+ int64_t poolid, uint64_t epoch,
+ ceph::real_time& removed_mtime, /* mtime of removed object */
+ std::list<rgw_obj_index_key> *remove_objs,
+ optional_yield y);
+ int cancel(const DoutPrefixProvider *dpp,
+ std::list<rgw_obj_index_key> *remove_objs,
+ optional_yield y);
+
+ const std::string *get_optag() { return &optag; }
+
+ bool is_prepared() { return prepared; }
+ }; // class UpdateIndex
+
+ class List {
+ protected:
+ // absolute maximum number of objects that
+ // list_objects_(un)ordered can return
+ static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+ RGWRados::Bucket *target;
+ rgw_obj_key next_marker;
+
+ int list_objects_ordered(const DoutPrefixProvider *dpp,
+ int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y);
+ int list_objects_unordered(const DoutPrefixProvider *dpp,
+ int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y);
+
+ public:
+
+ struct Params {
+ std::string prefix;
+ std::string delim;
+ rgw_obj_key marker;
+ rgw_obj_key end_marker;
+ std::string ns;
+ bool enforce_ns;
+ RGWAccessListFilter* access_list_filter;
+ RGWBucketListNameFilter force_check_filter;
+ bool list_versions;
+ bool allow_unordered;
+
+ Params() :
+ enforce_ns(true),
+ access_list_filter(nullptr),
+ list_versions(false),
+ allow_unordered(false)
+ {}
+ } params;
+
+ explicit List(RGWRados::Bucket *_target) : target(_target) {}
+
+ int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y) {
+ if (params.allow_unordered) {
+ return list_objects_unordered(dpp, max, result, common_prefixes,
+ is_truncated, y);
+ } else {
+ return list_objects_ordered(dpp, max, result, common_prefixes,
+ is_truncated, y);
+ }
+ }
+ rgw_obj_key& get_next_marker() {
+ return next_marker;
+ }
+ }; // class List
+ }; // class Bucket
+
+ int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::string& obj_prefix,
+ const std::string& obj_delim,
+ std::function<int(const rgw_bucket_dir_entry&)> handler);
+
+ bool swift_versioning_enabled(const RGWBucketInfo& bucket_info) const;
+
+ int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */
+ const rgw_user& user, /* in */
+ RGWBucketInfo& bucket_info, /* in */
+ const rgw_obj& obj, /* in */
+ const DoutPrefixProvider *dpp, /* in */
+ optional_yield y); /* in */
+ int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */
+ const rgw_user& user, /* in */
+ RGWBucketInfo& bucket_info, /* in */
+ rgw_obj& obj, /* in/out */
+ bool& restored, /* out */
+ const DoutPrefixProvider *dpp); /* in */
+ int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+ RGWObjState *astate,
+ std::map<std::string, bufferlist>& src_attrs,
+ RGWRados::Object::Read& read_op,
+ const rgw_user& user_id,
+ const rgw_obj& dest_obj,
+ ceph::real_time *mtime);
+
+ enum AttrsMod {
+ ATTRSMOD_NONE = 0,
+ ATTRSMOD_REPLACE = 1,
+ ATTRSMOD_MERGE = 2
+ };
+
+ D3nDataCache* d3n_data_cache{nullptr};
+
+ int rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y);
+ int reindex_obj(const RGWBucketInfo& dest_bucket_info,
+ const rgw_obj& obj,
+ const DoutPrefixProvider* dpp,
+ optional_yield y);
+
+ int stat_remote_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& src_obj,
+ const RGWBucketInfo *src_bucket_info,
+ real_time *src_mtime,
+ uint64_t *psize,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ std::map<std::string, bufferlist> *pattrs,
+ std::map<std::string, std::string> *pheaders,
+ std::string *version_id,
+ std::string *ptag,
+ std::string *petag);
+
+ int fetch_remote_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& dest_obj,
+ const rgw_obj& src_obj,
+ RGWBucketInfo& dest_bucket_info,
+ RGWBucketInfo *src_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement,
+ ceph::real_time *src_mtime,
+ ceph::real_time *mtime,
+ const ceph::real_time *mod_ptr,
+ const ceph::real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ std::optional<uint64_t> olh_epoch,
+ ceph::real_time delete_at,
+ std::string *ptag,
+ std::string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ RGWFetchObjFilter *filter,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace = nullptr,
+ std::optional<uint64_t>* bytes_transferred = 0);
+ /**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ * ATTRSMOD_NONE - the attributes of the source object will be
+ * copied without modifications, attrs parameter is ignored;
+ * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ * parameter, source object attributes are not copied;
+ * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ * are overwritten by values contained in attrs parameter.
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int copy_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ const rgw_obj& dest_obj,
+ const rgw_obj& src_obj,
+ RGWBucketInfo& dest_bucket_info,
+ RGWBucketInfo& src_bucket_info,
+ const rgw_placement_rule& dest_placement,
+ ceph::real_time *src_mtime,
+ ceph::real_time *mtime,
+ const ceph::real_time *mod_ptr,
+ const ceph::real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ std::map<std::string, bufferlist>& attrs,
+ RGWObjCategory category,
+ uint64_t olh_epoch,
+ ceph::real_time delete_at,
+ std::string *version_id,
+ std::string *ptag,
+ std::string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int copy_obj_data(RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& dest_bucket_info,
+ const rgw_placement_rule& dest_placement,
+ RGWRados::Object::Read& read_op, off_t end,
+ const rgw_obj& dest_obj,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ uint64_t olh_epoch,
+ ceph::real_time delete_at,
+ std::string *petag,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int transition_obj(RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
+
+ /**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+ int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
+
+ void wakeup_meta_sync_shards(std::set<int>& shard_ids);
+
+ void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
+
+ RGWMetaSyncStatusManager* get_meta_sync_manager();
+ RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
+
+ int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
+ int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
+ int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
+
+ /** Delete an object.*/
+ int delete_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags = 0,
+ const ceph::real_time& expiration_time = ceph::real_time(),
+ rgw_zone_set *zones_trace = nullptr);
+
+ int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+
+ /** Remove an object from the bucket index */
+ int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime,
+ const DoutPrefixProvider *dpp, optional_yield y);
+
+ /**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl);
+
+ int set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ std::map<std::string, bufferlist>& attrs,
+ std::map<std::string, bufferlist>* rmattrs,
+ optional_yield y,
+ ceph::real_time set_mtime = ceph::real_clock::zero());
+
+ int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent = false);
+ int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
+ return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
+ }
+
+ using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
+ off_t, bool, RGWObjState*, void*);
+
+ int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
+ const rgw_obj& obj, off_t ofs, off_t end,
+ uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
+ optional_yield y);
+
+ int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
+
+ virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg);
+
+ /**
+ * a simple object read without keeping state
+ */
+
+ int raw_obj_stat(const DoutPrefixProvider *dpp,
+ rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
+ std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
+ RGWObjVersionTracker *objv_tracker, optional_yield y);
+
+ int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
+ int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
+
+ int guard_reshard(const DoutPrefixProvider *dpp,
+ BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call);
+ int block_while_resharding(RGWRados::BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
+ void olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, const std::string& op_tag, optional_yield y);
+ int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+ int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+ int bucket_index_link_olh(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& olh_state,
+ const rgw_obj& obj_instance, bool delete_marker,
+ const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch,
+ ceph::real_time unmod_since, bool high_precision_time,
+ optional_yield y,
+ rgw_zone_set *zones_trace = nullptr,
+ bool log_data_change = false);
+ int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj_instance,
+ const std::string& op_tag, const std::string& olh_tag,
+ uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
+ int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver_marker,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
+ int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
+ int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const std::string& olh_tag, const rgw_obj& obj_instance);
+ int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+ uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
+ int update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace = nullptr);
+ int clear_olh(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RGWBucketInfo& bucket_info,
+ const std::string& tag,
+ const uint64_t ver,
+ optional_yield y);
+ int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
+ optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+ int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj);
+ int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
+ uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+
+ void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
+ int remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
+ int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target);
+ int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
+
+ void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+ void gen_rand_obj_instance_name(rgw_obj *target);
+
+ int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
+ int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
+
+public:
+ void set_atomic(void *ctx, const rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_atomic(obj);
+ }
+ void set_prefetch_data(void *ctx, const rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_prefetch_data(obj);
+ }
+ void set_compressed(void *ctx, const rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_compressed(obj);
+ }
+ int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
+ int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
+ std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
+ int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
+
+ int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp, optional_yield y);
+ /* xxx dang obj_ctx -> svc */
+ int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+ int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+
+ static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
+
+ int get_bucket_info(RGWServices *svc,
+ const std::string& tenant_name, const std::string& bucket_name,
+ RGWBucketInfo& info,
+ ceph::real_time *pmtime, optional_yield y,
+ const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
+
+ // Returns 0 on successful refresh. Returns error code if there was
+ // an error or the version stored on the OSD is the same as that
+ // presented in the BucketInfo structure.
+ //
+ int try_refresh_bucket_info(RGWBucketInfo& info,
+ ceph::real_time *pmtime,
+ const DoutPrefixProvider *dpp,
+ std::map<std::string, bufferlist> *pattrs = nullptr);
+
+ int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
+ std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
+ const DoutPrefixProvider *dpp, optional_yield y);
+
+ int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
+ RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
+ ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
+ std::list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
+
+ using ent_map_t =
+ boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
+
+ int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ const int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ const std::string& delimiter,
+ const uint32_t num_entries,
+ const bool list_versions,
+ const uint16_t exp_factor, // 0 means ignore
+ ent_map_t& m,
+ bool* is_truncated,
+ bool* cls_filtered,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter = {});
+ int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ std::vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter = {});
+ int cls_bucket_head(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, std::vector<rgw_bucket_dir_header>& headers,
+ std::map<int, std::string> *bucket_instance_ids = NULL);
+ int cls_bucket_head_async(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
+ int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+ int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+ int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+ void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
+ int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
+ int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
+ int bi_list(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ int shard_id,
+ const std::string& filter_obj,
+ const std::string& marker,
+ uint32_t max,
+ std::list<rgw_cls_bi_entry> *entries,
+ bool *is_truncated);
+ int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+ int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
+ std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+ int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
+
+ int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
+ int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+ uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
+ std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
+ int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+ uint64_t end_epoch);
+ int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
+
+ int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
+
+ int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
+ int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
+
+ void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
+ std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
+ void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
+ int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
+ int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
+ librados::ObjectWriteOperation *op);
+ int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
+
+ int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+ int process_gc(bool expired_only);
+ bool process_expire_objects(const DoutPrefixProvider *dpp);
+ int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y);
+
+ int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
+ int list_lc_progress(std::string& marker, uint32_t max_entries,
+ std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+ int& index);
+
+ int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
+ std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
+ int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
+
+ // Search the bucket for encrypted multipart uploads, and increase their mtime
+ // slightly to generate a bilog entry to trigger a resync to repair any
+ // corrupted replicas. See https://tracker.ceph.com/issues/46062
+ int bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ rgw::sal::RadosStore* driver,
+ RGWBucketInfo& bucket_info,
+ const std::string& marker,
+ RGWFormatterFlusher& flusher);
+
+ int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+ int remove_objs_from_index(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::list<rgw_obj_index_key>& oid_list);
+ int move_rados_obj(const DoutPrefixProvider *dpp,
+ librados::IoCtx& src_ioctx,
+ const std::string& src_oid, const std::string& src_locator,
+ librados::IoCtx& dst_ioctx,
+ const std::string& dst_oid, const std::string& dst_locator);
+ int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
+ int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
+
+ int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+ RGWQuota& quota, uint64_t obj_size,
+ optional_yield y, bool check_size_only = false);
+
+ int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
+ uint64_t num_objs, const DoutPrefixProvider *dpp);
+
+ int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
+
+ uint64_t instance_id();
+
+ librados::Rados* get_rados_handle();
+
+ int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
+ int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
+ std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+ optional_yield y);
+
+ private:
+ /**
+ * Check the actual on-disk state of the object specified
+ * by list_state, and fill in the time and size of object.
+ * Then append any changes to suggested_updates for
+ * the rgw class' dir_suggest_changes function.
+ *
+ * Note that this can maul list_state; don't use it afterwards. Also
+ * it expects object to already be filled in from list_state; it only
+ * sets the size and mtime.
+ *
+ * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
+ * and -errno on other failures. (-ENOENT is not a failure, and it
+ * will encode that info as a suggested update.)
+ */
+ int check_disk_state(const DoutPrefixProvider *dpp,
+ librados::IoCtx io_ctx,
+ RGWBucketInfo& bucket_info,
+ rgw_bucket_dir_entry& list_state,
+ rgw_bucket_dir_entry& object,
+ bufferlist& suggested_updates,
+ optional_yield y);
+
+ /**
+ * Init pool iteration
+ * pool: pool to use for the ctx initialization
+ * ctx: context object to use for the iteration
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
+
+ /**
+ * Init pool iteration
+ * pool: pool to use
+ * cursor: position to start iteration
+ * ctx: context object to use for the iteration
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
+
+ /**
+ * Get pool iteration position
+ * ctx: context object to use for the iteration
+ * Returns: std::string representation of position
+ */
+ std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
+ /**
+ * Iterate over pool return object names, use optional filter
+ * ctx: iteration context, initialized with pool_iterate_begin()
+ * num: max number of objects to return
+ * objs: a vector that the results will append into
+ * is_truncated: if not NULL, will hold true iff iteration is complete
+ * filter: if not NULL, will be used to filter returned objects
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
+ std::vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter);
+
+ uint64_t next_bucket_id();
+
+ /**
+ * This is broken out to facilitate unit testing.
+ */
+ static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+ uint32_t num_shards);
+};
+
+
+struct get_obj_data {
+ RGWRados* rgwrados;
+ RGWGetDataCB* client_cb = nullptr;
+ rgw::Aio* aio;
+ uint64_t offset; // next offset to write to client
+ rgw::AioResultList completed; // completed read results, sorted by offset
+ optional_yield yield;
+
+ get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
+ uint64_t offset, optional_yield yield)
+ : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
+ ~get_obj_data() {
+ if (rgwrados->get_use_datacache()) {
+ const std::lock_guard l(d3n_get_data.d3n_lock);
+ }
+ }
+
+ D3nGetObjData d3n_get_data;
+ std::atomic_bool d3n_bypass_cache_write{false};
+
+ int flush(rgw::AioResultList&& results);
+
+ void cancel() {
+ // wait for all completions to drain and ignore the results
+ aio->drain();
+ }
+
+ int drain() {
+ auto c = aio->wait();
+ while (!c.empty()) {
+ int r = flush(std::move(c));
+ if (r < 0) {
+ cancel();
+ return r;
+ }
+ c = aio->wait();
+ }
+ return flush(std::move(c));
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc
new file mode 100644
index 000000000..2abf02908
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -0,0 +1,1419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <limits>
+#include <sstream>
+
+#include "rgw_zone.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_reshard.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+
+#include "common/dout.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_bilog_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const string reshard_oid_prefix = "reshard.";
+const string reshard_lock_name = "reshard_process";
+const string bucket_instance_lock_name = "bucket_instance_lock";
+
+/* All primes up to 2000 used to attempt to make dynamic sharding use
+ * a prime numbers of shards. Note: this list also includes 1 for when
+ * 1 shard is the most appropriate, even though 1 is not prime.
+ */
+const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
+ 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
+ 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
+ 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
+ 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
+ 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
+ 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
+ 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
+ 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
+ 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
+ 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
+ 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
+ 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
+ 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
+ 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
+ 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
+ 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+ 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
+ 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
+ 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
+ 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
+ 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
+ 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
+ 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
+ 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
+};
+
+class BucketReshardShard {
+ rgw::sal::RadosStore* store;
+ const RGWBucketInfo& bucket_info;
+ int shard_id;
+ RGWRados::BucketShard bs;
+ vector<rgw_cls_bi_entry> entries;
+ map<RGWObjCategory, rgw_bucket_category_stats> stats;
+ deque<librados::AioCompletion *>& aio_completions;
+ uint64_t max_aio_completions;
+ uint64_t reshard_shard_batch_size;
+
+ int wait_next_completion() {
+ librados::AioCompletion *c = aio_completions.front();
+ aio_completions.pop_front();
+
+ c->wait_for_complete();
+
+ int ret = c->get_return_value();
+ c->release();
+
+ if (ret < 0) {
+ derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int get_completion(librados::AioCompletion **c) {
+ if (aio_completions.size() >= max_aio_completions) {
+ int ret = wait_next_completion();
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ aio_completions.push_back(*c);
+
+ return 0;
+ }
+
+public:
+ BucketReshardShard(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ int shard_id, deque<librados::AioCompletion *>& _completions) :
+ store(_store), bucket_info(_bucket_info), shard_id(shard_id),
+ bs(store->getRados()), aio_completions(_completions)
+ {
+ bs.init(dpp, bucket_info, index, shard_id);
+
+ max_aio_completions =
+ store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
+ reshard_shard_batch_size =
+ store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
+ }
+
+ int get_shard_id() const {
+ return shard_id;
+ }
+
+ int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+ const rgw_bucket_category_stats& entry_stats) {
+ entries.push_back(entry);
+ if (account) {
+ rgw_bucket_category_stats& target = stats[category];
+ target.num_entries += entry_stats.num_entries;
+ target.total_size += entry_stats.total_size;
+ target.total_size_rounded += entry_stats.total_size_rounded;
+ target.actual_size += entry_stats.actual_size;
+ }
+ if (entries.size() >= reshard_shard_batch_size) {
+ int ret = flush();
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+ }
+
+ int flush() {
+ if (entries.size() == 0) {
+ return 0;
+ }
+
+ librados::ObjectWriteOperation op;
+ for (auto& entry : entries) {
+ store->getRados()->bi_put(op, bs, entry);
+ }
+ cls_rgw_bucket_update_stats(op, false, stats);
+
+ librados::AioCompletion *c;
+ int ret = get_completion(&c);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = bs.bucket_obj.aio_operate(c, &op);
+ if (ret < 0) {
+ derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ entries.clear();
+ stats.clear();
+ return 0;
+ }
+
+ int wait_all_aio() {
+ int ret = 0;
+ while (!aio_completions.empty()) {
+ int r = wait_next_completion();
+ if (r < 0) {
+ ret = r;
+ }
+ }
+ return ret;
+ }
+}; // class BucketReshardShard
+
+
+class BucketReshardManager {
+ rgw::sal::RadosStore *store;
+ deque<librados::AioCompletion *> completions;
+ vector<BucketReshardShard> target_shards;
+
+public:
+ BucketReshardManager(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *_store,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& target)
+ : store(_store)
+ {
+ const uint32_t num_shards = rgw::num_shards(target.layout.normal);
+ target_shards.reserve(num_shards);
+ for (uint32_t i = 0; i < num_shards; ++i) {
+ target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
+ }
+ }
+
+ ~BucketReshardManager() {
+ for (auto& shard : target_shards) {
+ int ret = shard.wait_all_aio();
+ if (ret < 0) {
+ ldout(store->ctx(), 20) << __func__ <<
+ ": shard->wait_all_aio() returned ret=" << ret << dendl;
+ }
+ }
+ }
+
+ int add_entry(int shard_index,
+ rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+ const rgw_bucket_category_stats& entry_stats) {
+ int ret = target_shards[shard_index].add_entry(entry, account, category,
+ entry_stats);
+ if (ret < 0) {
+ derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+ ") returned error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int finish() {
+ int ret = 0;
+ for (auto& shard : target_shards) {
+ int r = shard.flush();
+ if (r < 0) {
+ derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
+ ret = r;
+ }
+ }
+ for (auto& shard : target_shards) {
+ int r = shard.wait_all_aio();
+ if (r < 0) {
+ derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
+ ret = r;
+ }
+ }
+ target_shards.clear();
+ return ret;
+ }
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& _bucket_info,
+ const std::map<std::string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock) :
+ store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+ reshard_lock(store, bucket_info, true),
+ outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+// sets reshard status of bucket index shards for the current index layout
+static int set_resharding_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const RGWBucketInfo& bucket_info,
+ cls_rgw_reshard_status status)
+{
+ cls_rgw_bucket_instance_entry instance_entry;
+ instance_entry.set_status(status);
+
+ int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
+ const rgw_bucket& bucket,
+ const DoutPrefixProvider* dpp)
+{
+ RGWBucketInfo info;
+ int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
+ nullptr, null_yield, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ // delete its shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
+ // delete the bucket instance metadata
+ return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
+}
+
+// initialize the new bucket index shard objects
+static int init_target_index(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ const DoutPrefixProvider* dpp)
+{
+ int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
+ "target index shard objects: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ if (!bucket_info.datasync_flag_enabled()) {
+ // if bucket sync is disabled, disable it on each of the new shards too
+ auto log = rgw::log_layout_from_index(0, index);
+ ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
+ "bucket sync on the target index shard objects: "
+ << cpp_strerror(ret) << dendl;
+ store->svc()->bi->clean_index(dpp, bucket_info, index);
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+// initialize a target index layout, create its bucket index shard objects, and
+// write the target layout to the bucket instance metadata
+static int init_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ uint32_t new_num_shards,
+ const DoutPrefixProvider* dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+ const auto current = prev.current_index;
+
+ // initialize a new normal target index layout generation
+ rgw::bucket_index_layout_generation target;
+ target.layout.type = rgw::BucketIndexType::Normal;
+ target.layout.normal.num_shards = new_num_shards;
+ target.gen = current.gen + 1;
+
+ if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
+ // backward-compatible cleanup of old reshards, where the target was in a
+ // different bucket instance
+ if (!bucket_info.new_bucket_instance_id.empty()) {
+ rgw_bucket new_bucket = bucket_info.bucket;
+ new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
+ ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
+ "from a previous reshard attempt" << dendl;
+ // ignore errors
+ remove_old_reshard_instance(store, new_bucket, dpp);
+ }
+ bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
+ }
+
+ if (bucket_info.layout.target_index) {
+ // a previous reshard failed or stalled, and its reshard lock dropped
+ ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
+ "objects from a previous reshard attempt" << dendl;
+ // delete its existing shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
+ // don't reuse this same generation in the new target layout, in case
+ // something is still trying to operate on its shard objects
+ target.gen = bucket_info.layout.target_index->gen + 1;
+ }
+
+ // create the index shard objects
+ int ret = init_target_index(store, bucket_info, target, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ do {
+ // update resharding state
+ bucket_info.layout.target_index = target;
+ bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
+
+ if (ret = fault.check("set_target_layout");
+ ret == 0) { // no fault injected, write the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+ real_time(), &bucket_attrs, dpp, null_yield);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
+ bucket_info.layout.current_index != current) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ break;
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
+ "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+
+ // delete the target shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, target);
+ return ret;
+ }
+ return 0;
+} // init_target_layout
+
+// delete the bucket index shards associated with the target layout and remove
+// it from the bucket instance metadata
+static int revert_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider* dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+
+ // remove target index shard objects
+ int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
+ "target index with: " << cpp_strerror(ret) << dendl;
+ ret = 0; // non-fatal error
+ }
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ do {
+ // clear target_index and resharding state
+ bucket_info.layout.target_index = std::nullopt;
+ bucket_info.layout.resharding = rgw::BucketReshardState::None;
+
+ if (ret = fault.check("revert_target_layout");
+ ret == 0) { // no fault injected, revert the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+ real_time(),
+ &bucket_attrs, dpp, null_yield);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "reshard cancel" << dendl;
+ return -ECANCELED;
+ }
+ if (bucket_info.layout.current_index != prev.current_index ||
+ bucket_info.layout.target_index != prev.target_index) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ return -ECANCELED;
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
+ "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+ return ret;
+ }
+ return 0;
+} // remove_target_layout
+
+static int init_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ uint32_t new_num_shards,
+ const DoutPrefixProvider *dpp)
+{
+ if (new_num_shards == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " got invalid new_num_shards=0" << dendl;
+ return -EINVAL;
+ }
+
+ int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret = fault.check("block_writes");
+ ret == 0) { // no fault injected, block writes to the current index shards
+ ret = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::IN_PROGRESS);
+ }
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
+ "writes to the current index: " << cpp_strerror(ret) << dendl;
+ // clean up the target layout (ignore errors)
+ revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ return ret;
+ }
+ return 0;
+} // init_reshard
+
+static int cancel_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ // unblock writes to the current index shard objects
+ int ret = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::NOT_RESHARDING);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+ "writes to current index objects: " << cpp_strerror(ret) << dendl;
+ ret = 0; // non-fatal error
+ }
+
+ if (bucket_info.layout.target_index) {
+ return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ }
+ // there is nothing to revert
+ return 0;
+} // cancel_reshard
+
+static int commit_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ auto& layout = bucket_info.layout;
+ const auto next_log_gen = layout.logs.empty() ? 1 :
+ layout.logs.back().gen + 1;
+
+ if (!store->svc()->zone->need_to_log_data()) {
+ // if we're not syncing data, we can drop any existing logs
+ layout.logs.clear();
+ }
+
+ // use the new index layout as current
+ ceph_assert(layout.target_index);
+ layout.current_index = std::move(*layout.target_index);
+ layout.target_index = std::nullopt;
+ layout.resharding = rgw::BucketReshardState::None;
+ // add the in-index log layout
+ layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
+
+ int ret = fault.check("commit_target_layout");
+ if (ret == 0) { // no fault injected, write the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(
+ bucket_info, false, real_time(), &bucket_attrs, dpp, null_yield);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+ return ret;
+} // commit_target_layout
+
+static int commit_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ int ret = 0;
+ do {
+ ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "reshard cancel" << dendl;
+ return -ECANCELED; // whatever canceled us already did the cleanup
+ }
+ if (bucket_info.layout.current_index != prev.current_index ||
+ bucket_info.layout.target_index != prev.target_index) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ return -ECANCELED; // whatever canceled us already did the cleanup
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
+ "target index layout: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+
+ // unblock writes to the current index shard objects
+ int ret2 = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::NOT_RESHARDING);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+ "writes to current index objects: " << cpp_strerror(ret2) << dendl;
+ // non-fatal error
+ }
+ return ret;
+ }
+
+ if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
+ prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
+ // write a datalog entry for each shard of the previous index. triggering
+ // sync on the old shards will force them to detect the end-of-log for that
+ // generation, and eventually transition to the next
+ // TODO: use a log layout to support types other than BucketLogType::InIndex
+ for (uint32_t shard_id = 0; shard_id < rgw::num_shards(prev.current_index.layout.normal); ++shard_id) {
+ // This null_yield can stay, for now, since we're in our own thread
+ ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id,
+ null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
+ << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
+ << prev.logs.back().gen << ")" << dendl;
+ } // datalog error is not fatal
+ }
+ }
+
+ // check whether the old index objects are still needed for bilogs
+ const auto& logs = bucket_info.layout.logs;
+ auto log = std::find_if(logs.begin(), logs.end(),
+ [&prev] (const rgw::bucket_log_layout_generation& log) {
+ return log.layout.type == rgw::BucketLogType::InIndex
+ && log.layout.in_index.gen == prev.current_index.gen;
+ });
+ if (log == logs.end()) {
+ // delete the index objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
+ }
+ return 0;
+} // commit_reshard
+
+int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ const DoutPrefixProvider* dpp)
+{
+ ReshardFaultInjector no_fault;
+ return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
+}
+
+int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
+{
+ int ret = reshard_lock.lock(dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+ ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
+ ret = -EINVAL;
+ } else {
+ ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
+ }
+
+ reshard_lock.unlock();
+ return ret;
+}
+
+RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral) :
+ store(_store),
+ lock_oid(reshard_lock_oid),
+ ephemeral(_ephemeral),
+ internal_lock(reshard_lock_name)
+{
+ const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
+ "rgw_reshard_bucket_lock_duration");
+ duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+ char cookie_buf[COOKIE_LEN + 1];
+ gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+ cookie_buf[COOKIE_LEN] = '\0';
+
+ internal_lock.set_cookie(cookie_buf);
+ internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
+ internal_lock.set_must_renew(false);
+
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+ }
+
+ if (ret == -EBUSY) {
+ ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
+ " found lock on " << lock_oid <<
+ " to be held by another RGW process; skipping for now" << dendl;
+ return ret;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
+ " failed to acquire lock on " << lock_oid << ": " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ reset_time(Clock::now());
+
+ return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+ int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+ " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+ }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+ internal_lock.set_must_renew(true);
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+ }
+ if (ret < 0) { /* expired or already locked by another processor */
+ std::stringstream error_s;
+ if (-ENOENT == ret) {
+ error_s << "ENOENT (lock expired or never initially locked)";
+ } else {
+ error_s << ret << " (" << cpp_strerror(-ret) << ")";
+ }
+ ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+ lock_oid << " with error " << error_s.str() << dendl;
+ return ret;
+ }
+ internal_lock.set_must_renew(false);
+
+ reset_time(now);
+ ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+ lock_oid << dendl;
+
+ return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
+ const rgw::bucket_index_layout_generation& target,
+ int max_entries,
+ bool verbose,
+ ostream *out,
+ Formatter *formatter,
+ const DoutPrefixProvider *dpp)
+{
+ if (out) {
+ (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
+ (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
+ }
+
+ /* update bucket info -- in progress*/
+ list<rgw_cls_bi_entry> entries;
+
+ if (max_entries < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": can't reshard, negative max_entries" << dendl;
+ return -EINVAL;
+ }
+
+ BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
+
+ bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
+
+ if (verbose_json_out) {
+ formatter->open_array_section("entries");
+ }
+
+ uint64_t total_entries = 0;
+
+ if (!verbose_json_out && out) {
+ (*out) << "total entries:";
+ }
+
+ const uint32_t num_source_shards = rgw::num_shards(current.layout.normal);
+ string marker;
+ for (uint32_t i = 0; i < num_source_shards; ++i) {
+ bool is_truncated = true;
+ marker.clear();
+ const std::string null_object_filter; // empty string since we're not filtering by object
+ while (is_truncated) {
+ entries.clear();
+ int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to find shard "
+ << i << ", skipping" << dendl;
+ // break out of the is_truncated loop and move on to the next shard
+ break;
+ } else if (ret < 0) {
+ derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_cls_bi_entry& entry = *iter;
+ if (verbose_json_out) {
+ formatter->open_object_section("entry");
+
+ encode_json("shard_id", i, formatter);
+ encode_json("num_entry", total_entries, formatter);
+ encode_json("entry", entry, formatter);
+ }
+ total_entries++;
+
+ marker = entry.idx;
+
+ int target_shard_id;
+ cls_rgw_obj_key cls_key;
+ RGWObjCategory category;
+ rgw_bucket_category_stats stats;
+ bool account = entry.get_info(&cls_key, &category, &stats);
+ rgw_obj_key key(cls_key);
+ if (entry.type == BIIndexType::OLH && key.empty()) {
+ // bogus entry created by https://tracker.ceph.com/issues/46456
+ // to fix, skip so it doesn't get include in the new bucket instance
+ total_entries--;
+ ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
+ continue;
+ }
+ rgw_obj obj(bucket_info.bucket, key);
+ RGWMPObj mp;
+ if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
+ // place the multipart .meta object on the same shard as its head object
+ obj.index_hash_source = mp.get_key();
+ }
+ ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
+ obj.get_hash_object(), &target_shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
+
+ ret = target_shards_mgr.add_entry(shard_index, entry, account,
+ category, stats);
+ if (ret < 0) {
+ return ret;
+ }
+
+ Clock::time_point now = Clock::now();
+ if (reshard_lock.should_renew(now)) {
+ // assume outer locks have timespans at least the size of ours, so
+ // can call inside conditional
+ if (outer_reshard_lock) {
+ ret = outer_reshard_lock->renew(now);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ ret = reshard_lock.renew(now);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
+ return ret;
+ }
+ }
+ if (verbose_json_out) {
+ formatter->close_section();
+ formatter->flush(*out);
+ } else if (out && !(total_entries % 1000)) {
+ (*out) << " " << total_entries;
+ }
+ } // entries loop
+ }
+ }
+
+ if (verbose_json_out) {
+ formatter->close_section();
+ formatter->flush(*out);
+ } else if (out) {
+ (*out) << " " << total_entries << std::endl;
+ }
+
+ int ret = target_shards_mgr.finish();
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
+ return -EIO;
+ }
+ return 0;
+} // RGWBucketReshard::do_reshard
+
+int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
+{
+ return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
+}
+
+int RGWBucketReshard::execute(int num_shards,
+ ReshardFaultInjector& fault,
+ int max_op_entries,
+ const DoutPrefixProvider *dpp,
+ bool verbose, ostream *out,
+ Formatter *formatter,
+ RGWReshard* reshard_log)
+{
+ // take a reshard lock on the bucket
+ int ret = reshard_lock.lock(dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ // unlock when scope exits
+ auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
+
+ if (reshard_log) {
+ ret = reshard_log->update(dpp, bucket_info);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ // prepare the target index and add its layout the bucket info
+ ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret = fault.check("do_reshard");
+ ret == 0) { // no fault injected, do the reshard
+ ret = do_reshard(bucket_info.layout.current_index,
+ *bucket_info.layout.target_index,
+ max_op_entries, verbose, out, formatter, dpp);
+ }
+
+ if (ret < 0) {
+ cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+
+ ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+ << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
+ return ret;
+ }
+
+ ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+ << bucket_info.bucket.name << "\" completed successfully" << dendl;
+ return 0;
+} // execute
+
+bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
+ const RGWSI_Zone* zone_svc)
+{
+ return !zone_svc->need_to_log_data() ||
+ bucket.layout.logs.size() < max_bilog_history;
+}
+
+
+RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
+ Formatter *_formatter) :
+ store(_store), instance_lock(bucket_instance_lock_name),
+ verbose(_verbose), out(_out), formatter(_formatter)
+{
+ num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
+}
+
+string RGWReshard::get_logshard_key(const string& tenant,
+ const string& bucket_name)
+{
+ return tenant + ":" + bucket_name;
+}
+
+#define MAX_RESHARD_LOGSHARDS_PRIME 7877
+
+void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
+{
+ string key = get_logshard_key(tenant, bucket_name);
+
+ uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+ uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+ sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
+
+ get_logshard_oid(int(sid), oid);
+}
+
+int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+ if (!store->svc()->zone->can_reshard()) {
+ ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl;
+ return 0;
+ }
+
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ librados::ObjectWriteOperation op;
+ cls_rgw_reshard_add(op, entry);
+
+ int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
+{
+ cls_rgw_reshard_entry entry;
+ entry.bucket_name = bucket_info.bucket.name;
+ entry.bucket_id = bucket_info.bucket.bucket_id;
+ entry.tenant = bucket_info.owner.tenant;
+
+ int ret = get(dpp, entry);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = add(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
+ cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+}
+
+
+int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
+{
+ string logshard_oid;
+
+ get_logshard_oid(logshard_num, &logshard_oid);
+
+ int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
+
+ if (ret == -ENOENT) {
+ // these shard objects aren't created until we actually write something to
+ // them, so treat ENOENT as a successful empty listing
+ *is_truncated = false;
+ ret = 0;
+ } else if (ret == -EACCES) {
+ ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
+ << ". Fix the pool access permissions of your client" << dendl;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
+ << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
+ }
+
+ return ret;
+}
+
+int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
+ " bucket=" << entry.bucket_name << dendl;
+ }
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
+{
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ librados::ObjectWriteOperation op;
+ cls_rgw_reshard_remove(op, entry);
+
+ int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+ return ret;
+ }
+
+ return ret;
+}
+
+int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
+{
+ int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWReshardWait::wait(optional_yield y)
+{
+ std::unique_lock lock(mutex);
+
+ if (going_down) {
+ return -ECANCELED;
+ }
+
+ if (y) {
+ auto& context = y.get_io_context();
+ auto& yield = y.get_yield_context();
+
+ Waiter waiter(context);
+ waiters.push_back(waiter);
+ lock.unlock();
+
+ waiter.timer.expires_after(duration);
+
+ boost::system::error_code ec;
+ waiter.timer.async_wait(yield[ec]);
+
+ lock.lock();
+ waiters.erase(waiters.iterator_to(waiter));
+ return -ec.value();
+ }
+
+ cond.wait_for(lock, duration);
+
+ if (going_down) {
+ return -ECANCELED;
+ }
+
+ return 0;
+}
+
+void RGWReshardWait::stop()
+{
+ std::scoped_lock lock(mutex);
+ going_down = true;
+ cond.notify_all();
+ for (auto& waiter : waiters) {
+ // unblock any waiters with ECANCELED
+ waiter.timer.cancel();
+ }
+}
+
+int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
+ int max_entries, const DoutPrefixProvider *dpp)
+{
+ ldpp_dout(dpp, 20) << __func__ << " resharding " <<
+ entry.bucket_name << dendl;
+
+ rgw_bucket bucket;
+ RGWBucketInfo bucket_info;
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ int ret = store->getRados()->get_bucket_info(store->svc(),
+ entry.tenant,
+ entry.bucket_name,
+ bucket_info, nullptr,
+ null_yield, dpp,
+ &bucket_attrs);
+ if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error in get_bucket_info for bucket " << entry.bucket_name <<
+ ": " << cpp_strerror(-ret) << dendl;
+ if (ret != -ENOENT) {
+ // any error other than ENOENT will abort
+ return ret;
+ }
+ } else {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Bucket: " << entry.bucket_name <<
+ " already resharded by someone, skipping " << dendl;
+ }
+
+ // we've encountered a reshard queue entry for an apparently
+ // non-existent bucket; let's try to recover by cleaning up
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": removing reshard queue entry for a resharded or non-existent bucket" <<
+ entry.bucket_name << dendl;
+
+ ret = remove(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error removing non-existent bucket " <<
+ entry.bucket_name << " from resharding queue: " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ // we cleaned up, move on to the next entry
+ return 0;
+ }
+
+ if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
+ ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
+ "eligible for resharding until peer zones finish syncing one "
+ "or more of its old log generations" << dendl;
+ return remove(dpp, entry);
+ }
+
+ RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
+
+ ReshardFaultInjector f; // no fault injected
+ ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
+ false, nullptr, nullptr, this);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error during resharding bucket " << entry.bucket_name << ":" <<
+ cpp_strerror(-ret)<< dendl;
+ return ret;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ " removing reshard queue entry for bucket " << entry.bucket_name <<
+ dendl;
+
+ ret = remove(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
+ entry.bucket_name << " from resharding queue: " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
+{
+ string marker;
+ bool truncated = true;
+
+ constexpr uint32_t max_entries = 1000;
+
+ string logshard_oid;
+ get_logshard_oid(logshard_num, &logshard_oid);
+
+ RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+ int ret = logshard_lock.lock(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
+ logshard_oid << ", ret = " << ret <<dendl;
+ return ret;
+ }
+
+ do {
+ std::list<cls_rgw_reshard_entry> entries;
+ ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
+ logshard_oid << dendl;
+ continue;
+ }
+
+ for(auto& entry: entries) { // logshard entries
+ process_entry(entry, max_entries, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ Clock::time_point now = Clock::now();
+ if (logshard_lock.should_renew(now)) {
+ ret = logshard_lock.renew(now);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ entry.get_key(&marker);
+ } // entry for loop
+ } while (truncated);
+
+ logshard_lock.unlock();
+ return 0;
+}
+
+
+void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
+{
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
+
+ string objname(reshard_oid_prefix);
+ *logshard = objname + buf;
+}
+
+int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+
+ for (int i = 0; i < num_logshards; i++) {
+ string logshard;
+ get_logshard_oid(i, &logshard);
+
+ ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
+
+ ret = process_single_logshard(i, dpp);
+
+ ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
+ }
+
+ return 0;
+}
+
+bool RGWReshard::going_down()
+{
+ return down_flag;
+}
+
+void RGWReshard::start_processor()
+{
+ worker = new ReshardWorker(store->ctx(), this);
+ worker->create("rgw_reshard");
+}
+
+void RGWReshard::stop_processor()
+{
+ down_flag = true;
+ if (worker) {
+ worker->stop();
+ worker->join();
+ }
+ delete worker;
+ worker = nullptr;
+}
+
+void *RGWReshard::ReshardWorker::entry() {
+ do {
+ utime_t start = ceph_clock_now();
+ reshard->process_all_logshards(this);
+
+ if (reshard->going_down())
+ break;
+
+ utime_t end = ceph_clock_now();
+ end -= start;
+ int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+
+ if (secs <= end.sec())
+ continue; // next round
+
+ secs -= end.sec();
+
+ std::unique_lock locker{lock};
+ cond.wait_for(locker, std::chrono::seconds(secs));
+ } while (!reshard->going_down());
+
+ return NULL;
+}
+
+void RGWReshard::ReshardWorker::stop()
+{
+ std::lock_guard l{lock};
+ cond.notify_all();
+}
+
+CephContext *RGWReshard::ReshardWorker::get_cct() const
+{
+ return cct;
+}
+
+unsigned RGWReshard::ReshardWorker::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
+{
+ return out << "rgw reshard worker thread: ";
+}
diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h
new file mode 100644
index 000000000..59819f3a5
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+#include <initializer_list>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/asio/basic_waitable_timer.hpp>
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
+#include "common/async/yield_context.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/lock/cls_lock_client.h"
+
+#include "rgw_common.h"
+#include "common/fault_injector.h"
+
+
+class RGWReshard;
+namespace rgw { namespace sal {
+ class RadosStore;
+} }
+
+using ReshardFaultInjector = FaultInjector<std::string_view>;
+
+class RGWBucketReshardLock {
+ using Clock = ceph::coarse_mono_clock;
+
+ rgw::sal::RadosStore* store;
+ const std::string lock_oid;
+ const bool ephemeral;
+ rados::cls::lock::Lock internal_lock;
+ std::chrono::seconds duration;
+
+ Clock::time_point start_time;
+ Clock::time_point renew_thresh;
+
+ void reset_time(const Clock::time_point& now) {
+ start_time = now;
+ renew_thresh = start_time + duration / 2;
+ }
+
+public:
+ RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral);
+ RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& bucket_info,
+ bool _ephemeral) :
+ RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+ {}
+
+ int lock(const DoutPrefixProvider *dpp);
+ void unlock();
+ int renew(const Clock::time_point&);
+
+ bool should_renew(const Clock::time_point& now) const {
+ return now >= renew_thresh;
+ }
+}; // class RGWBucketReshardLock
+
+class RGWBucketReshard {
+ public:
+ using Clock = ceph::coarse_mono_clock;
+
+ private:
+ rgw::sal::RadosStore *store;
+ RGWBucketInfo bucket_info;
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ RGWBucketReshardLock reshard_lock;
+ RGWBucketReshardLock* outer_reshard_lock;
+
+ // using an initializer_list as an array in contiguous memory
+ // allocated in at once
+ static const std::initializer_list<uint16_t> reshard_primes;
+
+ int do_reshard(const rgw::bucket_index_layout_generation& current,
+ const rgw::bucket_index_layout_generation& target,
+ int max_entries,
+ bool verbose,
+ std::ostream *os,
+ Formatter *formatter,
+ const DoutPrefixProvider *dpp);
+public:
+
+ // pass nullptr for the final parameter if no outer reshard lock to
+ // manage
+ RGWBucketReshard(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& _bucket_info,
+ const std::map<std::string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock);
+ int execute(int num_shards, ReshardFaultInjector& f,
+ int max_op_entries, const DoutPrefixProvider *dpp,
+ bool verbose = false, std::ostream *out = nullptr,
+ ceph::Formatter *formatter = nullptr,
+ RGWReshard *reshard_log = nullptr);
+ int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
+ int cancel(const DoutPrefixProvider* dpp);
+
+ static int clear_resharding(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ const DoutPrefixProvider* dpp);
+
+ static uint32_t get_max_prime_shards() {
+ return *std::crbegin(reshard_primes);
+ }
+
+ // returns the prime in our list less than or equal to the
+ // parameter; the lowest value that can be returned is 1
+ static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
+ auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
+ requested_shards);
+ if (it == reshard_primes.begin()) {
+ return 1;
+ } else {
+ return *(--it);
+ }
+ }
+
+ // returns the prime in our list greater than or equal to the
+ // parameter; if we do not have such a prime, 0 is returned
+ static uint32_t get_prime_shards_greater_or_equal(
+ uint32_t requested_shards)
+ {
+ auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
+ requested_shards);
+ if (it == reshard_primes.end()) {
+ return 0;
+ } else {
+ return *it;
+ }
+ }
+
+ // returns a preferred number of shards given a calculated number of
+ // shards based on max_dynamic_shards and the list of prime values
+ static uint32_t get_preferred_shards(uint32_t suggested_shards,
+ uint32_t max_dynamic_shards) {
+
+ // use a prime if max is within our prime range, otherwise use
+ // specified max
+ const uint32_t absolute_max =
+ max_dynamic_shards >= get_max_prime_shards() ?
+ max_dynamic_shards :
+ get_prime_shards_less_or_equal(max_dynamic_shards);
+
+ // if we can use a prime number, use it, otherwise use suggested;
+ // note get_prime_shards_greater_or_equal will return 0 if no prime in
+ // prime range
+ const uint32_t prime_ish_num_shards =
+ std::max(get_prime_shards_greater_or_equal(suggested_shards),
+ suggested_shards);
+
+ // dynamic sharding cannot reshard more than defined maximum
+ const uint32_t final_num_shards =
+ std::min(prime_ish_num_shards, absolute_max);
+
+ return final_num_shards;
+ }
+
+ const std::map<std::string, bufferlist>& get_bucket_attrs() const {
+ return bucket_attrs;
+ }
+
+ // for multisite, the RGWBucketInfo keeps a history of old log generations
+ // until all peers are done with them. prevent this log history from growing
+ // too large by refusing to reshard the bucket until the old logs get trimmed
+ static constexpr size_t max_bilog_history = 4;
+
+ static bool can_reshard(const RGWBucketInfo& bucket,
+ const RGWSI_Zone* zone_svc);
+}; // RGWBucketReshard
+
+
+class RGWReshard {
+public:
+ using Clock = ceph::coarse_mono_clock;
+
+private:
+ rgw::sal::RadosStore* store;
+ std::string lock_name;
+ rados::cls::lock::Lock instance_lock;
+ int num_logshards;
+
+ bool verbose;
+ std::ostream *out;
+ Formatter *formatter;
+
+ void get_logshard_oid(int shard_num, std::string *shard);
+protected:
+ class ReshardWorker : public Thread, public DoutPrefixProvider {
+ CephContext *cct;
+ RGWReshard *reshard;
+ ceph::mutex lock = ceph::make_mutex("ReshardWorker");
+ ceph::condition_variable cond;
+
+ public:
+ ReshardWorker(CephContext * const _cct,
+ RGWReshard * const _reshard)
+ : cct(_cct),
+ reshard(_reshard) {}
+
+ void *entry() override;
+ void stop();
+
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+ };
+
+ ReshardWorker *worker = nullptr;
+ std::atomic<bool> down_flag = { false };
+
+ std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
+ void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
+
+public:
+ RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
+ int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+ int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
+ int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+ int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
+ int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
+ int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
+
+ /* reshard thread */
+ int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
+ const DoutPrefixProvider *dpp);
+ int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
+ int process_all_logshards(const DoutPrefixProvider *dpp);
+ bool going_down();
+ void start_processor();
+ void stop_processor();
+};
+
+class RGWReshardWait {
+ public:
+ // the blocking wait uses std::condition_variable::wait_for(), which uses the
+ // std::chrono::steady_clock. use that for the async waits as well
+ using Clock = std::chrono::steady_clock;
+ private:
+ const ceph::timespan duration;
+ ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
+ ceph::condition_variable cond;
+
+ struct Waiter : boost::intrusive::list_base_hook<> {
+ using Executor = boost::asio::io_context::executor_type;
+ using Timer = boost::asio::basic_waitable_timer<Clock,
+ boost::asio::wait_traits<Clock>, Executor>;
+ Timer timer;
+ explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
+ };
+ boost::intrusive::list<Waiter> waiters;
+
+ bool going_down{false};
+
+public:
+ RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
+ : duration(duration) {}
+ ~RGWReshardWait() {
+ ceph_assert(going_down);
+ }
+ int wait(optional_yield y);
+ // unblock any threads waiting on reshard
+ void stop();
+};
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.cc b/src/rgw/driver/rados/rgw_rest_bucket.cc
new file mode 100644
index 000000000..ebe4e429c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_Bucket_Info : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Info() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_bucket_info"; }
+};
+
+void RGWOp_Bucket_Info::execute(optional_yield y)
+{
+ RGWBucketAdminOpState op_state;
+
+ bool fetch_stats;
+
+ std::string bucket;
+
+ string uid_str;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+ op_state.set_fetch_stats(fetch_stats);
+
+ op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
+}
+
+class RGWOp_Get_Policy : public RGWRESTOp {
+
+public:
+ RGWOp_Get_Policy() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_policy"; }
+};
+
+void RGWOp_Get_Policy::execute(optional_yield y)
+{
+ RGWBucketAdminOpState op_state;
+
+ std::string bucket;
+ std::string object;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "object", object, &object);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_object(object);
+
+ op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
+}
+
+class RGWOp_Check_Bucket_Index : public RGWRESTOp {
+
+public:
+ RGWOp_Check_Bucket_Index() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "check_bucket_index"; }
+};
+
+void RGWOp_Check_Bucket_Index::execute(optional_yield y)
+{
+ std::string bucket;
+
+ bool fix_index;
+ bool check_objects;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_bool(s, "fix", false, &fix_index);
+ RESTArgs::get_bool(s, "check-objects", false, &check_objects);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_fix_index(fix_index);
+ op_state.set_check_objects(check_objects);
+
+ op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
+}
+
+class RGWOp_Bucket_Link : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Link() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "link_bucket"; }
+};
+
+void RGWOp_Bucket_Link::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string bucket;
+ std::string bucket_id;
+ std::string new_bucket_name;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
+ RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
+
+ rgw_user uid(uid_str);
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+ op_state.set_bucket_id(bucket_id);
+ op_state.set_new_bucket_name(new_bucket_name);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWBucketAdminOp::link(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Unlink : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Unlink() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "unlink_bucket"; }
+};
+
+void RGWOp_Bucket_Unlink::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string bucket;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_bucket"; }
+};
+
+void RGWOp_Bucket_Remove::execute(optional_yield y)
+{
+ std::string bucket_name;
+ bool delete_children;
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+
+ RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+ RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
+
+ /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
+ * the master. This user is actually the OP caller, not the bucket owner. */
+ op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
+ if (op_ret == -ENOENT) {
+ op_ret = -ERR_NO_SUCH_BUCKET;
+ }
+ return;
+ }
+
+ op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
+}
+
+class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
+
+public:
+ RGWOp_Set_Bucket_Quota() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "set_bucket_quota"; }
+};
+
+#define QUOTA_INPUT_MAX_LEN 1024
+
+void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
+{
+ bool uid_arg_existed = false;
+ std::string uid_str;
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
+ if (! uid_arg_existed) {
+ op_ret = -EINVAL;
+ return;
+ }
+ rgw_user uid(uid_str);
+ bool bucket_arg_existed = false;
+ std::string bucket_name;
+ RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
+ if (! bucket_arg_existed) {
+ op_ret = -EINVAL;
+ return;
+ }
+
+ bool use_http_params;
+
+ if (s->content_length > 0) {
+ use_http_params = false;
+ } else {
+ const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+ use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+ }
+ RGWQuotaInfo quota;
+ if (!use_http_params) {
+ bool empty;
+ op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+ if (op_ret < 0) {
+ if (!empty)
+ return;
+ /* was probably chunked input, but no content provided, configure via http params */
+ use_http_params = true;
+ }
+ }
+ if (use_http_params) {
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
+ if (op_ret < 0) {
+ return;
+ }
+ RGWQuotaInfo *old_quota = &bucket->get_info().quota;
+ int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
+ int64_t max_size_kb;
+ bool has_max_size_kb = false;
+ RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+ RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+ RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
+ if (has_max_size_kb)
+ quota.max_size = max_size_kb * 1024;
+ RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+ }
+
+ RGWBucketAdminOpState op_state;
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket_name);
+ op_state.set_quota(quota);
+
+ op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
+}
+
+class RGWOp_Sync_Bucket : public RGWRESTOp {
+
+public:
+ RGWOp_Sync_Bucket() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "sync_bucket"; }
+};
+
+void RGWOp_Sync_Bucket::execute(optional_yield y)
+{
+ std::string bucket;
+ std::string tenant;
+ bool sync_bucket;
+
+ RGWBucketAdminOpState op_state;
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "tenant", tenant, &tenant);
+ RESTArgs::get_bool(s, "sync", true, &sync_bucket);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_tenant(tenant);
+ op_state.set_sync_bucket(sync_bucket);
+
+ op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
+}
+
+class RGWOp_Object_Remove: public RGWRESTOp {
+
+public:
+ RGWOp_Object_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_object"; }
+};
+
+void RGWOp_Object_Remove::execute(optional_yield y)
+{
+ std::string bucket;
+ std::string object;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "object", object, &object);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_object(object);
+
+ op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
+}
+
+
+RGWOp *RGWHandler_Bucket::op_get()
+{
+
+ if (s->info.args.sub_resource_exists("policy"))
+ return new RGWOp_Get_Policy;
+
+ if (s->info.args.sub_resource_exists("index"))
+ return new RGWOp_Check_Bucket_Index;
+
+ return new RGWOp_Bucket_Info;
+}
+
+RGWOp *RGWHandler_Bucket::op_put()
+{
+ if (s->info.args.sub_resource_exists("quota"))
+ return new RGWOp_Set_Bucket_Quota;
+
+ if (s->info.args.sub_resource_exists("sync"))
+ return new RGWOp_Sync_Bucket;
+
+ return new RGWOp_Bucket_Link;
+}
+
+RGWOp *RGWHandler_Bucket::op_post()
+{
+ return new RGWOp_Bucket_Unlink;
+}
+
+RGWOp *RGWHandler_Bucket::op_delete()
+{
+ if (s->info.args.sub_resource_exists("object"))
+ return new RGWOp_Object_Remove;
+
+ return new RGWOp_Bucket_Remove;
+}
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.h b/src/rgw/driver/rados/rgw_rest_bucket.h
new file mode 100644
index 000000000..00f0b6439
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
+protected:
+ RGWOp *op_get() override;
+ RGWOp *op_put() override;
+ RGWOp *op_post() override;
+ RGWOp *op_delete() override;
+public:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ ~RGWHandler_Bucket() override = default;
+
+ int read_permissions(RGWOp*, optional_yield y) override {
+ return 0;
+ }
+};
+
+class RGWRESTMgr_Bucket : public RGWRESTMgr {
+public:
+ RGWRESTMgr_Bucket() = default;
+ ~RGWRESTMgr_Bucket() override = default;
+
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&) override {
+ return new RGWHandler_Bucket(auth_registry);
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc
new file mode 100644
index 000000000..f4099807d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.cc
@@ -0,0 +1,1268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ceph_json.h"
+#include "common/strtol.h"
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_log.h"
+#include "rgw_client_io.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_trim_bilog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_bilog_rados.h"
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWOp_MDLog_List::execute(optional_yield y) {
+ string period = s->info.args.get("period");
+ string shard = s->info.args.get("id");
+ string max_entries_str = s->info.args.get("max-entries");
+ string marker = s->info.args.get("marker"),
+ err;
+ void *handle;
+ unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (!max_entries_str.empty()) {
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+ }
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
+
+ op_ret = meta_log.list_entries(this, handle, max_entries, entries,
+ &last_marker, &truncated);
+
+ meta_log.complete_list_entries(handle);
+}
+
+void RGWOp_MDLog_List::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("log_entries");
+ s->formatter->dump_string("marker", last_marker);
+ s->formatter->dump_bool("truncated", truncated);
+ {
+ s->formatter->open_array_section("entries");
+ for (list<cls_log_entry>::iterator iter = entries.begin();
+ iter != entries.end(); ++iter) {
+ cls_log_entry& entry = *iter;
+ static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
+ flusher.flush();
+ }
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_MDLog_Info::execute(optional_yield y) {
+ num_objects = s->cct->_conf->rgw_md_log_max_shards;
+ period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
+ op_ret = period.get_error();
+}
+
+void RGWOp_MDLog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ s->formatter->open_object_section("mdlog");
+ s->formatter->dump_unsigned("num_objects", num_objects);
+ if (period) {
+ s->formatter->dump_string("period", period.get_period().get_id());
+ s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
+ string period = s->info.args.get("period");
+ string shard = s->info.args.get("id");
+ string err;
+
+ unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ op_ret = meta_log.get_info(this, shard_id, &info);
+}
+
+void RGWOp_MDLog_ShardInfo::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ encode_json("info", info, s->formatter);
+ flusher.flush();
+}
+
+void RGWOp_MDLog_Delete::execute(optional_yield y) {
+ string marker = s->info.args.get("marker"),
+ period = s->info.args.get("period"),
+ shard = s->info.args.get("id"),
+ err;
+ unsigned shard_id;
+
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("start-marker")) {
+ ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("end-marker")) {
+ if (!s->info.args.exists("marker")) {
+ marker = s->info.args.get("end-marker");
+ } else {
+ ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+ op_ret = -EINVAL;
+ }
+ }
+
+ op_ret = 0;
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (marker.empty()) { /* bounding end */
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
+}
+
+void RGWOp_MDLog_Lock::execute(optional_yield y) {
+ string period, shard_id_str, duration_str, locker_id, zone_id;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ period = s->info.args.get("period");
+ shard_id_str = s->info.args.get("id");
+ duration_str = s->info.args.get("length");
+ locker_id = s->info.args.get("locker-id");
+ zone_id = s->info.args.get("zone-id");
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ }
+
+ if (period.empty() ||
+ shard_id_str.empty() ||
+ (duration_str.empty()) ||
+ locker_id.empty() ||
+ zone_id.empty()) {
+ ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+ unsigned dur;
+ dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
+ if (!err.empty() || dur <= 0) {
+ ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
+ locker_id);
+ if (op_ret == -EBUSY)
+ op_ret = -ERR_LOCKED;
+}
+
+void RGWOp_MDLog_Unlock::execute(optional_yield y) {
+ string period, shard_id_str, locker_id, zone_id;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ period = s->info.args.get("period");
+ shard_id_str = s->info.args.get("id");
+ locker_id = s->info.args.get("locker-id");
+ zone_id = s->info.args.get("zone-id");
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ }
+
+ if (period.empty() ||
+ shard_id_str.empty() ||
+ locker_id.empty() ||
+ zone_id.empty()) {
+ ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+ op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
+}
+
+void RGWOp_MDLog_Notify::execute(optional_yield y) {
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ set<int> updated_shards;
+ try {
+ decode_json_obj(updated_shards, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
+ }
+ }
+
+ driver->wakeup_meta_sync_shards(updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_BILog_List::execute(optional_yield y) {
+ bool gen_specified = false;
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ marker = s->info.args.get("marker"),
+ max_entries_str = s->info.args.get("max-entries"),
+ bucket_instance = s->info.args.get("bucket-instance"),
+ gen_str = s->info.args.get("generation", &gen_specified),
+ format_version_str = s->info.args.get("format-ver");
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ unsigned max_entries;
+
+ if (bucket_name.empty() && bucket_instance.empty()) {
+ ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ std::optional<uint64_t> gen;
+ if (gen_specified) {
+ gen = strict_strtoll(gen_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ if (!format_version_str.empty()) {
+ format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ const auto& logs = bucket->get_info().layout.logs;
+ if (logs.empty()) {
+ ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+
+ auto log = std::prev(logs.end());
+ if (gen) {
+ log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+ if (log == logs.end()) {
+ ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ }
+ if (auto next = std::next(log); next != logs.end()) {
+ next_log_layout = *next; // get the next log after the current latest
+ }
+ auto& log_layout = *log; // current log layout for log listing
+
+ unsigned count = 0;
+
+
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty())
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ send_response();
+ do {
+ list<rgw_bi_log_entry> entries;
+ int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
+ marker, max_entries - count,
+ entries, &truncated);
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
+ return;
+ }
+
+ count += entries.size();
+
+ send_response(entries, marker);
+ } while (truncated && count < max_entries);
+
+ send_response_end();
+}
+
+void RGWOp_BILog_List::send_response() {
+ if (sent_header)
+ return;
+
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ sent_header = true;
+
+ if (op_ret < 0)
+ return;
+
+ if (format_ver >= 2) {
+ s->formatter->open_object_section("result");
+ }
+
+ s->formatter->open_array_section("entries");
+}
+
+void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
+{
+ for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_bi_log_entry& entry = *iter;
+ encode_json("entry", entry, s->formatter);
+
+ marker = entry.id;
+ flusher.flush();
+ }
+}
+
+void RGWOp_BILog_List::send_response_end() {
+ s->formatter->close_section();
+
+ if (format_ver >= 2) {
+ encode_json("truncated", truncated, s->formatter);
+
+ if (next_log_layout) {
+ s->formatter->open_object_section("next_log");
+ encode_json("generation", next_log_layout->gen, s->formatter);
+ encode_json("num_shards", rgw::num_shards(next_log_layout->layout.in_index.layout), s->formatter);
+ s->formatter->close_section(); // next_log
+ }
+
+ s->formatter->close_section(); // result
+ }
+
+ flusher.flush();
+}
+
+void RGWOp_BILog_Info::execute(optional_yield y) {
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ bucket_instance = s->info.args.get("bucket-instance");
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ if (bucket_name.empty() && bucket_instance.empty()) {
+ ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ const auto& logs = bucket->get_info().layout.logs;
+ if (logs.empty()) {
+ ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+
+ map<RGWObjCategory, RGWStorageStats> stats;
+ const auto& index = log_to_index_layout(logs.back());
+
+ int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
+ if (ret < 0 && ret != -ENOENT) {
+ op_ret = ret;
+ return;
+ }
+
+ oldest_gen = logs.front().gen;
+ latest_gen = logs.back().gen;
+
+ for (auto& log : logs) {
+ uint32_t num_shards = rgw::num_shards(log.layout.in_index.layout);
+ generations.push_back({log.gen, num_shards});
+ }
+}
+
+void RGWOp_BILog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("info");
+ encode_json("bucket_ver", bucket_ver, s->formatter);
+ encode_json("master_ver", master_ver, s->formatter);
+ encode_json("max_marker", max_marker, s->formatter);
+ encode_json("syncstopped", syncstopped, s->formatter);
+ encode_json("oldest_gen", oldest_gen, s->formatter);
+ encode_json("latest_gen", latest_gen, s->formatter);
+ encode_json("generations", generations, s->formatter);
+ s->formatter->close_section();
+
+ flusher.flush();
+}
+
+void RGWOp_BILog_Delete::execute(optional_yield y) {
+ bool gen_specified = false;
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ start_marker = s->info.args.get("start-marker"),
+ end_marker = s->info.args.get("end-marker"),
+ bucket_instance = s->info.args.get("bucket-instance"),
+ gen_str = s->info.args.get("generation", &gen_specified);
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ op_ret = 0;
+ if ((bucket_name.empty() && bucket_instance.empty()) ||
+ end_marker.empty()) {
+ ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ uint64_t gen = 0;
+ if (gen_specified) {
+ gen = strict_strtoll(gen_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
+ bucket->get_info(), gen, shard_id,
+ start_marker, end_marker);
+ if (op_ret < 0) {
+ ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
+ }
+
+ return;
+}
+
+void RGWOp_DATALog_List::execute(optional_yield y) {
+ string shard = s->info.args.get("id");
+
+ string max_entries_str = s->info.args.get("max-entries"),
+ marker = s->info.args.get("marker"),
+ err;
+ unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ s->info.args.get_bool("extra-info", &extra_info, false);
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (!max_entries_str.empty()) {
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+ }
+ }
+
+ // Note that last_marker is updated to be the marker of the last
+ // entry listed
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+ datalog_rados->list_entries(this, shard_id, max_entries, entries,
+ marker, &last_marker, &truncated, y);
+}
+
+void RGWOp_DATALog_List::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("log_entries");
+ s->formatter->dump_string("marker", last_marker);
+ s->formatter->dump_bool("truncated", truncated);
+ {
+ s->formatter->open_array_section("entries");
+ for (const auto& entry : entries) {
+ if (!extra_info) {
+ encode_json("entry", entry.entry, s->formatter);
+ } else {
+ encode_json("entry", entry, s->formatter);
+ }
+ flusher.flush();
+ }
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+
+void RGWOp_DATALog_Info::execute(optional_yield y) {
+ num_objects = s->cct->_conf->rgw_data_log_num_shards;
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ s->formatter->open_object_section("num_objects");
+ s->formatter->dump_unsigned("num_objects", num_objects);
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
+ string shard = s->info.args.get("id");
+ string err;
+
+ unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+ datalog_rados->get_info(this, shard_id, &info, y);
+}
+
+void RGWOp_DATALog_ShardInfo::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ encode_json("info", info, s->formatter);
+ flusher.flush();
+}
+
+void RGWOp_DATALog_Notify::execute(optional_yield y) {
+ string source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
+ try {
+ auto decoder = rgw_data_notify_v1_decoder{updated_shards};
+ decode_json_obj(decoder, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
+ << " of gen=" << gen << dendl;
+ }
+ }
+ }
+
+ driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Notify2::execute(optional_yield y) {
+ string source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
+ try {
+ decode_json_obj(updated_shards, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
+ updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
+ " of generation=" << gen << dendl;
+ }
+ }
+ }
+
+ driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Delete::execute(optional_yield y) {
+ string marker = s->info.args.get("marker"),
+ shard = s->info.args.get("id"),
+ err;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("start-marker")) {
+ ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("end-marker")) {
+ if (!s->info.args.exists("marker")) {
+ marker = s->info.args.get("end-marker");
+ } else {
+ ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+ op_ret = -EINVAL;
+ }
+ }
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (marker.empty()) { /* bounding end */
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+ datalog_rados->trim_entries(this, shard_id, marker, y);
+}
+
+// not in header to avoid pulling in rgw_sync.h
+class RGWOp_MDLog_Status : public RGWRESTOp {
+ rgw_meta_sync_status status;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "get_metadata_log_status"; }
+};
+
+void RGWOp_MDLog_Status::execute(optional_yield y)
+{
+ auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
+ if (sync == nullptr) {
+ ldpp_dout(this, 1) << "no sync manager" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_MDLog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ encode_json("status", status, s->formatter);
+ }
+ flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_BILog_Status : public RGWRESTOp {
+ bilog_status_v2 status;
+ int version = 1;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "get_bucket_index_log_status"; }
+};
+
+void RGWOp_BILog_Status::execute(optional_yield y)
+{
+ const auto options = s->info.args.get("options");
+ bool merge = (options == "merge");
+ const auto source_zone = s->info.args.get("source-zone");
+ const auto source_key = s->info.args.get("source-bucket");
+ auto key = s->info.args.get("bucket");
+ op_ret = s->info.args.get_int("version", &version, 1);
+
+ if (key.empty()) {
+ key = source_key;
+ }
+ if (key.empty()) {
+ ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ rgw_bucket b;
+ int shard_id{-1}; // unused
+ op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ // read the bucket instance info for num_shards
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
+ return;
+ }
+
+ rgw_bucket source_bucket;
+
+ if (source_key.empty() ||
+ source_key == key) {
+ source_bucket = bucket->get_key();
+ } else {
+ op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
+ return;
+ }
+ }
+
+ const auto& local_zone_id = driver->get_zone()->get_id();
+
+ if (!merge) {
+ rgw_sync_bucket_pipe pipe;
+ pipe.source.zone = source_zone;
+ pipe.source.bucket = source_bucket;
+ pipe.dest.zone = local_zone_id;
+ pipe.dest.bucket = bucket->get_key();
+
+ ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+ op_ret = rgw_read_bucket_full_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ &status.sync_status,
+ s->yield);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ return;
+ }
+ status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
+
+ op_ret = rgw_read_bucket_inc_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ status.sync_status.incremental_gen,
+ &status.inc_status);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ }
+ return;
+ }
+
+ rgw_zone_id source_zone_id(source_zone);
+
+ RGWBucketSyncPolicyHandlerRef source_handler;
+ op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
+ return;
+ }
+
+ auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
+
+ std::vector<rgw_bucket_shard_sync_info> current_status;
+ for (auto& entry : local_dests) {
+ auto pipe = entry.second;
+
+ ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+ RGWBucketInfo *pinfo = &bucket->get_info();
+ std::optional<RGWBucketInfo> opt_dest_info;
+
+ if (!pipe.dest.bucket) {
+ /* Uh oh, something went wrong */
+ ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
+ op_ret = -EIO;
+ return;
+ }
+
+ if (*pipe.dest.bucket != pinfo->bucket) {
+ opt_dest_info.emplace();
+ std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+ op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
+ return;
+ }
+
+ *opt_dest_info = dest_bucket->get_info();
+ pinfo = &(*opt_dest_info);
+ pipe.dest.bucket = pinfo->bucket;
+ }
+
+ op_ret = rgw_read_bucket_full_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ &status.sync_status,
+ s->yield);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ return;
+ }
+
+ current_status.resize(status.sync_status.shards_done_with_gen.size());
+ int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
+ pipe, status.sync_status.incremental_gen, &current_status);
+ if (r < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
+ op_ret = r;
+ return;
+ }
+
+ if (status.inc_status.empty()) {
+ status.inc_status = std::move(current_status);
+ } else {
+ if (current_status.size() != status.inc_status.size()) {
+ op_ret = -EINVAL;
+ ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
+ "syncing from the same source: status.size()= "
+ << status.inc_status.size()
+ << " current_status.size()="
+ << current_status.size() << dendl;
+ return;
+ }
+ auto m = status.inc_status.begin();
+ for (auto& cur_shard_status : current_status) {
+ auto& result_shard_status = *m++;
+ // always take the first marker, or any later marker that's smaller
+ if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
+ result_shard_status = std::move(cur_shard_status);
+ }
+ }
+ }
+ }
+}
+
+void RGWOp_BILog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ if (version < 2) {
+ encode_json("status", status.inc_status, s->formatter);
+ } else {
+ encode_json("status", status, s->formatter);
+ }
+ }
+ flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_DATALog_Status : public RGWRESTOp {
+ rgw_data_sync_status status;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override ;
+ void send_response() override;
+ const char* name() const override { return "get_data_changes_log_status"; }
+};
+
+void RGWOp_DATALog_Status::execute(optional_yield y)
+{
+ const auto source_zone = s->info.args.get("source-zone");
+ auto sync = driver->get_data_sync_manager(source_zone);
+ if (sync == nullptr) {
+ ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_DATALog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ encode_json("status", status, s->formatter);
+ }
+ flusher.flush();
+}
+
+
+RGWOp *RGWHandler_Log::op_get() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0) {
+ if (s->info.args.exists("id")) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_MDLog_ShardInfo;
+ } else {
+ return new RGWOp_MDLog_List;
+ }
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_MDLog_Status;
+ } else {
+ return new RGWOp_MDLog_Info;
+ }
+ } else if (type.compare("bucket-index") == 0) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_BILog_Info;
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_BILog_Status;
+ } else {
+ return new RGWOp_BILog_List;
+ }
+ } else if (type.compare("data") == 0) {
+ if (s->info.args.exists("id")) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_DATALog_ShardInfo;
+ } else {
+ return new RGWOp_DATALog_List;
+ }
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_DATALog_Status;
+ } else {
+ return new RGWOp_DATALog_Info;
+ }
+ }
+ return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_delete() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0)
+ return new RGWOp_MDLog_Delete;
+ else if (type.compare("bucket-index") == 0)
+ return new RGWOp_BILog_Delete;
+ else if (type.compare("data") == 0)
+ return new RGWOp_DATALog_Delete;
+ return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_post() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0) {
+ if (s->info.args.exists("lock"))
+ return new RGWOp_MDLog_Lock;
+ else if (s->info.args.exists("unlock"))
+ return new RGWOp_MDLog_Unlock;
+ else if (s->info.args.exists("notify"))
+ return new RGWOp_MDLog_Notify;
+ } else if (type.compare("data") == 0) {
+ if (s->info.args.exists("notify")) {
+ return new RGWOp_DATALog_Notify;
+ } else if (s->info.args.exists("notify2")) {
+ return new RGWOp_DATALog_Notify2;
+ }
+ }
+ return NULL;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_log.h b/src/rgw/driver/rados/rgw_rest_log.h
new file mode 100644
index 000000000..02b1d133f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.h
@@ -0,0 +1,337 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_datalog.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+#include "rgw_data_sync.h"
+
+class RGWOp_BILog_List : public RGWRESTOp {
+ bool sent_header;
+ uint32_t format_ver{0};
+ bool truncated{false};
+ std::optional<rgw::bucket_log_layout_generation> next_log_layout;
+
+public:
+ RGWOp_BILog_List() : sent_header(false) {}
+ ~RGWOp_BILog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void send_response() override;
+ virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
+ virtual void send_response_end();
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "list_bucket_index_log";
+ }
+};
+
+class RGWOp_BILog_Info : public RGWRESTOp {
+ std::string bucket_ver;
+ std::string master_ver;
+ std::string max_marker;
+ bool syncstopped;
+ uint64_t oldest_gen = 0;
+ uint64_t latest_gen = 0;
+ std::vector<store_gen_shards> generations;
+
+public:
+ RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
+ ~RGWOp_BILog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void send_response() override;
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "bucket_index_log_info";
+ }
+};
+
+class RGWOp_BILog_Delete : public RGWRESTOp {
+public:
+ RGWOp_BILog_Delete() {}
+ ~RGWOp_BILog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_bucket_index_log";
+ }
+};
+
+class RGWOp_MDLog_List : public RGWRESTOp {
+ std::list<cls_log_entry> entries;
+ std::string last_marker;
+ bool truncated;
+public:
+ RGWOp_MDLog_List() : truncated(false) {}
+ ~RGWOp_MDLog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "list_metadata_log";
+ }
+};
+
+class RGWOp_MDLog_Info : public RGWRESTOp {
+ unsigned num_objects;
+ RGWPeriodHistory::Cursor period;
+public:
+ RGWOp_MDLog_Info() : num_objects(0) {}
+ ~RGWOp_MDLog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_metadata_log_info";
+ }
+};
+
+class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
+ RGWMetadataLogInfo info;
+public:
+ RGWOp_MDLog_ShardInfo() {}
+ ~RGWOp_MDLog_ShardInfo() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_metadata_log_shard_info";
+ }
+};
+
+class RGWOp_MDLog_Lock : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Lock() {}
+ ~RGWOp_MDLog_Lock() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "lock_mdlog_object";
+ }
+};
+
+class RGWOp_MDLog_Unlock : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Unlock() {}
+ ~RGWOp_MDLog_Unlock() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "unlock_mdlog_object";
+ }
+};
+
+class RGWOp_MDLog_Notify : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Notify() {}
+ ~RGWOp_MDLog_Notify() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "mdlog_notify";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
+};
+
+class RGWOp_MDLog_Delete : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Delete() {}
+ ~RGWOp_MDLog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_metadata_log";
+ }
+};
+
+class RGWOp_DATALog_List : public RGWRESTOp {
+ std::vector<rgw_data_change_log_entry> entries;
+ std::string last_marker;
+ bool truncated;
+ bool extra_info;
+public:
+ RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
+ ~RGWOp_DATALog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "list_data_changes_log";
+ }
+};
+
+class RGWOp_DATALog_Info : public RGWRESTOp {
+ unsigned num_objects;
+public:
+ RGWOp_DATALog_Info() : num_objects(0) {}
+ ~RGWOp_DATALog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_data_changes_log_info";
+ }
+};
+
+class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
+ RGWDataChangesLogInfo info;
+public:
+ RGWOp_DATALog_ShardInfo() {}
+ ~RGWOp_DATALog_ShardInfo() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_data_changes_log_shard_info";
+ }
+};
+
+class RGWOp_DATALog_Notify : public RGWRESTOp {
+public:
+ RGWOp_DATALog_Notify() {}
+ ~RGWOp_DATALog_Notify() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "datalog_notify";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
+};
+
+class RGWOp_DATALog_Notify2 : public RGWRESTOp {
+ rgw_data_notify_entry data_notify;
+public:
+ RGWOp_DATALog_Notify2() {}
+ ~RGWOp_DATALog_Notify2() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "datalog_notify2";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
+};
+
+class RGWOp_DATALog_Delete : public RGWRESTOp {
+public:
+ RGWOp_DATALog_Delete() {}
+ ~RGWOp_DATALog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_data_changes_log";
+ }
+};
+
+class RGWHandler_Log : public RGWHandler_Auth_S3 {
+protected:
+ RGWOp *op_get() override;
+ RGWOp *op_delete() override;
+ RGWOp *op_post() override;
+
+ int read_permissions(RGWOp*, optional_yield) override {
+ return 0;
+ }
+public:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ ~RGWHandler_Log() override = default;
+};
+
+class RGWRESTMgr_Log : public RGWRESTMgr {
+public:
+ RGWRESTMgr_Log() = default;
+ ~RGWRESTMgr_Log() override = default;
+
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state* const,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string& frontend_prefixs) override {
+ return new RGWHandler_Log(auth_registry);
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_rest_pubsub.h b/src/rgw/driver/rados/rgw_rest_pubsub.h
new file mode 100644
index 000000000..27bde7a95
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_pubsub.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "rgw_rest_s3.h"
+
+// s3 compliant notification handler factory
+class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 {
+protected:
+ int init_permissions(RGWOp* op, optional_yield y) override {return 0;}
+ int read_permissions(RGWOp* op, optional_yield y) override {return 0;}
+ bool supports_quota() override {return false;}
+ RGWOp* op_get() override;
+ RGWOp* op_put() override;
+ RGWOp* op_delete() override;
+public:
+ using RGWHandler_REST_S3::RGWHandler_REST_S3;
+ virtual ~RGWHandler_REST_PSNotifs_S3() = default;
+ // following are used to generate the operations when invoked by another REST handler
+ static RGWOp* create_get_op();
+ static RGWOp* create_put_op();
+ static RGWOp* create_delete_op();
+};
+
+// AWS compliant topics handler factory
+class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST {
+ const rgw::auth::StrategyRegistry& auth_registry;
+protected:
+ RGWOp* op_post() override;
+public:
+ RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry) :
+ auth_registry(_auth_registry) {}
+ virtual ~RGWHandler_REST_PSTopic_AWS() = default;
+ int postauth_init(optional_yield) override { return 0; }
+ int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
+ static bool action_exists(const req_state* s);
+};
+
diff --git a/src/rgw/driver/rados/rgw_rest_realm.cc b/src/rgw/driver/rados/rgw_rest_realm.cc
new file mode 100644
index 000000000..79640a2a1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_realm.cc
@@ -0,0 +1,376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_config.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+// reject 'period push' if we would have to fetch too many intermediate periods
+static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64;
+
+// base period op, shared between Get and Post
+class RGWOp_Period_Base : public RGWRESTOp {
+ protected:
+ RGWPeriod period;
+ std::ostringstream error_stream;
+ public:
+ int verify_permission(optional_yield) override { return 0; }
+ void send_response() override;
+};
+
+// reply with the period object on success
+void RGWOp_Period_Base::send_response()
+{
+ set_req_state_err(s, op_ret, error_stream.str());
+ dump_errno(s);
+
+ if (op_ret < 0) {
+ if (!s->err.message.empty()) {
+ ldpp_dout(this, 4) << "Request failed with " << op_ret
+ << ": " << s->err.message << dendl;
+ }
+ end_header(s);
+ return;
+ }
+
+ encode_json("period", period, s->formatter);
+ end_header(s, NULL, "application/json", s->formatter->get_len());
+ flusher.flush();
+}
+
+// GET /admin/realm/period
+class RGWOp_Period_Get : public RGWOp_Period_Base {
+ public:
+ void execute(optional_yield y) override;
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("zone", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ const char* name() const override { return "get_period"; }
+};
+
+void RGWOp_Period_Get::execute(optional_yield y)
+{
+ string realm_id, realm_name, period_id;
+ epoch_t epoch = 0;
+ RESTArgs::get_string(s, "realm_id", realm_id, &realm_id);
+ RESTArgs::get_string(s, "realm_name", realm_name, &realm_name);
+ RESTArgs::get_string(s, "period_id", period_id, &period_id);
+ RESTArgs::get_uint32(s, "epoch", 0, &epoch);
+
+ period.set_id(period_id);
+ period.set_epoch(epoch);
+
+ op_ret = period.init(this, driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y, realm_name);
+ if (op_ret < 0)
+ ldpp_dout(this, 5) << "failed to read period" << dendl;
+}
+
+// POST /admin/realm/period
+class RGWOp_Period_Post : public RGWOp_Period_Base {
+ public:
+ void execute(optional_yield y) override;
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("zone", RGW_CAP_WRITE);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ const char* name() const override { return "post_period"; }
+ RGWOpType get_type() override { return RGW_OP_PERIOD_POST; }
+};
+
+void RGWOp_Period_Post::execute(optional_yield y)
+{
+ auto cct = driver->ctx();
+
+ // initialize the period without reading from rados
+ period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y, false);
+
+ // decode the period from input
+ const auto max_size = cct->_conf->rgw_max_put_param_size;
+ bool empty;
+ op_ret = get_json_input(cct, s, period, max_size, &empty);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to decode period" << dendl;
+ return;
+ }
+
+ // require period.realm_id to match our realm
+ if (period.get_realm() != static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id()) {
+ error_stream << "period with realm id " << period.get_realm()
+ << " doesn't match current realm " << static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id() << std::endl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ // load the realm and current period from rados; there may be a more recent
+ // period that we haven't restarted with yet. we also don't want to modify
+ // the objects in use by RGWRados
+ RGWRealm realm(period.get_realm());
+ op_ret = realm.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to read current realm: "
+ << cpp_strerror(-op_ret) << dendl;
+ return;
+ }
+
+ RGWPeriod current_period;
+ op_ret = current_period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm.get_id(), y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to read current period: "
+ << cpp_strerror(-op_ret) << dendl;
+ return;
+ }
+
+ // if period id is empty, handle as 'period commit'
+ if (period.get_id().empty()) {
+ op_ret = period.commit(this, driver, realm, current_period, error_stream, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "master zone failed to commit period" << dendl;
+ }
+ return;
+ }
+
+ // if it's not period commit, nobody is allowed to push to the master zone
+ if (period.get_master_zone() == static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params().get_id()) {
+ ldpp_dout(this, 10) << "master zone rejecting period id="
+ << period.get_id() << " epoch=" << period.get_epoch() << dendl;
+ op_ret = -EINVAL; // XXX: error code
+ return;
+ }
+
+ // write the period to rados
+ op_ret = period.store_info(this, false, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to store period " << period.get_id() << dendl;
+ return;
+ }
+ // set as latest epoch
+ op_ret = period.update_latest_epoch(this, period.get_epoch(), y);
+ if (op_ret == -EEXIST) {
+ // already have this epoch (or a more recent one)
+ ldpp_dout(this, 4) << "already have epoch >= " << period.get_epoch()
+ << " for period " << period.get_id() << dendl;
+ op_ret = 0;
+ return;
+ }
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to set latest epoch" << dendl;
+ return;
+ }
+
+ auto period_history = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_period_history();
+
+ // decide whether we can set_current_period() or set_latest_epoch()
+ if (period.get_id() != current_period.get_id()) {
+ auto current_epoch = current_period.get_realm_epoch();
+ // discard periods in the past
+ if (period.get_realm_epoch() < current_epoch) {
+ ldpp_dout(this, 10) << "discarding period " << period.get_id()
+ << " with realm epoch " << period.get_realm_epoch()
+ << " older than current epoch " << current_epoch << dendl;
+ // return success to ack that we have this period
+ return;
+ }
+ // discard periods too far in the future
+ if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) {
+ ldpp_dout(this, -1) << "discarding period " << period.get_id()
+ << " with realm epoch " << period.get_realm_epoch() << " too far in "
+ "the future from current epoch " << current_epoch << dendl;
+ op_ret = -ENOENT; // XXX: error code
+ return;
+ }
+ // attach a copy of the period into the period history
+ auto cursor = period_history->attach(this, RGWPeriod{period}, y);
+ if (!cursor) {
+ // we're missing some history between the new period and current_period
+ op_ret = cursor.get_error();
+ ldpp_dout(this, -1) << "failed to collect the periods between current period "
+ << current_period.get_id() << " (realm epoch " << current_epoch
+ << ") and the new period " << period.get_id()
+ << " (realm epoch " << period.get_realm_epoch()
+ << "): " << cpp_strerror(-op_ret) << dendl;
+ return;
+ }
+ if (cursor.has_next()) {
+ // don't switch if we have a newer period in our history
+ ldpp_dout(this, 4) << "attached period " << period.get_id()
+ << " to history, but the history contains newer periods" << dendl;
+ return;
+ }
+ // set as current period
+ op_ret = realm.set_current_period(this, period, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to update realm's current period" << dendl;
+ return;
+ }
+ ldpp_dout(this, 4) << "period " << period.get_id()
+ << " is newer than current period " << current_period.get_id()
+ << ", updating realm's current period and notifying zone" << dendl;
+ realm.notify_new_period(this, period, y);
+ return;
+ }
+ // reflect the period into our local objects
+ op_ret = period.reflect(this, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "failed to update local objects: "
+ << cpp_strerror(-op_ret) << dendl;
+ return;
+ }
+ ldpp_dout(this, 4) << "period epoch " << period.get_epoch()
+ << " is newer than current epoch " << current_period.get_epoch()
+ << ", updating period's latest epoch and notifying zone" << dendl;
+ realm.notify_new_period(this, period, y);
+ // update the period history
+ period_history->insert(RGWPeriod{period});
+}
+
+class RGWHandler_Period : public RGWHandler_Auth_S3 {
+ protected:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+
+ RGWOp *op_get() override { return new RGWOp_Period_Get; }
+ RGWOp *op_post() override { return new RGWOp_Period_Post; }
+};
+
+class RGWRESTMgr_Period : public RGWRESTMgr {
+ public:
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&) override {
+ return new RGWHandler_Period(auth_registry);
+ }
+};
+
+
+// GET /admin/realm
+class RGWOp_Realm_Get : public RGWRESTOp {
+ std::unique_ptr<RGWRealm> realm;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("zone", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "get_realm"; }
+};
+
+void RGWOp_Realm_Get::execute(optional_yield y)
+{
+ string id;
+ RESTArgs::get_string(s, "id", id, &id);
+ string name;
+ RESTArgs::get_string(s, "name", name, &name);
+
+ // read realm
+ realm.reset(new RGWRealm(id, name));
+ op_ret = realm->init(this, g_ceph_context, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+ if (op_ret < 0)
+ ldpp_dout(this, -1) << "failed to read realm id=" << id
+ << " name=" << name << dendl;
+}
+
+void RGWOp_Realm_Get::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+
+ if (op_ret < 0) {
+ end_header(s);
+ return;
+ }
+
+ encode_json("realm", *realm, s->formatter);
+ end_header(s, NULL, "application/json", s->formatter->get_len());
+ flusher.flush();
+}
+
+// GET /admin/realm?list
+class RGWOp_Realm_List : public RGWRESTOp {
+ std::string default_id;
+ std::list<std::string> realms;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("zone", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "list_realms"; }
+};
+
+void RGWOp_Realm_List::execute(optional_yield y)
+{
+ {
+ // read default realm
+ RGWRealm realm(driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj);
+ [[maybe_unused]] int ret = realm.read_default_id(this, default_id, y);
+ }
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->list_realms(this, realms);
+ if (op_ret < 0)
+ ldpp_dout(this, -1) << "failed to list realms" << dendl;
+}
+
+void RGWOp_Realm_List::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+
+ if (op_ret < 0) {
+ end_header(s);
+ return;
+ }
+
+ s->formatter->open_object_section("realms_list");
+ encode_json("default_info", default_id, s->formatter);
+ encode_json("realms", realms, s->formatter);
+ s->formatter->close_section();
+ end_header(s, NULL, "application/json", s->formatter->get_len());
+ flusher.flush();
+}
+
+class RGWHandler_Realm : public RGWHandler_Auth_S3 {
+protected:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ RGWOp *op_get() override {
+ if (s->info.args.sub_resource_exists("list"))
+ return new RGWOp_Realm_List;
+ return new RGWOp_Realm_Get;
+ }
+};
+
+RGWRESTMgr_Realm::RGWRESTMgr_Realm()
+{
+ // add the /admin/realm/period resource
+ register_resource("period", new RGWRESTMgr_Period);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_Realm::get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&)
+{
+ return new RGWHandler_Realm(auth_registry);
+}
diff --git a/src/rgw/driver/rados/rgw_rest_realm.h b/src/rgw/driver/rados/rgw_rest_realm.h
new file mode 100644
index 000000000..a0d1dc1c9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_realm.h
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWRESTMgr_Realm : public RGWRESTMgr {
+public:
+ RGWRESTMgr_Realm();
+
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&) override;
+};
diff --git a/src/rgw/driver/rados/rgw_rest_user.cc b/src/rgw/driver/rados/rgw_rest_user.cc
new file mode 100644
index 000000000..361ceb0f7
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_user.cc
@@ -0,0 +1,1137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+
+#include "rgw_op.h"
+#include "rgw_user.h"
+#include "rgw_rest_user.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+#include "include/ceph_assert.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int fetch_access_keys_from_master(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState &op_state, req_state *s, optional_yield y) {
+ bufferlist data;
+ JSONParser jp;
+ RGWUserInfo ui;
+ int op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, &jp, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(dpp, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return op_ret;
+ }
+ ui.decode_json(&jp);
+ op_state.op_access_keys = std::move(ui.access_keys);
+
+ return 0;
+}
+
+class RGWOp_User_List : public RGWRESTOp {
+
+public:
+ RGWOp_User_List() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "list_user"; }
+};
+
+void RGWOp_User_List::execute(optional_yield y)
+{
+ RGWUserAdminOpState op_state(driver);
+
+ uint32_t max_entries;
+ std::string marker;
+ RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries);
+ RESTArgs::get_string(s, "marker", marker, &marker);
+
+ op_state.max_entries = max_entries;
+ op_state.marker = marker;
+ op_ret = RGWUserAdminOp_User::list(this, driver, op_state, flusher);
+}
+
+class RGWOp_User_Info : public RGWRESTOp {
+
+public:
+ RGWOp_User_Info() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_user_info"; }
+};
+
+void RGWOp_User_Info::execute(optional_yield y)
+{
+ RGWUserAdminOpState op_state(driver);
+
+ std::string uid_str, access_key_str;
+ bool fetch_stats;
+ bool sync_stats;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str);
+
+ // if uid was not supplied in rest argument, error out now, otherwise we'll
+ // end up initializing anonymous user, for which keys.init will eventually
+ // return -EACESS
+ if (uid_str.empty() && access_key_str.empty()){
+ op_ret=-EINVAL;
+ return;
+ }
+
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+ RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
+ op_state.set_user_id(uid);
+ op_state.set_access_key(access_key_str);
+ op_state.set_fetch_stats(fetch_stats);
+ op_state.set_sync_stats(sync_stats);
+
+ op_ret = RGWUserAdminOp_User::info(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Create : public RGWRESTOp {
+
+public:
+ RGWOp_User_Create() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "create_user"; }
+};
+
+void RGWOp_User_Create::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string display_name;
+ std::string email;
+ std::string access_key;
+ std::string secret_key;
+ std::string key_type_str;
+ std::string caps;
+ std::string tenant_name;
+ std::string op_mask_str;
+ std::string default_placement_str;
+ std::string placement_tags_str;
+
+ bool gen_key;
+ bool suspended;
+ bool system;
+ bool exclusive;
+
+ int32_t max_buckets;
+ const int32_t default_max_buckets =
+ s->cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "display-name", display_name, &display_name);
+ RESTArgs::get_string(s, "email", email, &email);
+ RESTArgs::get_string(s, "access-key", access_key, &access_key);
+ RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+ RESTArgs::get_string(s, "user-caps", caps, &caps);
+ RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
+ RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+ RESTArgs::get_bool(s, "suspended", false, &suspended);
+ RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets);
+ RESTArgs::get_bool(s, "system", false, &system);
+ RESTArgs::get_bool(s, "exclusive", false, &exclusive);
+ RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+ RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+ RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+ if (!s->user->get_info().system && system) {
+ ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (!tenant_name.empty()) {
+ uid.tenant = tenant_name;
+ }
+
+ // TODO: validate required args are passed in. (for eg. uid and display_name here)
+ op_state.set_user_id(uid);
+ op_state.set_display_name(display_name);
+ op_state.set_user_email(email);
+ op_state.set_caps(caps);
+ op_state.set_access_key(access_key);
+ op_state.set_secret_key(secret_key);
+
+ if (!op_mask_str.empty()) {
+ uint32_t op_mask;
+ int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_state.set_op_mask(op_mask);
+ }
+
+ if (!key_type_str.empty()) {
+ int32_t key_type = KEY_TYPE_UNDEFINED;
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+
+ op_state.set_key_type(key_type);
+ }
+
+ if (max_buckets != default_max_buckets) {
+ if (max_buckets < 0) {
+ max_buckets = -1;
+ }
+ op_state.set_max_buckets(max_buckets);
+ }
+ if (s->info.args.exists("suspended"))
+ op_state.set_suspension(suspended);
+
+ if (s->info.args.exists("system"))
+ op_state.set_system(system);
+
+ if (s->info.args.exists("exclusive"))
+ op_state.set_exclusive(exclusive);
+
+ if (!default_placement_str.empty()) {
+ rgw_placement_rule target_rule;
+ target_rule.from_str(default_placement_str);
+ if (!driver->valid_placement(target_rule)) {
+ ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_state.set_default_placement(target_rule);
+ }
+
+ if (!placement_tags_str.empty()) {
+ list<string> placement_tags_list;
+ get_str_list(placement_tags_str, ",", placement_tags_list);
+ op_state.set_placement_tags(placement_tags_list);
+ }
+
+ if(!(driver->is_meta_master())) {
+ op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y);
+
+ if(op_ret < 0) {
+ return;
+ } else {
+ // set_generate_key() is not set if keys have already been fetched from master zone
+ gen_key = false;
+ }
+ }
+
+ if (gen_key) {
+ op_state.set_generate_key();
+ }
+
+ op_ret = RGWUserAdminOp_User::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Modify : public RGWRESTOp {
+
+public:
+ RGWOp_User_Modify() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "modify_user"; }
+};
+
+void RGWOp_User_Modify::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string display_name;
+ std::string email;
+ std::string access_key;
+ std::string secret_key;
+ std::string key_type_str;
+ std::string op_mask_str;
+ std::string default_placement_str;
+ std::string placement_tags_str;
+
+ bool gen_key;
+ bool suspended;
+ bool system;
+ bool email_set;
+ bool quota_set;
+ int32_t max_buckets;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "display-name", display_name, &display_name);
+ RESTArgs::get_string(s, "email", email, &email, &email_set);
+ RESTArgs::get_string(s, "access-key", access_key, &access_key);
+ RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+ RESTArgs::get_bool(s, "generate-key", false, &gen_key);
+ RESTArgs::get_bool(s, "suspended", false, &suspended);
+ RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, &quota_set);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+ RESTArgs::get_bool(s, "system", false, &system);
+ RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+ RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+ RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+ if (!s->user->get_info().system && system) {
+ ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_state.set_user_id(uid);
+ op_state.set_display_name(display_name);
+
+ if (email_set)
+ op_state.set_user_email(email);
+
+ op_state.set_access_key(access_key);
+ op_state.set_secret_key(secret_key);
+
+ if (quota_set) {
+ if (max_buckets < 0 ) {
+ max_buckets = -1;
+ }
+ op_state.set_max_buckets(max_buckets);
+ }
+
+ if (!key_type_str.empty()) {
+ int32_t key_type = KEY_TYPE_UNDEFINED;
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+
+ op_state.set_key_type(key_type);
+ }
+
+ if (!op_mask_str.empty()) {
+ uint32_t op_mask;
+ if (rgw_parse_op_type_list(op_mask_str, &op_mask) < 0) {
+ ldpp_dout(this, 0) << "failed to parse op_mask" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_state.set_op_mask(op_mask);
+ }
+
+ if (s->info.args.exists("suspended"))
+ op_state.set_suspension(suspended);
+
+ if (s->info.args.exists("system"))
+ op_state.set_system(system);
+
+ if (!op_mask_str.empty()) {
+ uint32_t op_mask;
+ int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_state.set_op_mask(op_mask);
+ }
+
+ if (!default_placement_str.empty()) {
+ rgw_placement_rule target_rule;
+ target_rule.from_str(default_placement_str);
+ if (!driver->valid_placement(target_rule)) {
+ ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_state.set_default_placement(target_rule);
+ }
+
+ if (!placement_tags_str.empty()) {
+ list<string> placement_tags_list;
+ get_str_list(placement_tags_str, ",", placement_tags_list);
+ op_state.set_placement_tags(placement_tags_list);
+ }
+
+ if(!(driver->is_meta_master())) {
+ op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y);
+
+ if(op_ret < 0) {
+ return;
+ } else {
+ // set_generate_key() is not set if keys have already been fetched from master zone
+ gen_key = false;
+ }
+ }
+
+ if (gen_key) {
+ op_state.set_generate_key();
+ }
+
+ op_ret = RGWUserAdminOp_User::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_User_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_user"; }
+};
+
+void RGWOp_User_Remove::execute(optional_yield y)
+{
+ std::string uid_str;
+ bool purge_data;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_bool(s, "purge-data", false, &purge_data);
+
+ // FIXME: no double checking
+ if (!uid.empty())
+ op_state.set_user_id(uid);
+
+ op_state.set_purge_data(purge_data);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_User::remove(s, driver, op_state, flusher, s->yield);
+}
+
+class RGWOp_Subuser_Create : public RGWRESTOp {
+
+public:
+ RGWOp_Subuser_Create() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "create_subuser"; }
+};
+
+void RGWOp_Subuser_Create::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string subuser;
+ std::string secret_key;
+ std::string access_key;
+ std::string perm_str;
+ std::string key_type_str;
+
+ bool gen_subuser = false; // FIXME placeholder
+ bool gen_secret;
+ bool gen_access;
+
+ uint32_t perm_mask = 0;
+ int32_t key_type = KEY_TYPE_SWIFT;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "subuser", subuser, &subuser);
+ RESTArgs::get_string(s, "access-key", access_key, &access_key);
+ RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+ RESTArgs::get_string(s, "access", perm_str, &perm_str);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+ RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+ RESTArgs::get_bool(s, "gen-access-key", false, &gen_access);
+
+ perm_mask = rgw_str_to_perm(perm_str.c_str());
+ op_state.set_perm(perm_mask);
+
+ op_state.set_user_id(uid);
+ op_state.set_subuser(subuser);
+ op_state.set_access_key(access_key);
+ op_state.set_secret_key(secret_key);
+ op_state.set_generate_subuser(gen_subuser);
+
+ if (gen_access)
+ op_state.set_gen_access();
+
+ if (gen_secret)
+ op_state.set_gen_secret();
+
+ if (!key_type_str.empty()) {
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+ }
+ op_state.set_key_type(key_type);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_Subuser::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Modify : public RGWRESTOp {
+
+public:
+ RGWOp_Subuser_Modify() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "modify_subuser"; }
+};
+
+void RGWOp_Subuser_Modify::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string subuser;
+ std::string secret_key;
+ std::string key_type_str;
+ std::string perm_str;
+
+ RGWUserAdminOpState op_state(driver);
+
+ uint32_t perm_mask;
+ int32_t key_type = KEY_TYPE_SWIFT;
+
+ bool gen_secret;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "subuser", subuser, &subuser);
+ RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+ RESTArgs::get_string(s, "access", perm_str, &perm_str);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+ RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+
+ perm_mask = rgw_str_to_perm(perm_str.c_str());
+ op_state.set_perm(perm_mask);
+
+ op_state.set_user_id(uid);
+ op_state.set_subuser(subuser);
+
+ if (!secret_key.empty())
+ op_state.set_secret_key(secret_key);
+
+ if (gen_secret)
+ op_state.set_gen_secret();
+
+ if (!key_type_str.empty()) {
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+ }
+ op_state.set_key_type(key_type);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_Subuser::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_Subuser_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_subuser"; }
+};
+
+void RGWOp_Subuser_Remove::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string subuser;
+ bool purge_keys;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "subuser", subuser, &subuser);
+ RESTArgs::get_bool(s, "purge-keys", true, &purge_keys);
+
+ op_state.set_user_id(uid);
+ op_state.set_subuser(subuser);
+
+ if (purge_keys)
+ op_state.set_purge_keys();
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_Subuser::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Create : public RGWRESTOp {
+
+public:
+ RGWOp_Key_Create() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "create_access_key"; }
+};
+
+void RGWOp_Key_Create::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string subuser;
+ std::string access_key;
+ std::string secret_key;
+ std::string key_type_str;
+
+ bool gen_key;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "subuser", subuser, &subuser);
+ RESTArgs::get_string(s, "access-key", access_key, &access_key);
+ RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+ RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+
+ op_state.set_user_id(uid);
+ op_state.set_subuser(subuser);
+ op_state.set_access_key(access_key);
+ op_state.set_secret_key(secret_key);
+
+ if (gen_key)
+ op_state.set_generate_key();
+
+ if (!key_type_str.empty()) {
+ int32_t key_type = KEY_TYPE_UNDEFINED;
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+
+ op_state.set_key_type(key_type);
+ }
+
+ op_ret = RGWUserAdminOp_Key::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_Key_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_access_key"; }
+};
+
+void RGWOp_Key_Remove::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string subuser;
+ std::string access_key;
+ std::string key_type_str;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "subuser", subuser, &subuser);
+ RESTArgs::get_string(s, "access-key", access_key, &access_key);
+ RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+ op_state.set_user_id(uid);
+ op_state.set_subuser(subuser);
+ op_state.set_access_key(access_key);
+
+ if (!key_type_str.empty()) {
+ int32_t key_type = KEY_TYPE_UNDEFINED;
+ if (key_type_str.compare("swift") == 0)
+ key_type = KEY_TYPE_SWIFT;
+ else if (key_type_str.compare("s3") == 0)
+ key_type = KEY_TYPE_S3;
+
+ op_state.set_key_type(key_type);
+ }
+
+ op_ret = RGWUserAdminOp_Key::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Add : public RGWRESTOp {
+
+public:
+ RGWOp_Caps_Add() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "add_user_caps"; }
+};
+
+void RGWOp_Caps_Add::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string caps;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+ op_state.set_user_id(uid);
+ op_state.set_caps(caps);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_Caps::add(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_Caps_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_user_caps"; }
+};
+
+void RGWOp_Caps_Remove::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string caps;
+
+ RGWUserAdminOpState op_state(driver);
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+ op_state.set_user_id(uid);
+ op_state.set_caps(caps);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWUserAdminOp_Caps::remove(s, driver, op_state, flusher, y);
+}
+
+struct UserQuotas {
+ RGWQuota quota;
+
+ UserQuotas() {}
+
+ explicit UserQuotas(RGWUserInfo& info){
+ quota.bucket_quota = info.quota.bucket_quota;
+ quota.user_quota = info.quota.user_quota;
+ }
+
+ void dump(Formatter *f) const {
+ encode_json("bucket_quota", quota.bucket_quota, f);
+ encode_json("user_quota", quota.user_quota, f);
+ }
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
+ JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
+ }
+};
+
+class RGWOp_Quota_Info : public RGWRESTOp {
+
+public:
+ RGWOp_Quota_Info() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_quota_info"; }
+};
+
+
+void RGWOp_Quota_Info::execute(optional_yield y)
+{
+ RGWUserAdminOpState op_state(driver);
+
+ std::string uid_str;
+ std::string quota_type;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+ if (uid_str.empty()) {
+ op_ret = -EINVAL;
+ return;
+ }
+
+ rgw_user uid(uid_str);
+
+ bool show_all = quota_type.empty();
+ bool show_bucket = show_all || (quota_type == "bucket");
+ bool show_user = show_all || (quota_type == "user");
+
+ if (!(show_all || show_bucket || show_user)) {
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_state.set_user_id(uid);
+
+ RGWUser user;
+ op_ret = user.init(s, driver, op_state, y);
+ if (op_ret < 0)
+ return;
+
+ if (!op_state.has_existing_user()) {
+ op_ret = -ERR_NO_SUCH_USER;
+ return;
+ }
+
+ RGWUserInfo info;
+ string err_msg;
+ op_ret = user.info(info, &err_msg);
+ if (op_ret < 0)
+ return;
+
+ flusher.start(0);
+ if (show_all) {
+ UserQuotas quotas(info);
+ encode_json("quota", quotas, s->formatter);
+ } else if (show_user) {
+ encode_json("user_quota", info.quota.user_quota, s->formatter);
+ } else {
+ encode_json("bucket_quota", info.quota.bucket_quota, s->formatter);
+ }
+
+ flusher.flush();
+}
+
+class RGWOp_Quota_Set : public RGWRESTOp {
+
+public:
+ RGWOp_Quota_Set() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("users", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "set_quota_info"; }
+};
+
+/**
+ * set quota
+ *
+ * two different ways to set the quota info: as json struct in the message body or via http params.
+ *
+ * as json:
+ *
+ * PUT /admin/user?uid=<uid>[&quota-type=<type>]
+ *
+ * whereas quota-type is optional and is either user, or bucket
+ *
+ * if quota-type is not specified then we expect to get a structure that contains both quotas,
+ * otherwise we'll only get the relevant configuration.
+ *
+ * E.g., if quota type not specified:
+ * {
+ * "user_quota" : {
+ * "max_size_kb" : 4096,
+ * "max_objects" : -1,
+ * "enabled" : false
+ * },
+ * "bucket_quota" : {
+ * "max_size_kb" : 1024,
+ * "max_objects" : -1,
+ * "enabled" : true
+ * }
+ * }
+ *
+ *
+ * or if quota type is specified:
+ * {
+ * "max_size_kb" : 4096,
+ * "max_objects" : -1,
+ * "enabled" : false
+ * }
+ *
+ * Another option is not to pass any body and set the following http params:
+ *
+ *
+ * max-size-kb=<size>
+ * max-objects=<max objects>
+ * enabled[={true,false}]
+ *
+ * all params are optionals and default to the current settings. With this type of configuration the
+ * quota-type param is mandatory.
+ *
+ */
+
+void RGWOp_Quota_Set::execute(optional_yield y)
+{
+ RGWUserAdminOpState op_state(driver);
+
+ std::string uid_str;
+ std::string quota_type;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+ if (uid_str.empty()) {
+ op_ret = -EINVAL;
+ return;
+ }
+
+ rgw_user uid(uid_str);
+
+ bool set_all = quota_type.empty();
+ bool set_bucket = set_all || (quota_type == "bucket");
+ bool set_user = set_all || (quota_type == "user");
+
+ if (!(set_all || set_bucket || set_user)) {
+ ldpp_dout(this, 20) << "invalid quota type" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ bool use_http_params;
+
+ if (s->content_length > 0) {
+ use_http_params = false;
+ } else {
+ const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+ use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+ }
+
+ if (use_http_params && set_all) {
+ ldpp_dout(this, 20) << "quota type was not specified, can't set all quotas via http headers" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_state.set_user_id(uid);
+
+ RGWUser user;
+ op_ret = user.init(s, driver, op_state, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 20) << "failed initializing user info: " << op_ret << dendl;
+ return;
+ }
+
+ if (!op_state.has_existing_user()) {
+ op_ret = -ERR_NO_SUCH_USER;
+ return;
+ }
+
+#define QUOTA_INPUT_MAX_LEN 1024
+ if (set_all) {
+ UserQuotas quotas;
+
+ if ((op_ret = get_json_input(driver->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) {
+ ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+ return;
+ }
+
+ op_state.set_user_quota(quotas.quota.user_quota);
+ op_state.set_bucket_quota(quotas.quota.bucket_quota);
+ } else {
+ RGWQuotaInfo quota;
+
+ if (!use_http_params) {
+ bool empty;
+ op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+ if (op_ret < 0) {
+ ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+ if (!empty)
+ return;
+
+ /* was probably chunked input, but no content provided, configure via http params */
+ use_http_params = true;
+ }
+ }
+
+ if (use_http_params) {
+ RGWUserInfo info;
+ string err_msg;
+ op_ret = user.info(info, &err_msg);
+ if (op_ret < 0) {
+ ldpp_dout(this, 20) << "failed to get user info: " << op_ret << dendl;
+ return;
+ }
+ RGWQuotaInfo *old_quota;
+ if (set_user) {
+ old_quota = &info.quota.user_quota;
+ } else {
+ old_quota = &info.quota.bucket_quota;
+ }
+
+ RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+ RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+ int64_t max_size_kb;
+ bool has_max_size_kb = false;
+ RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+ if (has_max_size_kb) {
+ quota.max_size = max_size_kb * 1024;
+ }
+ RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+ }
+
+ if (set_user) {
+ op_state.set_user_quota(quota);
+ } else {
+ op_state.set_bucket_quota(quota);
+ }
+ }
+
+ string err;
+ op_ret = user.modify(s, op_state, y, &err);
+ if (op_ret < 0) {
+ ldpp_dout(this, 20) << "failed updating user info: " << op_ret << ": " << err << dendl;
+ return;
+ }
+}
+
+RGWOp *RGWHandler_User::op_get()
+{
+ if (s->info.args.sub_resource_exists("quota"))
+ return new RGWOp_Quota_Info;
+
+ if (s->info.args.sub_resource_exists("list"))
+ return new RGWOp_User_List;
+
+ return new RGWOp_User_Info;
+}
+
+RGWOp *RGWHandler_User::op_put()
+{
+ if (s->info.args.sub_resource_exists("subuser"))
+ return new RGWOp_Subuser_Create;
+
+ if (s->info.args.sub_resource_exists("key"))
+ return new RGWOp_Key_Create;
+
+ if (s->info.args.sub_resource_exists("caps"))
+ return new RGWOp_Caps_Add;
+
+ if (s->info.args.sub_resource_exists("quota"))
+ return new RGWOp_Quota_Set;
+
+ return new RGWOp_User_Create;
+}
+
+RGWOp *RGWHandler_User::op_post()
+{
+ if (s->info.args.sub_resource_exists("subuser"))
+ return new RGWOp_Subuser_Modify;
+
+ return new RGWOp_User_Modify;
+}
+
+RGWOp *RGWHandler_User::op_delete()
+{
+ if (s->info.args.sub_resource_exists("subuser"))
+ return new RGWOp_Subuser_Remove;
+
+ if (s->info.args.sub_resource_exists("key"))
+ return new RGWOp_Key_Remove;
+
+ if (s->info.args.sub_resource_exists("caps"))
+ return new RGWOp_Caps_Remove;
+
+ return new RGWOp_User_Remove;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_user.h b/src/rgw/driver/rados/rgw_rest_user.h
new file mode 100644
index 000000000..ee585be45
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_user.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_User : public RGWHandler_Auth_S3 {
+protected:
+ RGWOp *op_get() override;
+ RGWOp *op_put() override;
+ RGWOp *op_post() override;
+ RGWOp *op_delete() override;
+public:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ ~RGWHandler_User() override = default;
+
+ int read_permissions(RGWOp*, optional_yield) override {
+ return 0;
+ }
+};
+
+class RGWRESTMgr_User : public RGWRESTMgr {
+public:
+ RGWRESTMgr_User() = default;
+ ~RGWRESTMgr_User() override = default;
+
+ RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&) override {
+ return new RGWHandler_User(auth_registry);
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
new file mode 100644
index 000000000..9acdb79d3
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -0,0 +1,3846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <filesystem>
+#include <unistd.h>
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <boost/process.hpp>
+
+#include "common/Clock.h"
+#include "common/errno.h"
+
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_bucket.h"
+#include "rgw_multi.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_tracer.h"
+
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_service.h"
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_rest_admin.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_rest_metadata.h"
+#include "rgw_rest_log.h"
+#include "rgw_rest_config.h"
+#include "rgw_rest_ratelimit.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_user.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_zone.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_quota.h"
+#include "services/svc_config_key.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_role_rados.h"
+#include "services/svc_user.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "rgw_pubsub.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string mp_ns = RGW_OBJ_NS_MULTIPART;
+
+namespace rgw::sal {
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+static std::string pubsub_oid_prefix = "pubsub.";
+
+static int decode_policy(CephContext* cct,
+ bufferlist& bl,
+ RGWAccessControlPolicy* policy)
+{
+ auto iter = bl.cbegin();
+ try {
+ policy->decode(iter);
+ } catch (buffer::error& err) {
+ ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+ ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
+ RGWAccessControlPolicy_S3* s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
+ s3policy->to_xml(*_dout);
+ *_dout << dendl;
+ }
+ return 0;
+}
+
+static int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider* dpp,
+ RadosStore* store,
+ User* user,
+ Attrs& bucket_attrs,
+ RGWAccessControlPolicy* policy,
+ optional_yield y)
+{
+ auto aiter = bucket_attrs.find(RGW_ATTR_ACL);
+
+ if (aiter != bucket_attrs.end()) {
+ int ret = decode_policy(store->ctx(), aiter->second, policy);
+ if (ret < 0)
+ return ret;
+ } else {
+ ldout(store->ctx(), 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl;
+ /* object exists, but policy is broken */
+ int r = user->load_user(dpp, y);
+ if (r < 0)
+ return r;
+
+ policy->create_default(user->get_id(), user->get_display_name());
+ }
+ return 0;
+}
+
+static int drain_aio(std::list<librados::AioCompletion*>& handles)
+{
+ int ret = 0;
+ while (!handles.empty()) {
+ librados::AioCompletion* handle = handles.front();
+ handles.pop_front();
+ handle->wait_for_complete();
+ int r = handle->get_return_value();
+ handle->release();
+ if (r < 0) {
+ ret = r;
+ }
+ }
+ return ret;
+}
+
+int RadosCompletions::drain()
+{
+ return drain_aio(handles);
+}
+
+int RadosUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
+ const std::string& end_marker, uint64_t max, bool need_stats,
+ BucketList &buckets, optional_yield y)
+{
+ RGWUserBuckets ulist;
+ bool is_truncated = false;
+ int ret;
+
+ buckets.clear();
+ ret = store->ctl()->user->list_buckets(dpp, info.user_id, marker, end_marker, max,
+ need_stats, &ulist, &is_truncated, y);
+ if (ret < 0)
+ return ret;
+
+ buckets.set_truncated(is_truncated);
+ for (const auto& ent : ulist.get_buckets()) {
+ buckets.add(std::unique_ptr<Bucket>(new RadosBucket(this->store, ent.second, this)));
+ }
+
+ return 0;
+}
+
+int RadosUser::create_bucket(const DoutPrefixProvider* dpp,
+ const rgw_bucket& b,
+ const std::string& zonegroup_id,
+ rgw_placement_rule& placement_rule,
+ std::string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ const RGWAccessControlPolicy& policy,
+ Attrs& attrs,
+ RGWBucketInfo& info,
+ obj_version& ep_objv,
+ bool exclusive,
+ bool obj_lock_enabled,
+ bool* existed,
+ req_info& req_info,
+ std::unique_ptr<Bucket>* bucket_out,
+ optional_yield y)
+{
+ int ret;
+ bufferlist in_data;
+ RGWBucketInfo master_info;
+ rgw_bucket* pmaster_bucket;
+ uint32_t* pmaster_num_shards;
+ real_time creation_time;
+ std::unique_ptr<Bucket> bucket;
+ obj_version objv,* pobjv = NULL;
+
+ /* If it exists, look it up; otherwise create it */
+ ret = store->get_bucket(dpp, this, b, &bucket, y);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+
+ if (ret != -ENOENT) {
+ RGWAccessControlPolicy old_policy(store->ctx());
+ *existed = true;
+ if (swift_ver_location.empty()) {
+ swift_ver_location = bucket->get_info().swift_ver_location;
+ }
+ placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+ // don't allow changes to the acl policy
+ int r = rgw_op_get_bucket_policy_from_attr(dpp, store, this, bucket->get_attrs(),
+ &old_policy, y);
+ if (r >= 0 && old_policy != policy) {
+ bucket_out->swap(bucket);
+ return -EEXIST;
+ }
+ } else {
+ bucket = std::unique_ptr<Bucket>(new RadosBucket(store, b, this));
+ *existed = false;
+ bucket->set_attrs(attrs);
+ }
+
+ if (!store->svc()->zone->is_meta_master()) {
+ JSONParser jp;
+ ret = store->forward_request_to_master(dpp, this, NULL, in_data, &jp, req_info, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp);
+ JSONDecoder::decode_json("object_ver", objv, &jp);
+ JSONDecoder::decode_json("bucket_info", master_info, &jp);
+ ldpp_dout(dpp, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl;
+ std::time_t ctime = ceph::real_clock::to_time_t(master_info.creation_time);
+ ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
+ pmaster_bucket= &master_info.bucket;
+ creation_time = master_info.creation_time;
+ pmaster_num_shards = &master_info.layout.current_index.layout.normal.num_shards;
+ pobjv = &objv;
+ if (master_info.obj_lock_enabled()) {
+ info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+ }
+ } else {
+ pmaster_bucket = NULL;
+ pmaster_num_shards = NULL;
+ if (obj_lock_enabled)
+ info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+ }
+
+ std::string zid = zonegroup_id;
+ if (zid.empty()) {
+ zid = store->svc()->zone->get_zonegroup().get_id();
+ }
+
+ if (*existed) {
+ rgw_placement_rule selected_placement_rule;
+ ret = store->svc()->zone->select_bucket_placement(dpp, this->get_info(),
+ zid, placement_rule,
+ &selected_placement_rule, nullptr, y);
+ if (selected_placement_rule != info.placement_rule) {
+ ret = -EEXIST;
+ bucket_out->swap(bucket);
+ return ret;
+ }
+ } else {
+
+ ret = store->getRados()->create_bucket(this->get_info(), bucket->get_key(),
+ zid, placement_rule, swift_ver_location, pquota_info,
+ attrs, info, pobjv, &ep_objv, creation_time,
+ pmaster_bucket, pmaster_num_shards, y, dpp,
+ exclusive);
+ if (ret == -EEXIST) {
+ *existed = true;
+ /* bucket already existed, might have raced with another bucket creation,
+ * or might be partial bucket creation that never completed. Read existing
+ * bucket info, verify that the reported bucket owner is the current user.
+ * If all is ok then update the user's list of buckets. Otherwise inform
+ * client about a name conflict.
+ */
+ if (info.owner.compare(this->get_id()) != 0) {
+ return -EEXIST;
+ }
+ ret = 0;
+ } else if (ret != 0) {
+ return ret;
+ }
+ }
+
+ bucket->set_version(ep_objv);
+ bucket->get_info() = info;
+
+ RadosBucket* rbucket = static_cast<RadosBucket*>(bucket.get());
+ ret = rbucket->link(dpp, this, y, false);
+ if (ret && !*existed && ret != -EEXIST) {
+ /* if it exists (or previously existed), don't remove it! */
+ ret = rbucket->unlink(dpp, this, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << ret
+ << dendl;
+ }
+ } else if (ret == -EEXIST || (ret == 0 && *existed)) {
+ ret = -ERR_BUCKET_EXISTS;
+ }
+
+ bucket_out->swap(bucket);
+
+ return ret;
+}
+
+int RadosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return store->ctl()->user->get_attrs_by_uid(dpp, get_id(), &attrs, y, &objv_tracker);
+}
+
+int RadosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+ for(auto& it : new_attrs) {
+ attrs[it.first] = it.second;
+ }
+ return store_user(dpp, y, false);
+}
+
+int RadosUser::read_stats(const DoutPrefixProvider *dpp,
+ optional_yield y, RGWStorageStats* stats,
+ ceph::real_time* last_stats_sync,
+ ceph::real_time* last_stats_update)
+{
+ return store->ctl()->user->read_stats(dpp, get_id(), stats, y, last_stats_sync, last_stats_update);
+}
+
+int RadosUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb)
+{
+ return store->svc()->user->read_stats_async(dpp, get_id(), cb);
+}
+
+int RadosUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ return store->svc()->user->complete_flush_stats(dpp, get_id(), y);
+}
+
+int RadosUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool* is_truncated,
+ RGWUsageIter& usage_iter,
+ map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+ std::string bucket_name;
+ return store->getRados()->read_usage(dpp, get_id(), bucket_name, start_epoch,
+ end_epoch, max_entries, is_truncated,
+ usage_iter, usage);
+}
+
+int RadosUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+ std::string bucket_name;
+
+ return store->getRados()->trim_usage(dpp, get_id(), bucket_name, start_epoch, end_epoch);
+}
+
+int RadosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return store->ctl()->user->get_info_by_uid(dpp, info.user_id, &info, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker).set_attrs(&attrs));
+}
+
+int RadosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
+{
+ return store->ctl()->user->store_info(dpp, info, y,
+ RGWUserCtl::PutParams().set_objv_tracker(&objv_tracker)
+ .set_exclusive(exclusive)
+ .set_attrs(&attrs)
+ .set_old_info(old_info));
+}
+
+int RadosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return store->ctl()->user->remove_info(dpp, info, y,
+ RGWUserCtl::RemoveParams().set_objv_tracker(&objv_tracker));
+}
+
+int RadosUser::verify_mfa(const std::string& mfa_str, bool* verified,
+ const DoutPrefixProvider* dpp, optional_yield y)
+{
+ vector<string> params;
+ get_str_vec(mfa_str, " ", params);
+
+ if (params.size() != 2) {
+ ldpp_dout(dpp, 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl;
+ return -EINVAL;
+ }
+
+ string& serial = params[0];
+ string& pin = params[1];
+
+ auto i = info.mfa_ids.find(serial);
+ if (i == info.mfa_ids.end()) {
+ ldpp_dout(dpp, 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl;
+ return -EACCES;
+ }
+
+ int ret = store->svc()->cls->mfa.check_mfa(dpp, info.user_id, serial, pin, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl;
+ return -EACCES;
+ }
+
+ *verified = true;
+
+ return 0;
+}
+
+RadosBucket::~RadosBucket() {}
+
+int RadosBucket::remove_bucket(const DoutPrefixProvider* dpp,
+ bool delete_children,
+ bool forward_to_master,
+ req_info* req_info,
+ optional_yield y)
+{
+ int ret;
+
+ // Refresh info
+ ret = load_bucket(dpp, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ListParams params;
+ params.list_versions = true;
+ params.allow_unordered = true;
+
+ ListResults results;
+
+ do {
+ results.objs.clear();
+
+ ret = list(dpp, params, 1000, results, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!results.objs.empty() && !delete_children) {
+ ldpp_dout(dpp, -1) << "ERROR: could not remove non-empty bucket " << info.bucket.name <<
+ dendl;
+ return -ENOTEMPTY;
+ }
+
+ for (const auto& obj : results.objs) {
+ rgw_obj_key key(obj.key);
+ /* xxx dang */
+ ret = rgw_remove_object(dpp, store, this, key);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ }
+ }
+ } while(results.is_truncated);
+
+ ret = abort_multiparts(dpp, store->ctx());
+ if (ret < 0) {
+ return ret;
+ }
+
+ // remove lifecycle config, if any (XXX note could be made generic)
+ (void) store->getRados()->get_lc()->remove_bucket_config(
+ this, get_attrs());
+
+ ret = store->ctl()->bucket->sync_user_stats(dpp, info.owner, info, y, nullptr);
+ if (ret < 0) {
+ ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl;
+ }
+
+ RGWObjVersionTracker ot;
+
+ // if we deleted children above we will force delete, as any that
+ // remain is detrius from a prior bug
+ ret = store->getRados()->delete_bucket(info, ot, y, dpp, !delete_children);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " <<
+ info.bucket.name << dendl;
+ return ret;
+ }
+
+ // if bucket has notification definitions associated with it
+ // they should be removed (note that any pending notifications on the bucket are still going to be sent)
+ const RGWPubSub ps(store, info.owner.tenant);
+ const RGWPubSub::Bucket ps_bucket(ps, this);
+ const auto ps_ret = ps_bucket.remove_notifications(dpp, y);
+ if (ps_ret < 0 && ps_ret != -ENOENT) {
+ ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl;
+ }
+
+ ret = store->ctl()->bucket->unlink_bucket(info.owner, info.bucket, y, dpp, false);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: unable to remove user bucket information" << dendl;
+ }
+
+ if (forward_to_master) {
+ bufferlist in_data;
+ ret = store->forward_request_to_master(dpp, owner, &ot.read_version, in_data, nullptr, *req_info, y);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ /* adjust error, we want to return with NoSuchBucket and not
+ * NoSuchKey */
+ ret = -ERR_NO_SUCH_BUCKET;
+ }
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+int RadosBucket::remove_bucket_bypass_gc(int concurrent_max, bool
+ keep_index_consistent,
+ optional_yield y, const
+ DoutPrefixProvider *dpp)
+{
+ int ret;
+ map<RGWObjCategory, RGWStorageStats> stats;
+ map<string, bool> common_prefixes;
+ RGWObjectCtx obj_ctx(store);
+ CephContext *cct = store->ctx();
+
+ string bucket_ver, master_ver;
+
+ ret = load_bucket(dpp, y);
+ if (ret < 0)
+ return ret;
+
+ const auto& index = info.get_current_index();
+ ret = read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = abort_multiparts(dpp, cct);
+ if (ret < 0) {
+ return ret;
+ }
+
+ rgw::sal::Bucket::ListParams params;
+ rgw::sal::Bucket::ListResults results;
+
+ params.list_versions = true;
+ params.allow_unordered = true;
+
+ std::list<librados::AioCompletion*> handles;
+
+ int max_aio = concurrent_max;
+ results.is_truncated = true;
+
+ while (results.is_truncated) {
+ ret = list(dpp, params, listing_max_entries, results, y);
+ if (ret < 0)
+ return ret;
+
+ std::vector<rgw_bucket_dir_entry>::iterator it = results.objs.begin();
+ for (; it != results.objs.end(); ++it) {
+ RGWObjState *astate = NULL;
+ RGWObjManifest *amanifest = nullptr;
+ rgw_obj obj{get_key(), it->key};
+
+ ret = store->getRados()->get_obj_state(dpp, &obj_ctx, get_info(),
+ obj, &astate, &amanifest,
+ false, y);
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 1) << "WARNING: cannot find obj state for obj " << obj << dendl;
+ continue;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: get obj state returned with error " << ret << dendl;
+ return ret;
+ }
+
+ if (amanifest) {
+ RGWObjManifest& manifest = *amanifest;
+ RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
+ const rgw_obj head_obj = manifest.get_obj();
+ rgw_raw_obj raw_head_obj;
+ store->get_raw_obj(manifest.get_head_placement_rule(), head_obj, &raw_head_obj);
+
+ for (; miter != manifest.obj_end(dpp) && max_aio--; ++miter) {
+ if (!max_aio) {
+ ret = drain_aio(handles);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+ max_aio = concurrent_max;
+ }
+
+ rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store->getRados());
+ if (last_obj == raw_head_obj) {
+ // have the head obj deleted at the end
+ continue;
+ }
+
+ ret = store->getRados()->delete_raw_obj_aio(dpp, last_obj, handles);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+ return ret;
+ }
+ } // for all shadow objs
+
+ ret = store->getRados()->delete_obj_aio(dpp, head_obj, get_info(), astate,
+ handles, keep_index_consistent, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+ return ret;
+ }
+ }
+
+ if (!max_aio) {
+ ret = drain_aio(handles);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+ max_aio = concurrent_max;
+ }
+ obj_ctx.invalidate(obj);
+ } // for all RGW objects in results
+ } // while is_truncated
+
+ ret = drain_aio(handles);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+ return ret;
+ }
+
+ sync_user_stats(dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: failed sync user stats before bucket delete. ret=" << ret << dendl;
+ }
+
+ RGWObjVersionTracker objv_tracker;
+
+ // this function can only be run if caller wanted children to be
+ // deleted, so we can ignore the check for children as any that
+ // remain are detritus from a prior bug
+ ret = remove_bucket(dpp, true, false, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << this << dendl;
+ return ret;
+ }
+
+ return ret;
+}
+
+int RadosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats)
+{
+ int ret;
+
+ RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj();
+ RGWObjVersionTracker ep_ot;
+ if (info.bucket.bucket_id.empty()) {
+ ret = store->ctl()->bucket->read_bucket_info(info.bucket, &info, y, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(&mtime)
+ .set_attrs(&attrs)
+ .set_bectx_params(bectx_params),
+ &ep_ot);
+ } else {
+ ret = store->ctl()->bucket->read_bucket_instance_info(info.bucket, &info, y, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(&mtime)
+ .set_attrs(&attrs)
+ .set_bectx_params(bectx_params));
+ }
+ if (ret != 0) {
+ return ret;
+ }
+
+ bucket_version = ep_ot.read_version;
+
+ if (get_stats) {
+ ret = store->ctl()->bucket->read_bucket_stats(info.bucket, &ent, y, dpp);
+ }
+
+ return ret;
+}
+
+int RadosBucket::read_stats(const DoutPrefixProvider *dpp,
+ const bucket_index_layout_generation& idx_layout,
+ int shard_id, std::string* bucket_ver, std::string* master_ver,
+ std::map<RGWObjCategory, RGWStorageStats>& stats,
+ std::string* max_marker, bool* syncstopped)
+{
+ return store->getRados()->get_bucket_stats(dpp, info, idx_layout, shard_id, bucket_ver, master_ver, stats, max_marker, syncstopped);
+}
+
+int RadosBucket::read_stats_async(const DoutPrefixProvider *dpp,
+ const bucket_index_layout_generation& idx_layout,
+ int shard_id, RGWGetBucketStats_CB* ctx)
+{
+ return store->getRados()->get_bucket_stats_async(dpp, get_info(), idx_layout, shard_id, ctx);
+}
+
+int RadosBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ return store->ctl()->bucket->sync_user_stats(dpp, owner->get_id(), info, y, &ent);
+}
+
+int RadosBucket::update_container_stats(const DoutPrefixProvider* dpp)
+{
+ int ret;
+ map<std::string, RGWBucketEnt> m;
+
+ m[info.bucket.name] = ent;
+ ret = store->getRados()->update_containers_stats(m, dpp);
+ if (!ret)
+ return -EEXIST;
+ if (ret < 0)
+ return ret;
+
+ map<std::string, RGWBucketEnt>::iterator iter = m.find(info.bucket.name);
+ if (iter == m.end())
+ return -EINVAL;
+
+ ent.count = iter->second.count;
+ ent.size = iter->second.size;
+ ent.size_rounded = iter->second.size_rounded;
+ ent.creation_time = iter->second.creation_time;
+ ent.placement_rule = std::move(iter->second.placement_rule);
+
+ info.creation_time = ent.creation_time;
+ info.placement_rule = ent.placement_rule;
+
+ return 0;
+}
+
+int RadosBucket::check_bucket_shards(const DoutPrefixProvider* dpp)
+{
+ return store->getRados()->check_bucket_shards(info, info.bucket, get_count(), dpp);
+}
+
+int RadosBucket::link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint, RGWObjVersionTracker* objv)
+{
+ RGWBucketEntryPoint ep;
+ ep.bucket = info.bucket;
+ ep.owner = new_user->get_id();
+ ep.creation_time = get_creation_time();
+ ep.linked = true;
+ Attrs ep_attrs;
+ rgw_ep_info ep_data{ep, ep_attrs};
+
+ int r = store->ctl()->bucket->link_bucket(new_user->get_id(), info.bucket,
+ get_creation_time(), y, dpp, update_entrypoint,
+ &ep_data);
+ if (r < 0)
+ return r;
+
+ if (objv)
+ *objv = ep_data.ep_objv;
+
+ return r;
+}
+
+int RadosBucket::unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint)
+{
+ return store->ctl()->bucket->unlink_bucket(new_user->get_id(), info.bucket, y, dpp, update_entrypoint);
+}
+
+int RadosBucket::chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y)
+{
+ std::string obj_marker;
+ int r;
+
+ if (!owner) {
+ ldpp_dout(dpp, 0) << __func__ << " Cannot chown without an owner " << dendl;
+ return -EINVAL;
+ }
+
+ r = this->unlink(dpp, owner, y);
+ if (r < 0) {
+ return r;
+ }
+
+ return this->link(dpp, &new_user, y);
+}
+
+int RadosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time _mtime)
+{
+ mtime = _mtime;
+ return store->getRados()->put_bucket_instance_info(info, exclusive, mtime, &attrs, dpp, null_yield);
+}
+
+/* Make sure to call get_bucket_info() if you need it first */
+bool RadosBucket::is_owner(User* user)
+{
+ return (info.owner.compare(user->get_id()) == 0);
+}
+
+int RadosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return store->getRados()->check_bucket_empty(dpp, info, y);
+}
+
+int RadosBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
+ optional_yield y, bool check_size_only)
+{
+ return store->getRados()->check_quota(dpp, info.owner, get_key(),
+ quota, obj_size, y, check_size_only);
+}
+
+int RadosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+ for(auto& it : new_attrs) {
+ attrs[it.first] = it.second;
+ }
+ return store->ctl()->bucket->set_bucket_instance_attrs(get_info(),
+ new_attrs, &get_info().objv_tracker, y, dpp);
+}
+
+int RadosBucket::try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime)
+{
+ return store->getRados()->try_refresh_bucket_info(info, pmtime, dpp, &attrs);
+}
+
+int RadosBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool* is_truncated,
+ RGWUsageIter& usage_iter,
+ map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+ return store->getRados()->read_usage(dpp, owner->get_id(), get_name(), start_epoch,
+ end_epoch, max_entries, is_truncated,
+ usage_iter, usage);
+}
+
+int RadosBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+ return store->getRados()->trim_usage(dpp, owner->get_id(), get_name(), start_epoch, end_epoch);
+}
+
+int RadosBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
+{
+ return store->getRados()->remove_objs_from_index(dpp, info, objs_to_unlink);
+}
+
+int RadosBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+{
+ return store->getRados()->bucket_check_index(dpp, info, &existing_stats, &calculated_stats);
+}
+
+int RadosBucket::rebuild_index(const DoutPrefixProvider *dpp)
+{
+ return store->getRados()->bucket_rebuild_index(dpp, info);
+}
+
+int RadosBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+{
+ return store->getRados()->cls_obj_set_bucket_tag_timeout(dpp, info, timeout);
+}
+
+int RadosBucket::purge_instance(const DoutPrefixProvider* dpp)
+{
+ int max_shards = (info.layout.current_index.layout.normal.num_shards > 0 ? info.layout.current_index.layout.normal.num_shards : 1);
+ for (int i = 0; i < max_shards; i++) {
+ RGWRados::BucketShard bs(store->getRados());
+ int shard_id = (info.layout.current_index.layout.normal.num_shards > 0 ? i : -1);
+ int ret = bs.init(dpp, info, info.layout.current_index, shard_id);
+ if (ret < 0) {
+ cerr << "ERROR: bs.init(bucket=" << info.bucket << ", shard=" << shard_id
+ << "): " << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+ ret = store->getRados()->bi_remove(dpp, bs);
+ if (ret < 0) {
+ cerr << "ERROR: failed to remove bucket index object: "
+ << cpp_strerror(-ret) << std::endl;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int RadosBucket::set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy &acl, optional_yield y)
+{
+ bufferlist aclbl;
+
+ acls = acl;
+ acl.encode(aclbl);
+ map<string, bufferlist>& attrs = get_attrs();
+
+ attrs[RGW_ATTR_ACL] = aclbl;
+ info.owner = acl.get_owner().get_id();
+
+ int r = store->ctl()->bucket->store_bucket_instance_info(info.bucket,
+ info, y, dpp,
+ RGWBucketCtl::BucketInstance::PutParams().set_attrs(&attrs));
+ if (r < 0) {
+ cerr << "ERROR: failed to set bucket owner: " << cpp_strerror(-r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+std::unique_ptr<Object> RadosBucket::get_object(const rgw_obj_key& k)
+{
+ return std::make_unique<RadosObject>(this->store, k, this);
+}
+
+int RadosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, ListResults& results, optional_yield y)
+{
+ RGWRados::Bucket target(store->getRados(), get_info());
+ if (params.shard_id >= 0) {
+ target.set_shard_id(params.shard_id);
+ }
+ RGWRados::Bucket::List list_op(&target);
+
+ list_op.params.prefix = params.prefix;
+ list_op.params.delim = params.delim;
+ list_op.params.marker = params.marker;
+ list_op.params.ns = params.ns;
+ list_op.params.end_marker = params.end_marker;
+ list_op.params.ns = params.ns;
+ list_op.params.enforce_ns = params.enforce_ns;
+ list_op.params.access_list_filter = params.access_list_filter;
+ list_op.params.force_check_filter = params.force_check_filter;
+ list_op.params.list_versions = params.list_versions;
+ list_op.params.allow_unordered = params.allow_unordered;
+
+ int ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated, y);
+ if (ret >= 0) {
+ results.next_marker = list_op.get_next_marker();
+ params.marker = results.next_marker;
+ }
+
+ return ret;
+}
+
+std::unique_ptr<MultipartUpload> RadosBucket::get_multipart_upload(
+ const std::string& oid,
+ std::optional<std::string> upload_id,
+ ACLOwner owner, ceph::real_time mtime)
+{
+ return std::make_unique<RadosMultipartUpload>(this->store, this, oid, upload_id,
+ std::move(owner), mtime);
+}
+
+int RadosBucket::list_multiparts(const DoutPrefixProvider *dpp,
+ const string& prefix,
+ string& marker,
+ const string& delim,
+ const int& max_uploads,
+ vector<std::unique_ptr<MultipartUpload>>& uploads,
+ map<string, bool> *common_prefixes,
+ bool *is_truncated)
+{
+ rgw::sal::Bucket::ListParams params;
+ rgw::sal::Bucket::ListResults results;
+ MultipartMetaFilter mp_filter;
+
+ params.prefix = prefix;
+ params.delim = delim;
+ params.marker = marker;
+ params.ns = RGW_OBJ_NS_MULTIPART;
+ params.access_list_filter = &mp_filter;
+
+ int ret = list(dpp, params, max_uploads, results, null_yield);
+
+ if (ret < 0)
+ return ret;
+
+ if (!results.objs.empty()) {
+ for (const rgw_bucket_dir_entry& dentry : results.objs) {
+ rgw_obj_key key(dentry.key);
+ ACLOwner owner(rgw_user(dentry.meta.owner));
+ owner.set_name(dentry.meta.owner_display_name);
+ uploads.push_back(this->get_multipart_upload(key.name,
+ std::nullopt, std::move(owner), dentry.meta.mtime));
+ }
+ }
+ if (common_prefixes) {
+ *common_prefixes = std::move(results.common_prefixes);
+ }
+ *is_truncated = results.is_truncated;
+ marker = params.marker.name;
+
+ return 0;
+}
+
+int RadosBucket::abort_multiparts(const DoutPrefixProvider* dpp,
+ CephContext* cct)
+{
+ constexpr int max = 1000;
+ int ret, num_deleted = 0;
+ vector<std::unique_ptr<MultipartUpload>> uploads;
+ string marker;
+ bool is_truncated;
+
+ const std::string empty_delim;
+ const std::string empty_prefix;
+
+ do {
+ ret = list_multiparts(dpp, empty_prefix, marker, empty_delim,
+ max, uploads, nullptr, &is_truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR : calling list_bucket_multiparts; ret=" << ret <<
+ "; bucket=\"" << this << "\"" << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << __func__ <<
+ " INFO: aborting and cleaning up multipart upload(s); bucket=\"" <<
+ this << "\"; uploads.size()=" << uploads.size() <<
+ "; is_truncated=" << is_truncated << dendl;
+
+ if (!uploads.empty()) {
+ for (const auto& upload : uploads) {
+ ret = upload->abort(dpp, cct);
+ if (ret < 0) {
+ // we're doing a best-effort; if something cannot be found,
+ // log it and keep moving forward
+ if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR : failed to abort and clean-up multipart upload \"" <<
+ upload->get_meta() << "\"" << dendl;
+ return ret;
+ } else {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " NOTE : unable to find part(s) of "
+ "aborted multipart upload of \"" << upload->get_meta() <<
+ "\" for cleaning up" << dendl;
+ }
+ }
+ num_deleted++;
+ }
+ if (num_deleted) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " WARNING : aborted " << num_deleted <<
+ " incomplete multipart uploads" << dendl;
+ }
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+std::string RadosBucket::topics_oid() const {
+ return pubsub_oid_prefix + get_tenant() + ".bucket." + get_name() + "/" + get_marker();
+}
+
+int RadosBucket::read_topics(rgw_pubsub_bucket_topics& notifications,
+ RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp)
+{
+ bufferlist bl;
+ const int ret = rgw_get_system_obj(store->svc()->sysobj,
+ store->svc()->zone->get_zone_params().log_pool,
+ topics_oid(),
+ bl,
+ objv_tracker,
+ nullptr, y, dpp, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto iter = bl.cbegin();
+ try {
+ decode(notifications, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 20) << " failed to decode bucket notifications from oid: " << topics_oid() << ". for bucket: "
+ << get_name() << ". error: " << err.what() << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RadosBucket::write_topics(const rgw_pubsub_bucket_topics& notifications,
+ RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) {
+ bufferlist bl;
+ encode(notifications, bl);
+
+ return rgw_put_system_obj(dpp, store->svc()->sysobj,
+ store->svc()->zone->get_zone_params().log_pool,
+ topics_oid(),
+ bl, false, objv_tracker, real_time(), y);
+}
+
+int RadosBucket::remove_topics(RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) {
+ return rgw_delete_system_obj(dpp, store->svc()->sysobj,
+ store->svc()->zone->get_zone_params().log_pool,
+ topics_oid(),
+ objv_tracker, y);
+}
+
+std::unique_ptr<User> RadosStore::get_user(const rgw_user &u)
+{
+ return std::make_unique<RadosUser>(this, u);
+}
+
+std::string RadosStore::get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return getRados()->get_cluster_fsid(dpp, y);
+}
+
+int RadosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
+{
+ RGWUserInfo uinfo;
+ User* u;
+ RGWObjVersionTracker objv_tracker;
+
+ int r = ctl()->user->get_info_by_access_key(dpp, key, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+ if (r < 0)
+ return r;
+
+ u = new RadosUser(this, uinfo);
+ if (!u)
+ return -ENOMEM;
+
+ u->get_version_tracker() = objv_tracker;
+
+ user->reset(u);
+ return 0;
+}
+
+int RadosStore::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+{
+ RGWUserInfo uinfo;
+ User* u;
+ RGWObjVersionTracker objv_tracker;
+
+ int r = ctl()->user->get_info_by_email(dpp, email, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+ if (r < 0)
+ return r;
+
+ u = new RadosUser(this, uinfo);
+ if (!u)
+ return -ENOMEM;
+
+ u->get_version_tracker() = objv_tracker;
+
+ user->reset(u);
+ return 0;
+}
+
+int RadosStore::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+{
+ RGWUserInfo uinfo;
+ User* u;
+ RGWObjVersionTracker objv_tracker;
+
+ int r = ctl()->user->get_info_by_swift(dpp, user_str, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+ if (r < 0)
+ return r;
+
+ u = new RadosUser(this, uinfo);
+ if (!u)
+ return -ENOMEM;
+
+ u->get_version_tracker() = objv_tracker;
+
+ user->reset(u);
+ return 0;
+}
+
+std::unique_ptr<Object> RadosStore::get_object(const rgw_obj_key& k)
+{
+ return std::make_unique<RadosObject>(this, k);
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+ int ret;
+ Bucket* bp;
+
+ bp = new RadosBucket(this, b, u);
+ ret = bp->load_bucket(dpp, y);
+ if (ret < 0) {
+ delete bp;
+ return ret;
+ }
+
+ bucket->reset(bp);
+ return 0;
+}
+
+int RadosStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+{
+ Bucket* bp;
+
+ bp = new RadosBucket(this, i, u);
+ /* Don't need to fetch the bucket info, use the provided one */
+
+ bucket->reset(bp);
+ return 0;
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+ rgw_bucket b;
+
+ b.tenant = tenant;
+ b.name = name;
+
+ return get_bucket(dpp, u, b, bucket, y);
+}
+
+bool RadosStore::is_meta_master()
+{
+ return svc()->zone->is_meta_master();
+}
+
+int RadosStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+ bufferlist& in_data,
+ JSONParser* jp, req_info& info,
+ optional_yield y)
+{
+ if (is_meta_master()) {
+ /* We're master, don't forward */
+ return 0;
+ }
+
+ if (!svc()->zone->get_master_conn()) {
+ ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+ return -EINVAL;
+ }
+ ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+ bufferlist response;
+ std::string uid_str = user->get_id().to_str();
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+ int ret = svc()->zone->get_master_conn()->forward(dpp, rgw_user(uid_str), info,
+ objv, MAX_REST_RESPONSE,
+ &in_data, &response, y);
+ if (ret < 0)
+ return ret;
+
+ ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+ if (jp && !jp->parse(response.c_str(), response.length())) {
+ ldpp_dout(dpp, 0) << "failed parsing response from master zonegroup" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int RadosStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+ bufferlist& in_data,
+ RGWXMLDecoder::XMLParser* parser, req_info& info,
+ optional_yield y)
+{
+ if (is_meta_master()) {
+ /* We're master, don't forward */
+ return 0;
+ }
+
+ if (!svc()->zone->get_master_conn()) {
+ ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+ return -EINVAL;
+ }
+ ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+ bufferlist response;
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+ int ret = svc()->zone->get_master_conn()->forward_iam_request(dpp, key, info,
+ objv, MAX_REST_RESPONSE,
+ &in_data, &response, y);
+ if (ret < 0)
+ return ret;
+
+ ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+
+ std::string r = response.c_str();
+ std::string str_to_search = "&quot;";
+ std::string str_to_replace = "\"";
+ boost::replace_all(r, str_to_search, str_to_replace);
+ ldpp_dout(dpp, 20) << "r: " << r.c_str() << dendl;
+
+ if (parser && !parser->parse(r.c_str(), r.length(), 1)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to parse response from master zonegroup" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+std::string RadosStore::zone_unique_id(uint64_t unique_num)
+{
+ return svc()->zone_utils->unique_id(unique_num);
+}
+
+std::string RadosStore::zone_unique_trans_id(const uint64_t unique_num)
+{
+ return svc()->zone_utils->unique_trans_id(unique_num);
+}
+
+int RadosStore::get_zonegroup(const std::string& id,
+ std::unique_ptr<ZoneGroup>* zonegroup)
+{
+ ZoneGroup* zg;
+ RGWZoneGroup rzg;
+ int r = svc()->zone->get_zonegroup(id, rzg);
+ if (r < 0)
+ return r;
+
+ zg = new RadosZoneGroup(this, rzg);
+ if (!zg)
+ return -ENOMEM;
+
+ zonegroup->reset(zg);
+ return 0;
+}
+
+int RadosStore::list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids)
+{
+ return svc()->zone->list_zones(dpp, zone_ids);
+}
+
+int RadosStore::cluster_stat(RGWClusterStat& stats)
+{
+ rados_cluster_stat_t rados_stats;
+ int ret;
+
+ ret = rados->get_rados_handle()->cluster_stat(rados_stats);
+ if (ret < 0)
+ return ret;
+
+ stats.kb = rados_stats.kb;
+ stats.kb_used = rados_stats.kb_used;
+ stats.kb_avail = rados_stats.kb_avail;
+ stats.num_objects = rados_stats.num_objects;
+
+ return ret;
+}
+
+std::unique_ptr<Lifecycle> RadosStore::get_lifecycle(void)
+{
+ return std::make_unique<RadosLifecycle>(this);
+}
+
+std::unique_ptr<Completions> RadosStore::get_completions(void)
+{
+ return std::make_unique<RadosCompletions>();
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(
+ rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name)
+{
+ return std::make_unique<RadosNotification>(s, this, obj, src_obj, s, event_type, y, object_name);
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y)
+{
+ return std::make_unique<RadosNotification>(dpp, this, obj, src_obj, event_type, _bucket, _user_id, _user_tenant, _req_id, y);
+}
+
+std::string RadosStore::topics_oid(const std::string& tenant) const {
+ return pubsub_oid_prefix + tenant;
+}
+
+int RadosStore::read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) {
+ bufferlist bl;
+ const int ret = rgw_get_system_obj(svc()->sysobj,
+ svc()->zone->get_zone_params().log_pool,
+ topics_oid(tenant),
+ bl,
+ objv_tracker,
+ nullptr, y, dpp, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto iter = bl.cbegin();
+ try {
+ decode(topics, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 20) << " failed to decode topics from oid: " << topics_oid(tenant) <<
+ ". error: " << err.what() << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RadosStore::write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) {
+ bufferlist bl;
+ encode(topics, bl);
+
+ return rgw_put_system_obj(dpp, svc()->sysobj,
+ svc()->zone->get_zone_params().log_pool,
+ topics_oid(tenant),
+ bl, false, objv_tracker, real_time(), y);
+}
+
+int RadosStore::remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) {
+ return rgw_delete_system_obj(dpp, svc()->sysobj,
+ svc()->zone->get_zone_params().log_pool,
+ topics_oid(tenant),
+ objv_tracker, y);
+}
+
+int RadosStore::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+ return rados->delete_raw_obj(dpp, obj);
+}
+
+int RadosStore::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio)
+{
+ RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+ return rados->delete_raw_obj_aio(dpp, obj, raio->handles);
+}
+
+void RadosStore::get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj)
+{
+ rados->obj_to_raw(placement_rule, obj, raw_obj);
+}
+
+int RadosStore::get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size)
+{
+ return rados->get_max_chunk_size(obj.pool, chunk_size, dpp);
+}
+
+int RadosStore::initialize(CephContext *cct, const DoutPrefixProvider *dpp)
+{
+ std::unique_ptr<ZoneGroup> zg =
+ std::make_unique<RadosZoneGroup>(this, svc()->zone->get_zonegroup());
+ zone = make_unique<RadosZone>(this, std::move(zg));
+ return 0;
+}
+
+int RadosStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+ return rados->log_usage(dpp, usage_info);
+}
+
+int RadosStore::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl)
+{
+ rgw_raw_obj obj(svc()->zone->get_zone_params().log_pool, oid);
+
+ int ret = rados->append_async(dpp, obj, bl.length(), bl);
+ if (ret == -ENOENT) {
+ ret = rados->create_pool(dpp, svc()->zone->get_zone_params().log_pool);
+ if (ret < 0)
+ return ret;
+ // retry
+ ret = rados->append_async(dpp, obj, bl.length(), bl);
+ }
+
+ return ret;
+}
+
+int RadosStore::register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+ const map<std::string, std::string>& meta)
+{
+ return rados->register_to_service_map(dpp, daemon_type, meta);
+}
+
+void RadosStore::get_quota(RGWQuota& quota)
+{
+ quota.bucket_quota = svc()->quota->get_bucket_quota();
+ quota.user_quota = svc()->quota->get_user_quota();
+}
+
+void RadosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit)
+{
+ bucket_ratelimit = svc()->zone->get_current_period().get_config().bucket_ratelimit;
+ user_ratelimit = svc()->zone->get_current_period().get_config().user_ratelimit;
+ anon_ratelimit = svc()->zone->get_current_period().get_config().anon_ratelimit;
+}
+
+int RadosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, vector<rgw_bucket>& buckets, bool enabled)
+{
+ return rados->set_buckets_enabled(buckets, enabled, dpp);
+}
+
+int RadosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp,
+ std::optional<rgw_zone_id> zone,
+ std::optional<rgw_bucket> bucket,
+ RGWBucketSyncPolicyHandlerRef* phandler,
+ optional_yield y)
+{
+ return ctl()->bucket->get_sync_policy_handler(zone, bucket, phandler, y, dpp);
+}
+
+RGWDataSyncStatusManager* RadosStore::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+ return rados->get_data_sync_manager(source_zone);
+}
+
+int RadosStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool* is_truncated,
+ RGWUsageIter& usage_iter,
+ map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+ rgw_user uid;
+ std::string bucket_name;
+
+ return rados->read_usage(dpp, uid, bucket_name, start_epoch, end_epoch, max_entries,
+ is_truncated, usage_iter, usage);
+}
+
+int RadosStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+ rgw_user uid;
+ std::string bucket_name;
+
+ return rados->trim_usage(dpp, uid, bucket_name, start_epoch, end_epoch);
+}
+
+int RadosStore::get_config_key_val(std::string name, bufferlist* bl)
+{
+ return svc()->config_key->get(name, true, bl);
+}
+
+int RadosStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle)
+{
+ return ctl()->meta.mgr->list_keys_init(dpp, section, marker, phandle);
+}
+
+int RadosStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<std::string>& keys, bool* truncated)
+{
+ return ctl()->meta.mgr->list_keys_next(dpp, handle, max, keys, truncated);
+}
+
+void RadosStore::meta_list_keys_complete(void* handle)
+{
+ ctl()->meta.mgr->list_keys_complete(handle);
+}
+
+std::string RadosStore::meta_get_marker(void* handle)
+{
+ return ctl()->meta.mgr->get_marker(handle);
+}
+
+int RadosStore::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y)
+{
+ return ctl()->meta.mgr->remove(metadata_key, y, dpp);
+}
+
+void RadosStore::finalize(void)
+{
+ if (rados)
+ rados->finalize();
+}
+
+void RadosStore::register_admin_apis(RGWRESTMgr* mgr)
+{
+ mgr->register_resource("user", new RGWRESTMgr_User);
+ mgr->register_resource("bucket", new RGWRESTMgr_Bucket);
+ /*Registering resource for /admin/metadata */
+ mgr->register_resource("metadata", new RGWRESTMgr_Metadata);
+ mgr->register_resource("log", new RGWRESTMgr_Log);
+ /* XXX These may become global when cbodley is done with his zone work */
+ mgr->register_resource("config", new RGWRESTMgr_Config);
+ mgr->register_resource("realm", new RGWRESTMgr_Realm);
+ mgr->register_resource("ratelimit", new RGWRESTMgr_Ratelimit);
+}
+
+std::unique_ptr<LuaManager> RadosStore::get_lua_manager()
+{
+ return std::make_unique<RadosLuaManager>(this);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string name,
+ std::string tenant,
+ std::string path,
+ std::string trust_policy,
+ std::string max_session_duration_str,
+ std::multimap<std::string,std::string> tags)
+{
+ return std::make_unique<RadosRole>(this, name, tenant, path, trust_policy, max_session_duration_str, tags);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string id)
+{
+ return std::make_unique<RadosRole>(this, id);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(const RGWRoleInfo& info)
+{
+ return std::make_unique<RadosRole>(this, info);
+}
+
+int RadosStore::get_roles(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ const std::string& path_prefix,
+ const std::string& tenant,
+ vector<std::unique_ptr<RGWRole>>& roles)
+{
+ auto pool = svc()->zone->get_zone_params().roles_pool;
+ std::string prefix;
+
+ // List all roles if path prefix is empty
+ if (! path_prefix.empty()) {
+ prefix = tenant + RGWRole::role_path_oid_prefix + path_prefix;
+ } else {
+ prefix = tenant + RGWRole::role_path_oid_prefix;
+ }
+
+ //Get the filtered objects
+ list<std::string> result;
+ bool is_truncated;
+ RGWListRawObjsCtx ctx;
+ do {
+ list<std::string> oids;
+ int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: "
+ << prefix << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ for (const auto& iter : oids) {
+ result.push_back(iter.substr(RGWRole::role_path_oid_prefix.size()));
+ }
+ } while (is_truncated);
+
+ for (const auto& it : result) {
+ //Find the role oid prefix from the end
+ size_t pos = it.rfind(RGWRole::role_oid_prefix);
+ if (pos == std::string::npos) {
+ continue;
+ }
+ // Split the result into path and info_oid + id
+ std::string path = it.substr(0, pos);
+
+ /*Make sure that prefix is part of path (False results could've been returned)
+ because of the role info oid + id appended to the path)*/
+ if(path_prefix.empty() || path.find(path_prefix) != std::string::npos) {
+ //Get id from info oid prefix + id
+ std::string id = it.substr(pos + RGWRole::role_oid_prefix.length());
+
+ std::unique_ptr<rgw::sal::RGWRole> role = get_role(id);
+ int ret = role->read_info(dpp, y);
+ if (ret < 0) {
+ return ret;
+ }
+ roles.push_back(std::move(role));
+ }
+ }
+
+ return 0;
+}
+
+std::unique_ptr<RGWOIDCProvider> RadosStore::get_oidc_provider()
+{
+ return std::make_unique<RadosOIDCProvider>(this);
+}
+
+int RadosStore::get_oidc_providers(const DoutPrefixProvider *dpp,
+ const std::string& tenant,
+ vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+{
+ std::string prefix = tenant + RGWOIDCProvider::oidc_url_oid_prefix;
+ auto pool = svc()->zone->get_zone_params().oidc_pool;
+
+ //Get the filtered objects
+ list<std::string> result;
+ bool is_truncated;
+ RGWListRawObjsCtx ctx;
+ do {
+ list<std::string> oids;
+ int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: OIDC pool: "
+ << pool.name << ": " << prefix << ": " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ for (const auto& iter : oids) {
+ std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = get_oidc_provider();
+ bufferlist bl;
+
+ r = rgw_get_system_obj(svc()->sysobj, pool, iter, bl, nullptr, nullptr, null_yield, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ try {
+ using ceph::decode;
+ auto iter = bl.cbegin();
+ decode(*provider, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: "
+ << pool.name << ": " << iter << dendl;
+ return -EIO;
+ }
+
+ providers.push_back(std::move(provider));
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+std::unique_ptr<Writer> RadosStore::get_append_writer(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ const std::string& unique_tag,
+ uint64_t position,
+ uint64_t *cur_accounted_size)
+{
+ RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+ RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+ auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+ return std::make_unique<RadosAppendWriter>(dpp, y,
+ bucket_info, obj_ctx, obj->get_obj(),
+ this, std::move(aio), owner,
+ ptail_placement_rule,
+ unique_tag, position,
+ cur_accounted_size);
+}
+
+std::unique_ptr<Writer> RadosStore::get_atomic_writer(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t olh_epoch,
+ const std::string& unique_tag)
+{
+ RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+ RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+ auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+ return std::make_unique<RadosAtomicWriter>(dpp, y,
+ bucket_info, obj_ctx, obj->get_obj(),
+ this, std::move(aio), owner,
+ ptail_placement_rule,
+ olh_epoch, unique_tag);
+}
+
+const std::string& RadosStore::get_compression_type(const rgw_placement_rule& rule)
+{
+ return svc()->zone->get_zone_params().get_compression_type(rule);
+}
+
+bool RadosStore::valid_placement(const rgw_placement_rule& rule)
+{
+ return svc()->zone->get_zone_params().valid_placement(rule);
+}
+
+int RadosStore::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx* ioctx)
+{
+ return rados->get_obj_head_ioctx(dpp, bucket_info, obj, ioctx);
+}
+
+RadosObject::~RadosObject()
+{
+ if (rados_ctx_owned)
+ delete rados_ctx;
+}
+
+int RadosObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh)
+{
+ int ret = store->getRados()->get_obj_state(dpp, rados_ctx, bucket->get_info(), get_obj(), pstate, &manifest, follow_olh, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Don't overwrite obj, atomic, or prefetch */
+ rgw_obj obj = get_obj();
+ bool is_atomic = state.is_atomic;
+ bool prefetch_data = state.prefetch_data;
+
+ state = **pstate;
+
+ state.obj = obj;
+ state.is_atomic = is_atomic;
+ state.prefetch_data = prefetch_data;
+ return ret;
+}
+
+int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
+{
+ read_op.params.attrs = &state.attrset;
+ read_op.params.target_obj = target_obj;
+ read_op.params.obj_size = &state.size;
+ read_op.params.lastmod = &state.mtime;
+
+ return read_op.prepare(y, dpp);
+}
+
+int RadosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
+{
+ Attrs empty;
+ return store->getRados()->set_attrs(dpp, rados_ctx,
+ bucket->get_info(),
+ get_obj(),
+ setattrs ? *setattrs : empty,
+ delattrs ? delattrs : nullptr,
+ y);
+}
+
+int RadosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
+{
+ RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+ RGWRados::Object::Read read_op(&op_target);
+
+ return read_attrs(dpp, read_op, y, target_obj);
+}
+
+int RadosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
+{
+ rgw_obj target = get_obj();
+ rgw_obj save = get_obj();
+ int r = get_obj_attrs(y, dpp, &target);
+ if (r < 0) {
+ return r;
+ }
+
+ /* Temporarily set target */
+ state.obj = target;
+ set_atomic();
+ state.attrset[attr_name] = attr_val;
+ r = set_obj_attrs(dpp, &state.attrset, nullptr, y);
+ /* Restore target */
+ state.obj = save;
+
+ return r;
+}
+
+int RadosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
+{
+ Attrs rmattr;
+ bufferlist bl;
+
+ set_atomic();
+ rmattr[attr_name] = bl;
+ return set_obj_attrs(dpp, nullptr, &rmattr, y);
+}
+
+bool RadosObject::is_expired() {
+ auto iter = state.attrset.find(RGW_ATTR_DELETE_AT);
+ if (iter == state.attrset.end()) {
+ return false;
+ }
+ utime_t delete_at;
+ try {
+ auto bufit = iter->second.cbegin();
+ decode(delete_at, bufit);
+ } catch (buffer::error& err) {
+ ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
+ return false;
+ }
+
+ return delete_at <= ceph_clock_now() && !delete_at.is_zero();
+}
+
+void RadosObject::gen_rand_obj_instance_name()
+{
+ store->getRados()->gen_rand_obj_instance_name(&state.obj.key);
+}
+
+void RadosObject::raw_obj_to_obj(const rgw_raw_obj& raw_obj)
+{
+ rgw_obj tobj = get_obj();
+ RGWSI_Tier_RADOS::raw_obj_to_obj(get_bucket()->get_key(), raw_obj, &tobj);
+ set_key(tobj.key);
+}
+
+void RadosObject::get_raw_obj(rgw_raw_obj* raw_obj)
+{
+ store->getRados()->obj_to_raw((bucket->get_info()).placement_rule, get_obj(), raw_obj);
+}
+
+int RadosObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+ std::map<std::string, bufferlist> *m,
+ bool* pmore, optional_yield y)
+{
+ rgw_raw_obj raw_obj;
+ get_raw_obj(&raw_obj);
+ auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+ return sysobj.omap().get_vals(dpp, marker, count, m, pmore, y);
+}
+
+int RadosObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+ optional_yield y)
+{
+ rgw_raw_obj raw_obj;
+ get_raw_obj(&raw_obj);
+ auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+ return sysobj.omap().get_all(dpp, m, y);
+}
+
+int RadosObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+ const std::set<std::string>& keys,
+ Attrs* vals)
+{
+ int ret;
+ rgw_raw_obj head_obj;
+ librados::IoCtx cur_ioctx;
+ rgw_obj obj = get_obj();
+
+ store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &head_obj);
+ ret = store->get_obj_head_ioctx(dpp, bucket->get_info(), obj, &cur_ioctx);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return cur_ioctx.omap_get_vals_by_keys(oid, keys, vals);
+}
+
+int RadosObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+ bool must_exist, optional_yield y)
+{
+ rgw_raw_obj raw_meta_obj;
+ rgw_obj obj = get_obj();
+
+ store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &raw_meta_obj);
+
+ auto sysobj = store->svc()->sysobj->get_obj(raw_meta_obj);
+
+ return sysobj.omap().set_must_exist(must_exist).set(dpp, key, val, y);
+}
+
+int RadosObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y)
+{
+ int r = get_obj_attrs(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read object attrs " << get_name() << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ const auto& aiter = get_attrs().find(RGW_ATTR_ACL);
+ if (aiter == get_attrs().end()) {
+ ldpp_dout(dpp, 0) << "ERROR: no acls found for object " << get_name() << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist& bl = aiter->second;
+ RGWAccessControlPolicy policy(store->ctx());
+ ACLOwner owner;
+ auto bliter = bl.cbegin();
+ try {
+ policy.decode(bliter);
+ owner = policy.get_owner();
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: decode policy failed" << err.what()
+ << dendl;
+ return -EIO;
+ }
+
+ //Get the ACL from the policy
+ RGWAccessControlList& acl = policy.get_acl();
+
+ //Remove grant that is set to old owner
+ acl.remove_canon_user_grant(owner.get_id());
+
+ //Create a grant and add grant
+ ACLGrant grant;
+ grant.set_canon(new_user.get_id(), new_user.get_display_name(), RGW_PERM_FULL_CONTROL);
+ acl.add_grant(&grant);
+
+ //Update the ACL owner to the new user
+ owner.set_id(new_user.get_id());
+ owner.set_name(new_user.get_display_name());
+ policy.set_owner(owner);
+
+ bl.clear();
+ encode(policy, bl);
+
+ set_atomic();
+ map<string, bufferlist> attrs;
+ attrs[RGW_ATTR_ACL] = bl;
+ r = set_obj_attrs(dpp, &attrs, nullptr, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: modify attr failed " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+std::unique_ptr<MPSerializer> RadosObject::get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name)
+{
+ return std::make_unique<MPRadosSerializer>(dpp, store, this, lock_name);
+}
+
+int RadosObject::transition(Bucket* bucket,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider* dpp,
+ optional_yield y)
+{
+ return store->getRados()->transition_obj(*rados_ctx, bucket->get_info(), get_obj(), placement_rule, mtime, olh_epoch, dpp, y);
+}
+
+int RadosObject::transition_to_cloud(Bucket* bucket,
+ rgw::sal::PlacementTier* tier,
+ rgw_bucket_dir_entry& o,
+ std::set<std::string>& cloud_targets,
+ CephContext* cct,
+ bool update_object,
+ const DoutPrefixProvider* dpp,
+ optional_yield y)
+{
+ /* init */
+ rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+ string id = "cloudid";
+ string endpoint = rtier->get_rt().t.s3.endpoint;
+ RGWAccessKey key = rtier->get_rt().t.s3.key;
+ string region = rtier->get_rt().t.s3.region;
+ HostStyle host_style = rtier->get_rt().t.s3.host_style;
+ string bucket_name = rtier->get_rt().t.s3.target_path;
+ const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
+
+ if (bucket_name.empty()) {
+ bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
+ "-cloud-bucket";
+ boost::algorithm::to_lower(bucket_name);
+ }
+
+ /* Create RGW REST connection */
+ S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
+
+ RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
+ this, conn, bucket_name,
+ rtier->get_rt().t.s3.target_storage_class);
+ tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
+ tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
+ tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
+ tier_ctx.storage_class = tier->get_storage_class();
+
+ ldpp_dout(dpp, 0) << "Transitioning object(" << o.key << ") to the cloud endpoint(" << endpoint << ")" << dendl;
+
+ /* Transition object to cloud end point */
+ int ret = rgw_cloud_tier_transfer_object(tier_ctx, cloud_targets);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to transfer object(" << o.key << ") to the cloud endpoint(" << endpoint << ") ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (update_object) {
+ real_time read_mtime;
+
+ std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+ read_op->params.lastmod = &read_mtime;
+
+ ret = read_op->prepare(y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (read_mtime != tier_ctx.o.meta.mtime) {
+ /* raced */
+ ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << -ECANCELED << dendl;
+ return -ECANCELED;
+ }
+
+ rgw_placement_rule target_placement;
+ target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
+ target_placement.storage_class = tier->get_storage_class();
+
+ ret = write_cloud_tier(dpp, y, tier_ctx.o.versioned_epoch,
+ tier, tier_ctx.is_multipart_upload,
+ target_placement, tier_ctx.obj);
+
+ }
+
+ return ret;
+}
+
+int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ uint64_t olh_epoch,
+ PlacementTier* tier,
+ bool is_multipart_upload,
+ rgw_placement_rule& target_placement,
+ Object* head_obj)
+{
+ rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+ map<string, bufferlist> attrs = get_attrs();
+ RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+ RGWRados::Object::Write obj_op(&op_target);
+
+ obj_op.meta.modify_tail = true;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.category = RGWObjCategory::CloudTiered;
+ obj_op.meta.delete_at = real_time();
+ bufferlist blo;
+ obj_op.meta.data = &blo;
+ obj_op.meta.if_match = NULL;
+ obj_op.meta.if_nomatch = NULL;
+ obj_op.meta.user_data = NULL;
+ obj_op.meta.zones_trace = NULL;
+ obj_op.meta.delete_at = real_time();
+ obj_op.meta.olh_epoch = olh_epoch;
+
+ RGWObjManifest *pmanifest;
+ RGWObjManifest manifest;
+
+ pmanifest = &manifest;
+ RGWObjTier tier_config;
+ tier_config.name = tier->get_storage_class();
+ tier_config.tier_placement = rtier->get_rt();
+ tier_config.is_multipart_upload = is_multipart_upload;
+
+ pmanifest->set_tier_type("cloud-s3");
+ pmanifest->set_tier_config(tier_config);
+
+ /* check if its necessary */
+ pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
+ pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
+ pmanifest->set_obj_size(0);
+ obj_op.meta.manifest = pmanifest;
+
+ /* update storage class */
+ bufferlist bl;
+ bl.append(tier->get_storage_class());
+ attrs[RGW_ATTR_STORAGE_CLASS] = bl;
+
+ attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+
+ return obj_op.write_meta(dpp, 0, 0, attrs, y);
+}
+
+int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment)
+{
+ return store->getRados()->get_max_chunk_size(placement_rule, get_obj(), max_chunk_size, dpp, alignment);
+}
+
+void RadosObject::get_max_aligned_size(uint64_t size, uint64_t alignment,
+ uint64_t* max_size)
+{
+ store->getRados()->get_max_aligned_size(size, alignment, max_size);
+}
+
+bool RadosObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+{
+ rgw_obj obj;
+ rgw_pool p1, p2;
+
+ obj = get_obj();
+
+ if (r1 == r2)
+ return true;
+
+ if (!store->getRados()->get_obj_data_pool(r1, obj, &p1)) {
+ return false;
+ }
+ if (!store->getRados()->get_obj_data_pool(r2, obj, &p2)) {
+ return false;
+ }
+
+ return p1 == p2;
+}
+
+int RadosObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
+{
+ int ret;
+ RGWObjManifest *amanifest{nullptr};
+ rgw_raw_obj head_obj;
+
+ RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+ RGWRados::Object::Read parent_op(&op_target);
+ uint64_t obj_size;
+
+ parent_op.params.obj_size = &obj_size;
+ parent_op.params.attrs = &get_attrs();
+
+ ret = parent_op.prepare(y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ head_obj = parent_op.state.head_obj;
+
+ ret = op_target.get_manifest(dpp, &amanifest, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ::encode_json("head", head_obj, f);
+ ::encode_json("manifest", *amanifest, f);
+ f->open_array_section("data_location");
+ for (auto miter = amanifest->obj_begin(dpp); miter != amanifest->obj_end(dpp); ++miter) {
+ f->open_object_section("obj");
+ rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store->getRados());
+ uint64_t ofs = miter.get_ofs();
+ uint64_t left = amanifest->get_obj_size() - ofs;
+ ::encode_json("ofs", miter.get_ofs(), f);
+ ::encode_json("loc", raw_loc, f);
+ ::encode_json("loc_ofs", miter.location_ofs(), f);
+ uint64_t loc_size = miter.get_stripe_size();
+ if (loc_size > left) {
+ loc_size = left;
+ }
+ ::encode_json("loc_size", loc_size, f);
+ f->close_section();
+ }
+ f->close_section();
+
+ return 0;
+}
+
+std::unique_ptr<Object::ReadOp> RadosObject::get_read_op()
+{
+ return std::make_unique<RadosObject::RadosReadOp>(this, rados_ctx);
+}
+
+RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) :
+ source(_source),
+ rctx(_rctx),
+ op_target(_source->store->getRados(),
+ _source->get_bucket()->get_info(),
+ *static_cast<RGWObjectCtx *>(rctx),
+ _source->get_obj()),
+ parent_op(&op_target)
+{ }
+
+int RadosObject::RadosReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+{
+ uint64_t obj_size;
+
+ parent_op.conds.mod_ptr = params.mod_ptr;
+ parent_op.conds.unmod_ptr = params.unmod_ptr;
+ parent_op.conds.high_precision_time = params.high_precision_time;
+ parent_op.conds.mod_zone_id = params.mod_zone_id;
+ parent_op.conds.mod_pg_ver = params.mod_pg_ver;
+ parent_op.conds.if_match = params.if_match;
+ parent_op.conds.if_nomatch = params.if_nomatch;
+ parent_op.params.lastmod = params.lastmod;
+ parent_op.params.target_obj = params.target_obj;
+ parent_op.params.obj_size = &obj_size;
+ parent_op.params.attrs = &source->get_attrs();
+
+ int ret = parent_op.prepare(y, dpp);
+ if (ret < 0)
+ return ret;
+
+ source->set_key(parent_op.state.obj.key);
+ source->set_obj_size(obj_size);
+
+ return ret;
+}
+
+int RadosObject::RadosReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
+{
+ return parent_op.read(ofs, end, bl, y, dpp);
+}
+
+int RadosObject::RadosReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+{
+ return parent_op.get_attr(dpp, name, dest, y);
+}
+
+std::unique_ptr<Object::DeleteOp> RadosObject::get_delete_op()
+{
+ return std::make_unique<RadosObject::RadosDeleteOp>(this);
+}
+
+RadosObject::RadosDeleteOp::RadosDeleteOp(RadosObject *_source) :
+ source(_source),
+ op_target(_source->store->getRados(),
+ _source->get_bucket()->get_info(),
+ _source->get_ctx(),
+ _source->get_obj()),
+ parent_op(&op_target)
+{ }
+
+int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ parent_op.params.bucket_owner = params.bucket_owner.get_id();
+ parent_op.params.versioning_status = params.versioning_status;
+ parent_op.params.obj_owner = params.obj_owner;
+ parent_op.params.olh_epoch = params.olh_epoch;
+ parent_op.params.marker_version_id = params.marker_version_id;
+ parent_op.params.bilog_flags = params.bilog_flags;
+ parent_op.params.remove_objs = params.remove_objs;
+ parent_op.params.expiration_time = params.expiration_time;
+ parent_op.params.unmod_since = params.unmod_since;
+ parent_op.params.mtime = params.mtime;
+ parent_op.params.high_precision_time = params.high_precision_time;
+ parent_op.params.zones_trace = params.zones_trace;
+ parent_op.params.abortmp = params.abortmp;
+ parent_op.params.parts_accounted_size = params.parts_accounted_size;
+
+ int ret = parent_op.delete_obj(y, dpp);
+ if (ret < 0)
+ return ret;
+
+ result.delete_marker = parent_op.result.delete_marker;
+ result.version_id = parent_op.result.version_id;
+
+ return ret;
+}
+
+int RadosObject::delete_object(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ bool prevent_versioning)
+{
+ RGWRados::Object del_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+ RGWRados::Object::Delete del_op(&del_target);
+
+ del_op.params.bucket_owner = bucket->get_info().owner;
+ del_op.params.versioning_status = prevent_versioning ? 0 : bucket->get_info().versioning_status();
+
+ return del_op.delete_obj(y, dpp);
+}
+
+int RadosObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+ Completions* aio, bool keep_index_consistent,
+ optional_yield y)
+{
+ RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+ return store->getRados()->delete_obj_aio(dpp, get_obj(), bucket->get_info(), astate,
+ raio->handles, keep_index_consistent, y);
+}
+
+int RadosObject::copy_object(User* user,
+ req_info* info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_object,
+ rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ const rgw_placement_rule& dest_placement,
+ ceph::real_time* src_mtime,
+ ceph::real_time* mtime,
+ const ceph::real_time* mod_ptr,
+ const ceph::real_time* unmod_ptr,
+ bool high_precision_time,
+ const char* if_match,
+ const char* if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ Attrs& attrs,
+ RGWObjCategory category,
+ uint64_t olh_epoch,
+ boost::optional<ceph::real_time> delete_at,
+ std::string* version_id,
+ std::string* tag,
+ std::string* etag,
+ void (*progress_cb)(off_t, void *),
+ void* progress_data,
+ const DoutPrefixProvider* dpp,
+ optional_yield y)
+{
+ return store->getRados()->copy_obj(*rados_ctx,
+ user->get_id(),
+ info,
+ source_zone,
+ dest_object->get_obj(),
+ get_obj(),
+ dest_bucket->get_info(),
+ src_bucket->get_info(),
+ dest_placement,
+ src_mtime,
+ mtime,
+ mod_ptr,
+ unmod_ptr,
+ high_precision_time,
+ if_match,
+ if_nomatch,
+ static_cast<RGWRados::AttrsMod>(attrs_mod),
+ copy_if_newer,
+ attrs,
+ category,
+ olh_epoch,
+ (delete_at ? *delete_at : real_time()),
+ version_id,
+ tag,
+ etag,
+ progress_cb,
+ progress_data,
+ dpp,
+ y);
+}
+
+int RadosObject::RadosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y)
+{
+ return parent_op.iterate(dpp, ofs, end, cb, y);
+}
+
+int RadosObject::swift_versioning_restore(bool& restored,
+ const DoutPrefixProvider* dpp)
+{
+ rgw_obj obj = get_obj();
+ return store->getRados()->swift_versioning_restore(*rados_ctx,
+ bucket->get_owner()->get_id(),
+ bucket->get_info(),
+ obj,
+ restored,
+ dpp);
+}
+
+int RadosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, optional_yield y)
+{
+ return store->getRados()->swift_versioning_copy(*rados_ctx,
+ bucket->get_info().owner,
+ bucket->get_info(),
+ get_obj(),
+ dpp,
+ y);
+}
+
+int RadosMultipartUpload::cleanup_part_history(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ RadosMultipartPart *part,
+ list<rgw_obj_index_key>& remove_objs)
+{
+ cls_rgw_obj_chain chain;
+ for (auto& ppfx : part->get_past_prefixes()) {
+ rgw_obj past_obj;
+ past_obj.init_ns(bucket->get_key(), ppfx + "." + std::to_string(part->info.num), mp_ns);
+ rgw_obj_index_key past_key;
+ past_obj.key.get_index_key(&past_key);
+ // Remove past upload part objects from index, too.
+ remove_objs.push_back(past_key);
+
+ RGWObjManifest manifest = part->get_manifest();
+ manifest.set_prefix(ppfx);
+ RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
+ for (; miter != manifest.obj_end(dpp); ++miter) {
+ rgw_raw_obj raw_part_obj = miter.get_location().get_raw_obj(store->getRados());
+ cls_rgw_obj_key part_key(raw_part_obj.oid);
+ chain.push_obj(raw_part_obj.pool.to_str(), part_key, raw_part_obj.loc);
+ }
+ }
+ if (store->getRados()->get_gc() == nullptr) {
+ // Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+ store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
+ } else {
+ // use upload id as tag and do it synchronously
+ auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
+ if (ret < 0 && leftover_chain) {
+ ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ // Delete objects inline if send chain to gc fails
+ store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
+ }
+ }
+ return 0;
+}
+
+
+int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+ std::unique_ptr<rgw::sal::Object> meta_obj = get_meta_obj();
+ meta_obj->set_in_extra_data(true);
+ meta_obj->set_hash_source(mp_obj.get_key());
+ cls_rgw_obj_chain chain;
+ list<rgw_obj_index_key> remove_objs;
+ bool truncated;
+ int marker = 0;
+ int ret;
+ uint64_t parts_accounted_size = 0;
+
+ do {
+ ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
+ ret << dendl;
+ return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+ }
+
+ for (auto part_it = parts.begin();
+ part_it != parts.end();
+ ++part_it) {
+ RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+ if (obj_part->info.manifest.empty()) {
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+ rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
+ obj->set_hash_source(mp_obj.get_key());
+ ret = obj->delete_object(dpp, null_yield);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+ } else {
+ auto target = meta_obj->get_obj();
+ store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
+ RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
+ if (oiter != obj_part->info.manifest.obj_end(dpp)) {
+ std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
+ rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+ dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
+
+ rgw_obj_index_key key;
+ head->get_key().get_index_key(&key);
+ remove_objs.push_back(key);
+
+ cleanup_part_history(dpp, null_yield, obj_part, remove_objs);
+ }
+ }
+ parts_accounted_size += obj_part->info.accounted_size;
+ }
+ } while (truncated);
+
+ if (store->getRados()->get_gc() == nullptr) {
+ //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+ store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
+ } else {
+ /* use upload id as tag and do it synchronously */
+ auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
+ if (ret < 0 && leftover_chain) {
+ ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ //Delete objects inline if send chain to gc fails
+ store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
+ }
+ }
+
+ std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+ del_op->params.bucket_owner = bucket->get_acl_owner();
+ del_op->params.versioning_status = 0;
+ if (!remove_objs.empty()) {
+ del_op->params.remove_objs = &remove_objs;
+ }
+
+ del_op->params.abortmp = true;
+ del_op->params.parts_accounted_size = parts_accounted_size;
+
+ // and also remove the metadata obj
+ ret = del_op->delete_obj(dpp, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
+ ret << dendl;
+ }
+ return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+}
+
+std::unique_ptr<rgw::sal::Object> RadosMultipartUpload::get_meta_obj()
+{
+ return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
+}
+
+int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
+{
+ int ret;
+ std::string oid = mp_obj.get_key();
+ RGWObjectCtx obj_ctx(store);
+
+ do {
+ char buf[33];
+ string tmp_obj_name;
+ std::unique_ptr<rgw::sal::Object> obj;
+ gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+ std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
+ upload_id.append(buf);
+
+ mp_obj.init(oid, upload_id);
+ tmp_obj_name = mp_obj.get_meta();
+
+ obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns));
+ // the meta object will be indexed with 0 size, we c
+ obj->set_in_extra_data(true);
+ obj->set_hash_source(oid);
+
+ RGWRados::Object op_target(store->getRados(),
+ obj->get_bucket()->get_info(),
+ obj_ctx, obj->get_obj());
+ RGWRados::Object::Write obj_op(&op_target);
+
+ op_target.set_versioning_disabled(true); /* no versioning for multipart meta */
+ obj_op.meta.owner = owner.get_id();
+ obj_op.meta.category = RGWObjCategory::MultiMeta;
+ obj_op.meta.flags = PUT_OBJ_CREATE_EXCL;
+ obj_op.meta.mtime = &mtime;
+
+ multipart_upload_info upload_info;
+ upload_info.dest_placement = dest_placement;
+
+ bufferlist bl;
+ encode(upload_info, bl);
+ obj_op.meta.data = &bl;
+
+ ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
+
+int RadosMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+ int num_parts, int marker,
+ int *next_marker, bool *truncated,
+ bool assume_unsorted)
+{
+ map<string, bufferlist> parts_map;
+ map<string, bufferlist>::iterator iter;
+
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+ rgw_obj_key(get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
+ obj->set_in_extra_data(true);
+
+ bool sorted_omap = is_v2_upload_id(get_upload_id()) && !assume_unsorted;
+
+ parts.clear();
+
+ int ret;
+ if (sorted_omap) {
+ string p;
+ p = "part.";
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%08d", marker);
+ p.append(buf);
+
+ ret = obj->omap_get_vals(dpp, p, num_parts + 1, &parts_map,
+ nullptr, null_yield);
+ } else {
+ ret = obj->omap_get_all(dpp, &parts_map, null_yield);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+
+ int i;
+ int last_num = 0;
+
+ uint32_t expected_next = marker + 1;
+
+ for (i = 0, iter = parts_map.begin();
+ (i < num_parts || !sorted_omap) && iter != parts_map.end();
+ ++iter, ++i) {
+ bufferlist& bl = iter->second;
+ auto bli = bl.cbegin();
+ std::unique_ptr<RadosMultipartPart> part = std::make_unique<RadosMultipartPart>();
+ try {
+ decode(part->info, bli);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: could not part info, caught buffer::error" <<
+ dendl;
+ return -EIO;
+ }
+ if (sorted_omap) {
+ if (part->info.num != expected_next) {
+ /* ouch, we expected a specific part num here, but we got a
+ * different one. Either a part is missing, or it could be a
+ * case of mixed rgw versions working on the same upload,
+ * where one gateway doesn't support correctly sorted omap
+ * keys for multipart upload just assume data is unsorted.
+ */
+ return list_parts(dpp, cct, num_parts, marker, next_marker, truncated, true);
+ }
+ expected_next++;
+ }
+ if (sorted_omap ||
+ (int)part->info.num > marker) {
+ last_num = part->info.num;
+ parts[part->info.num] = std::move(part);
+ }
+ }
+
+ if (sorted_omap) {
+ if (truncated) {
+ *truncated = (iter != parts_map.end());
+ }
+ } else {
+ /* rebuild a map with only num_parts entries */
+ std::map<uint32_t, std::unique_ptr<MultipartPart>> new_parts;
+ std::map<uint32_t, std::unique_ptr<MultipartPart>>::iterator piter;
+ for (i = 0, piter = parts.begin();
+ i < num_parts && piter != parts.end();
+ ++i, ++piter) {
+ last_num = piter->first;
+ new_parts[piter->first] = std::move(piter->second);
+ }
+
+ if (truncated) {
+ *truncated = (piter != parts.end());
+ }
+
+ parts.swap(new_parts);
+ }
+
+ if (next_marker) {
+ *next_marker = last_num;
+ }
+
+ return 0;
+}
+
+int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
+ optional_yield y, CephContext* cct,
+ map<int, string>& part_etags,
+ list<rgw_obj_index_key>& remove_objs,
+ uint64_t& accounted_size, bool& compressed,
+ RGWCompressionInfo& cs_info, off_t& ofs,
+ std::string& tag, ACLOwner& owner,
+ uint64_t olh_epoch,
+ rgw::sal::Object* target_obj)
+{
+ char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+ std::string etag;
+ bufferlist etag_bl;
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ bool truncated;
+ int ret;
+
+ int total_parts = 0;
+ int handled_parts = 0;
+ int max_parts = 1000;
+ int marker = 0;
+ uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+ auto etags_iter = part_etags.begin();
+ rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+ do {
+ ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+ if (ret == -ENOENT) {
+ ret = -ERR_NO_SUCH_UPLOAD;
+ }
+ if (ret < 0)
+ return ret;
+
+ total_parts += parts.size();
+ if (!truncated && total_parts != (int)part_etags.size()) {
+ ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+ << " expected: " << part_etags.size() << dendl;
+ ret = -ERR_INVALID_PART;
+ return ret;
+ }
+
+ for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) {
+ RadosMultipartPart* part = dynamic_cast<rgw::sal::RadosMultipartPart*>(obj_iter->second.get());
+ uint64_t part_size = part->get_size();
+ if (handled_parts < (int)part_etags.size() - 1 &&
+ part_size < min_part_size) {
+ ret = -ERR_TOO_SMALL;
+ return ret;
+ }
+
+ char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ if (etags_iter->first != (int)obj_iter->first) {
+ ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+ << etags_iter->first << " next uploaded: "
+ << obj_iter->first << dendl;
+ ret = -ERR_INVALID_PART;
+ return ret;
+ }
+ string part_etag = rgw_string_unquote(etags_iter->second);
+ if (part_etag.compare(part->get_etag()) != 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
+ << " etag: " << etags_iter->second << dendl;
+ ret = -ERR_INVALID_PART;
+ return ret;
+ }
+
+ hex_to_buf(part->get_etag().c_str(), petag,
+ CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+
+ RGWUploadPartInfo& obj_part = part->info;
+
+ /* update manifest for part */
+ string oid = mp_obj.get_part(part->info.num);
+ rgw_obj src_obj;
+ src_obj.init_ns(bucket->get_key(), oid, mp_ns);
+
+ if (obj_part.manifest.empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
+ << src_obj << dendl;
+ ret = -ERR_INVALID_PART;
+ return ret;
+ } else {
+ manifest.append(dpp, obj_part.manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
+ auto manifest_prefix = part->info.manifest.get_prefix();
+ if (not manifest_prefix.empty()) {
+ // It has an explicit prefix. Override the default one.
+ src_obj.init_ns(bucket->get_key(), manifest_prefix + "." + std::to_string(part->info.num), mp_ns);
+ }
+ }
+
+ bool part_compressed = (obj_part.cs_info.compression_type != "none");
+ if ((handled_parts > 0) &&
+ ((part_compressed != compressed) ||
+ (cs_info.compression_type != obj_part.cs_info.compression_type))) {
+ ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload ("
+ << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl;
+ ret = -ERR_INVALID_PART;
+ return ret;
+ }
+
+ if (part_compressed) {
+ int64_t new_ofs; // offset in compression data for new part
+ if (cs_info.blocks.size() > 0)
+ new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
+ else
+ new_ofs = 0;
+ for (const auto& block : obj_part.cs_info.blocks) {
+ compression_block cb;
+ cb.old_ofs = block.old_ofs + cs_info.orig_size;
+ cb.new_ofs = new_ofs;
+ cb.len = block.len;
+ cs_info.blocks.push_back(cb);
+ new_ofs = cb.new_ofs + cb.len;
+ }
+ if (!compressed)
+ cs_info.compression_type = obj_part.cs_info.compression_type;
+ cs_info.orig_size += obj_part.cs_info.orig_size;
+ compressed = true;
+ }
+
+ rgw_obj_index_key remove_key;
+ src_obj.key.get_index_key(&remove_key);
+
+ remove_objs.push_back(remove_key);
+
+ cleanup_part_history(dpp, y, part, remove_objs);
+
+ ofs += obj_part.size;
+ accounted_size += obj_part.accounted_size;
+ }
+ } while (truncated);
+ hash.Final((unsigned char *)final_etag);
+
+ buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+ snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+ sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+ "-%lld", (long long)part_etags.size());
+ etag = final_etag_str;
+ ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
+
+ etag_bl.append(etag);
+
+ attrs[RGW_ATTR_ETAG] = etag_bl;
+
+ if (compressed) {
+ // write compression attribute to full object
+ bufferlist tmp;
+ encode(cs_info, tmp);
+ attrs[RGW_ATTR_COMPRESSION] = tmp;
+ }
+
+ target_obj->set_atomic();
+
+ RGWRados::Object op_target(store->getRados(),
+ target_obj->get_bucket()->get_info(),
+ dynamic_cast<RadosObject*>(target_obj)->get_ctx(),
+ target_obj->get_obj());
+ RGWRados::Object::Write obj_op(&op_target);
+
+ obj_op.meta.manifest = &manifest;
+ obj_op.meta.remove_objs = &remove_objs;
+
+ obj_op.meta.ptag = &tag; /* use req_id as operation tag */
+ obj_op.meta.owner = owner.get_id();
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.modify_tail = true;
+ obj_op.meta.completeMultipart = true;
+ obj_op.meta.olh_epoch = olh_epoch;
+
+ ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y);
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+int RadosMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
+{
+ if (!rule && !attrs) {
+ return 0;
+ }
+
+ if (rule) {
+ if (!placement.empty()) {
+ *rule = &placement;
+ if (!attrs) {
+ /* Don't need attrs, done */
+ return 0;
+ }
+ } else {
+ *rule = nullptr;
+ }
+ }
+
+ /* We need either attributes or placement, so we need a read */
+ std::unique_ptr<rgw::sal::Object> meta_obj;
+ meta_obj = get_meta_obj();
+ meta_obj->set_in_extra_data(true);
+
+ multipart_upload_info upload_info;
+ bufferlist headbl;
+
+ /* Read the obj head which contains the multipart_upload_info */
+ std::unique_ptr<rgw::sal::Object::ReadOp> read_op = meta_obj->get_read_op();
+ meta_obj->set_prefetch_data();
+
+ int ret = read_op->prepare(y, dpp);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ return ret;
+ }
+
+ extract_span_context(meta_obj->get_attrs(), trace_ctx);
+
+ if (attrs) {
+ /* Attrs are filled in by prepare */
+ *attrs = meta_obj->get_attrs();
+ if (!rule || *rule != nullptr) {
+ /* placement was cached; don't actually read */
+ return 0;
+ }
+ }
+
+ /* Now read the placement from the head */
+ ret = read_op->read(0, store->ctx()->_conf->rgw_max_chunk_size, headbl, y, dpp);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ return ret;
+ }
+
+ if (headbl.length() <= 0) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+
+ /* Decode multipart_upload_info */
+ auto hiter = headbl.cbegin();
+ try {
+ decode(upload_info, hiter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl;
+ return -EIO;
+ }
+ placement = upload_info.dest_placement;
+ *rule = &placement;
+
+ return 0;
+}
+
+std::unique_ptr<Writer> RadosMultipartUpload::get_writer(
+ const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t part_num,
+ const std::string& part_num_str)
+{
+ RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+ RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+ auto aio = rgw::make_throttle(store->ctx()->_conf->rgw_put_obj_min_window_size, y);
+ return std::make_unique<RadosMultipartWriter>(dpp, y, get_upload_id(),
+ bucket_info, obj_ctx,
+ obj->get_obj(), store, std::move(aio), owner,
+ ptail_placement_rule, part_num, part_num_str);
+}
+
+MPRadosSerializer::MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name) :
+ lock(lock_name)
+{
+ rgw_pool meta_pool;
+ rgw_raw_obj raw_obj;
+
+ obj->get_raw_obj(&raw_obj);
+ oid = raw_obj.oid;
+ store->getRados()->get_obj_data_pool(obj->get_bucket()->get_placement_rule(),
+ obj->get_obj(), &meta_pool);
+ store->getRados()->open_pool_ctx(dpp, meta_pool, ioctx, true, true);
+}
+
+int MPRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+ op.assert_exists();
+ lock.set_duration(dur);
+ lock.lock_exclusive(&op);
+ int ret = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+ if (! ret) {
+ locked = true;
+ }
+ return ret;
+}
+
+LCRadosSerializer::LCRadosSerializer(RadosStore* store, const std::string& _oid, const std::string& lock_name, const std::string& cookie) :
+ StoreLCSerializer(_oid),
+ lock(lock_name)
+{
+ ioctx = &store->getRados()->lc_pool_ctx;
+ lock.set_cookie(cookie);
+}
+
+int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+ lock.set_duration(dur);
+ return lock.lock_exclusive(ioctx, oid);
+}
+
+int RadosLifecycle::get_entry(const std::string& oid, const std::string& marker,
+ std::unique_ptr<LCEntry>* entry)
+{
+ cls_rgw_lc_entry cls_entry;
+ int ret = cls_rgw_lc_get_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, cls_entry);
+ if (ret)
+ return ret;
+
+ LCEntry* e;
+ e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+ if (!e)
+ return -ENOMEM;
+
+ entry->reset(e);
+ return 0;
+}
+
+int RadosLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
+ std::unique_ptr<LCEntry>* entry)
+{
+ cls_rgw_lc_entry cls_entry;
+ int ret = cls_rgw_lc_get_next_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker,
+ cls_entry);
+
+ if (ret)
+ return ret;
+
+ LCEntry* e;
+ e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+ if (!e)
+ return -ENOMEM;
+
+ entry->reset(e);
+ return 0;
+}
+
+int RadosLifecycle::set_entry(const std::string& oid, LCEntry& entry)
+{
+ cls_rgw_lc_entry cls_entry;
+
+ cls_entry.bucket = entry.get_bucket();
+ cls_entry.start_time = entry.get_start_time();
+ cls_entry.status = entry.get_status();
+
+ return cls_rgw_lc_set_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::list_entries(const std::string& oid, const std::string& marker,
+ uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
+{
+ entries.clear();
+
+ vector<cls_rgw_lc_entry> cls_entries;
+ int ret = cls_rgw_lc_list(*store->getRados()->get_lc_pool_ctx(), oid, marker, max_entries, cls_entries);
+
+ if (ret < 0)
+ return ret;
+
+ for (auto& entry : cls_entries) {
+ entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
+ entry.start_time, entry.status));
+ }
+
+ return ret;
+}
+
+int RadosLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
+{
+ cls_rgw_lc_entry cls_entry;
+
+ cls_entry.bucket = entry.get_bucket();
+ cls_entry.start_time = entry.get_start_time();
+ cls_entry.status = entry.get_status();
+
+ return cls_rgw_lc_rm_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
+{
+ cls_rgw_lc_obj_head cls_head;
+ int ret = cls_rgw_lc_get_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+ if (ret)
+ return ret;
+
+ LCHead* h;
+ h = new StoreLCHead(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
+ if (!h)
+ return -ENOMEM;
+
+ head->reset(h);
+ return 0;
+}
+
+int RadosLifecycle::put_head(const std::string& oid, LCHead& head)
+{
+ cls_rgw_lc_obj_head cls_head;
+
+ cls_head.marker = head.get_marker();
+ cls_head.start_date = head.get_start_date();
+ cls_head.shard_rollover_date = head.get_shard_rollover_date();
+
+ return cls_rgw_lc_put_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+}
+
+std::unique_ptr<LCSerializer> RadosLifecycle::get_serializer(const std::string& lock_name,
+ const std::string& oid,
+ const std::string& cookie)
+{
+ return std::make_unique<LCRadosSerializer>(store, oid, lock_name, cookie);
+}
+
+int RadosNotification::publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags)
+{
+ return rgw::notify::publish_reserve(dpp, event_type, res, obj_tags);
+}
+
+int RadosNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+ const ceph::real_time& mtime, const std::string& etag, const std::string& version)
+{
+ return rgw::notify::publish_commit(obj, size, mtime, etag, version, event_type, res, dpp);
+}
+
+int RadosAtomicWriter::prepare(optional_yield y)
+{
+ return processor.prepare(y);
+}
+
+int RadosAtomicWriter::process(bufferlist&& data, uint64_t offset)
+{
+ return processor.process(std::move(data), offset);
+}
+
+int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y)
+{
+ return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+ if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosAppendWriter::prepare(optional_yield y)
+{
+ return processor.prepare(y);
+}
+
+int RadosAppendWriter::process(bufferlist&& data, uint64_t offset)
+{
+ return processor.process(std::move(data), offset);
+}
+
+int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y)
+{
+ return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+ if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosMultipartWriter::prepare(optional_yield y)
+{
+ return processor.prepare(y);
+}
+
+int RadosMultipartWriter::process(bufferlist&& data, uint64_t offset)
+{
+ return processor.process(std::move(data), offset);
+}
+
+int RadosMultipartWriter::complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y)
+{
+ return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+ if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+const std::string& RadosZoneGroup::get_endpoint() const
+{
+ if (!group.endpoints.empty()) {
+ return group.endpoints.front();
+ } else {
+ // use zonegroup's master zone endpoints
+ auto z = group.zones.find(group.master_zone);
+ if (z != group.zones.end() && !z->second.endpoints.empty()) {
+ return z->second.endpoints.front();
+ }
+ }
+ return empty;
+}
+
+bool RadosZoneGroup::placement_target_exists(std::string& target) const
+{
+ return !!group.placement_targets.count(target);
+}
+
+void RadosZoneGroup::get_placement_target_names(std::set<std::string>& names) const
+{
+ for (const auto& target : group.placement_targets) {
+ names.emplace(target.second.name);
+ }
+}
+
+int RadosZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+ std::unique_ptr<PlacementTier>* tier)
+{
+ std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+ titer = group.placement_targets.find(rule.name);
+ if (titer == group.placement_targets.end()) {
+ return -ENOENT;
+ }
+
+ const auto& target_rule = titer->second;
+ std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
+ ttier = target_rule.tier_targets.find(rule.storage_class);
+ if (ttier == target_rule.tier_targets.end()) {
+ // not found
+ return -ENOENT;
+ }
+
+ PlacementTier* t;
+ t = new RadosPlacementTier(store, ttier->second);
+ if (!t)
+ return -ENOMEM;
+
+ tier->reset(t);
+ return 0;
+}
+
+int RadosZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone)
+{
+ RGWZone* rz = store->svc()->zone->find_zone(id);
+ if (!rz)
+ return -ENOENT;
+
+ Zone* z = new RadosZone(store, clone(), *rz);
+ zone->reset(z);
+ return 0;
+}
+
+int RadosZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone)
+{
+ rgw_zone_id id;
+ int ret = store->svc()->zone->find_zone_id_by_name(name, &id);
+ if (ret < 0)
+ return ret;
+
+ RGWZone* rz = store->svc()->zone->find_zone(id.id);
+ if (!rz)
+ return -ENOENT;
+
+ Zone* z = new RadosZone(store, clone(), *rz);
+ zone->reset(z);
+ return 0;
+}
+
+int RadosZoneGroup::list_zones(std::list<std::string>& zone_ids)
+{
+ for (const auto& entry : group.zones)
+ {
+ zone_ids.push_back(entry.second.id);
+ }
+ return 0;
+}
+
+std::unique_ptr<Zone> RadosZone::clone()
+{
+ if (local_zone)
+ return std::make_unique<RadosZone>(store, group->clone());
+
+ return std::make_unique<RadosZone>(store, group->clone(), rgw_zone);
+}
+
+const std::string& RadosZone::get_id()
+{
+ if (local_zone)
+ return store->svc()->zone->zone_id().id;
+
+ return rgw_zone.id;
+}
+
+const std::string& RadosZone::get_name() const
+{
+ if (local_zone)
+ return store->svc()->zone->zone_name();
+
+ return rgw_zone.name;
+}
+
+bool RadosZone::is_writeable()
+{
+ if (local_zone)
+ return store->svc()->zone->zone_is_writeable();
+
+ return !rgw_zone.read_only;
+}
+
+bool RadosZone::get_redirect_endpoint(std::string* endpoint)
+{
+ if (local_zone)
+ return store->svc()->zone->get_redirect_zone_endpoint(endpoint);
+
+ endpoint = &rgw_zone.redirect_zone;
+ return true;
+}
+
+bool RadosZone::has_zonegroup_api(const std::string& api) const
+{
+ return store->svc()->zone->has_zonegroup_api(api);
+}
+
+const std::string& RadosZone::get_current_period_id()
+{
+ return store->svc()->zone->get_current_period_id();
+}
+
+const RGWAccessKey& RadosZone::get_system_key()
+{
+ return store->svc()->zone->get_zone_params().system_key;
+}
+
+const std::string& RadosZone::get_realm_name()
+{
+ return store->svc()->zone->get_realm().get_name();
+}
+
+const std::string& RadosZone::get_realm_id()
+{
+ return store->svc()->zone->get_realm().get_id();
+}
+
+const std::string_view RadosZone::get_tier_type()
+{
+ if (local_zone)
+ return store->svc()->zone->get_zone().tier_type;
+
+ return rgw_zone.id;
+}
+
+RGWBucketSyncPolicyHandlerRef RadosZone::get_sync_policy_handler()
+{
+ return store->svc()->zone->get_sync_policy_handler(get_id());
+}
+
+RadosLuaManager::RadosLuaManager(RadosStore* _s) :
+ store(_s),
+ pool((store->svc() && store->svc()->zone) ? store->svc()->zone->get_zone_params().log_pool : rgw_pool())
+{ }
+
+int RadosLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
+{
+ if (pool.empty()) {
+ ldpp_dout(dpp, 10) << "WARNING: missing pool when reading lua script " << dendl;
+ return 0;
+ }
+ bufferlist bl;
+
+ int r = rgw_get_system_obj(store->svc()->sysobj, pool, key, bl, nullptr, nullptr, y, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ auto iter = bl.cbegin();
+ try {
+ ceph::decode(script, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RadosLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
+{
+ if (pool.empty()) {
+ ldpp_dout(dpp, 10) << "WARNING: missing pool when writing lua script " << dendl;
+ return 0;
+ }
+ bufferlist bl;
+ ceph::encode(script, bl);
+
+ int r = rgw_put_system_obj(dpp, store->svc()->sysobj, pool, key, bl, false, nullptr, real_time(), y);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RadosLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
+{
+ if (pool.empty()) {
+ ldpp_dout(dpp, 10) << "WARNING: missing pool when deleting lua script " << dendl;
+ return 0;
+ }
+ int r = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, key, nullptr, y);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ return 0;
+}
+
+const std::string PACKAGE_LIST_OBJECT_NAME = "lua_package_allowlist";
+
+int RadosLuaManager::add_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+ // add package to list
+ const bufferlist empty_bl;
+ std::map<std::string, bufferlist> new_package{{package_name, empty_bl}};
+ librados::ObjectWriteOperation op;
+ op.omap_set(new_package);
+ auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+ PACKAGE_LIST_OBJECT_NAME, &op, y);
+
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+}
+
+int RadosLuaManager::remove_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+ librados::ObjectWriteOperation op;
+ size_t pos = package_name.find(" ");
+ if (pos != package_name.npos) {
+ // remove specfic version of the the package
+ op.omap_rm_keys(std::set<std::string>({package_name}));
+ auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+ PACKAGE_LIST_OBJECT_NAME, &op, y);
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+ }
+ // otherwise, remove any existing versions of the package
+ rgw::lua::packages_t packages;
+ auto ret = list_packages(dpp, y, packages);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ }
+ for(const auto& package : packages) {
+ const std::string package_no_version = package.substr(0, package.find(" "));
+ if (package_no_version.compare(package_name) == 0) {
+ op.omap_rm_keys(std::set<std::string>({package}));
+ ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+ PACKAGE_LIST_OBJECT_NAME, &op, y);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+int RadosLuaManager::list_packages(const DoutPrefixProvider *dpp, optional_yield y, rgw::lua::packages_t& packages)
+{
+ constexpr auto max_chunk = 1024U;
+ std::string start_after;
+ bool more = true;
+ int rval;
+ while (more) {
+ librados::ObjectReadOperation op;
+ rgw::lua::packages_t packages_chunk;
+ op.omap_get_keys2(start_after, max_chunk, &packages_chunk, &more, &rval);
+ const auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+ PACKAGE_LIST_OBJECT_NAME, &op, nullptr, y);
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ packages.merge(packages_chunk);
+ }
+
+ return 0;
+}
+
+int RadosOIDCProvider::store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y)
+{
+ auto sysobj = store->svc()->sysobj;
+ std::string oid = tenant + get_url_oid_prefix() + url;
+
+ bufferlist bl;
+ using ceph::encode;
+ encode(*this, bl);
+ return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().oidc_pool, oid, bl, exclusive, nullptr, real_time(), y);
+}
+
+int RadosOIDCProvider::read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant)
+{
+ auto sysobj = store->svc()->sysobj;
+ auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+ std::string oid = tenant + get_url_oid_prefix() + url;
+ bufferlist bl;
+
+ int ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ try {
+ using ceph::decode;
+ auto iter = bl.cbegin();
+ decode(*this, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " << pool.name <<
+ ": " << url << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RadosOIDCProvider::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+
+ std::string url, tenant;
+ auto ret = get_tenant_url_from_arn(tenant, url);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl;
+ return -EINVAL;
+ }
+
+ if (this->tenant != tenant) {
+ ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", "
+ << tenant << ": " << dendl;
+ return -EINVAL;
+ }
+
+ // Delete url
+ std::string oid = tenant + get_url_oid_prefix() + url;
+ ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: deleting oidc url from pool: " << pool.name << ": "
+ << provider_url << ": " << cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+}
+
+int RadosRole::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+ using ceph::encode;
+ std::string oid;
+
+ oid = info.id;
+
+ bufferlist bl;
+ encode(this->info, bl);
+
+ if (!this->info.tags.empty()) {
+ bufferlist bl_tags;
+ encode(this->info.tags, bl_tags);
+ map<string, bufferlist> attrs;
+ attrs.emplace("tagging", bl_tags);
+
+ RGWSI_MBSObj_PutParams params(bl, &attrs, info.mtime, exclusive);
+ std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+ ctx->init(store->svc()->role->get_be_handler());
+ return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+ } else {
+ RGWSI_MBSObj_PutParams params(bl, nullptr, info.mtime, exclusive);
+ std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+ ctx->init(store->svc()->role->get_be_handler());
+ return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+ }
+}
+
+int RadosRole::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+ auto sysobj = store->svc()->sysobj;
+ RGWNameToId nameToId;
+ nameToId.obj_id = info.id;
+
+ std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+
+ bufferlist bl;
+ using ceph::encode;
+ encode(nameToId, bl);
+
+ return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+ auto sysobj = store->svc()->sysobj;
+ std::string oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+
+ bufferlist bl;
+
+ return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y)
+{
+ auto sysobj = store->svc()->sysobj;
+ std::string oid = info.tenant + get_names_oid_prefix() + role_name;
+ bufferlist bl;
+
+ int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ RGWNameToId nameToId;
+ try {
+ auto iter = bl.cbegin();
+ using ceph::decode;
+ decode(nameToId, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode role from Role pool: " << role_name << dendl;
+ return -EIO;
+ }
+ role_id = nameToId.obj_id;
+ return 0;
+}
+
+int RadosRole::read_name(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ auto sysobj = store->svc()->sysobj;
+ std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+ bufferlist bl;
+
+ int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed reading role name from Role pool: " << info.name <<
+ ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ RGWNameToId nameToId;
+ try {
+ using ceph::decode;
+ auto iter = bl.cbegin();
+ decode(nameToId, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode role name from Role pool: " << info.name << dendl;
+ return -EIO;
+ }
+ info.id = nameToId.obj_id;
+ return 0;
+}
+
+int RadosRole::read_info(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ std::string oid;
+
+ oid = info.id;
+ ldpp_dout(dpp, 20) << "INFO: oid in read_info is: " << oid << dendl;
+
+ bufferlist bl;
+
+ RGWSI_MBSObj_GetParams params(&bl, &info.attrs, &info.mtime);
+ std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+ ctx->init(store->svc()->role->get_be_handler());
+ int ret = store->svc()->role->svc.meta_be->get(ctx.get(), oid, params, &info.objv_tracker, y, dpp, true);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed reading role info from Role pool: " << info.id << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ try {
+ using ceph::decode;
+ auto iter = bl.cbegin();
+ decode(this->info, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode role info from Role pool: " << info.id << dendl;
+ return -EIO;
+ }
+
+ auto it = info.attrs.find("tagging");
+ if (it != info.attrs.end()) {
+ bufferlist bl_tags = it->second;
+ try {
+ using ceph::decode;
+ auto iter = bl_tags.cbegin();
+ decode(info.tags, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode attrs" << info.id << dendl;
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+int RadosRole::create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y)
+{
+ int ret;
+
+ if (! validate_input(dpp)) {
+ return -EINVAL;
+ }
+
+ if (!role_id.empty()) {
+ info.id = role_id;
+ }
+
+ /* check to see the name is not used */
+ ret = read_id(dpp, info.name, info.tenant, info.id, y);
+ if (exclusive && ret == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: name " << info.name << " already in use for role id "
+ << info.id << dendl;
+ return -EEXIST;
+ } else if ( ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "failed reading role id " << info.id << ": "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ if (info.id.empty()) {
+ /* create unique id */
+ uuid_d new_uuid;
+ char uuid_str[37];
+ new_uuid.generate_random();
+ new_uuid.print(uuid_str);
+ info.id = uuid_str;
+ }
+
+ //arn
+ info.arn = role_arn_prefix + info.tenant + ":role" + info.path + info.name;
+
+ // Creation time
+ real_clock::time_point t = real_clock::now();
+
+ struct timeval tv;
+ real_clock::to_timeval(t, tv);
+
+ char buf[30];
+ struct tm result;
+ gmtime_r(&tv.tv_sec, &result);
+ strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result);
+ sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000);
+ info.creation_date.assign(buf, strlen(buf));
+
+ auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+ ret = store_info(dpp, exclusive, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: storing role info in Role pool: "
+ << info.id << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ret = store_name(dpp, exclusive, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: storing role name in Role pool: "
+ << info.name << ": " << cpp_strerror(-ret) << dendl;
+
+ //Delete the role info that was stored in the previous call
+ std::string oid = get_info_oid_prefix() + info.id;
+ int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (info_ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+ << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+ }
+ return ret;
+ }
+
+ ret = store_path(dpp, exclusive, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: storing role path in Role pool: "
+ << info.path << ": " << cpp_strerror(-ret) << dendl;
+ //Delete the role info that was stored in the previous call
+ std::string oid = get_info_oid_prefix() + info.id;
+ int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (info_ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+ << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+ }
+ //Delete role name that was stored in previous call
+ oid = info.tenant + get_names_oid_prefix() + info.name;
+ int name_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (name_ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: cleanup of role name from Role pool: "
+ << info.name << ": " << cpp_strerror(-name_ret) << dendl;
+ }
+ return ret;
+ }
+ return 0;
+}
+
+int RadosRole::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+
+ int ret = read_name(dpp, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = read_info(dpp, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (! info.perm_policy_map.empty()) {
+ return -ERR_DELETE_CONFLICT;
+ }
+
+ // Delete id
+ std::string oid = get_info_oid_prefix() + info.id;
+ ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: deleting role id from Role pool: "
+ << info.id << ": " << cpp_strerror(-ret) << dendl;
+ }
+
+ // Delete name
+ oid = info.tenant + get_names_oid_prefix() + info.name;
+ ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: deleting role name from Role pool: "
+ << info.name << ": " << cpp_strerror(-ret) << dendl;
+ }
+
+ // Delete path
+ oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+ ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: deleting role path from Role pool: "
+ << info.path << ": " << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+}
+
+} // namespace rgw::sal
+
+extern "C" {
+
+void* newRadosStore(void)
+{
+ rgw::sal::RadosStore* store = new rgw::sal::RadosStore();
+ if (store) {
+ RGWRados* rados = new RGWRados();
+
+ if (!rados) {
+ delete store; store = nullptr;
+ } else {
+ store->setRados(rados);
+ rados->set_store(store);
+ }
+ }
+
+ return store;
+}
+
+}
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
new file mode 100644
index 000000000..4d2dc9709
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -0,0 +1,978 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_store.h"
+#include "rgw_rados.h"
+#include "rgw_notify.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+#include "rgw_multi.h"
+#include "rgw_putobj_processor.h"
+#include "services/svc_tier_rados.h"
+#include "cls/lock/cls_lock_client.h"
+
+namespace rgw { namespace sal {
+
+class RadosMultipartUpload;
+
+class RadosCompletions : public Completions {
+ public:
+ std::list<librados::AioCompletion*> handles;
+ RadosCompletions() {}
+ ~RadosCompletions() = default;
+ virtual int drain() override;
+};
+
+class RadosPlacementTier: public StorePlacementTier {
+ RadosStore* store;
+ RGWZoneGroupPlacementTier tier;
+public:
+ RadosPlacementTier(RadosStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
+ virtual ~RadosPlacementTier() = default;
+
+ virtual const std::string& get_tier_type() { return tier.tier_type; }
+ virtual const std::string& get_storage_class() { return tier.storage_class; }
+ virtual bool retain_head_object() { return tier.retain_head_object; }
+ RGWZoneGroupPlacementTier& get_rt() { return tier; }
+};
+
+class RadosZoneGroup : public StoreZoneGroup {
+ RadosStore* store;
+ const RGWZoneGroup group;
+ std::string empty;
+public:
+ RadosZoneGroup(RadosStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {}
+ virtual ~RadosZoneGroup() = default;
+
+ virtual const std::string& get_id() const override { return group.get_id(); };
+ virtual const std::string& get_name() const override { return group.get_name(); };
+ virtual int equals(const std::string& other_zonegroup) const override {
+ return group.equals(other_zonegroup);
+ };
+ /** Get the endpoint from zonegroup, or from master zone if not set */
+ virtual const std::string& get_endpoint() const override;
+ virtual bool placement_target_exists(std::string& target) const override;
+ virtual bool is_master_zonegroup() const override {
+ return group.is_master_zonegroup();
+ };
+ virtual const std::string& get_api_name() const override { return group.api_name; };
+ virtual void get_placement_target_names(std::set<std::string>& names) const override;
+ virtual const std::string& get_default_placement_name() const override {
+ return group.default_placement.name; };
+ virtual int get_hostnames(std::list<std::string>& names) const override {
+ names = group.hostnames;
+ return 0;
+ };
+ virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
+ names = group.hostnames_s3website;
+ return 0;
+ };
+ virtual int get_zone_count() const override {
+ return group.zones.size();
+ }
+ virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier);
+ virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override;
+ virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override;
+ virtual int list_zones(std::list<std::string>& zone_ids) override;
+ bool supports(std::string_view feature) const override {
+ return group.supports(feature);
+ }
+ virtual std::unique_ptr<ZoneGroup> clone() override {
+ return std::make_unique<RadosZoneGroup>(store, group);
+ }
+ const RGWZoneGroup& get_group() const { return group; }
+};
+
+class RadosZone : public StoreZone {
+ protected:
+ RadosStore* store;
+ std::unique_ptr<ZoneGroup> group;
+ RGWZone rgw_zone;
+ bool local_zone{false};
+ public:
+ RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg) : store(_store), group(std::move(_zg)), local_zone(true) {}
+ RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg, RGWZone& z) : store(_store), group(std::move(_zg)), rgw_zone(z) {}
+ ~RadosZone() = default;
+
+ virtual std::unique_ptr<Zone> clone() override;
+ virtual ZoneGroup& get_zonegroup() override { return *(group.get()); }
+ virtual const std::string& get_id() override;
+ virtual const std::string& get_name() const override;
+ virtual bool is_writeable() override;
+ virtual bool get_redirect_endpoint(std::string* endpoint) override;
+ virtual bool has_zonegroup_api(const std::string& api) const override;
+ virtual const std::string& get_current_period_id() override;
+ virtual const RGWAccessKey& get_system_key() override;
+ virtual const std::string& get_realm_name() override;
+ virtual const std::string& get_realm_id() override;
+ virtual const std::string_view get_tier_type() override;
+ virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override;
+};
+
+class RadosStore : public StoreDriver {
+ private:
+ RGWRados* rados;
+ RGWUserCtl* user_ctl;
+ std::unique_ptr<RadosZone> zone;
+ std::string topics_oid(const std::string& tenant) const;
+
+ public:
+ RadosStore()
+ : rados(nullptr) {
+ }
+ ~RadosStore() {
+ delete rados;
+ }
+
+ virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
+ virtual const std::string get_name() const override {
+ return "rados";
+ }
+ virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+ virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
+ virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
+ virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
+ virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+ virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+ virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
+ virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+ virtual bool is_meta_master() override;
+ virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+ bufferlist& in_data, JSONParser* jp, req_info& info,
+ optional_yield y) override;
+ virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+ bufferlist& in_data,
+ RGWXMLDecoder::XMLParser* parser, req_info& info,
+ optional_yield y) override;
+ virtual Zone* get_zone() { return zone.get(); }
+ virtual std::string zone_unique_id(uint64_t unique_num) override;
+ virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+ virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+ virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
+ virtual int cluster_stat(RGWClusterStat& stats) override;
+ virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+ virtual std::unique_ptr<Completions> get_completions(void) override;
+ virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) override;
+ virtual std::unique_ptr<Notification> get_notification(
+ const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj,
+ rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant,
+ std::string& _req_id, optional_yield y) override;
+ int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+ int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+ int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+ virtual RGWLC* get_rgwlc(void) override { return rados->get_lc(); }
+ virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return rados->get_cr_registry(); }
+
+ virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+ virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
+ virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+ const std::map<std::string, std::string>& meta) override;
+ virtual void get_quota(RGWQuota& quota) override;
+ virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
+ virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
+ virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+ std::optional<rgw_zone_id> zone,
+ std::optional<rgw_bucket> bucket,
+ RGWBucketSyncPolicyHandlerRef* phandler,
+ optional_yield y) override;
+ virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+ virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { rados->wakeup_meta_sync_shards(shard_ids); }
+ virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override { rados->wakeup_data_sync_shards(dpp, source_zone, shard_ids); }
+ virtual int clear_usage(const DoutPrefixProvider *dpp) override { return rados->clear_usage(dpp); }
+ virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool* is_truncated,
+ RGWUsageIter& usage_iter,
+ std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+ virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+ virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+ virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
+ virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
+ virtual void meta_list_keys_complete(void* handle) override;
+ virtual std::string meta_get_marker(void* handle) override;
+ virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) override;
+ virtual const RGWSyncModuleInstanceRef& get_sync_module() { return rados->get_sync_module(); }
+ virtual std::string get_host_id() { return rados->host_id; }
+ virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+ virtual std::unique_ptr<RGWRole> get_role(std::string name,
+ std::string tenant,
+ std::string path="",
+ std::string trust_policy="",
+ std::string max_session_duration_str="",
+ std::multimap<std::string,std::string> tags={}) override;
+ virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+ virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+ virtual int get_roles(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ const std::string& path_prefix,
+ const std::string& tenant,
+ std::vector<std::unique_ptr<RGWRole>>& roles) override;
+ virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+ virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+ const std::string& tenant,
+ std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+ virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ const std::string& unique_tag,
+ uint64_t position,
+ uint64_t *cur_accounted_size) override;
+ virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t olh_epoch,
+ const std::string& unique_tag) override;
+ virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+ virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+ virtual void finalize(void) override;
+
+ virtual CephContext* ctx(void) override { return rados->ctx(); }
+
+ virtual void register_admin_apis(RGWRESTMgr* mgr) override;
+
+ /* Unique to RadosStore */
+ int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ librados::IoCtx* ioctx);
+ int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+ int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio);
+ void get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj);
+ int get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size);
+
+ void setRados(RGWRados * st) { rados = st; }
+ RGWRados* getRados(void) { return rados; }
+
+ RGWServices* svc() { return &rados->svc; }
+ const RGWServices* svc() const { return &rados->svc; }
+ RGWCtl* ctl() { return &rados->ctl; }
+ const RGWCtl* ctl() const { return &rados->ctl; }
+
+ void setUserCtl(RGWUserCtl *_ctl) { user_ctl = _ctl; }
+};
+
+class RadosUser : public StoreUser {
+ private:
+ RadosStore* store;
+
+ public:
+ RadosUser(RadosStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
+ RadosUser(RadosStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
+ RadosUser(RadosStore *_st) : store(_st) { }
+ RadosUser(RadosUser& _o) = default;
+
+ virtual std::unique_ptr<User> clone() override {
+ return std::unique_ptr<User>(new RadosUser(*this));
+ }
+ int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, const std::string& end_marker,
+ uint64_t max, bool need_stats, BucketList& buckets,
+ optional_yield y) override;
+ virtual int create_bucket(const DoutPrefixProvider* dpp,
+ const rgw_bucket& b,
+ const std::string& zonegroup_id,
+ rgw_placement_rule& placement_rule,
+ std::string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ const RGWAccessControlPolicy& policy,
+ Attrs& attrs,
+ RGWBucketInfo& info,
+ obj_version& ep_objv,
+ bool exclusive,
+ bool obj_lock_enabled,
+ bool* existed,
+ req_info& req_info,
+ std::unique_ptr<Bucket>* bucket,
+ optional_yield y) override;
+ virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
+ virtual int read_stats(const DoutPrefixProvider *dpp,
+ optional_yield y, RGWStorageStats* stats,
+ ceph::real_time* last_stats_sync = nullptr,
+ ceph::real_time* last_stats_update = nullptr) override;
+ virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
+ virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+ virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+ bool* is_truncated, RGWUsageIter& usage_iter,
+ std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+ virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+
+ virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
+ virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+ friend class RadosBucket;
+};
+
+class RadosObject : public StoreObject {
+ private:
+ RadosStore* store;
+ RGWAccessControlPolicy acls;
+ RGWObjManifest *manifest{nullptr};
+ RGWObjectCtx* rados_ctx;
+ bool rados_ctx_owned;
+
+ public:
+
+ struct RadosReadOp : public ReadOp {
+ private:
+ RadosObject* source;
+ RGWObjectCtx* rctx;
+ RGWRados::Object op_target;
+ RGWRados::Object::Read parent_op;
+
+ public:
+ RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx);
+
+ virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+
+ /*
+ * Both `read` and `iterate` read up through index `end`
+ * *inclusive*. The number of bytes that could be returned is
+ * `end - ofs + 1`.
+ */
+ virtual int read(int64_t ofs, int64_t end,
+ bufferlist& bl, optional_yield y,
+ const DoutPrefixProvider* dpp) override;
+ virtual int iterate(const DoutPrefixProvider* dpp,
+ int64_t ofs, int64_t end,
+ RGWGetDataCB* cb, optional_yield y) override;
+
+ virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
+ };
+
+ struct RadosDeleteOp : public DeleteOp {
+ private:
+ RadosObject* source;
+ RGWRados::Object op_target;
+ RGWRados::Object::Delete parent_op;
+
+ public:
+ RadosDeleteOp(RadosObject* _source);
+
+ virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+ };
+
+ RadosObject(RadosStore *_st, const rgw_obj_key& _k)
+ : StoreObject(_k),
+ store(_st),
+ acls(),
+ rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))),
+ rados_ctx_owned(true) {
+ }
+ RadosObject(RadosStore *_st, const rgw_obj_key& _k, Bucket* _b)
+ : StoreObject(_k, _b),
+ store(_st),
+ acls(),
+ rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))) ,
+ rados_ctx_owned(true) {
+ }
+ RadosObject(RadosObject& _o) : StoreObject(_o) {
+ store = _o.store;
+ acls = _o.acls;
+ manifest = _o.manifest;
+ rados_ctx = _o.rados_ctx;
+ rados_ctx_owned = false;
+ }
+
+ virtual ~RadosObject();
+
+ virtual void invalidate() override {
+ StoreObject::invalidate();
+ rados_ctx->invalidate(get_obj());
+ }
+ virtual int delete_object(const DoutPrefixProvider* dpp,
+ optional_yield y, bool prevent_versioning) override;
+ virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+ bool keep_index_consistent, optional_yield y) override;
+ virtual int copy_object(User* user,
+ req_info* info, const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ const rgw_placement_rule& dest_placement,
+ ceph::real_time* src_mtime, ceph::real_time* mtime,
+ const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+ bool high_precision_time,
+ const char* if_match, const char* if_nomatch,
+ AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+ RGWObjCategory category, uint64_t olh_epoch,
+ boost::optional<ceph::real_time> delete_at,
+ std::string* version_id, std::string* tag, std::string* etag,
+ void (*progress_cb)(off_t, void *), void* progress_data,
+ const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+ virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+ virtual void set_atomic() override {
+ rados_ctx->set_atomic(state.obj);
+ StoreObject::set_atomic();
+ }
+ virtual void set_prefetch_data() override {
+ rados_ctx->set_prefetch_data(state.obj);
+ StoreObject::set_prefetch_data();
+ }
+ virtual void set_compressed() override {
+ rados_ctx->set_compressed(state.obj);
+ StoreObject::set_compressed();
+ }
+
+ virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
+ virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
+ virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
+ virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
+ virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
+ virtual bool is_expired() override;
+ virtual void gen_rand_obj_instance_name() override;
+ void get_raw_obj(rgw_raw_obj* raw_obj);
+ virtual std::unique_ptr<Object> clone() override {
+ return std::unique_ptr<Object>(new RadosObject(*this));
+ }
+ virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+ const std::string& lock_name) override;
+ virtual int transition(Bucket* bucket,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider* dpp,
+ optional_yield y) override;
+ virtual int transition_to_cloud(Bucket* bucket,
+ rgw::sal::PlacementTier* tier,
+ rgw_bucket_dir_entry& o,
+ std::set<std::string>& cloud_targets,
+ CephContext* cct,
+ bool update_object,
+ const DoutPrefixProvider* dpp,
+ optional_yield y) override;
+ virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+ virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
+
+ /* Swift versioning */
+ virtual int swift_versioning_restore(bool& restored,
+ const DoutPrefixProvider* dpp) override;
+ virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+ optional_yield y) override;
+
+ /* OPs */
+ virtual std::unique_ptr<ReadOp> get_read_op() override;
+ virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+ /* OMAP */
+ virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+ std::map<std::string, bufferlist> *m,
+ bool* pmore, optional_yield y) override;
+ virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+ optional_yield y) override;
+ virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+ const std::set<std::string>& keys,
+ Attrs* vals) override;
+ virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+ bool must_exist, optional_yield y) override;
+ virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+ /* Internal to RadosStore */
+ int get_max_chunk_size(const DoutPrefixProvider* dpp,
+ rgw_placement_rule placement_rule,
+ uint64_t* max_chunk_size,
+ uint64_t* alignment = nullptr);
+ void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t* max_size);
+ void raw_obj_to_obj(const rgw_raw_obj& raw_obj);
+ int write_cloud_tier(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ uint64_t olh_epoch,
+ rgw::sal::PlacementTier* tier,
+ bool is_multipart_upload,
+ rgw_placement_rule& target_placement,
+ Object* head_obj);
+ RGWObjManifest* get_manifest() { return manifest; }
+ RGWObjectCtx& get_ctx() { return *rados_ctx; }
+
+ private:
+ int read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
+};
+
+class RadosBucket : public StoreBucket {
+ private:
+ RadosStore* store;
+ RGWAccessControlPolicy acls;
+ std::string topics_oid() const;
+
+ public:
+ RadosBucket(RadosStore *_st)
+ : store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, User* _u)
+ : StoreBucket(_u),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const rgw_bucket& _b)
+ : StoreBucket(_b),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const RGWBucketEnt& _e)
+ : StoreBucket(_e),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const RGWBucketInfo& _i)
+ : StoreBucket(_i),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const rgw_bucket& _b, User* _u)
+ : StoreBucket(_b, _u),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const RGWBucketEnt& _e, User* _u)
+ : StoreBucket(_e, _u),
+ store(_st),
+ acls() {
+ }
+
+ RadosBucket(RadosStore *_st, const RGWBucketInfo& _i, User* _u)
+ : StoreBucket(_i, _u),
+ store(_st),
+ acls() {
+ }
+
+ virtual ~RadosBucket();
+ virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+ virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) override;
+ virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
+ virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+ keep_index_consistent,
+ optional_yield y, const
+ DoutPrefixProvider *dpp) override;
+ virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+ virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
+ virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) override;
+ virtual int read_stats(const DoutPrefixProvider *dpp,
+ const bucket_index_layout_generation& idx_layout,
+ int shard_id, std::string* bucket_ver, std::string* master_ver,
+ std::map<RGWObjCategory, RGWStorageStats>& stats,
+ std::string* max_marker = nullptr,
+ bool* syncstopped = nullptr) override;
+ virtual int read_stats_async(const DoutPrefixProvider *dpp,
+ const bucket_index_layout_generation& idx_layout,
+ int shard_id, RGWGetBucketStats_CB* ctx) override;
+ virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+ virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
+ virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
+ virtual int chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) override;
+ virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) override;
+ virtual bool is_owner(User* user) override;
+ virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override;
+ virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
+ virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, optional_yield y) override;
+ virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) override;
+ virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+ bool* is_truncated, RGWUsageIter& usage_iter,
+ std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+ virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+ virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
+ virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+ virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+ virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+ virtual int purge_instance(const DoutPrefixProvider* dpp) override;
+ virtual std::unique_ptr<Bucket> clone() override {
+ return std::make_unique<RadosBucket>(*this);
+ }
+ virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+ const std::string& oid,
+ std::optional<std::string> upload_id=std::nullopt,
+ ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
+ virtual int list_multiparts(const DoutPrefixProvider *dpp,
+ const std::string& prefix,
+ std::string& marker,
+ const std::string& delim,
+ const int& max_uploads,
+ std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated) override;
+ virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+ CephContext* cct) override;
+ int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+ int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+ int remove_topics(RGWObjVersionTracker* objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override;
+
+ private:
+ int link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true, RGWObjVersionTracker* objv = nullptr);
+ int unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true);
+ friend class RadosUser;
+};
+
+class RadosMultipartPart : public StoreMultipartPart {
+protected:
+ RGWUploadPartInfo info;
+
+public:
+ RadosMultipartPart() = default;
+ virtual ~RadosMultipartPart() = default;
+
+ virtual uint32_t get_num() { return info.num; }
+ virtual uint64_t get_size() { return info.accounted_size; }
+ virtual const std::string& get_etag() { return info.etag; }
+ virtual ceph::real_time& get_mtime() { return info.modified; }
+
+ /* For RadosStore code */
+ RGWObjManifest& get_manifest() { return info.manifest; }
+ const std::set<std::string>& get_past_prefixes() const { return info.past_prefixes; }
+
+ friend class RadosMultipartUpload;
+};
+
+class RadosMultipartUpload : public StoreMultipartUpload {
+ RadosStore* store;
+ RGWMPObj mp_obj;
+ ACLOwner owner;
+ ceph::real_time mtime;
+ rgw_placement_rule placement;
+ RGWObjManifest manifest;
+
+public:
+ RadosMultipartUpload(RadosStore* _store, Bucket* _bucket, const std::string& oid,
+ std::optional<std::string> upload_id, ACLOwner owner,
+ ceph::real_time _mtime)
+ : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id),
+ owner(owner), mtime(_mtime) {}
+ virtual ~RadosMultipartUpload() = default;
+
+ virtual const std::string& get_meta() const override { return mp_obj.get_meta(); }
+ virtual const std::string& get_key() const override { return mp_obj.get_key(); }
+ virtual const std::string& get_upload_id() const override { return mp_obj.get_upload_id(); }
+ virtual const ACLOwner& get_owner() const override { return owner; }
+ virtual ceph::real_time& get_mtime() override { return mtime; }
+ virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+ virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+ virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+ int num_parts, int marker,
+ int* next_marker, bool* truncated,
+ bool assume_unsorted = false) override;
+ virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+ virtual int complete(const DoutPrefixProvider* dpp,
+ optional_yield y, CephContext* cct,
+ std::map<int, std::string>& part_etags,
+ std::list<rgw_obj_index_key>& remove_objs,
+ uint64_t& accounted_size, bool& compressed,
+ RGWCompressionInfo& cs_info, off_t& ofs,
+ std::string& tag, ACLOwner& owner,
+ uint64_t olh_epoch,
+ rgw::sal::Object* target_obj) override;
+ virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
+ virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ rgw::sal::Object* obj,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t part_num,
+ const std::string& part_num_str) override;
+protected:
+ int cleanup_part_history(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ RadosMultipartPart* part,
+ std::list<rgw_obj_index_key>& remove_objs);
+};
+
+class MPRadosSerializer : public StoreMPSerializer {
+ librados::IoCtx ioctx;
+ rados::cls::lock::Lock lock;
+ librados::ObjectWriteOperation op;
+
+public:
+ MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name);
+
+ virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+ virtual int unlock() override {
+ return lock.unlock(&ioctx, oid);
+ }
+};
+
+class LCRadosSerializer : public StoreLCSerializer {
+ librados::IoCtx* ioctx;
+ rados::cls::lock::Lock lock;
+
+public:
+ LCRadosSerializer(RadosStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie);
+
+ virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+ virtual int unlock() override {
+ return lock.unlock(ioctx, oid);
+ }
+};
+
+class RadosLifecycle : public StoreLifecycle {
+ RadosStore* store;
+
+public:
+ RadosLifecycle(RadosStore* _st) : store(_st) {}
+
+ using StoreLifecycle::get_entry;
+ virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+ virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+ virtual int set_entry(const std::string& oid, LCEntry& entry) override;
+ virtual int list_entries(const std::string& oid, const std::string& marker,
+ uint32_t max_entries,
+ std::vector<std::unique_ptr<LCEntry>>& entries) override;
+ virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
+ virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
+ virtual int put_head(const std::string& oid, LCHead& head) override;
+ virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+ const std::string& oid,
+ const std::string& cookie) override;
+};
+
+class RadosNotification : public StoreNotification {
+ RadosStore* store;
+ /* XXX it feels incorrect to me that rgw::notify::reservation_t is
+ * currently RADOS-specific; instead, I think notification types such as
+ * reservation_t should be generally visible, whereas the internal
+ * notification behavior should be made portable (e.g., notification
+ * to non-RADOS message sinks) */
+ rgw::notify::reservation_t res;
+
+ public:
+ RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, req_state* _s, rgw::notify::EventType _type, optional_yield y, const std::string* object_name) :
+ StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _s, _obj, _src_obj, object_name, y) { }
+
+ RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, rgw::notify::EventType _type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) :
+ StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _obj, _src_obj, _bucket, _user_id, _user_tenant, _req_id, y) {}
+
+ ~RadosNotification() = default;
+
+ rgw::notify::reservation_t& get_reservation(void) {
+ return res;
+ }
+
+ virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override;
+ virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+ const ceph::real_time& mtime, const std::string& etag, const std::string& version) override;
+};
+
+class RadosAtomicWriter : public StoreWriter {
+protected:
+ rgw::sal::RadosStore* store;
+ std::unique_ptr<Aio> aio;
+ RGWObjectCtx& obj_ctx;
+ rgw::putobj::AtomicObjectProcessor processor;
+
+public:
+ RadosAtomicWriter(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RadosStore* _store, std::unique_ptr<Aio> _aio,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t olh_epoch,
+ const std::string& unique_tag) :
+ StoreWriter(dpp, y),
+ store(_store),
+ aio(std::move(_aio)),
+ obj_ctx(obj_ctx),
+ processor(&*aio, store->getRados(), bucket_info,
+ ptail_placement_rule, owner, obj_ctx,
+ obj, olh_epoch, unique_tag,
+ dpp, y)
+ {}
+ ~RadosAtomicWriter() = default;
+
+ // prepare to start processing object data
+ virtual int prepare(optional_yield y) override;
+
+ // Process a bufferlist
+ virtual int process(bufferlist&& data, uint64_t offset) override;
+
+ // complete the operation and make its result visible to clients
+ virtual int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+};
+
+class RadosAppendWriter : public StoreWriter {
+protected:
+ rgw::sal::RadosStore* store;
+ std::unique_ptr<Aio> aio;
+ RGWObjectCtx& obj_ctx;
+ rgw::putobj::AppendObjectProcessor processor;
+
+public:
+ RadosAppendWriter(const DoutPrefixProvider *dpp,
+ optional_yield y,
+ RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RadosStore* _store, std::unique_ptr<Aio> _aio,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ const std::string& unique_tag,
+ uint64_t position,
+ uint64_t *cur_accounted_size) :
+ StoreWriter(dpp, y),
+ store(_store),
+ aio(std::move(_aio)),
+ obj_ctx(obj_ctx),
+ processor(&*aio, store->getRados(), bucket_info,
+ ptail_placement_rule, owner, obj_ctx,
+ obj, unique_tag, position,
+ cur_accounted_size, dpp, y)
+ {}
+ ~RadosAppendWriter() = default;
+
+ // prepare to start processing object data
+ virtual int prepare(optional_yield y) override;
+
+ // Process a bufferlist
+ virtual int process(bufferlist&& data, uint64_t offset) override;
+
+ // complete the operation and make its result visible to clients
+ virtual int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+};
+
+class RadosMultipartWriter : public StoreWriter {
+protected:
+ rgw::sal::RadosStore* store;
+ std::unique_ptr<Aio> aio;
+ RGWObjectCtx& obj_ctx;
+ rgw::putobj::MultipartObjectProcessor processor;
+
+public:
+ RadosMultipartWriter(const DoutPrefixProvider *dpp,
+ optional_yield y, const std::string& upload_id,
+ RGWBucketInfo& bucket_info,
+ RGWObjectCtx& obj_ctx,
+ const rgw_obj& obj,
+ RadosStore* _store, std::unique_ptr<Aio> _aio,
+ const rgw_user& owner,
+ const rgw_placement_rule *ptail_placement_rule,
+ uint64_t part_num, const std::string& part_num_str) :
+ StoreWriter(dpp, y),
+ store(_store),
+ aio(std::move(_aio)),
+ obj_ctx(obj_ctx),
+ processor(&*aio, store->getRados(), bucket_info,
+ ptail_placement_rule, owner, obj_ctx,
+ obj, upload_id,
+ part_num, part_num_str, dpp, y)
+ {}
+ ~RadosMultipartWriter() = default;
+
+ // prepare to start processing object data
+ virtual int prepare(optional_yield y) override;
+
+ // Process a bufferlist
+ virtual int process(bufferlist&& data, uint64_t offset) override;
+
+ // complete the operation and make its result visible to clients
+ virtual int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+};
+
+class RadosLuaManager : public StoreLuaManager {
+ RadosStore* const store;
+ rgw_pool pool;
+
+public:
+ RadosLuaManager(RadosStore* _s);
+ virtual ~RadosLuaManager() = default;
+
+ virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script);
+ virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script);
+ virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key);
+ virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+ virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+ virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages);
+};
+
+class RadosOIDCProvider : public RGWOIDCProvider {
+ RadosStore* store;
+public:
+ RadosOIDCProvider(RadosStore* _store) : store(_store) {}
+ ~RadosOIDCProvider() = default;
+
+ virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override;
+ virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override;
+ virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+ void encode(bufferlist& bl) const {
+ RGWOIDCProvider::encode(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ RGWOIDCProvider::decode(bl);
+ }
+};
+
+class RadosRole : public RGWRole {
+ RadosStore* store;
+public:
+ RadosRole(RadosStore* _store, std::string name,
+ std::string tenant,
+ std::string path,
+ std::string trust_policy,
+ std::string max_session_duration,
+ std::multimap<std::string,std::string> tags) : RGWRole(name, tenant, path, trust_policy, max_session_duration, tags), store(_store) {}
+ RadosRole(RadosStore* _store, std::string id) : RGWRole(id), store(_store) {}
+ RadosRole(RadosStore* _store, const RGWRoleInfo& info) : RGWRole(info), store(_store) {}
+ RadosRole(RadosStore* _store) : store(_store) {}
+ ~RadosRole() = default;
+
+ virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+ virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+ virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+ virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) override;
+ virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) override;
+ virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) override;
+ virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) override;
+ virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+};
+}} // namespace rgw::sal
+
+WRITE_CLASS_ENCODER(rgw::sal::RadosOIDCProvider)
diff --git a/src/rgw/driver/rados/rgw_service.cc b/src/rgw/driver/rados/rgw_service.cc
new file mode 100644
index 000000000..4fcb1ebde
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_service.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_service.h"
+
+#include "services/svc_finisher.h"
+#include "services/svc_bi_rados.h"
+#include "services/svc_bilog_rados.h"
+#include "services/svc_bucket_sobj.h"
+#include "services/svc_bucket_sync_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_config_key_rados.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_notify.h"
+#include "services/svc_otp.h"
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_sys_obj_core.h"
+#include "services/svc_user_rados.h"
+#include "services/svc_role_rados.h"
+
+#include "common/errno.h"
+
+#include "rgw_bucket.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_otp.h"
+#include "rgw_user.h"
+#include "rgw_role.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWServices_Def::RGWServices_Def() = default;
+RGWServices_Def::~RGWServices_Def()
+{
+ shutdown();
+}
+
+int RGWServices_Def::init(CephContext *cct,
+ bool have_cache,
+ bool raw,
+ bool run_sync,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ finisher = std::make_unique<RGWSI_Finisher>(cct);
+ bucket_sobj = std::make_unique<RGWSI_Bucket_SObj>(cct);
+ bucket_sync_sobj = std::make_unique<RGWSI_Bucket_Sync_SObj>(cct);
+ bi_rados = std::make_unique<RGWSI_BucketIndex_RADOS>(cct);
+ bilog_rados = std::make_unique<RGWSI_BILog_RADOS>(cct);
+ cls = std::make_unique<RGWSI_Cls>(cct);
+ config_key_rados = std::make_unique<RGWSI_ConfigKey_RADOS>(cct);
+ datalog_rados = std::make_unique<RGWDataChangesLog>(cct);
+ mdlog = std::make_unique<RGWSI_MDLog>(cct, run_sync);
+ meta = std::make_unique<RGWSI_Meta>(cct);
+ meta_be_sobj = std::make_unique<RGWSI_MetaBackend_SObj>(cct);
+ meta_be_otp = std::make_unique<RGWSI_MetaBackend_OTP>(cct);
+ notify = std::make_unique<RGWSI_Notify>(cct);
+ otp = std::make_unique<RGWSI_OTP>(cct);
+ rados = std::make_unique<RGWSI_RADOS>(cct);
+ zone = std::make_unique<RGWSI_Zone>(cct);
+ zone_utils = std::make_unique<RGWSI_ZoneUtils>(cct);
+ quota = std::make_unique<RGWSI_Quota>(cct);
+ sync_modules = std::make_unique<RGWSI_SyncModules>(cct);
+ sysobj = std::make_unique<RGWSI_SysObj>(cct);
+ sysobj_core = std::make_unique<RGWSI_SysObj_Core>(cct);
+ user_rados = std::make_unique<RGWSI_User_RADOS>(cct);
+ role_rados = std::make_unique<RGWSI_Role_RADOS>(cct);
+
+ if (have_cache) {
+ sysobj_cache = std::make_unique<RGWSI_SysObj_Cache>(dpp, cct);
+ }
+
+ vector<RGWSI_MetaBackend *> meta_bes{meta_be_sobj.get(), meta_be_otp.get()};
+
+ finisher->init();
+ bi_rados->init(zone.get(), rados.get(), bilog_rados.get(), datalog_rados.get());
+ bilog_rados->init(bi_rados.get());
+ bucket_sobj->init(zone.get(), sysobj.get(), sysobj_cache.get(),
+ bi_rados.get(), meta.get(), meta_be_sobj.get(),
+ sync_modules.get(), bucket_sync_sobj.get());
+ bucket_sync_sobj->init(zone.get(),
+ sysobj.get(),
+ sysobj_cache.get(),
+ bucket_sobj.get());
+ cls->init(zone.get(), rados.get());
+ config_key_rados->init(rados.get());
+ mdlog->init(rados.get(), zone.get(), sysobj.get(), cls.get());
+ meta->init(sysobj.get(), mdlog.get(), meta_bes);
+ meta_be_sobj->init(sysobj.get(), mdlog.get());
+ meta_be_otp->init(sysobj.get(), mdlog.get(), cls.get());
+ notify->init(zone.get(), rados.get(), finisher.get());
+ otp->init(zone.get(), meta.get(), meta_be_otp.get());
+ rados->init();
+ zone->init(sysobj.get(), rados.get(), sync_modules.get(), bucket_sync_sobj.get());
+ zone_utils->init(rados.get(), zone.get());
+ quota->init(zone.get());
+ sync_modules->init(zone.get());
+ sysobj_core->core_init(rados.get(), zone.get());
+ if (have_cache) {
+ sysobj_cache->init(rados.get(), zone.get(), notify.get());
+ sysobj->init(rados.get(), sysobj_cache.get());
+ } else {
+ sysobj->init(rados.get(), sysobj_core.get());
+ }
+ user_rados->init(rados.get(), zone.get(), sysobj.get(), sysobj_cache.get(),
+ meta.get(), meta_be_sobj.get(), sync_modules.get());
+ role_rados->init(zone.get(), meta.get(), meta_be_sobj.get(), sysobj.get());
+
+ can_shutdown = true;
+
+ int r = finisher->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (!raw) {
+ r = notify->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ }
+
+ r = rados->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (!raw) {
+ r = zone->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = datalog_rados->start(dpp, &zone->get_zone(),
+ zone->get_zone_params(),
+ rados->get_rados_handle());
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start datalog_rados service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = mdlog->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start mdlog service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = sync_modules->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start sync modules service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ }
+
+ r = cls->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start cls service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = config_key_rados->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start config_key service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = zone_utils->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = quota->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = sysobj_core->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (have_cache) {
+ r = sysobj_cache->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ }
+
+ r = sysobj->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (!raw) {
+ r = meta_be_sobj->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start meta_be_sobj service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = meta->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start meta service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = bucket_sobj->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start bucket service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = bucket_sync_sobj->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start bucket_sync service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = user_rados->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start user_rados service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = otp->start(y, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start otp service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = role_rados->start(y, dpp);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start role_rados service (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ }
+
+ /* cache or core services will be started by sysobj */
+
+ return 0;
+}
+
+void RGWServices_Def::shutdown()
+{
+ if (!can_shutdown) {
+ return;
+ }
+
+ if (has_shutdown) {
+ return;
+ }
+
+ role_rados->shutdown();
+ datalog_rados.reset();
+ user_rados->shutdown();
+ sync_modules->shutdown();
+ otp->shutdown();
+ notify->shutdown();
+ meta_be_otp->shutdown();
+ meta_be_sobj->shutdown();
+ meta->shutdown();
+ mdlog->shutdown();
+ config_key_rados->shutdown();
+ cls->shutdown();
+ bilog_rados->shutdown();
+ bi_rados->shutdown();
+ bucket_sync_sobj->shutdown();
+ bucket_sobj->shutdown();
+ finisher->shutdown();
+
+ sysobj->shutdown();
+ sysobj_core->shutdown();
+ notify->shutdown();
+ if (sysobj_cache) {
+ sysobj_cache->shutdown();
+ }
+ quota->shutdown();
+ zone_utils->shutdown();
+ zone->shutdown();
+ rados->shutdown();
+
+ has_shutdown = true;
+
+}
+
+
+int RGWServices::do_init(CephContext *_cct, bool have_cache, bool raw, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp)
+{
+ cct = _cct;
+
+ int r = _svc.init(cct, have_cache, raw, run_sync, y, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ finisher = _svc.finisher.get();
+ bi_rados = _svc.bi_rados.get();
+ bi = bi_rados;
+ bilog_rados = _svc.bilog_rados.get();
+ bucket_sobj = _svc.bucket_sobj.get();
+ bucket = bucket_sobj;
+ bucket_sync_sobj = _svc.bucket_sync_sobj.get();
+ bucket_sync = bucket_sync_sobj;
+ cls = _svc.cls.get();
+ config_key_rados = _svc.config_key_rados.get();
+ config_key = config_key_rados;
+ datalog_rados = _svc.datalog_rados.get();
+ mdlog = _svc.mdlog.get();
+ meta = _svc.meta.get();
+ meta_be_sobj = _svc.meta_be_sobj.get();
+ meta_be_otp = _svc.meta_be_otp.get();
+ notify = _svc.notify.get();
+ otp = _svc.otp.get();
+ rados = _svc.rados.get();
+ zone = _svc.zone.get();
+ zone_utils = _svc.zone_utils.get();
+ quota = _svc.quota.get();
+ sync_modules = _svc.sync_modules.get();
+ sysobj = _svc.sysobj.get();
+ cache = _svc.sysobj_cache.get();
+ core = _svc.sysobj_core.get();
+ user = _svc.user_rados.get();
+ role = _svc.role_rados.get();
+
+ return 0;
+}
+
+RGWServiceInstance::~RGWServiceInstance() {}
+
+int RGWServiceInstance::start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+ if (start_state != StateInit) {
+ return 0;
+ }
+
+ start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular
+ references can call start() on each other */
+
+ int r = do_start(y, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ start_state = StateStarted;
+
+ return 0;
+}
+
+RGWCtlDef::RGWCtlDef() {}
+RGWCtlDef::~RGWCtlDef() {}
+RGWCtlDef::_meta::_meta() {}
+RGWCtlDef::_meta::~_meta() {}
+
+
+int RGWCtlDef::init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+ meta.mgr.reset(new RGWMetadataManager(svc.meta));
+
+ meta.user.reset(RGWUserMetaHandlerAllocator::alloc(svc.user));
+
+ auto sync_module = svc.sync_modules->get_sync_module();
+ if (sync_module) {
+ meta.bucket.reset(sync_module->alloc_bucket_meta_handler());
+ meta.bucket_instance.reset(sync_module->alloc_bucket_instance_meta_handler(driver));
+ } else {
+ meta.bucket.reset(RGWBucketMetaHandlerAllocator::alloc());
+ meta.bucket_instance.reset(RGWBucketInstanceMetaHandlerAllocator::alloc(driver));
+ }
+
+ meta.otp.reset(RGWOTPMetaHandlerAllocator::alloc());
+ meta.role = std::make_unique<rgw::sal::RGWRoleMetadataHandler>(driver, svc.role);
+
+ user.reset(new RGWUserCtl(svc.zone, svc.user, (RGWUserMetadataHandler *)meta.user.get()));
+ bucket.reset(new RGWBucketCtl(svc.zone,
+ svc.bucket,
+ svc.bucket_sync,
+ svc.bi, svc.user));
+ otp.reset(new RGWOTPCtl(svc.zone, svc.otp));
+
+ RGWBucketMetadataHandlerBase *bucket_meta_handler = static_cast<RGWBucketMetadataHandlerBase *>(meta.bucket.get());
+ RGWBucketInstanceMetadataHandlerBase *bi_meta_handler = static_cast<RGWBucketInstanceMetadataHandlerBase *>(meta.bucket_instance.get());
+
+ bucket_meta_handler->init(svc.bucket, bucket.get());
+ bi_meta_handler->init(svc.zone, svc.bucket, svc.bi);
+
+ RGWOTPMetadataHandlerBase *otp_handler = static_cast<RGWOTPMetadataHandlerBase *>(meta.otp.get());
+ otp_handler->init(svc.zone, svc.meta_be_otp, svc.otp);
+
+ user->init(bucket.get());
+ bucket->init(user.get(),
+ (RGWBucketMetadataHandler *)bucket_meta_handler,
+ (RGWBucketInstanceMetadataHandler *)bi_meta_handler,
+ svc.datalog_rados,
+ dpp);
+
+ otp->init((RGWOTPMetadataHandler *)meta.otp.get());
+
+ return 0;
+}
+
+int RGWCtl::init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+ svc = _svc;
+ cct = svc->cct;
+
+ int r = _ctl.init(*svc, driver, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start init ctls (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ meta.mgr = _ctl.meta.mgr.get();
+ meta.user = _ctl.meta.user.get();
+ meta.bucket = _ctl.meta.bucket.get();
+ meta.bucket_instance = _ctl.meta.bucket_instance.get();
+ meta.otp = _ctl.meta.otp.get();
+ meta.role = _ctl.meta.role.get();
+
+ user = _ctl.user.get();
+ bucket = _ctl.bucket.get();
+ otp = _ctl.otp.get();
+
+ r = meta.user->attach(meta.mgr);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start init meta.user ctl (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = meta.bucket->attach(meta.mgr);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start init meta.bucket ctl (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = meta.bucket_instance->attach(meta.mgr);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start init meta.bucket_instance ctl (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = meta.otp->attach(meta.mgr);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = meta.role->attach(meta.mgr);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h
new file mode 100644
index 000000000..4c0b8d842
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_service.h
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "common/async/yield_context.h"
+
+#include "rgw_common.h"
+
+struct RGWServices_Def;
+
+class RGWServiceInstance
+{
+ friend struct RGWServices_Def;
+
+protected:
+ CephContext *cct;
+
+ enum StartState {
+ StateInit = 0,
+ StateStarting = 1,
+ StateStarted = 2,
+ } start_state{StateInit};
+
+ virtual void shutdown() {}
+ virtual int do_start(optional_yield, const DoutPrefixProvider *dpp) {
+ return 0;
+ }
+public:
+ RGWServiceInstance(CephContext *_cct) : cct(_cct) {}
+ virtual ~RGWServiceInstance();
+
+ int start(optional_yield y, const DoutPrefixProvider *dpp);
+ bool is_started() {
+ return (start_state == StateStarted);
+ }
+
+ CephContext *ctx() {
+ return cct;
+ }
+};
+
+class RGWSI_Finisher;
+class RGWSI_Bucket;
+class RGWSI_Bucket_SObj;
+class RGWSI_Bucket_Sync;
+class RGWSI_Bucket_Sync_SObj;
+class RGWSI_BucketIndex;
+class RGWSI_BucketIndex_RADOS;
+class RGWSI_BILog_RADOS;
+class RGWSI_Cls;
+class RGWSI_ConfigKey;
+class RGWSI_ConfigKey_RADOS;
+class RGWSI_MDLog;
+class RGWSI_Meta;
+class RGWSI_MetaBackend;
+class RGWSI_MetaBackend_SObj;
+class RGWSI_MetaBackend_OTP;
+class RGWSI_Notify;
+class RGWSI_OTP;
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWSI_ZoneUtils;
+class RGWSI_Quota;
+class RGWSI_SyncModules;
+class RGWSI_SysObj;
+class RGWSI_SysObj_Core;
+class RGWSI_SysObj_Cache;
+class RGWSI_User;
+class RGWSI_User_RADOS;
+class RGWDataChangesLog;
+class RGWSI_Role_RADOS;
+
+struct RGWServices_Def
+{
+ bool can_shutdown{false};
+ bool has_shutdown{false};
+
+ std::unique_ptr<RGWSI_Finisher> finisher;
+ std::unique_ptr<RGWSI_Bucket_SObj> bucket_sobj;
+ std::unique_ptr<RGWSI_Bucket_Sync_SObj> bucket_sync_sobj;
+ std::unique_ptr<RGWSI_BucketIndex_RADOS> bi_rados;
+ std::unique_ptr<RGWSI_BILog_RADOS> bilog_rados;
+ std::unique_ptr<RGWSI_Cls> cls;
+ std::unique_ptr<RGWSI_ConfigKey_RADOS> config_key_rados;
+ std::unique_ptr<RGWSI_MDLog> mdlog;
+ std::unique_ptr<RGWSI_Meta> meta;
+ std::unique_ptr<RGWSI_MetaBackend_SObj> meta_be_sobj;
+ std::unique_ptr<RGWSI_MetaBackend_OTP> meta_be_otp;
+ std::unique_ptr<RGWSI_Notify> notify;
+ std::unique_ptr<RGWSI_OTP> otp;
+ std::unique_ptr<RGWSI_RADOS> rados;
+ std::unique_ptr<RGWSI_Zone> zone;
+ std::unique_ptr<RGWSI_ZoneUtils> zone_utils;
+ std::unique_ptr<RGWSI_Quota> quota;
+ std::unique_ptr<RGWSI_SyncModules> sync_modules;
+ std::unique_ptr<RGWSI_SysObj> sysobj;
+ std::unique_ptr<RGWSI_SysObj_Core> sysobj_core;
+ std::unique_ptr<RGWSI_SysObj_Cache> sysobj_cache;
+ std::unique_ptr<RGWSI_User_RADOS> user_rados;
+ std::unique_ptr<RGWDataChangesLog> datalog_rados;
+ std::unique_ptr<RGWSI_Role_RADOS> role_rados;
+
+ RGWServices_Def();
+ ~RGWServices_Def();
+
+ int init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+ void shutdown();
+};
+
+
+struct RGWServices
+{
+ RGWServices_Def _svc;
+
+ CephContext *cct;
+
+ RGWSI_Finisher *finisher{nullptr};
+ RGWSI_Bucket *bucket{nullptr};
+ RGWSI_Bucket_SObj *bucket_sobj{nullptr};
+ RGWSI_Bucket_Sync *bucket_sync{nullptr};
+ RGWSI_Bucket_Sync_SObj *bucket_sync_sobj{nullptr};
+ RGWSI_BucketIndex *bi{nullptr};
+ RGWSI_BucketIndex_RADOS *bi_rados{nullptr};
+ RGWSI_BILog_RADOS *bilog_rados{nullptr};
+ RGWSI_Cls *cls{nullptr};
+ RGWSI_ConfigKey_RADOS *config_key_rados{nullptr};
+ RGWSI_ConfigKey *config_key{nullptr};
+ RGWDataChangesLog *datalog_rados{nullptr};
+ RGWSI_MDLog *mdlog{nullptr};
+ RGWSI_Meta *meta{nullptr};
+ RGWSI_MetaBackend *meta_be_sobj{nullptr};
+ RGWSI_MetaBackend *meta_be_otp{nullptr};
+ RGWSI_Notify *notify{nullptr};
+ RGWSI_OTP *otp{nullptr};
+ RGWSI_RADOS *rados{nullptr};
+ RGWSI_Zone *zone{nullptr};
+ RGWSI_ZoneUtils *zone_utils{nullptr};
+ RGWSI_Quota *quota{nullptr};
+ RGWSI_SyncModules *sync_modules{nullptr};
+ RGWSI_SysObj *sysobj{nullptr};
+ RGWSI_SysObj_Cache *cache{nullptr};
+ RGWSI_SysObj_Core *core{nullptr};
+ RGWSI_User *user{nullptr};
+ RGWSI_Role_RADOS *role{nullptr};
+
+ int do_init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+
+ int init(CephContext *cct, bool have_cache, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) {
+ return do_init(cct, have_cache, false, run_sync, y, dpp);
+ }
+
+ int init_raw(CephContext *cct, bool have_cache, optional_yield y, const DoutPrefixProvider *dpp) {
+ return do_init(cct, have_cache, true, false, y, dpp);
+ }
+ void shutdown() {
+ _svc.shutdown();
+ }
+};
+
+class RGWMetadataManager;
+class RGWMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWOTPCtl;
+
+struct RGWCtlDef {
+ struct _meta {
+ std::unique_ptr<RGWMetadataManager> mgr;
+ std::unique_ptr<RGWMetadataHandler> bucket;
+ std::unique_ptr<RGWMetadataHandler> bucket_instance;
+ std::unique_ptr<RGWMetadataHandler> user;
+ std::unique_ptr<RGWMetadataHandler> otp;
+ std::unique_ptr<RGWMetadataHandler> role;
+
+ _meta();
+ ~_meta();
+ } meta;
+
+ std::unique_ptr<RGWUserCtl> user;
+ std::unique_ptr<RGWBucketCtl> bucket;
+ std::unique_ptr<RGWOTPCtl> otp;
+
+ RGWCtlDef();
+ ~RGWCtlDef();
+
+ int init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
+
+struct RGWCtl {
+ CephContext *cct{nullptr};
+ RGWServices *svc{nullptr};
+
+ RGWCtlDef _ctl;
+
+ struct _meta {
+ RGWMetadataManager *mgr{nullptr};
+
+ RGWMetadataHandler *bucket{nullptr};
+ RGWMetadataHandler *bucket_instance{nullptr};
+ RGWMetadataHandler *user{nullptr};
+ RGWMetadataHandler *otp{nullptr};
+ RGWMetadataHandler *role{nullptr};
+ } meta;
+
+ RGWUserCtl *user{nullptr};
+ RGWBucketCtl *bucket{nullptr};
+ RGWOTPCtl *otp{nullptr};
+
+ int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
diff --git a/src/rgw/driver/rados/rgw_sync.cc b/src/rgw/driver/rados/rgw_sync.cc
new file mode 100644
index 000000000..d0ec90796
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync.cc
@@ -0,0 +1,2568 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta sync: ")
+
+using namespace std;
+
+static string mdlog_sync_status_oid = "mdlog.sync-status";
+static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
+static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
+
+RGWContinuousLeaseCR::~RGWContinuousLeaseCR() {}
+
+RGWSyncErrorLogger::RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
+ for (int i = 0; i < num_shards; i++) {
+ oids.push_back(get_shard_oid(oid_prefix, i));
+ }
+}
+string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
+ char buf[oid_prefix.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
+ return string(buf);
+}
+
+RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const DoutPrefixProvider *dpp, const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
+ cls_log_entry entry;
+
+ rgw_sync_error_info info(source_zone, error_code, message);
+ bufferlist bl;
+ encode(info, bl);
+ store->svc()->cls->timelog.prepare_entry(entry, real_clock::now(), section, name, bl);
+
+ uint32_t shard_id = ++counter % num_shards;
+
+
+ return new RGWRadosTimelogAddCR(dpp, store, oids[shard_id], entry);
+}
+
+void RGWSyncBackoff::update_wait_time()
+{
+ if (cur_wait == 0) {
+ cur_wait = 1;
+ } else {
+ cur_wait = (cur_wait << 1);
+ }
+ if (cur_wait >= max_secs) {
+ cur_wait = max_secs;
+ }
+}
+
+void RGWSyncBackoff::backoff_sleep()
+{
+ update_wait_time();
+ sleep(cur_wait);
+}
+
+void RGWSyncBackoff::backoff(RGWCoroutine *op)
+{
+ update_wait_time();
+ op->wait(utime_t(cur_wait, 0));
+}
+
+int RGWBackoffControlCR::operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ // retry the operation until it succeeds
+ while (true) {
+ yield {
+ std::lock_guard l{lock};
+ cr = alloc_cr();
+ cr->get();
+ call(cr);
+ }
+ {
+ std::lock_guard l{lock};
+ cr->put();
+ cr = NULL;
+ }
+ if (retcode >= 0) {
+ break;
+ }
+ if (retcode != -EBUSY && retcode != -EAGAIN) {
+ ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
+ if (exit_on_error) {
+ return set_cr_error(retcode);
+ }
+ }
+ if (reset_backoff) {
+ backoff.reset();
+ }
+ yield backoff.backoff(this);
+ }
+
+ // run an optional finisher
+ yield call(alloc_finisher_cr());
+ if (retcode < 0) {
+ ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+void rgw_mdlog_info::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("num_objects", num_shards, obj);
+ JSONDecoder::decode_json("period", period, obj);
+ JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_mdlog_entry::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("id", id, obj);
+ JSONDecoder::decode_json("section", section, obj);
+ JSONDecoder::decode_json("name", name, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("timestamp", ut, obj);
+ timestamp = ut.to_real_time();
+ JSONDecoder::decode_json("data", log_data, obj);
+}
+
+void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ JSONDecoder::decode_json("entries", entries, obj);
+};
+
+int RGWShardCollectCR::operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ while (spawn_next()) {
+ current_running++;
+
+ if (current_running >= max_concurrent) {
+ int child_ret;
+ yield wait_for_child();
+ if (collect_next(&child_ret)) {
+ current_running--;
+ child_ret = handle_result(child_ret);
+ if (child_ret < 0) {
+ status = child_ret;
+ }
+ }
+ }
+ }
+ while (current_running > 0) {
+ int child_ret;
+ yield wait_for_child();
+ if (collect_next(&child_ret)) {
+ current_running--;
+ child_ret = handle_result(child_ret);
+ if (child_ret < 0) {
+ status = child_ret;
+ }
+ }
+ }
+ if (status < 0) {
+ return set_cr_error(status);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
+ RGWMetaSyncEnv *sync_env;
+
+ const std::string& period;
+ int num_shards;
+ map<int, RGWMetadataLogInfo> *mdlog_info;
+
+ int shard_id;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to fetch mdlog status: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+public:
+ RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
+ const std::string& period, int _num_shards,
+ map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+ sync_env(_sync_env),
+ period(period), num_shards(_num_shards),
+ mdlog_info(_mdlog_info), shard_id(0) {}
+ bool spawn_next() override;
+};
+
+class RGWListRemoteMDLogCR : public RGWShardCollectCR {
+ RGWMetaSyncEnv *sync_env;
+
+ const std::string& period;
+ map<int, string> shards;
+ int max_entries_per_shard;
+ map<int, rgw_mdlog_shard_data> *result;
+
+ map<int, string>::iterator iter;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to list remote mdlog shard: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+public:
+ RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
+ const std::string& period, map<int, string>& _shards,
+ int _max_entries_per_shard,
+ map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+ sync_env(_sync_env), period(period),
+ max_entries_per_shard(_max_entries_per_shard),
+ result(_result) {
+ shards.swap(_shards);
+ iter = shards.begin();
+ }
+ bool spawn_next() override;
+};
+
+int RGWRemoteMetaLog::read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info)
+{
+ rgw_http_param_pair pairs[] = { { "type", "metadata" },
+ { NULL, NULL } };
+
+ int ret = conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl;
+ return ret;
+ }
+
+ ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
+
+ return 0;
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_info(const DoutPrefixProvider *dpp, const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+
+ rgw_mdlog_info log_info;
+ int ret = read_log_info(dpp, &log_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return run(dpp, new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_next(const DoutPrefixProvider *dpp, const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+
+ return run(dpp, new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
+}
+
+int RGWRemoteMetaLog::init()
+{
+ conn = store->svc()->zone->get_master_conn();
+
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+
+ error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+ init_sync_env(&sync_env);
+
+ tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta");
+
+ return 0;
+}
+
+#define CLONE_MAX_ENTRIES 100
+
+int RGWMetaSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+
+ if (!store->svc()->zone->get_master_conn()) {
+ ldpp_dout(dpp, -1) << "no REST connection to master zone" << dendl;
+ return -EIO;
+ }
+
+ int r = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), store->svc()->zone->get_zone_params().log_pool, ioctx, true);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to open log pool (" << store->svc()->zone->get_zone_params().log_pool << " ret=" << r << dendl;
+ return r;
+ }
+
+ r = master_log.init();
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to init remote log, r=" << r << dendl;
+ return r;
+ }
+
+ RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
+
+ rgw_meta_sync_status sync_status;
+ r = read_sync_status(dpp, &sync_status);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to read sync status, r=" << r << dendl;
+ return r;
+ }
+
+ int num_shards = sync_status.sync_info.num_shards;
+
+ for (int i = 0; i < num_shards; i++) {
+ shard_objs[i] = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.shard_obj_name(i));
+ }
+
+ std::unique_lock wl{ts_to_shard_lock};
+ for (int i = 0; i < num_shards; i++) {
+ clone_markers.push_back(string());
+ utime_shard ut;
+ ut.shard_id = i;
+ ts_to_shard[ut] = i;
+ }
+
+ return 0;
+}
+
+void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+ RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+ RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) {
+ dpp = _dpp;
+ cct = _cct;
+ store = _store;
+ conn = _conn;
+ async_rados = _async_rados;
+ http_manager = _http_manager;
+ error_logger = _error_logger;
+ sync_tracer = _sync_tracer;
+}
+
+string RGWMetaSyncEnv::status_oid()
+{
+ return mdlog_sync_status_oid;
+}
+
+string RGWMetaSyncEnv::shard_obj_name(int shard_id)
+{
+ char buf[mdlog_sync_status_shard_prefix.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
+
+ return string(buf);
+}
+
+class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWMetadataLog *mdlog;
+ int shard_id;
+ int max_entries;
+
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override {
+ real_time from_time;
+ real_time end_time;
+
+ void *handle;
+
+ mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle);
+
+ int ret = mdlog->list_entries(dpp, handle, max_entries, entries, &marker, &truncated);
+
+ mdlog->complete_list_entries(handle);
+
+ return ret;
+ }
+public:
+ string marker;
+ list<cls_log_entry> entries;
+ bool truncated;
+
+ RGWAsyncReadMDLogEntries(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ RGWMetadataLog* mdlog, int _shard_id,
+ std::string _marker, int _max_entries)
+ : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(_store), mdlog(mdlog),
+ shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {}
+};
+
+class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ RGWMetadataLog *const mdlog;
+ int shard_id;
+ string marker;
+ string *pmarker;
+ int max_entries;
+ list<cls_log_entry> *entries;
+ bool *truncated;
+
+ RGWAsyncReadMDLogEntries *req{nullptr};
+
+public:
+ RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+ int _shard_id, string*_marker, int _max_entries,
+ list<cls_log_entry> *_entries, bool *_truncated)
+ : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+ shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
+ entries(_entries), truncated(_truncated) {}
+
+ ~RGWReadMDLogEntriesCR() override {
+ if (req) {
+ req->finish();
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ marker = *pmarker;
+ req = new RGWAsyncReadMDLogEntries(dpp, this, stack->create_completion_notifier(),
+ sync_env->store, mdlog, shard_id, marker,
+ max_entries);
+ sync_env->async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ *pmarker = std::move(req->marker);
+ *entries = std::move(req->entries);
+ *truncated = req->truncated;
+ return req->get_ret_status();
+ }
+};
+
+
+class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
+ RGWMetaSyncEnv *env;
+ RGWRESTReadResource *http_op;
+
+ const std::string& period;
+ int shard_id;
+ RGWMetadataLogInfo *shard_info;
+
+public:
+ RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
+ int _shard_id, RGWMetadataLogInfo *_shard_info)
+ : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
+ period(period), shard_id(_shard_id), shard_info(_shard_info) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ auto store = env->store;
+ RGWRESTConn *conn = store->svc()->zone->get_master_conn();
+ reenter(this) {
+ yield {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+ rgw_http_param_pair pairs[] = { { "type" , "metadata" },
+ { "id", buf },
+ { "period", period.c_str() },
+ { "info" , NULL },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+
+ http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
+ env->http_manager);
+
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+ }
+ yield {
+ int ret = http_op->wait(shard_info, null_yield);
+ http_op->put();
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ return set_cr_done();
+ }
+ }
+ return 0;
+ }
+};
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+ const std::string& period,
+ int shard_id,
+ RGWMetadataLogInfo* info)
+{
+ return new RGWReadRemoteMDLogShardInfoCR(env, period, shard_id, info);
+}
+
+class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ RGWRESTReadResource *http_op;
+
+ const std::string& period;
+ int shard_id;
+ string marker;
+ uint32_t max_entries;
+ rgw_mdlog_shard_data *result;
+
+public:
+ RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
+ int _shard_id, const string& _marker, uint32_t _max_entries,
+ rgw_mdlog_shard_data *_result)
+ : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
+ period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ RGWRESTConn *conn = sync_env->conn;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+
+ char max_entries_buf[32];
+ snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+ const char *marker_key = (marker.empty() ? "" : "marker");
+
+ rgw_http_param_pair pairs[] = { { "type", "metadata" },
+ { "id", buf },
+ { "period", period.c_str() },
+ { "max-entries", max_entries_buf },
+ { marker_key, marker.c_str() },
+ { NULL, NULL } };
+
+ string p = "/admin/log/";
+
+ http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int request_complete() override {
+ int ret = http_op->wait(result, null_yield);
+ http_op->put();
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+};
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+ const std::string& period,
+ int shard_id,
+ const std::string& marker,
+ uint32_t max_entries,
+ rgw_mdlog_shard_data *result)
+{
+ return new RGWListRemoteMDLogShardCR(env, period, shard_id, marker,
+ max_entries, result);
+}
+
+bool RGWReadRemoteMDLogInfoCR::spawn_next() {
+ if (shard_id >= num_shards) {
+ return false;
+ }
+ spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
+ shard_id++;
+ return true;
+}
+
+bool RGWListRemoteMDLogCR::spawn_next() {
+ if (iter == shards.end()) {
+ return false;
+ }
+
+ spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+ ++iter;
+ return true;
+}
+
+class RGWInitSyncStatusCoroutine : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+
+ rgw_meta_sync_info status;
+ vector<RGWMetadataLogInfo> shards_info;
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+ boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+public:
+ RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+ const rgw_meta_sync_info &status)
+ : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
+ status(status), shards_info(status.num_shards),
+ lease_cr(nullptr), lease_stack(nullptr) {}
+
+ ~RGWInitSyncStatusCoroutine() override {
+ if (lease_cr) {
+ lease_cr->abort();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ int ret;
+ reenter(this) {
+ yield {
+ set_status("acquiring sync lock");
+ uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+ string lock_name = "sync_lock";
+ rgw::sal::RadosStore* store = sync_env->store;
+ lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+ lock_name, lock_duration, this, nullptr));
+ lease_stack.reset(spawn(lease_cr.get(), false));
+ }
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+ set_status("lease lock failed, early abort");
+ return set_cr_error(lease_cr->get_ret_status());
+ }
+ set_sleeping(true);
+ yield;
+ }
+ yield {
+ set_status("writing sync status");
+ rgw::sal::RadosStore* store = sync_env->store;
+ call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+ status));
+ }
+
+ if (retcode < 0) {
+ set_status("failed to write sync status");
+ ldpp_dout(dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
+ yield lease_cr->go_down();
+ return set_cr_error(retcode);
+ }
+ /* fetch current position in logs */
+ set_status("fetching remote log position");
+ yield {
+ for (int i = 0; i < (int)status.num_shards; i++) {
+ spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
+ &shards_info[i]), false);
+ }
+ }
+
+ drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+ yield {
+ set_status("updating sync status");
+ for (int i = 0; i < (int)status.num_shards; i++) {
+ rgw_meta_sync_marker marker;
+ RGWMetadataLogInfo& info = shards_info[i];
+ marker.next_step_marker = info.marker;
+ marker.timestamp = info.last_update;
+ rgw::sal::RadosStore* store = sync_env->store;
+ spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp,
+ store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
+ marker), true);
+ }
+ }
+ yield {
+ set_status("changing sync state: build full sync maps");
+ status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+ rgw::sal::RadosStore* store = sync_env->store;
+ call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+ status));
+ }
+ set_status("drop lock lease");
+ yield lease_cr->go_down();
+ while (collect(&ret, NULL)) {
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ yield;
+ }
+ drain_all();
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ RGWMetaSyncEnv *env;
+ const int num_shards;
+ int shard_id{0};
+ map<uint32_t, rgw_meta_sync_marker>& markers;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to read metadata sync markers: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
+ map<uint32_t, rgw_meta_sync_marker>& markers)
+ : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
+ env(env), num_shards(num_shards), markers(markers)
+ {}
+ bool spawn_next() override;
+};
+
+bool RGWReadSyncStatusMarkersCR::spawn_next()
+{
+ if (shard_id >= num_shards) {
+ return false;
+ }
+ using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
+ rgw_raw_obj obj{env->store->svc()->zone->get_zone_params().log_pool,
+ env->shard_obj_name(shard_id)};
+ spawn(new CR(env->dpp, env->store, obj, &markers[shard_id]), false);
+ shard_id++;
+ return true;
+}
+
+class RGWReadSyncStatusCoroutine : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ rgw_meta_sync_status *sync_status;
+
+public:
+ RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+ rgw_meta_sync_status *_status)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
+ {}
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // read sync info
+ using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
+ yield {
+ bool empty_on_enoent = false; // fail on ENOENT
+ rgw_raw_obj obj{sync_env->store->svc()->zone->get_zone_params().log_pool,
+ sync_env->status_oid()};
+ call(new ReadInfoCR(dpp, sync_env->store, obj,
+ &sync_status->sync_info, empty_on_enoent));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to read sync status info with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ // read shard markers
+ using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
+ yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
+ sync_status->sync_markers));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWFetchAllMetaCR : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+
+ int num_shards;
+
+
+ int ret_status;
+
+ list<string> sections;
+ list<string>::iterator sections_iter;
+
+ struct meta_list_result {
+ list<string> keys;
+ string marker;
+ uint64_t count{0};
+ bool truncated{false};
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("keys", keys, obj);
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("count", count, obj);
+ JSONDecoder::decode_json("truncated", truncated, obj);
+ }
+ } result;
+ list<string>::iterator iter;
+
+ std::unique_ptr<RGWShardedOmapCRManager> entries_index;
+
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+ boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+ bool lost_lock;
+ bool failed;
+
+ string marker;
+
+ map<uint32_t, rgw_meta_sync_marker>& markers;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
+ map<uint32_t, rgw_meta_sync_marker>& _markers,
+ RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+ num_shards(_num_shards),
+ ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
+ lost_lock(false), failed(false), markers(_markers) {
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta");
+ }
+
+ ~RGWFetchAllMetaCR() override {
+ }
+
+ void append_section_from_set(set<string>& all_sections, const string& name) {
+ set<string>::iterator iter = all_sections.find(name);
+ if (iter != all_sections.end()) {
+ sections.emplace_back(std::move(*iter));
+ all_sections.erase(iter);
+ }
+ }
+ /*
+ * meta sync should go in the following order: user, bucket.instance, bucket
+ * then whatever other sections exist (if any)
+ */
+ void rearrange_sections() {
+ set<string> all_sections;
+ std::move(sections.begin(), sections.end(),
+ std::inserter(all_sections, all_sections.end()));
+ sections.clear();
+
+ append_section_from_set(all_sections, "user");
+ append_section_from_set(all_sections, "bucket.instance");
+ append_section_from_set(all_sections, "bucket");
+ append_section_from_set(all_sections, "roles");
+
+ std::move(all_sections.begin(), all_sections.end(),
+ std::back_inserter(sections));
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ RGWRESTConn *conn = sync_env->conn;
+
+ reenter(this) {
+ yield {
+ set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
+ uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+ string lock_name = "sync_lock";
+ lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
+ sync_env->store,
+ rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+ lock_name, lock_duration, this, nullptr));
+ lease_stack.reset(spawn(lease_cr.get(), false));
+ }
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+ set_status("lease lock failed, early abort");
+ return set_cr_error(lease_cr->get_ret_status());
+ }
+ set_sleeping(true);
+ yield;
+ }
+ entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
+ sync_env->store->svc()->zone->get_zone_params().log_pool,
+ mdlog_sync_full_sync_index_prefix));
+ yield {
+ call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
+ "/admin/metadata", NULL, &sections));
+ }
+ if (get_ret_status() < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl;
+ yield entries_index->finish();
+ yield lease_cr->go_down();
+ drain_all();
+ return set_cr_error(get_ret_status());
+ }
+ rearrange_sections();
+ sections_iter = sections.begin();
+ for (; sections_iter != sections.end(); ++sections_iter) {
+ do {
+ yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+ string entrypoint = string("/admin/metadata/") + *sections_iter;
+ rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+ { "marker", result.marker.c_str() },
+ { NULL, NULL } };
+ result.keys.clear();
+ call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+ entrypoint, pairs, &result));
+ }
+ ret_status = get_ret_status();
+ if (ret_status == -ENOENT) {
+ set_retcode(0); /* reset coroutine status so that we don't return it */
+ ret_status = 0;
+ }
+ if (ret_status < 0) {
+ tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter));
+ yield entries_index->finish();
+ yield lease_cr->go_down();
+ drain_all();
+ return set_cr_error(ret_status);
+ }
+ iter = result.keys.begin();
+ for (; iter != result.keys.end(); ++iter) {
+ if (!lease_cr->is_locked()) {
+ lost_lock = true;
+ tn->log(1, "lease is lost, abort");
+ break;
+ }
+ yield; // allow entries_index consumer to make progress
+
+ tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter));
+ string s = *sections_iter + ":" + *iter;
+ int shard_id;
+ rgw::sal::RadosStore* store = sync_env->store;
+ int ret = store->ctl()->meta.mgr->get_shard_id(*sections_iter, *iter, &shard_id);
+ if (ret < 0) {
+ tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter));
+ ret_status = ret;
+ break;
+ }
+ if (!entries_index->append(s, shard_id)) {
+ break;
+ }
+ }
+ } while (result.truncated);
+ }
+ yield {
+ if (!entries_index->finish()) {
+ failed = true;
+ }
+ }
+ if (!failed) {
+ for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
+ int shard_id = (int)iter->first;
+ rgw_meta_sync_marker& marker = iter->second;
+ marker.total_entries = entries_index->get_total_entries(shard_id);
+ spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp, sync_env->store,
+ rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
+ marker), true);
+ }
+ }
+
+ drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+ yield lease_cr->go_down();
+
+ int ret;
+ while (collect(&ret, NULL)) {
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ yield;
+ }
+ drain_all();
+ if (failed) {
+ yield return set_cr_error(-EIO);
+ }
+ if (lost_lock) {
+ yield return set_cr_error(-EBUSY);
+ }
+
+ if (ret_status < 0) {
+ yield return set_cr_error(ret_status);
+ }
+
+ yield return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+static string full_sync_index_shard_oid(int shard_id)
+{
+ char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
+ snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
+ return string(buf);
+}
+
+class RGWReadRemoteMetadataCR : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+
+ RGWRESTReadResource *http_op;
+
+ string section;
+ string key;
+
+ bufferlist *pbl;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
+ const string& _section, const string& _key, bufferlist *_pbl,
+ const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+ http_op(NULL),
+ section(_section),
+ key(_key),
+ pbl(_pbl) {
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta",
+ section + ":" + key);
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ RGWRESTConn *conn = sync_env->conn;
+ reenter(this) {
+ yield {
+ string key_encode;
+ url_encode(key, key_encode);
+ rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
+ { NULL, NULL } };
+
+ string p = string("/admin/metadata/") + section + "/" + key_encode;
+
+ http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+ }
+ yield {
+ int ret = http_op->wait(pbl, null_yield);
+ http_op->put();
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ return set_cr_done();
+ }
+ }
+ return 0;
+ }
+};
+
+class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ string raw_key;
+ bufferlist bl;
+ const DoutPrefixProvider *dpp;
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override {
+ int ret = store->ctl()->meta.mgr->put(raw_key, bl, null_yield, dpp, RGWMDLogSyncType::APPLY_ALWAYS, true);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+public:
+ RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ const string& _raw_key,
+ bufferlist& _bl,
+ const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+ raw_key(_raw_key), bl(_bl), dpp(dpp) {}
+};
+
+
+class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ string raw_key;
+ bufferlist bl;
+
+ RGWAsyncMetaStoreEntry *req;
+
+public:
+ RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
+ const string& _raw_key,
+ bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+ raw_key(_raw_key), bl(_bl), req(NULL) {
+ }
+
+ ~RGWMetaStoreEntryCR() override {
+ if (req) {
+ req->finish();
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
+ sync_env->store, raw_key, bl, dpp);
+ sync_env->async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+};
+
+class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
+ rgw::sal::RadosStore* store;
+ string raw_key;
+ const DoutPrefixProvider *dpp;
+protected:
+ int _send_request(const DoutPrefixProvider *dpp) override {
+ int ret = store->ctl()->meta.mgr->remove(raw_key, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+public:
+ RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+ const string& _raw_key, const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+ raw_key(_raw_key), dpp(dpp) {}
+};
+
+
+class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ string raw_key;
+
+ RGWAsyncMetaRemoveEntry *req;
+
+public:
+ RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
+ const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+ raw_key(_raw_key), req(NULL) {
+ }
+
+ ~RGWMetaRemoveEntryCR() override {
+ if (req) {
+ req->finish();
+ }
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
+ sync_env->store, raw_key, dpp);
+ sync_env->async_rados->queue(req);
+ return 0;
+ }
+
+ int request_complete() override {
+ int r = req->get_ret_status();
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ return r;
+ }
+};
+
+#define META_SYNC_UPDATE_MARKER_WINDOW 10
+
+
+int RGWLastCallerWinsCR::operate(const DoutPrefixProvider *dpp) {
+ RGWCoroutine *call_cr;
+ reenter(this) {
+ while (cr) {
+ call_cr = cr;
+ cr = nullptr;
+ yield call(call_cr);
+ /* cr might have been modified at this point */
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: RGWLastCallerWinsCR() failed: retcode=" << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+ RGWMetaSyncEnv *sync_env;
+
+ string marker_oid;
+ rgw_meta_sync_marker sync_marker;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
+ const string& _marker_oid,
+ const rgw_meta_sync_marker& _marker,
+ RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
+ sync_env(_sync_env),
+ marker_oid(_marker_oid),
+ sync_marker(_marker),
+ tn(_tn){}
+
+ RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+ sync_marker.marker = new_marker;
+ if (index_pos > 0) {
+ sync_marker.pos = index_pos;
+ }
+
+ if (!real_clock::is_zero(timestamp)) {
+ sync_marker.timestamp = timestamp;
+ }
+
+ ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
+ tn->log(20, SSTR("new marker=" << new_marker));
+ rgw::sal::RadosStore* store = sync_env->store;
+ return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->dpp, store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, marker_oid),
+ sync_marker);
+ }
+
+ RGWOrderCallCR *allocate_order_control_cr() override {
+ return new RGWLastCallerWinsCR(sync_env->cct);
+ }
+};
+
+RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+ const string& _raw_key, const string& _entry_marker,
+ const RGWMDLogStatus& _op_status,
+ RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct),
+ sync_env(_sync_env),
+ raw_key(_raw_key), entry_marker(_entry_marker),
+ op_status(_op_status),
+ pos(0), sync_status(0),
+ marker_tracker(_marker_tracker), tries(0) {
+ error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0);
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key);
+}
+
+int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+#define NUM_TRANSIENT_ERROR_RETRIES 10
+
+ if (error_injection &&
+ rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
+ return set_cr_error(-EIO);
+ }
+
+ if (op_status != MDLOG_STATUS_COMPLETE) {
+ tn->log(20, "skipping pending operation");
+ yield call(marker_tracker->finish(entry_marker));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+ for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+ yield {
+ pos = raw_key.find(':');
+ section = raw_key.substr(0, pos);
+ key = raw_key.substr(pos + 1);
+ tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)")));
+ call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn));
+ }
+
+ sync_status = retcode;
+
+ if (sync_status == -ENOENT) {
+ break;
+ }
+
+ if (sync_status < 0) {
+ if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
+ ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata entry: " << section << ":" << key << ", will retry" << dendl;
+ continue;
+ }
+
+ tn->log(10, SSTR("failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status));
+ log_error() << "failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
+ yield call(sync_env->error_logger->log_error_cr(dpp, sync_env->conn->get_remote_id(), section, key, -sync_status,
+ string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
+ return set_cr_error(sync_status);
+ }
+
+ break;
+ }
+
+ retcode = 0;
+ for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+ if (sync_status != -ENOENT) {
+ tn->log(10, SSTR("storing local metadata entry: " << section << ":" << key));
+ yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
+ } else {
+ tn->log(10, SSTR("removing local metadata entry:" << section << ":" << key));
+ yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
+ if (retcode == -ENOENT) {
+ retcode = 0;
+ break;
+ }
+ }
+ if ((retcode < 0) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+ ldpp_dout(dpp, 20) << *this << ": failed to store metadata entry: " << section << ":" << key << ", got retcode=" << retcode << ", will retry" << dendl;
+ continue;
+ }
+ break;
+ }
+
+ sync_status = retcode;
+
+ if (sync_status == 0 && marker_tracker) {
+ /* update marker */
+ yield call(marker_tracker->finish(entry_marker));
+ sync_status = retcode;
+ }
+ if (sync_status < 0) {
+ tn->log(10, SSTR("failed, status=" << sync_status));
+ return set_cr_error(sync_status);
+ }
+ tn->log(10, "success");
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class RGWCloneMetaLogCoroutine : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ RGWMetadataLog *mdlog;
+
+ const std::string& period;
+ int shard_id;
+ string marker;
+ bool truncated = false;
+ string *new_marker;
+
+ int max_entries = CLONE_MAX_ENTRIES;
+
+ RGWRESTReadResource *http_op = nullptr;
+ boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
+
+ RGWMetadataLogInfo shard_info;
+ rgw_mdlog_shard_data data;
+
+public:
+ RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+ const std::string& period, int _id,
+ const string& _marker, string *_new_marker)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+ period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
+ if (new_marker) {
+ *new_marker = marker;
+ }
+ }
+ ~RGWCloneMetaLogCoroutine() override {
+ if (http_op) {
+ http_op->put();
+ }
+ if (completion) {
+ completion->cancel();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+
+ int state_init();
+ int state_read_shard_status();
+ int state_read_shard_status_complete();
+ int state_send_rest_request(const DoutPrefixProvider *dpp);
+ int state_receive_rest_response();
+ int state_store_mdlog_entries();
+ int state_store_mdlog_entries_complete();
+};
+
+class RGWMetaSyncShardCR : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+
+ const rgw_pool& pool;
+ const std::string& period; //< currently syncing period id
+ const epoch_t realm_epoch; //< realm_epoch of period
+ RGWMetadataLog* mdlog; //< log of syncing period
+ uint32_t shard_id;
+ rgw_meta_sync_marker& sync_marker;
+ boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
+ string marker;
+ string max_marker;
+ const std::string& period_marker; //< max marker stored in next period
+
+ RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+ std::set<std::string> entries;
+ std::set<std::string>::iterator iter;
+
+ string oid;
+
+ RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
+
+ list<cls_log_entry> log_entries;
+ list<cls_log_entry>::iterator log_iter;
+ bool truncated = false;
+
+ string mdlog_marker;
+ string raw_key;
+ rgw_mdlog_entry mdlog_entry;
+
+ ceph::mutex inc_lock = ceph::make_mutex("RGWMetaSyncShardCR::inc_lock");
+ ceph::condition_variable inc_cond;
+
+ boost::asio::coroutine incremental_cr;
+ boost::asio::coroutine full_cr;
+
+ boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+ boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+ bool lost_lock = false;
+
+ bool *reset_backoff;
+
+ // hold a reference to the cr stack while it's in the map
+ using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+ map<StackRef, string> stack_to_pos;
+ map<string, string> pos_to_prev;
+
+ bool can_adjust_marker = false;
+ bool done_with_period = false;
+
+ int total_entries = 0;
+
+ RGWSyncTraceNodeRef tn;
+public:
+ RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+ const std::string& period, epoch_t realm_epoch,
+ RGWMetadataLog* mdlog, uint32_t _shard_id,
+ rgw_meta_sync_marker& _marker,
+ const std::string& period_marker, bool *_reset_backoff,
+ RGWSyncTraceNodeRef& _tn)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
+ period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+ shard_id(_shard_id), sync_marker(_marker),
+ period_marker(period_marker),
+ reset_backoff(_reset_backoff), tn(_tn) {
+ *reset_backoff = false;
+ }
+
+ ~RGWMetaSyncShardCR() override {
+ delete marker_tracker;
+ if (lease_cr) {
+ lease_cr->abort();
+ }
+ }
+
+ void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
+ delete marker_tracker;
+ marker_tracker = mt;
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ int r;
+ while (true) {
+ switch (sync_marker.state) {
+ case rgw_meta_sync_marker::FullSync:
+ r = full_sync();
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
+ return set_cr_error(r);
+ }
+ return 0;
+ case rgw_meta_sync_marker::IncrementalSync:
+ r = incremental_sync();
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
+ return set_cr_error(r);
+ }
+ return 0;
+ }
+ }
+ /* unreachable */
+ return 0;
+ }
+
+ void collect_children()
+ {
+ int child_ret;
+ RGWCoroutinesStack *child;
+ while (collect_next(&child_ret, &child)) {
+ auto iter = stack_to_pos.find(child);
+ if (iter == stack_to_pos.end()) {
+ /* some other stack that we don't care about */
+ continue;
+ }
+
+ string& pos = iter->second;
+
+ if (child_ret < 0) {
+ ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
+ // on any error code from RGWMetaSyncSingleEntryCR, we do not advance
+ // the sync status marker past this entry, and set
+ // can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
+ // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
+ // previous marker and retry
+ can_adjust_marker = false;
+ }
+
+ map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
+ ceph_assert(prev_iter != pos_to_prev.end());
+
+ if (pos_to_prev.size() == 1) {
+ if (can_adjust_marker) {
+ sync_marker.marker = pos;
+ }
+ pos_to_prev.erase(prev_iter);
+ } else {
+ ceph_assert(pos_to_prev.size() > 1);
+ pos_to_prev.erase(prev_iter);
+ prev_iter = pos_to_prev.begin();
+ if (can_adjust_marker) {
+ sync_marker.marker = prev_iter->second;
+ }
+ }
+
+ ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
+ stack_to_pos.erase(iter);
+ }
+ }
+
+ int full_sync() {
+#define OMAP_GET_MAX_ENTRIES 100
+ int max_entries = OMAP_GET_MAX_ENTRIES;
+ reenter(&full_cr) {
+ set_status("full_sync");
+ tn->log(10, "start full sync");
+ oid = full_sync_index_shard_oid(shard_id);
+ can_adjust_marker = true;
+ /* grab lock */
+ yield {
+ uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+ string lock_name = "sync_lock";
+ rgw::sal::RadosStore* store = sync_env->store;
+ lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+ rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+ lock_name, lock_duration, this, nullptr));
+ lease_stack.reset(spawn(lease_cr.get(), false));
+ lost_lock = false;
+ }
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ drain_all();
+ tn->log(5, "failed to take lease");
+ return lease_cr->get_ret_status();
+ }
+ set_sleeping(true);
+ yield;
+ }
+ tn->log(10, "took lease");
+
+ /* lock succeeded, a retry now should avoid previous backoff status */
+ *reset_backoff = true;
+
+ /* prepare marker tracker */
+ set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+ sync_env->shard_obj_name(shard_id),
+ sync_marker, tn));
+
+ marker = sync_marker.marker;
+
+ total_entries = sync_marker.pos;
+
+ /* sync! */
+ do {
+ if (!lease_cr->is_locked()) {
+ tn->log(1, "lease is lost, abort");
+ lost_lock = true;
+ break;
+ }
+ omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+ yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
+ marker, max_entries, omapkeys));
+ if (retcode < 0) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
+ tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode));
+ yield lease_cr->go_down();
+ drain_all();
+ return retcode;
+ }
+ entries = std::move(omapkeys->entries);
+ tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+ if (entries.size() > 0) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ }
+ iter = entries.begin();
+ for (; iter != entries.end(); ++iter) {
+ marker = *iter;
+ tn->log(20, SSTR("full sync: " << marker));
+ total_entries++;
+ if (!marker_tracker->start(marker, total_entries, real_time())) {
+ tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?"));
+ } else {
+ // fetch remote and write locally
+ yield {
+ RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false);
+ // stack_to_pos holds a reference to the stack
+ stack_to_pos[stack] = marker;
+ pos_to_prev[marker] = marker;
+ }
+ // limit spawn window
+ while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+ yield wait_for_child();
+ collect_children();
+ }
+ }
+ }
+ collect_children();
+ } while (omapkeys->more && can_adjust_marker);
+
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+
+ while (num_spawned() > 1) {
+ yield wait_for_child();
+ collect_children();
+ }
+
+ if (!lost_lock) {
+ /* update marker to reflect we're done with full sync */
+ if (can_adjust_marker) {
+ // apply updates to a temporary marker, or operate() will send us
+ // to incremental_sync() after we yield
+ temp_marker = sync_marker;
+ temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
+ temp_marker->marker = std::move(temp_marker->next_step_marker);
+ temp_marker->next_step_marker.clear();
+ temp_marker->realm_epoch = realm_epoch;
+ ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
+
+ using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
+ yield call(new WriteMarkerCR(sync_env->dpp, sync_env->store,
+ rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+ *temp_marker));
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
+ yield lease_cr->go_down();
+ drain_all();
+ return retcode;
+ }
+ // clean up full sync index
+ yield {
+ auto oid = full_sync_index_shard_oid(shard_id);
+ call(new RGWRadosRemoveCR(sync_env->store, {pool, oid}));
+ }
+ }
+
+ /*
+ * if we reached here, it means that lost_lock is true, otherwise the state
+ * change in the previous block will prevent us from reaching here
+ */
+
+ yield lease_cr->go_down();
+
+ lease_cr.reset();
+
+ drain_all();
+
+ if (!can_adjust_marker) {
+ return -EAGAIN;
+ }
+
+ if (lost_lock) {
+ return -EBUSY;
+ }
+
+ tn->log(10, "full sync complete");
+
+ // apply the sync marker update
+ ceph_assert(temp_marker);
+ sync_marker = std::move(*temp_marker);
+ temp_marker = boost::none;
+ // must not yield after this point!
+ }
+ return 0;
+ }
+
+
+ int incremental_sync() {
+ reenter(&incremental_cr) {
+ set_status("incremental_sync");
+ tn->log(10, "start incremental sync");
+ can_adjust_marker = true;
+ /* grab lock */
+ if (!lease_cr) { /* could have had a lease_cr lock from previous state */
+ yield {
+ uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+ string lock_name = "sync_lock";
+ rgw::sal::RadosStore* store = sync_env->store;
+ lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
+ rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+ lock_name, lock_duration, this, nullptr));
+ lease_stack.reset(spawn(lease_cr.get(), false));
+ lost_lock = false;
+ }
+ while (!lease_cr->is_locked()) {
+ if (lease_cr->is_done()) {
+ drain_all();
+ tn->log(5, "failed to take lease");
+ return lease_cr->get_ret_status();
+ }
+ set_sleeping(true);
+ yield;
+ }
+ }
+ tn->log(10, "took lease");
+ // if the period has advanced, we can't use the existing marker
+ if (sync_marker.realm_epoch < realm_epoch) {
+ ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker
+ << " from old realm_epoch=" << sync_marker.realm_epoch
+ << " (now " << realm_epoch << ')' << dendl;
+ sync_marker.realm_epoch = realm_epoch;
+ sync_marker.marker.clear();
+ }
+ mdlog_marker = sync_marker.marker;
+ set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+ sync_env->shard_obj_name(shard_id),
+ sync_marker, tn));
+
+ /*
+ * mdlog_marker: the remote sync marker positiion
+ * sync_marker: the local sync marker position
+ * max_marker: the max mdlog position that we fetched
+ * marker: the current position we try to sync
+ * period_marker: the last marker before the next period begins (optional)
+ */
+ marker = max_marker = sync_marker.marker;
+ /* inc sync */
+ do {
+ if (!lease_cr->is_locked()) {
+ lost_lock = true;
+ tn->log(1, "lease is lost, abort");
+ break;
+ }
+#define INCREMENTAL_MAX_ENTRIES 100
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << " truncated=" << truncated << dendl;
+ if (!period_marker.empty() && period_marker <= mdlog_marker) {
+ tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker));
+ done_with_period = true;
+ break;
+ }
+ if (mdlog_marker <= max_marker || !truncated) {
+ /* we're at the tip, try to bring more entries */
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
+ yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
+ period, shard_id,
+ mdlog_marker, &mdlog_marker));
+ }
+ if (retcode < 0) {
+ tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode));
+ yield lease_cr->go_down();
+ drain_all();
+ *reset_backoff = false; // back off and try again later
+ return retcode;
+ }
+ truncated = true;
+ *reset_backoff = true; /* if we got to this point, all systems function */
+ if (mdlog_marker > max_marker) {
+ tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+ tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker));
+ marker = max_marker;
+ yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
+ &max_marker, INCREMENTAL_MAX_ENTRIES,
+ &log_entries, &truncated));
+ if (retcode < 0) {
+ tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode));
+ yield lease_cr->go_down();
+ drain_all();
+ *reset_backoff = false; // back off and try again later
+ return retcode;
+ }
+ for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
+ if (!period_marker.empty() && period_marker <= log_iter->id) {
+ done_with_period = true;
+ if (period_marker < log_iter->id) {
+ tn->log(10, SSTR("found key=" << log_iter->id
+ << " past period_marker=" << period_marker));
+ break;
+ }
+ ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl;
+ // sync this entry, then return control to RGWMetaSyncCR
+ }
+ if (!mdlog_entry.convert_from(*log_iter)) {
+ tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry"));
+ continue;
+ }
+ tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp));
+ if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
+ } else {
+ raw_key = log_iter->section + ":" + log_iter->name;
+ yield {
+ RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false);
+ ceph_assert(stack);
+ // stack_to_pos holds a reference to the stack
+ stack_to_pos[stack] = log_iter->id;
+ pos_to_prev[log_iter->id] = marker;
+ }
+ // limit spawn window
+ while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+ yield wait_for_child();
+ collect_children();
+ }
+ }
+ marker = log_iter->id;
+ }
+ }
+ collect_children();
+ ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
+ if (done_with_period) {
+ // return control to RGWMetaSyncCR and advance to the next period
+ tn->log(10, SSTR(*this << ": done with period"));
+ break;
+ }
+ if (mdlog_marker == max_marker && can_adjust_marker) {
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+ yield wait(utime_t(cct->_conf->rgw_meta_sync_poll_interval, 0));
+ }
+ } while (can_adjust_marker);
+
+ tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+ while (num_spawned() > 1) {
+ yield wait_for_child();
+ collect_children();
+ }
+
+ yield lease_cr->go_down();
+
+ drain_all();
+
+ if (lost_lock) {
+ return -EBUSY;
+ }
+
+ if (!can_adjust_marker) {
+ return -EAGAIN;
+ }
+
+ return set_cr_done();
+ }
+ /* TODO */
+ return 0;
+ }
+};
+
+class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
+{
+ RGWMetaSyncEnv *sync_env;
+
+ const rgw_pool& pool;
+ const std::string& period;
+ epoch_t realm_epoch;
+ RGWMetadataLog* mdlog;
+ uint32_t shard_id;
+ rgw_meta_sync_marker sync_marker;
+ const std::string period_marker;
+
+ RGWSyncTraceNodeRef tn;
+
+ static constexpr bool exit_on_error = false; // retry on all errors
+public:
+ RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+ const std::string& period, epoch_t realm_epoch,
+ RGWMetadataLog* mdlog, uint32_t _shard_id,
+ const rgw_meta_sync_marker& _marker,
+ std::string&& period_marker,
+ RGWSyncTraceNodeRef& _tn_parent)
+ : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
+ pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+ shard_id(_shard_id), sync_marker(_marker),
+ period_marker(std::move(period_marker)) {
+ tn = sync_env->sync_tracer->add_node(_tn_parent, "shard",
+ std::to_string(shard_id));
+ }
+
+ RGWCoroutine *alloc_cr() override {
+ return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
+ shard_id, sync_marker, period_marker, backoff_ptr(), tn);
+ }
+
+ RGWCoroutine *alloc_finisher_cr() override {
+ rgw::sal::RadosStore* store = sync_env->store;
+ return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->dpp, store,
+ rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+ &sync_marker);
+ }
+};
+
+class RGWMetaSyncCR : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+ const rgw_pool& pool;
+ RGWPeriodHistory::Cursor cursor; //< sync position in period history
+ RGWPeriodHistory::Cursor next; //< next period in history
+ rgw_meta_sync_status sync_status;
+ RGWSyncTraceNodeRef tn;
+
+ std::mutex mutex; //< protect access to shard_crs
+
+ // TODO: it should be enough to hold a reference on the stack only, as calling
+ // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
+ // already completed
+ using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
+ using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+ using RefPair = std::pair<ControlCRRef, StackRef>;
+ map<int, RefPair> shard_crs;
+ int ret{0};
+
+public:
+ RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor,
+ const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn)
+ : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+ pool(sync_env->store->svc()->zone->get_zone_params().log_pool),
+ cursor(cursor), sync_status(_sync_status), tn(_tn) {}
+
+ ~RGWMetaSyncCR() {
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ // loop through one period at a time
+ tn->log(1, "start");
+ for (;;) {
+ if (cursor == sync_env->store->svc()->mdlog->get_period_history()->get_current()) {
+ next = RGWPeriodHistory::Cursor{};
+ if (cursor) {
+ ldpp_dout(dpp, 10) << "RGWMetaSyncCR on current period="
+ << cursor.get_period().get_id() << dendl;
+ } else {
+ ldpp_dout(dpp, 10) << "RGWMetaSyncCR with no period" << dendl;
+ }
+ } else {
+ next = cursor;
+ next.next();
+ ldpp_dout(dpp, 10) << "RGWMetaSyncCR on period="
+ << cursor.get_period().get_id() << ", next="
+ << next.get_period().get_id() << dendl;
+ }
+
+ yield {
+ // get the mdlog for the current period (may be empty)
+ auto& period_id = sync_status.sync_info.period;
+ auto realm_epoch = sync_status.sync_info.realm_epoch;
+ auto mdlog = sync_env->store->svc()->mdlog->get_log(period_id);
+
+ tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id));
+
+ // prevent wakeup() from accessing shard_crs while we're spawning them
+ std::lock_guard<std::mutex> lock(mutex);
+
+ // sync this period on each shard
+ for (const auto& m : sync_status.sync_markers) {
+ uint32_t shard_id = m.first;
+ auto& marker = m.second;
+
+ std::string period_marker;
+ if (next) {
+ // read the maximum marker from the next period's sync status
+ period_marker = next.get_period().get_sync_status()[shard_id];
+ if (period_marker.empty()) {
+ // no metadata changes have occurred on this shard, skip it
+ ldpp_dout(dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
+ << " with empty period marker" << dendl;
+ continue;
+ }
+ }
+
+ using ShardCR = RGWMetaSyncShardControlCR;
+ auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
+ mdlog, shard_id, marker,
+ std::move(period_marker), tn);
+ auto stack = spawn(cr, false);
+ shard_crs[shard_id] = RefPair{cr, stack};
+ }
+ }
+ // wait for each shard to complete
+ while (ret == 0 && num_spawned() > 0) {
+ yield wait_for_child();
+ collect(&ret, nullptr);
+ }
+ drain_all();
+ {
+ // drop shard cr refs under lock
+ std::lock_guard<std::mutex> lock(mutex);
+ shard_crs.clear();
+ }
+ if (ret < 0) {
+ return set_cr_error(ret);
+ }
+ // advance to the next period
+ ceph_assert(next);
+ cursor = next;
+
+ // write the updated sync info
+ sync_status.sync_info.period = cursor.get_period().get_id();
+ sync_status.sync_info.realm_epoch = cursor.get_epoch();
+ yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->store,
+ rgw_raw_obj(pool, sync_env->status_oid()),
+ sync_status.sync_info));
+ }
+ }
+ return 0;
+ }
+
+ void wakeup(int shard_id) {
+ std::lock_guard<std::mutex> lock(mutex);
+ auto iter = shard_crs.find(shard_id);
+ if (iter == shard_crs.end()) {
+ return;
+ }
+ iter->second.first->wakeup();
+ }
+};
+
+void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
+ env->dpp = dpp;
+ env->cct = store->ctx();
+ env->store = store;
+ env->conn = conn;
+ env->async_rados = async_rados;
+ env->http_manager = &http_manager;
+ env->error_logger = error_logger;
+ env->sync_tracer = store->getRados()->get_sync_tracer();
+}
+
+int RGWRemoteMetaLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+ // cannot run concurrently with run_sync(), so run in a separate manager
+ RGWCoroutinesManager crs(store->ctx(), store->getRados()->get_cr_registry());
+ RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+ int ret = http_manager.start();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+ return ret;
+ }
+ RGWMetaSyncEnv sync_env_local = sync_env;
+ sync_env_local.http_manager = &http_manager;
+ tn->log(20, "read sync status");
+ ret = crs.run(dpp, new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
+ http_manager.stop();
+ return ret;
+}
+
+int RGWRemoteMetaLog::init_sync_status(const DoutPrefixProvider *dpp)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+
+ rgw_mdlog_info mdlog_info;
+ int r = read_log_info(dpp, &mdlog_info);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+ return r;
+ }
+
+ rgw_meta_sync_info sync_info;
+ sync_info.num_shards = mdlog_info.num_shards;
+ auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+ if (cursor) {
+ sync_info.period = cursor.get_period().get_id();
+ sync_info.realm_epoch = cursor.get_epoch();
+ }
+
+ return run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
+}
+
+int RGWRemoteMetaLog::store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info)
+{
+ tn->log(20, "store sync info");
+ return run(dpp, new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.status_oid()),
+ sync_info));
+}
+
+// return a cursor to the period at our sync position
+static RGWPeriodHistory::Cursor get_period_at(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const rgw_meta_sync_info& info,
+ optional_yield y)
+{
+ if (info.period.empty()) {
+ // return an empty cursor with error=0
+ return RGWPeriodHistory::Cursor{};
+ }
+
+ // look for an existing period in our history
+ auto cursor = store->svc()->mdlog->get_period_history()->lookup(info.realm_epoch);
+ if (cursor) {
+ // verify that the period ids match
+ auto& existing = cursor.get_period().get_id();
+ if (existing != info.period) {
+ ldpp_dout(dpp, -1) << "ERROR: sync status period=" << info.period
+ << " does not match period=" << existing
+ << " in history at realm epoch=" << info.realm_epoch << dendl;
+ return RGWPeriodHistory::Cursor{-EEXIST};
+ }
+ return cursor;
+ }
+
+ // read the period from rados or pull it from the master
+ RGWPeriod period;
+ int r = store->svc()->mdlog->pull_period(dpp, info.period, period, y);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to read period id "
+ << info.period << ": " << cpp_strerror(r) << dendl;
+ return RGWPeriodHistory::Cursor{r};
+ }
+ // attach the period to our history
+ cursor = store->svc()->mdlog->get_period_history()->attach(dpp, std::move(period), y);
+ if (!cursor) {
+ r = cursor.get_error();
+ ldpp_dout(dpp, -1) << "ERROR: failed to read period history back to "
+ << info.period << ": " << cpp_strerror(r) << dendl;
+ }
+ return cursor;
+}
+
+int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ if (store->svc()->zone->is_meta_master()) {
+ return 0;
+ }
+
+ int r = 0;
+
+ // get shard count and oldest log period from master
+ rgw_mdlog_info mdlog_info;
+ for (;;) {
+ if (going_down) {
+ ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+ return 0;
+ }
+ r = read_log_info(dpp, &mdlog_info);
+ if (r == -EIO || r == -ENOENT) {
+ // keep retrying if master isn't alive or hasn't initialized the log
+ ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl;
+ backoff.backoff_sleep();
+ continue;
+ }
+ backoff.reset();
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+ return r;
+ }
+ break;
+ }
+
+ rgw_meta_sync_status sync_status;
+ do {
+ if (going_down) {
+ ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+ return 0;
+ }
+ r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
+ return r;
+ }
+
+ if (!mdlog_info.period.empty()) {
+ // restart sync if the remote has a period, but:
+ // a) our status does not, or
+ // b) our sync period comes before the remote's oldest log period
+ if (sync_status.sync_info.period.empty() ||
+ sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
+ sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
+ string reason;
+ if (sync_status.sync_info.period.empty()) {
+ reason = "period is empty";
+ } else {
+ reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch);
+ }
+ tn->log(1, "initialize sync (reason: " + reason + ")");
+ ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch
+ << " in sync status comes before remote's oldest mdlog epoch="
+ << mdlog_info.realm_epoch << ", restarting sync" << dendl;
+ }
+ }
+
+ if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
+ ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl;
+ sync_status.sync_info.num_shards = mdlog_info.num_shards;
+ auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+ if (cursor) {
+ // run full sync, then start incremental from the current period/epoch
+ sync_status.sync_info.period = cursor.get_period().get_id();
+ sync_status.sync_info.realm_epoch = cursor.get_epoch();
+ }
+ r = run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
+ if (r == -EBUSY) {
+ backoff.backoff_sleep();
+ continue;
+ }
+ backoff.reset();
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl;
+ return r;
+ }
+ }
+ } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
+
+ auto num_shards = sync_status.sync_info.num_shards;
+ if (num_shards != mdlog_info.num_shards) {
+ ldpp_dout(dpp, -1) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
+ return -EINVAL;
+ }
+
+ RGWPeriodHistory::Cursor cursor;
+ do {
+ r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+ if (r < 0 && r != -ENOENT) {
+ tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r));
+ return r;
+ }
+
+ switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
+ case rgw_meta_sync_info::StateBuildingFullSyncMaps:
+ tn->log(20, "building full sync maps");
+ r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
+ if (r == -EBUSY || r == -EIO) {
+ backoff.backoff_sleep();
+ continue;
+ }
+ backoff.reset();
+ if (r < 0) {
+ tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")"));
+ return r;
+ }
+
+ sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
+ r = store_sync_info(dpp, sync_status.sync_info);
+ if (r < 0) {
+ tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")"));
+ return r;
+ }
+ /* fall through */
+ case rgw_meta_sync_info::StateSync:
+ tn->log(20, "sync");
+ // find our position in the period history (if any)
+ cursor = get_period_at(dpp, store, sync_status.sync_info, y);
+ r = cursor.get_error();
+ if (r < 0) {
+ return r;
+ }
+ meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn);
+ r = run(dpp, meta_sync_cr);
+ if (r < 0) {
+ tn->log(0, "ERROR: failed to fetch all metadata keys");
+ return r;
+ }
+ break;
+ default:
+ tn->log(0, "ERROR: bad sync state!");
+ return -EIO;
+ }
+ } while (!going_down);
+
+ return 0;
+}
+
+void RGWRemoteMetaLog::wakeup(int shard_id)
+{
+ if (!meta_sync_cr) {
+ return;
+ }
+ meta_sync_cr->wakeup(shard_id);
+}
+
+int RGWCloneMetaLogCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ do {
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
+ return state_init();
+ }
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
+ return state_read_shard_status();
+ }
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
+ return state_read_shard_status_complete();
+ }
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
+ return state_send_rest_request(dpp);
+ }
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
+ return state_receive_rest_response();
+ }
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
+ return state_store_mdlog_entries();
+ }
+ } while (truncated);
+ yield {
+ ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
+ return state_store_mdlog_entries_complete();
+ }
+ }
+
+ return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_init()
+{
+ data = rgw_mdlog_shard_data();
+
+ return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status()
+{
+ const bool add_ref = false; // default constructs with refs=1
+
+ completion.reset(new RGWMetadataLogInfoCompletion(
+ [this](int ret, const cls_log_header& header) {
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with "
+ << cpp_strerror(ret) << dendl;
+ }
+ } else {
+ shard_info.marker = header.max_marker;
+ shard_info.last_update = header.max_time.to_real_time();
+ }
+ // wake up parent stack
+ io_complete();
+ }), add_ref);
+
+ int ret = mdlog->get_info_async(sync_env->dpp, shard_id, completion.get());
+ if (ret < 0) {
+ ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
+{
+ completion.reset();
+
+ ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
+
+ marker = shard_info.marker;
+
+ return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_send_rest_request(const DoutPrefixProvider *dpp)
+{
+ RGWRESTConn *conn = sync_env->conn;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+
+ char max_entries_buf[32];
+ snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
+
+ const char *marker_key = (marker.empty() ? "" : "marker");
+
+ rgw_http_param_pair pairs[] = { { "type", "metadata" },
+ { "id", buf },
+ { "period", period.c_str() },
+ { "max-entries", max_entries_buf },
+ { marker_key, marker.c_str() },
+ { NULL, NULL } };
+
+ http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
+
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ http_op = NULL;
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_receive_rest_response()
+{
+ int ret = http_op->wait(&data, null_yield);
+ if (ret < 0) {
+ error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
+ ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl;
+ http_op->put();
+ http_op = NULL;
+ return set_cr_error(ret);
+ }
+ http_op->put();
+ http_op = NULL;
+
+ ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
+
+ truncated = ((int)data.entries.size() == max_entries);
+
+ if (data.entries.empty()) {
+ if (new_marker) {
+ *new_marker = marker;
+ }
+ return set_cr_done();
+ }
+
+ if (new_marker) {
+ *new_marker = data.entries.back().id;
+ }
+
+ return 0;
+}
+
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
+{
+ list<cls_log_entry> dest_entries;
+
+ vector<rgw_mdlog_entry>::iterator iter;
+ for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
+ rgw_mdlog_entry& entry = *iter;
+ ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl;
+
+ cls_log_entry dest_entry;
+ dest_entry.id = entry.id;
+ dest_entry.section = entry.section;
+ dest_entry.name = entry.name;
+ dest_entry.timestamp = utime_t(entry.timestamp);
+
+ encode(entry.log_data, dest_entry.data);
+
+ dest_entries.push_back(dest_entry);
+
+ marker = entry.id;
+ }
+
+ RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
+
+ int ret = mdlog->store_entries_in_shard(sync_env->dpp, dest_entries, shard_id, cn->completion());
+ if (ret < 0) {
+ cn->put();
+ ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
+ return set_cr_error(ret);
+ }
+ return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
+{
+ return set_cr_done();
+}
+
+void rgw_meta_sync_info::decode_json(JSONObj *obj)
+{
+ string s;
+ JSONDecoder::decode_json("status", s, obj);
+ if (s == "init") {
+ state = StateInit;
+ } else if (s == "building-full-sync-maps") {
+ state = StateBuildingFullSyncMaps;
+ } else if (s == "sync") {
+ state = StateSync;
+ }
+ JSONDecoder::decode_json("num_shards", num_shards, obj);
+ JSONDecoder::decode_json("period", period, obj);
+ JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_info::dump(Formatter *f) const
+{
+ string s;
+ switch ((SyncState)state) {
+ case StateInit:
+ s = "init";
+ break;
+ case StateBuildingFullSyncMaps:
+ s = "building-full-sync-maps";
+ break;
+ case StateSync:
+ s = "sync";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+ encode_json("status", s, f);
+ encode_json("num_shards", num_shards, f);
+ encode_json("period", period, f);
+ encode_json("realm_epoch", realm_epoch, f);
+}
+
+
+void rgw_meta_sync_marker::decode_json(JSONObj *obj)
+{
+ int s;
+ JSONDecoder::decode_json("state", s, obj);
+ state = s;
+ JSONDecoder::decode_json("marker", marker, obj);
+ JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+ JSONDecoder::decode_json("total_entries", total_entries, obj);
+ JSONDecoder::decode_json("pos", pos, obj);
+ utime_t ut;
+ JSONDecoder::decode_json("timestamp", ut, obj);
+ timestamp = ut.to_real_time();
+ JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_marker::dump(Formatter *f) const
+{
+ encode_json("state", (int)state, f);
+ encode_json("marker", marker, f);
+ encode_json("next_step_marker", next_step_marker, f);
+ encode_json("total_entries", total_entries, f);
+ encode_json("pos", pos, f);
+ encode_json("timestamp", utime_t(timestamp), f);
+ encode_json("realm_epoch", realm_epoch, f);
+}
+
+void rgw_meta_sync_status::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("info", sync_info, obj);
+ JSONDecoder::decode_json("markers", sync_markers, obj);
+}
+
+void rgw_meta_sync_status::dump(Formatter *f) const {
+ encode_json("info", sync_info, f);
+ encode_json("markers", sync_markers, f);
+}
+
+void rgw_sync_error_info::dump(Formatter *f) const {
+ encode_json("source_zone", source_zone, f);
+ encode_json("error_code", error_code, f);
+ encode_json("message", message, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h
new file mode 100644
index 000000000..e6c255cc6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync.h
@@ -0,0 +1,547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+
+#include "include/stringify.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_http_client.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_sync_trace.h"
+#include "rgw_mdlog.h"
+
+#define ERROR_LOGGER_SHARDS 32
+#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log"
+
+struct rgw_mdlog_info {
+ uint32_t num_shards;
+ std::string period; //< period id of the master's oldest metadata log
+ epoch_t realm_epoch; //< realm epoch of oldest metadata log
+
+ rgw_mdlog_info() : num_shards(0), realm_epoch(0) {}
+
+ void decode_json(JSONObj *obj);
+};
+
+
+struct rgw_mdlog_entry {
+ std::string id;
+ std::string section;
+ std::string name;
+ ceph::real_time timestamp;
+ RGWMetadataLogData log_data;
+
+ void decode_json(JSONObj *obj);
+
+ bool convert_from(cls_log_entry& le) {
+ id = le.id;
+ section = le.section;
+ name = le.name;
+ timestamp = le.timestamp.to_real_time();
+ try {
+ auto iter = le.data.cbegin();
+ decode(log_data, iter);
+ } catch (buffer::error& err) {
+ return false;
+ }
+ return true;
+ }
+};
+
+struct rgw_mdlog_shard_data {
+ std::string marker;
+ bool truncated;
+ std::vector<rgw_mdlog_entry> entries;
+
+ void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWMetaSyncStatusManager;
+class RGWMetaSyncCR;
+class RGWRESTConn;
+class RGWSyncTraceManager;
+
+class RGWSyncErrorLogger {
+ rgw::sal::RadosStore* store;
+
+ std::vector<std::string> oids;
+ int num_shards;
+
+ std::atomic<int64_t> counter = { 0 };
+public:
+ RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const std::string &oid_prefix, int _num_shards);
+ RGWCoroutine *log_error_cr(const DoutPrefixProvider *dpp, const std::string& source_zone, const std::string& section, const std::string& name, uint32_t error_code, const std::string& message);
+
+ static std::string get_shard_oid(const std::string& oid_prefix, int shard_id);
+};
+
+struct rgw_sync_error_info {
+ std::string source_zone;
+ uint32_t error_code;
+ std::string message;
+
+ rgw_sync_error_info() : error_code(0) {}
+ rgw_sync_error_info(const std::string& _source_zone, uint32_t _error_code, const std::string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(source_zone, bl);
+ encode(error_code, bl);
+ encode(message, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(source_zone, bl);
+ decode(error_code, bl);
+ decode(message, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_error_info)
+
+#define DEFAULT_BACKOFF_MAX 30
+
+class RGWSyncBackoff {
+ int cur_wait;
+ int max_secs;
+
+ void update_wait_time();
+public:
+ explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {}
+
+ void backoff_sleep();
+ void reset() {
+ cur_wait = 0;
+ }
+
+ void backoff(RGWCoroutine *op);
+};
+
+class RGWBackoffControlCR : public RGWCoroutine
+{
+ RGWCoroutine *cr;
+ ceph::mutex lock;
+
+ RGWSyncBackoff backoff;
+ bool reset_backoff;
+
+ bool exit_on_error;
+
+protected:
+ bool *backoff_ptr() {
+ return &reset_backoff;
+ }
+
+ ceph::mutex& cr_lock() {
+ return lock;
+ }
+
+ RGWCoroutine *get_cr() {
+ return cr;
+ }
+
+public:
+ RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error)
+ : RGWCoroutine(_cct),
+ cr(nullptr),
+ lock(ceph::make_mutex("RGWBackoffControlCR::lock:" + stringify(this))),
+ reset_backoff(false), exit_on_error(_exit_on_error) {
+ }
+
+ ~RGWBackoffControlCR() override {
+ if (cr) {
+ cr->put();
+ }
+ }
+
+ virtual RGWCoroutine *alloc_cr() = 0;
+ virtual RGWCoroutine *alloc_finisher_cr() { return NULL; }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+struct RGWMetaSyncEnv {
+ const DoutPrefixProvider *dpp;
+ CephContext *cct{nullptr};
+ rgw::sal::RadosStore* store{nullptr};
+ RGWRESTConn *conn{nullptr};
+ RGWAsyncRadosProcessor *async_rados{nullptr};
+ RGWHTTPManager *http_manager{nullptr};
+ RGWSyncErrorLogger *error_logger{nullptr};
+ RGWSyncTraceManager *sync_tracer{nullptr};
+
+ RGWMetaSyncEnv() {}
+
+ void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+ RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+ RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer);
+
+ std::string shard_obj_name(int shard_id);
+ std::string status_oid();
+};
+
+class RGWRemoteMetaLog : public RGWCoroutinesManager {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWRESTConn *conn;
+ RGWAsyncRadosProcessor *async_rados;
+
+ RGWHTTPManager http_manager;
+ RGWMetaSyncStatusManager *status_manager;
+ RGWSyncErrorLogger *error_logger{nullptr};
+ RGWSyncTraceManager *sync_tracer{nullptr};
+
+ RGWMetaSyncCR *meta_sync_cr{nullptr};
+
+ RGWSyncBackoff backoff;
+
+ RGWMetaSyncEnv sync_env;
+
+ void init_sync_env(RGWMetaSyncEnv *env);
+ int store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info);
+
+ std::atomic<bool> going_down = { false };
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWRemoteMetaLog(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store,
+ RGWAsyncRadosProcessor *async_rados,
+ RGWMetaSyncStatusManager *_sm)
+ : RGWCoroutinesManager(_store->ctx(), _store->getRados()->get_cr_registry()),
+ dpp(dpp), store(_store), conn(NULL), async_rados(async_rados),
+ http_manager(store->ctx(), completion_mgr),
+ status_manager(_sm) {}
+
+ virtual ~RGWRemoteMetaLog() override;
+
+ int init();
+ void finish();
+
+ int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info);
+ int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info);
+ int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result);
+ int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status);
+ int init_sync_status(const DoutPrefixProvider *dpp);
+ int run_sync(const DoutPrefixProvider *dpp, optional_yield y);
+
+ void wakeup(int shard_id);
+
+ RGWMetaSyncEnv& get_sync_env() {
+ return sync_env;
+ }
+};
+
+class RGWMetaSyncStatusManager : public DoutPrefixProvider {
+ rgw::sal::RadosStore* store;
+ librados::IoCtx ioctx;
+
+ RGWRemoteMetaLog master_log;
+
+ std::map<int, rgw_raw_obj> shard_objs;
+
+ struct utime_shard {
+ real_time ts;
+ int shard_id;
+
+ utime_shard() : shard_id(-1) {}
+
+ bool operator<(const utime_shard& rhs) const {
+ if (ts == rhs.ts) {
+ return shard_id < rhs.shard_id;
+ }
+ return ts < rhs.ts;
+ }
+ };
+
+ ceph::shared_mutex ts_to_shard_lock = ceph::make_shared_mutex("ts_to_shard_lock");
+ std::map<utime_shard, int> ts_to_shard;
+ std::vector<std::string> clone_markers;
+
+public:
+ RGWMetaSyncStatusManager(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
+ : store(_store), master_log(this, store, async_rados, this)
+ {}
+
+ virtual ~RGWMetaSyncStatusManager() override;
+
+ int init(const DoutPrefixProvider *dpp);
+
+ int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) {
+ return master_log.read_sync_status(dpp, sync_status);
+ }
+ int init_sync_status(const DoutPrefixProvider *dpp) { return master_log.init_sync_status(dpp); }
+ int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) {
+ return master_log.read_log_info(dpp, log_info);
+ }
+ int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info) {
+ return master_log.read_master_log_shards_info(dpp, master_period, shards_info);
+ }
+ int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result) {
+ return master_log.read_master_log_shards_next(dpp, period, shard_markers, result);
+ }
+
+ int run(const DoutPrefixProvider *dpp, optional_yield y) { return master_log.run_sync(dpp, y); }
+
+
+ // implements DoutPrefixProvider
+ CephContext *get_cct() const override { return store->ctx(); }
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+
+ void wakeup(int shard_id) { return master_log.wakeup(shard_id); }
+ void stop() {
+ master_log.finish();
+ }
+};
+
+class RGWOrderCallCR : public RGWCoroutine
+{
+public:
+ RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {}
+
+ virtual void call_cr(RGWCoroutine *_cr) = 0;
+};
+
+class RGWLastCallerWinsCR : public RGWOrderCallCR
+{
+ RGWCoroutine *cr{nullptr};
+
+public:
+ explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {}
+ ~RGWLastCallerWinsCR() {
+ if (cr) {
+ cr->put();
+ }
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+
+ void call_cr(RGWCoroutine *_cr) override {
+ if (cr) {
+ cr->put();
+ }
+ cr = _cr;
+ }
+};
+
+template <class T, class K>
+class RGWSyncShardMarkerTrack {
+ struct marker_entry {
+ uint64_t pos;
+ real_time timestamp;
+
+ marker_entry() : pos(0) {}
+ marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {}
+ };
+ typename std::map<T, marker_entry> pending;
+
+ std::map<T, marker_entry> finish_markers;
+
+ int window_size;
+ int updates_since_flush;
+
+ RGWOrderCallCR *order_cr{nullptr};
+
+protected:
+ typename std::set<K> need_retry_set;
+
+ virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0;
+ virtual RGWOrderCallCR *allocate_order_control_cr() = 0;
+ virtual void handle_finish(const T& marker) { }
+
+public:
+ RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {}
+ virtual ~RGWSyncShardMarkerTrack() {
+ if (order_cr) {
+ order_cr->put();
+ }
+ }
+
+ bool start(const T& pos, int index_pos, const real_time& timestamp) {
+ if (pending.find(pos) != pending.end()) {
+ return false;
+ }
+ pending[pos] = marker_entry(index_pos, timestamp);
+ return true;
+ }
+
+ void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) {
+ finish_markers[pos] = marker_entry(index_pos, timestamp);
+ }
+
+ RGWCoroutine *finish(const T& pos) {
+ if (pending.empty()) {
+ /* can happen, due to a bug that ended up with multiple objects with the same name and version
+ * -- which can happen when versioning is enabled an the version is 'null'.
+ */
+ return NULL;
+ }
+
+ typename std::map<T, marker_entry>::iterator iter = pending.begin();
+
+ bool is_first = (pos == iter->first);
+
+ typename std::map<T, marker_entry>::iterator pos_iter = pending.find(pos);
+ if (pos_iter == pending.end()) {
+ /* see pending.empty() comment */
+ return NULL;
+ }
+
+ finish_markers[pos] = pos_iter->second;
+
+ pending.erase(pos);
+
+ handle_finish(pos);
+
+ updates_since_flush++;
+
+ if (is_first && (updates_since_flush >= window_size || pending.empty())) {
+ return flush();
+ }
+ return NULL;
+ }
+
+ RGWCoroutine *flush() {
+ if (finish_markers.empty()) {
+ return NULL;
+ }
+
+ typename std::map<T, marker_entry>::iterator i;
+
+ if (pending.empty()) {
+ i = finish_markers.end();
+ } else {
+ i = finish_markers.lower_bound(pending.begin()->first);
+ }
+ if (i == finish_markers.begin()) {
+ return NULL;
+ }
+ updates_since_flush = 0;
+
+ auto last = i;
+ --i;
+ const T& high_marker = i->first;
+ marker_entry& high_entry = i->second;
+ RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp));
+ finish_markers.erase(finish_markers.begin(), last);
+ return cr;
+ }
+
+ /*
+ * a key needs retry if it was processing when another marker that points
+ * to the same bucket shards arrives. Instead of processing it, we mark
+ * it as need_retry so that when we finish processing the original, we
+ * retry the processing on the same bucket shard, in case there are more
+ * entries to process. This closes a race that can happen.
+ */
+ bool need_retry(const K& key) {
+ return (need_retry_set.find(key) != need_retry_set.end());
+ }
+
+ void set_need_retry(const K& key) {
+ need_retry_set.insert(key);
+ }
+
+ void reset_need_retry(const K& key) {
+ need_retry_set.erase(key);
+ }
+
+ RGWCoroutine *order(RGWCoroutine *cr) {
+ /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns
+ * nothing and the existing one will call the cr
+ */
+ if (order_cr && order_cr->is_done()) {
+ order_cr->put();
+ order_cr = nullptr;
+ }
+ if (!order_cr) {
+ order_cr = allocate_order_control_cr();
+ order_cr->get();
+ order_cr->call_cr(cr);
+ return order_cr;
+ }
+ order_cr->call_cr(cr);
+ return nullptr; /* don't call it a second time */
+ }
+};
+
+class RGWMetaSyncShardMarkerTrack;
+
+class RGWMetaSyncSingleEntryCR : public RGWCoroutine {
+ RGWMetaSyncEnv *sync_env;
+
+ std::string raw_key;
+ std::string entry_marker;
+ RGWMDLogStatus op_status;
+
+ ssize_t pos;
+ std::string section;
+ std::string key;
+
+ int sync_status;
+
+ bufferlist md_bl;
+
+ RGWMetaSyncShardMarkerTrack *marker_tracker;
+
+ int tries;
+
+ bool error_injection;
+
+ RGWSyncTraceNodeRef tn;
+
+public:
+ RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+ const std::string& _raw_key, const std::string& _entry_marker,
+ const RGWMDLogStatus& _op_status,
+ RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent);
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWShardCollectCR : public RGWCoroutine {
+ int current_running = 0;
+ protected:
+ int max_concurrent;
+ int status = 0;
+
+ // called with the result of each child. error codes can be ignored by
+ // returning 0. if handle_result() returns a negative value, it's
+ // treated as an error and stored in 'status'. the last such error is
+ // reported to the caller with set_cr_error()
+ virtual int handle_result(int r) = 0;
+ public:
+ RGWShardCollectCR(CephContext *_cct, int _max_concurrent)
+ : RGWCoroutine(_cct), max_concurrent(_max_concurrent)
+ {}
+
+ virtual bool spawn_next() = 0;
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+// factory functions for meta sync coroutines needed in mdlog trimming
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+ const std::string& period,
+ int shard_id,
+ RGWMetadataLogInfo* info);
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+ const std::string& period,
+ int shard_id,
+ const std::string& marker,
+ uint32_t max_entries,
+ rgw_mdlog_shard_data *result);
+
diff --git a/src/rgw/driver/rados/rgw_sync_counters.cc b/src/rgw/driver/rados/rgw_sync_counters.cc
new file mode 100644
index 000000000..1d23d58dc
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_counters.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_context.h"
+#include "rgw_sync_counters.h"
+
+namespace sync_counters {
+
+PerfCountersRef build(CephContext *cct, const std::string& name)
+{
+ PerfCountersBuilder b(cct, name, l_first, l_last);
+
+ // share these counters with ceph-mgr
+ b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+ b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated");
+ b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated");
+ b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors");
+
+ b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
+ b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
+
+ auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
+ cct->get_perfcounters_collection()->add(logger.get());
+ return logger;
+}
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_counters.h b/src/rgw/driver/rados/rgw_sync_counters.h
new file mode 100644
index 000000000..df3acc680
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_counters.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/perf_counters_collection.h"
+
+namespace sync_counters {
+
+enum {
+ l_first = 805000,
+
+ l_fetch,
+ l_fetch_not_modified,
+ l_fetch_err,
+
+ l_poll,
+ l_poll_err,
+
+ l_last,
+};
+
+PerfCountersRef build(CephContext *cct, const std::string& name);
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.cc b/src/rgw/driver/rados/rgw_sync_error_repo.cc
new file mode 100644
index 000000000..44305b60b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_error_repo.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "rgw_sync_error_repo.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "services/svc_rados.h"
+#include "cls/cmpomap/client.h"
+
+namespace rgw::error_repo {
+
+// prefix for the binary encoding of keys. this particular value is not
+// valid as the first byte of a utf8 code point, so we use this to
+// differentiate the binary encoding from existing string keys for
+// backward-compatibility
+constexpr uint8_t binary_key_prefix = 0x80;
+
+struct key_type {
+ rgw_bucket_shard bs;
+ std::optional<uint64_t> gen;
+};
+
+void encode(const key_type& k, bufferlist& bl, uint64_t f=0)
+{
+ ENCODE_START(1, 1, bl);
+ encode(k.bs, bl);
+ encode(k.gen, bl);
+ ENCODE_FINISH(bl);
+}
+
+void decode(key_type& k, bufferlist::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ decode(k.bs, bl);
+ decode(k.gen, bl);
+ DECODE_FINISH(bl);
+}
+
+std::string encode_key(const rgw_bucket_shard& bs,
+ std::optional<uint64_t> gen)
+{
+ using ceph::encode;
+ const auto key = key_type{bs, gen};
+ bufferlist bl;
+ encode(binary_key_prefix, bl);
+ encode(key, bl);
+ return bl.to_str();
+}
+
+int decode_key(std::string encoded,
+ rgw_bucket_shard& bs,
+ std::optional<uint64_t>& gen)
+{
+ using ceph::decode;
+ key_type key;
+ const auto bl = bufferlist::static_from_string(encoded);
+ auto p = bl.cbegin();
+ try {
+ uint8_t prefix;
+ decode(prefix, p);
+ if (prefix != binary_key_prefix) {
+ return -EINVAL;
+ }
+ decode(key, p);
+ } catch (const buffer::error&) {
+ return -EIO;
+ }
+ if (!p.end()) {
+ return -EIO; // buffer contained unexpected bytes
+ }
+ bs = std::move(key.bs);
+ gen = key.gen;
+ return 0;
+}
+
+ceph::real_time decode_value(const bufferlist& bl)
+{
+ uint64_t value;
+ try {
+ using ceph::decode;
+ decode(value, bl);
+ } catch (const buffer::error&) {
+ value = 0; // empty buffer = 0
+ }
+ return ceph::real_clock::zero() + ceph::timespan(value);
+}
+
+int write(librados::ObjectWriteOperation& op,
+ const std::string& key,
+ ceph::real_time timestamp)
+{
+ // overwrite the existing timestamp if value is greater
+ const uint64_t value = timestamp.time_since_epoch().count();
+ using namespace ::cls::cmpomap;
+ const bufferlist zero = u64_buffer(0); // compare against 0 for missing keys
+ return cmp_set_vals(op, Mode::U64, Op::GT, {{key, u64_buffer(value)}}, zero);
+}
+
+int remove(librados::ObjectWriteOperation& op,
+ const std::string& key,
+ ceph::real_time timestamp)
+{
+ // remove the omap key if value >= existing
+ const uint64_t value = timestamp.time_since_epoch().count();
+ using namespace ::cls::cmpomap;
+ return cmp_rm_keys(op, Mode::U64, Op::GTE, {{key, u64_buffer(value)}});
+}
+
+class RGWErrorRepoWriteCR : public RGWSimpleCoroutine {
+ RGWSI_RADOS::Obj obj;
+ std::string key;
+ ceph::real_time timestamp;
+
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+ RGWErrorRepoWriteCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+ const std::string& key, ceph::real_time timestamp)
+ : RGWSimpleCoroutine(rados->ctx()),
+ obj(rados->obj(raw_obj)),
+ key(key), timestamp(timestamp)
+ {}
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ librados::ObjectWriteOperation op;
+ int r = write(op, key, timestamp);
+ if (r < 0) {
+ return r;
+ }
+ r = obj.open(dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ cn = stack->create_completion_notifier();
+ return obj.aio_operate(cn->completion(), &op);
+ }
+
+ int request_complete() override {
+ return cn->completion()->get_return_value();
+ }
+};
+
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+ const rgw_raw_obj& obj,
+ const std::string& key,
+ ceph::real_time timestamp)
+{
+ return new RGWErrorRepoWriteCR(rados, obj, key, timestamp);
+}
+
+
+class RGWErrorRepoRemoveCR : public RGWSimpleCoroutine {
+ RGWSI_RADOS::Obj obj;
+ std::string key;
+ ceph::real_time timestamp;
+
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+ RGWErrorRepoRemoveCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+ const std::string& key, ceph::real_time timestamp)
+ : RGWSimpleCoroutine(rados->ctx()),
+ obj(rados->obj(raw_obj)),
+ key(key), timestamp(timestamp)
+ {}
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ librados::ObjectWriteOperation op;
+ int r = remove(op, key, timestamp);
+ if (r < 0) {
+ return r;
+ }
+ r = obj.open(dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ cn = stack->create_completion_notifier();
+ return obj.aio_operate(cn->completion(), &op);
+ }
+
+ int request_complete() override {
+ return cn->completion()->get_return_value();
+ }
+};
+
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+ const rgw_raw_obj& obj,
+ const std::string& key,
+ ceph::real_time timestamp)
+{
+ return new RGWErrorRepoRemoveCR(rados, obj, key, timestamp);
+}
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.h b/src/rgw/driver/rados/rgw_sync_error_repo.h
new file mode 100644
index 000000000..60525d281
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_error_repo.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <optional>
+#include "include/rados/librados_fwd.hpp"
+#include "include/buffer_fwd.h"
+#include "common/ceph_time.h"
+
+class RGWSI_RADOS;
+class RGWCoroutine;
+struct rgw_raw_obj;
+struct rgw_bucket_shard;
+
+namespace rgw::error_repo {
+
+// binary-encode a bucket/shard/gen and return it as a string
+std::string encode_key(const rgw_bucket_shard& bs,
+ std::optional<uint64_t> gen);
+
+// try to decode a key. returns -EINVAL if not in binary format
+int decode_key(std::string encoded,
+ rgw_bucket_shard& bs,
+ std::optional<uint64_t>& gen);
+
+// decode a timestamp as a uint64_t for CMPXATTR_MODE_U64
+ceph::real_time decode_value(const ceph::bufferlist& bl);
+
+// write an omap key iff the given timestamp is newer
+int write(librados::ObjectWriteOperation& op,
+ const std::string& key,
+ ceph::real_time timestamp);
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+ const rgw_raw_obj& obj,
+ const std::string& key,
+ ceph::real_time timestamp);
+
+// remove an omap key iff there isn't a newer timestamp
+int remove(librados::ObjectWriteOperation& op,
+ const std::string& key,
+ ceph::real_time timestamp);
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+ const rgw_raw_obj& obj,
+ const std::string& key,
+ ceph::real_time timestamp);
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_module.cc b/src/rgw/driver/rados/rgw_sync_module.cc
new file mode 100644
index 000000000..5a1e70be3
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module.cc
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_bucket.h"
+
+#include "rgw_sync_module_log.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_aws.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler()
+{
+ return RGWBucketMetaHandlerAllocator::alloc();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWSyncModuleInstance::alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver)
+{
+ return RGWBucketInstanceMetaHandlerAllocator::alloc(driver);
+}
+
+RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ src_bucket(_src_bucket), key(_key) {
+}
+
+RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ src_bucket(_src_bucket), key(_key) {
+}
+
+int RGWCallStatRemoteObjCR::operate(const DoutPrefixProvider *dpp) {
+ reenter(this) {
+ yield {
+ call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->driver,
+ sc->source_zone,
+ src_bucket, key, &mtime, &size, &etag, &attrs, &headers));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ ldpp_dout(dpp, 20) << "stat of remote obj: z=" << sc->source_zone
+ << " b=" << src_bucket << " k=" << key
+ << " size=" << size << " mtime=" << mtime << dendl;
+ yield {
+ RGWStatRemoteObjCBCR *cb = allocate_callback();
+ if (cb) {
+ cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers));
+ call(cb);
+ }
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager)
+{
+ RGWSyncModuleRef default_module(std::make_shared<RGWDefaultSyncModule>());
+ modules_manager->register_module("rgw", default_module, true);
+
+ RGWSyncModuleRef archive_module(std::make_shared<RGWArchiveSyncModule>());
+ modules_manager->register_module("archive", archive_module);
+
+ RGWSyncModuleRef log_module(std::make_shared<RGWLogSyncModule>());
+ modules_manager->register_module("log", log_module);
+
+ RGWSyncModuleRef es_module(std::make_shared<RGWElasticSyncModule>());
+ modules_manager->register_module("elasticsearch", es_module);
+
+ RGWSyncModuleRef aws_module(std::make_shared<RGWAWSSyncModule>());
+ modules_manager->register_module("cloud", aws_module);
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h
new file mode 100644
index 000000000..38abb3d1a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+
+class RGWBucketInfo;
+class RGWRemoteDataLog;
+struct RGWDataSyncCtx;
+struct RGWDataSyncEnv;
+struct rgw_bucket_entry_owner;
+struct rgw_obj_key;
+struct rgw_bucket_sync_pipe;
+
+
+class RGWDataSyncModule {
+public:
+ RGWDataSyncModule() {}
+ virtual ~RGWDataSyncModule() {}
+
+ virtual void init(RGWDataSyncCtx *sync_env, uint64_t instance_id) {}
+
+ virtual RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+ return nullptr;
+ }
+
+ virtual RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+ return nullptr;
+ }
+ virtual RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+ rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& my_trace_entry,
+ rgw_zone_set *zones_trace) = 0;
+ virtual RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+ bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+ virtual RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+};
+
+class RGWRESTMgr;
+class RGWMetadataHandler;
+class RGWBucketInstanceMetadataHandlerBase;
+
+class RGWSyncModuleInstance {
+public:
+ RGWSyncModuleInstance() {}
+ virtual ~RGWSyncModuleInstance() {}
+ virtual RGWDataSyncModule *get_data_handler() = 0;
+ virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) {
+ return orig;
+ }
+ virtual bool supports_user_writes() {
+ return false;
+ }
+ virtual RGWMetadataHandler *alloc_bucket_meta_handler();
+ virtual RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver);
+
+ // indication whether the sync module start with full sync (default behavior)
+ // incremental sync would follow anyway
+ virtual bool should_full_sync() const {
+ return true;
+ }
+};
+
+typedef std::shared_ptr<RGWSyncModuleInstance> RGWSyncModuleInstanceRef;
+
+class JSONFormattable;
+
+class RGWSyncModule {
+
+public:
+ RGWSyncModule() {}
+ virtual ~RGWSyncModule() {}
+
+ virtual bool supports_writes() {
+ return false;
+ }
+ virtual bool supports_data_export() = 0;
+ virtual int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0;
+};
+
+typedef std::shared_ptr<RGWSyncModule> RGWSyncModuleRef;
+
+
+class RGWSyncModulesManager {
+ ceph::mutex lock = ceph::make_mutex("RGWSyncModulesManager");
+
+ std::map<std::string, RGWSyncModuleRef> modules;
+public:
+ RGWSyncModulesManager() = default;
+
+ void register_module(const std::string& name, RGWSyncModuleRef& module, bool is_default = false) {
+ std::lock_guard l{lock};
+ modules[name] = module;
+ if (is_default) {
+ modules[std::string()] = module;
+ }
+ }
+
+ bool get_module(const std::string& name, RGWSyncModuleRef *module) {
+ std::lock_guard l{lock};
+ auto iter = modules.find(name);
+ if (iter == modules.end()) {
+ return false;
+ }
+ if (module != nullptr) {
+ *module = iter->second;
+ }
+ return true;
+ }
+
+
+ bool supports_data_export(const std::string& name) {
+ RGWSyncModuleRef module;
+ if (!get_module(name, &module)) {
+ return false;
+ }
+
+ return module->supports_data_export();
+ }
+
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const std::string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+ RGWSyncModuleRef module;
+ if (!get_module(name, &module)) {
+ return -ENOENT;
+ }
+
+ return module.get()->create_instance(dpp, cct, config, instance);
+ }
+
+ std::vector<std::string> get_registered_module_names() const {
+ std::vector<std::string> names;
+ for (auto& i: modules) {
+ if (!i.first.empty()) {
+ names.push_back(i.first);
+ }
+ }
+ return names;
+ }
+};
+
+class RGWStatRemoteObjCBCR : public RGWCoroutine {
+protected:
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ rgw_bucket src_bucket;
+ rgw_obj_key key;
+
+ ceph::real_time mtime;
+ uint64_t size = 0;
+ std::string etag;
+ std::map<std::string, bufferlist> attrs;
+ std::map<std::string, std::string> headers;
+public:
+ RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key);
+ ~RGWStatRemoteObjCBCR() override {}
+
+ void set_result(ceph::real_time& _mtime,
+ uint64_t _size,
+ const std::string& _etag,
+ std::map<std::string, bufferlist>&& _attrs,
+ std::map<std::string, std::string>&& _headers) {
+ mtime = _mtime;
+ size = _size;
+ etag = _etag;
+ attrs = std::move(_attrs);
+ headers = std::move(_headers);
+ }
+};
+
+class RGWCallStatRemoteObjCR : public RGWCoroutine {
+ ceph::real_time mtime;
+ uint64_t size{0};
+ std::string etag;
+ std::map<std::string, bufferlist> attrs;
+ std::map<std::string, std::string> headers;
+
+protected:
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+
+ rgw_bucket src_bucket;
+ rgw_obj_key key;
+
+public:
+ RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key);
+
+ ~RGWCallStatRemoteObjCR() override {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+
+ virtual RGWStatRemoteObjCBCR *allocate_callback() {
+ return nullptr;
+ }
+};
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.cc b/src/rgw/driver/rados/rgw_sync_module_aws.cc
new file mode 100644
index 000000000..cefcd9dd1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_aws.cc
@@ -0,0 +1,1823 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_aws.h"
+#include "rgw_cr_rados.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_acl.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+
+using namespace std;
+
+static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}";
+
+static string get_key_oid(const rgw_obj_key& key)
+{
+ string oid = key.name;
+ if (!key.instance.empty() &&
+ !key.have_null_instance()) {
+ oid += string(":") + key.instance;
+ }
+ return oid;
+}
+
+static string obj_to_aws_path(const rgw_obj& obj)
+{
+ return obj.bucket.name + "/" + get_key_oid(obj.key);
+}
+
+/*
+
+ json configuration definition:
+
+ {
+ "connection": {
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "host_style": <path | virtual>,
+ },
+ "acls": [ { "type": <id | email | uri>,
+ "source_id": <source_id>,
+ "dest_id": <dest_id> } ... ], # optional, acl mappings, no mappings if does not exist
+ "target_path": <target_path>, # override default
+
+
+ # anything below here is for non trivial configuration
+ # can be used in conjuction with the above
+
+ "default": {
+ "connection": {
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "host_style" <path | virtual>,
+ },
+ "acls": [ # list of source uids and how they map into destination uids in the dest objects acls
+ {
+ "type" : <id | email | uri>, # optional, default is id
+ "source_id": <id>,
+ "dest_id": <id>
+ } ... ]
+ "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path,
+ # final object name will be target_path + "/" + obj
+ },
+ "connections": [
+ {
+ "id": <id>,
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ } ... ],
+ "acl_profiles": [
+ {
+ "id": <id>, # acl mappings
+ "acls": [ {
+ "type": <id | email | uri>,
+ "source_id": <id>,
+ "dest_id": <id>
+ } ... ]
+ }
+ ],
+ "profiles": [
+ {
+ "source_bucket": <source>, # can specify either specific bucket name (foo), or prefix (foo*)
+ "target_path": <dest>, # (override default)
+ "connection_id": <connection_id>, # optional, if empty references default connection
+ "acls_id": <mappings_id>, # optional, if empty references default mappings
+ } ... ],
+ }
+
+target path optional variables:
+
+(evaluated at init)
+sid: sync instance id, randomly generated by sync process on first sync initalization
+zonegroup: zonegroup name
+zonegroup_id: zonegroup name
+zone: zone name
+zone_id: zone name
+
+(evaluated when syncing)
+bucket: bucket name
+owner: bucket owner
+
+*/
+
+struct ACLMapping {
+ ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
+ string source_id;
+ string dest_id;
+
+ ACLMapping() = default;
+
+ ACLMapping(ACLGranteeTypeEnum t,
+ const string& s,
+ const string& d) : type(t),
+ source_id(s),
+ dest_id(d) {}
+
+ void init(const JSONFormattable& config) {
+ const string& t = config["type"];
+
+ if (t == "email") {
+ type = ACL_TYPE_EMAIL_USER;
+ } else if (t == "uri") {
+ type = ACL_TYPE_GROUP;
+ } else {
+ type = ACL_TYPE_CANON_USER;
+ }
+
+ source_id = config["source_id"];
+ dest_id = config["dest_id"];
+ }
+
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ObjectSection os(jf, "acl_mapping");
+ string s;
+ switch (type) {
+ case ACL_TYPE_EMAIL_USER:
+ s = "email";
+ break;
+ case ACL_TYPE_GROUP:
+ s = "uri";
+ break;
+ default:
+ s = "id";
+ break;
+ }
+ encode_json("type", s, &jf);
+ encode_json("source_id", source_id, &jf);
+ encode_json("dest_id", dest_id, &jf);
+ }
+};
+
+struct ACLMappings {
+ map<string, ACLMapping> acl_mappings;
+
+ void init(const JSONFormattable& config) {
+ for (auto& c : config.array()) {
+ ACLMapping m;
+ m.init(c);
+
+ acl_mappings.emplace(std::make_pair(m.source_id, m));
+ }
+ }
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ArraySection os(jf, "acls");
+
+ for (auto& i : acl_mappings) {
+ i.second.dump_conf(cct, jf);
+ }
+ }
+};
+
+struct AWSSyncConfig_ACLProfiles {
+ map<string, std::shared_ptr<ACLMappings> > acl_profiles;
+
+ void init(const JSONFormattable& config) {
+ for (auto& c : config.array()) {
+ const string& profile_id = c["id"];
+
+ std::shared_ptr<ACLMappings> ap{new ACLMappings};
+ ap->init(c["acls"]);
+
+ acl_profiles[profile_id] = ap;
+ }
+ }
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ArraySection section(jf, "acl_profiles");
+
+ for (auto& p : acl_profiles) {
+ Formatter::ObjectSection section(jf, "profile");
+ encode_json("id", p.first, &jf);
+ p.second->dump_conf(cct, jf);
+ }
+ }
+
+ bool find(const string& profile_id, ACLMappings *result) const {
+ auto iter = acl_profiles.find(profile_id);
+ if (iter == acl_profiles.end()) {
+ return false;
+ }
+ *result = *iter->second;
+ return true;
+ }
+};
+
+struct AWSSyncConfig_Connection {
+ string connection_id;
+ string endpoint;
+ RGWAccessKey key;
+ std::optional<string> region;
+ HostStyle host_style{PathStyle};
+
+ bool has_endpoint{false};
+ bool has_key{false};
+ bool has_host_style{false};
+
+ void init(const JSONFormattable& config) {
+ has_endpoint = config.exists("endpoint");
+ has_key = config.exists("access_key") || config.exists("secret");
+ has_host_style = config.exists("host_style");
+
+ connection_id = config["id"];
+ endpoint = config["endpoint"];
+
+ key = RGWAccessKey(config["access_key"], config["secret"]);
+
+ if (config.exists("region")) {
+ region = config["region"];
+ } else {
+ region.reset();
+ }
+
+ string host_style_str = config["host_style"];
+ if (host_style_str != "virtual") {
+ host_style = PathStyle;
+ } else {
+ host_style = VirtualStyle;
+ }
+ }
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ObjectSection section(jf, "connection");
+ encode_json("id", connection_id, &jf);
+ encode_json("endpoint", endpoint, &jf);
+ string s = (host_style == PathStyle ? "path" : "virtual");
+ encode_json("region", region, &jf);
+ encode_json("host_style", s, &jf);
+
+ {
+ Formatter::ObjectSection os(jf, "key");
+ encode_json("access_key", key.id, &jf);
+ string secret = (key.key.empty() ? "" : "******");
+ encode_json("secret", secret, &jf);
+ }
+ }
+};
+
+static int conf_to_uint64(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+ string sval;
+ if (config.find(key, &sval)) {
+ string err;
+ uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl;
+ return -EINVAL;
+ }
+ *pval = val;
+ }
+ return 0;
+}
+
+struct AWSSyncConfig_S3 {
+ uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+ uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+
+ int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+ int r = conf_to_uint64(dpp, cct, config, "multipart_sync_threshold", &multipart_sync_threshold);
+ if (r < 0) {
+ return r;
+ }
+
+ r = conf_to_uint64(dpp, cct, config, "multipart_min_part_size", &multipart_min_part_size);
+ if (r < 0) {
+ return r;
+ }
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+ if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+ multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+ }
+ return 0;
+ }
+
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ObjectSection section(jf, "s3");
+ encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf);
+ encode_json("multipart_min_part_size", multipart_min_part_size, &jf);
+ }
+};
+
+struct AWSSyncConfig_Profile {
+ string source_bucket;
+ bool prefix{false};
+ string target_path;
+ string connection_id;
+ string acls_id;
+
+ std::shared_ptr<AWSSyncConfig_Connection> conn_conf;
+ std::shared_ptr<ACLMappings> acls;
+
+ std::shared_ptr<RGWRESTConn> conn;
+
+ void init(const JSONFormattable& config) {
+ source_bucket = config["source_bucket"];
+
+ prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*');
+
+ if (prefix) {
+ source_bucket = source_bucket.substr(0, source_bucket.size() - 1);
+ }
+
+ target_path = config["target_path"];
+ connection_id = config["connection_id"];
+ acls_id = config["acls_id"];
+
+ if (config.exists("connection")) {
+ conn_conf = make_shared<AWSSyncConfig_Connection>();
+ conn_conf->init(config["connection"]);
+ }
+
+ if (config.exists("acls")) {
+ acls = make_shared<ACLMappings>();
+ acls->init(config["acls"]);
+ }
+ }
+
+ void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const {
+ Formatter::ObjectSection config(jf, section);
+ string sb{source_bucket};
+ if (prefix) {
+ sb.append("*");
+ }
+ encode_json("source_bucket", sb, &jf);
+ encode_json("target_path", target_path, &jf);
+ encode_json("connection_id", connection_id, &jf);
+ encode_json("acls_id", acls_id, &jf);
+ if (conn_conf.get()) {
+ conn_conf->dump_conf(cct, jf);
+ }
+ if (acls.get()) {
+ acls->dump_conf(cct, jf);
+ }
+ }
+};
+
+static void find_and_replace(const string& src, const string& find, const string& replace, string *dest)
+{
+ string s = src;
+
+ size_t pos = s.find(find);
+ while (pos != string::npos) {
+ size_t next_ofs = pos + find.size();
+ s = s.substr(0, pos) + replace + s.substr(next_ofs);
+ pos = s.find(find, next_ofs);
+ }
+
+ *dest = s;
+}
+
+static void apply_meta_param(const string& src, const string& param, const string& val, string *dest)
+{
+ string s = string("${") + param + "}";
+ find_and_replace(src, s, val, dest);
+}
+
+
+struct AWSSyncConfig {
+ AWSSyncConfig_Profile default_profile;
+ std::shared_ptr<AWSSyncConfig_Profile> root_profile;
+
+ map<string, std::shared_ptr<AWSSyncConfig_Connection> > connections;
+ AWSSyncConfig_ACLProfiles acl_profiles;
+
+ map<string, std::shared_ptr<AWSSyncConfig_Profile> > explicit_profiles;
+
+ AWSSyncConfig_S3 s3;
+
+ int init_profile(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile,
+ bool connection_must_exist) {
+ if (!profile.connection_id.empty()) {
+ if (profile.conn_conf) {
+ ldpp_dout(dpp, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl;
+ return -EINVAL;
+ }
+ if (connections.find(profile.connection_id) == connections.end()) {
+ ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl;
+ return -EINVAL;
+ }
+ profile.conn_conf = connections[profile.connection_id];
+ } else if (!profile.conn_conf) {
+ profile.connection_id = default_profile.connection_id;
+ auto i = connections.find(profile.connection_id);
+ if (i != connections.end()) {
+ profile.conn_conf = i->second;
+ }
+ }
+
+ if (connection_must_exist && !profile.conn_conf) {
+ ldpp_dout(dpp, 0) << "ERROR: remote connection undefined for sync profile" << dendl;
+ return -EINVAL;
+ }
+
+ if (profile.conn_conf && default_profile.conn_conf) {
+ if (!profile.conn_conf->has_endpoint) {
+ profile.conn_conf->endpoint = default_profile.conn_conf->endpoint;
+ }
+ if (!profile.conn_conf->has_host_style) {
+ profile.conn_conf->host_style = default_profile.conn_conf->host_style;
+ }
+ if (!profile.conn_conf->has_key) {
+ profile.conn_conf->key = default_profile.conn_conf->key;
+ }
+ }
+
+ ACLMappings acl_mappings;
+
+ if (!profile.acls_id.empty()) {
+ if (!acl_profiles.find(profile.acls_id, &acl_mappings)) {
+ ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl;
+ return -EINVAL;
+ }
+ profile.acls = acl_profiles.acl_profiles[profile.acls_id];
+ } else if (!profile.acls) {
+ if (default_profile.acls) {
+ profile.acls = default_profile.acls;
+ profile.acls_id = default_profile.acls_id;
+ }
+ }
+
+ if (profile.target_path.empty()) {
+ profile.target_path = default_profile.target_path;
+ }
+ if (profile.target_path.empty()) {
+ profile.target_path = default_target_path;
+ }
+
+ return 0;
+ }
+
+ int init_target(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+ std::shared_ptr<AWSSyncConfig_Profile> profile;
+ profile.reset(new AWSSyncConfig_Profile);
+ profile->init(profile_conf);
+
+ int ret = init_profile(dpp, cct, profile_conf, *profile, true);
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto& sb = profile->source_bucket;
+
+ if (explicit_profiles.find(sb) != explicit_profiles.end()) {
+ ldpp_dout(dpp, 0) << "WARNING: duplicate target configuration in sync module" << dendl;
+ }
+
+ explicit_profiles[sb] = profile;
+ if (ptarget) {
+ *ptarget = profile;
+ }
+ return 0;
+ }
+
+ bool do_find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+ const string& name = bucket.name;
+ auto iter = explicit_profiles.upper_bound(name);
+ if (iter == explicit_profiles.begin()) {
+ return false;
+ }
+
+ --iter;
+ if (iter->first.size() > name.size()) {
+ return false;
+ }
+ if (name.compare(0, iter->first.size(), iter->first) != 0) {
+ return false;
+ }
+
+ std::shared_ptr<AWSSyncConfig_Profile>& target = iter->second;
+
+ if (!target->prefix &&
+ name.size() != iter->first.size()) {
+ return false;
+ }
+
+ *result = target;
+ return true;
+ }
+
+ void find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+ if (!do_find_profile(bucket, result)) {
+ *result = root_profile;
+ }
+ }
+
+ AWSSyncConfig() {}
+
+ int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+ auto& default_conf = config["default"];
+
+ if (config.exists("default")) {
+ default_profile.init(default_conf);
+ init_profile(dpp, cct, default_conf, default_profile, false);
+ }
+
+ for (auto& conn : config["connections"].array()) {
+ auto new_conn = conn;
+
+ std::shared_ptr<AWSSyncConfig_Connection> c{new AWSSyncConfig_Connection};
+ c->init(new_conn);
+
+ connections[new_conn["id"]] = c;
+ }
+
+ acl_profiles.init(config["acl_profiles"]);
+
+ int r = s3.init(dpp, cct, config["s3"]);
+ if (r < 0) {
+ return r;
+ }
+
+ auto new_root_conf = config;
+
+ r = init_target(dpp, cct, new_root_conf, &root_profile); /* the root profile config */
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto target_conf : config["profiles"].array()) {
+ int r = init_target(dpp, cct, target_conf, nullptr);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ JSONFormatter jf(true);
+ dump_conf(cct, jf);
+ stringstream ss;
+ jf.flush(ss);
+
+ ldpp_dout(dpp, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl;
+
+ return 0;
+ }
+
+ void expand_target(RGWDataSyncCtx *sc, const string& sid, const string& path, string *dest) {
+ apply_meta_param(path, "sid", sid, dest);
+
+ const RGWZoneGroup& zg = sc->env->svc->zone->get_zonegroup();
+ apply_meta_param(path, "zonegroup", zg.get_name(), dest);
+ apply_meta_param(path, "zonegroup_id", zg.get_id(), dest);
+
+ const RGWZone& zone = sc->env->svc->zone->get_zone();
+ apply_meta_param(path, "zone", zone.name, dest);
+ apply_meta_param(path, "zone_id", zone.id, dest);
+ }
+
+ void update_config(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, const string& sid) {
+ expand_target(sc, sid, root_profile->target_path, &root_profile->target_path);
+ ldpp_dout(dpp, 20) << "updated target: (root) -> " << root_profile->target_path << dendl;
+ for (auto& t : explicit_profiles) {
+ expand_target(sc, sid, t.second->target_path, &t.second->target_path);
+ ldpp_dout(dpp, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl;
+ }
+ }
+
+ void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+ Formatter::ObjectSection config(jf, "config");
+ root_profile->dump_conf(cct, jf);
+ jf.open_array_section("connections");
+ for (auto c : connections) {
+ c.second->dump_conf(cct, jf);
+ }
+ jf.close_section();
+
+ acl_profiles.dump_conf(cct, jf);
+
+ { // targets
+ Formatter::ArraySection as(jf, "profiles");
+ for (auto& t : explicit_profiles) {
+ Formatter::ObjectSection target_section(jf, "profile");
+ encode_json("name", t.first, &jf);
+ t.second->dump_conf(cct, jf);
+ }
+ }
+ }
+
+ string get_path(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj_key& obj) {
+ string bucket_str;
+ string owner;
+ if (!bucket_info.owner.tenant.empty()) {
+ bucket_str = owner = bucket_info.owner.tenant + "-";
+ owner += bucket_info.owner.id;
+ }
+ bucket_str += bucket_info.bucket.name;
+
+ const string& path = profile->target_path;
+
+ string new_path;
+ apply_meta_param(path, "bucket", bucket_str, &new_path);
+ apply_meta_param(new_path, "owner", owner, &new_path);
+
+ new_path += string("/") + get_key_oid(obj);
+
+ return new_path;
+ }
+
+ void get_target(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj_key& obj,
+ string *bucket_name,
+ string *obj_name) {
+ string path = get_path(profile, bucket_info, obj);
+ size_t pos = path.find('/');
+
+ *bucket_name = path.substr(0, pos);
+ *obj_name = path.substr(pos + 1);
+ }
+
+ void init_conns(RGWDataSyncCtx *sc, const string& id) {
+ auto sync_env = sc->env;
+
+ update_config(sync_env->dpp, sc, id);
+
+ auto& root_conf = root_profile->conn_conf;
+
+ root_profile->conn.reset(new S3RESTConn(sc->cct,
+ id,
+ { root_conf->endpoint },
+ root_conf->key,
+ sync_env->svc->zone->get_zonegroup().get_id(),
+ root_conf->region,
+ root_conf->host_style));
+
+ for (auto i : explicit_profiles) {
+ auto& c = i.second;
+
+ c->conn.reset(new S3RESTConn(sc->cct,
+ id,
+ { c->conn_conf->endpoint },
+ c->conn_conf->key,
+ sync_env->svc->zone->get_zonegroup().get_id(),
+ c->conn_conf->region,
+ c->conn_conf->host_style));
+ }
+ }
+};
+
+
+struct AWSSyncInstanceEnv {
+ AWSSyncConfig conf;
+ string id;
+
+ explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {}
+
+ void init(RGWDataSyncCtx *sc, uint64_t instance_id) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id);
+ id = buf;
+
+ conf.init_conns(sc, id);
+ }
+
+ void get_profile(const rgw_bucket& bucket, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+ conf.find_profile(bucket, ptarget);
+ ceph_assert(ptarget);
+ }
+};
+
+static int do_decode_rest_obj(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrs, map<string, string>& headers, rgw_rest_obj *info)
+{
+ for (auto header : headers) {
+ const string& val = header.second;
+ if (header.first == "RGWX_OBJECT_SIZE") {
+ info->content_len = atoi(val.c_str());
+ } else {
+ info->attrs[header.first] = val;
+ }
+ }
+
+ info->acls.set_ctx(cct);
+ auto aiter = attrs.find(RGW_ATTR_ACL);
+ if (aiter != attrs.end()) {
+ bufferlist& bl = aiter->second;
+ auto bliter = bl.cbegin();
+ try {
+ info->acls.decode(bliter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+ return -EIO;
+ }
+ } else {
+ ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+ }
+
+ return 0;
+}
+
+class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF
+{
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *conn;
+ const rgw_obj& src_obj;
+ RGWRESTConn::get_obj_params req_params;
+
+ rgw_sync_aws_src_obj_properties src_properties;
+public:
+ RGWRESTStreamGetCRF(CephContext *_cct,
+ RGWCoroutinesEnv *_env,
+ RGWCoroutine *_caller,
+ RGWDataSyncCtx *_sc,
+ RGWRESTConn *_conn,
+ const rgw_obj& _src_obj,
+ const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller,
+ _sc->env->http_manager, _src_obj.key),
+ sc(_sc), conn(_conn), src_obj(_src_obj),
+ src_properties(_src_properties) {
+ }
+
+ int init(const DoutPrefixProvider *dpp) override {
+ /* init input connection */
+
+
+ req_params.get_op = true;
+ req_params.prepend_metadata = true;
+
+ req_params.unmod_ptr = &src_properties.mtime;
+ req_params.etag = src_properties.etag;
+ req_params.mod_zone_id = src_properties.zone_short_id;
+ req_params.mod_pg_ver = src_properties.pg_ver;
+
+ if (range.is_set) {
+ req_params.range_is_set = true;
+ req_params.range_start = range.ofs;
+ req_params.range_end = range.ofs + range.size - 1;
+ }
+
+ RGWRESTStreamRWRequest *in_req;
+ int ret = conn->get_obj(dpp, src_obj, req_params, false /* send */, &in_req);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ set_req(in_req);
+
+ return RGWStreamReadHTTPResourceCRF::init(dpp);
+ }
+
+ int decode_rest_obj(const DoutPrefixProvider *dpp, map<string, string>& headers, bufferlist& extra_data) override {
+ map<string, bufferlist> src_attrs;
+
+ ldpp_dout(dpp, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl;
+
+ if (extra_data.length() > 0) {
+ JSONParser jp;
+ if (!jp.parse(extra_data.c_str(), extra_data.length())) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl;
+ return -EIO;
+ }
+
+ JSONDecoder::decode_json("attrs", src_attrs, &jp);
+ }
+ return do_decode_rest_obj(dpp, sc->cct, src_attrs, headers, &rest_obj);
+ }
+
+ bool need_extra_data() override {
+ return true;
+ }
+};
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+ "CONTENT_ENCODING",
+ "CONTENT_DISPOSITION",
+ "CONTENT_LANGUAGE" };
+
+class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF
+{
+ RGWDataSyncCtx *sc;
+ rgw_sync_aws_src_obj_properties src_properties;
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ const rgw_obj& dest_obj;
+ string etag;
+public:
+ RGWAWSStreamPutCRF(CephContext *_cct,
+ RGWCoroutinesEnv *_env,
+ RGWCoroutine *_caller,
+ RGWDataSyncCtx *_sc,
+ const rgw_sync_aws_src_obj_properties& _src_properties,
+ std::shared_ptr<AWSSyncConfig_Profile>& _target,
+ const rgw_obj& _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sc->env->http_manager),
+ sc(_sc), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) {
+ }
+
+ int init() override {
+ /* init output connection */
+ RGWRESTStreamS3PutObj *out_req{nullptr};
+
+ if (multipart.is_multipart) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+ rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+ { "partNumber", buf },
+ { nullptr, nullptr } };
+ target->conn->put_obj_send_init(dest_obj, params, &out_req);
+ } else {
+ target->conn->put_obj_send_init(dest_obj, nullptr, &out_req);
+ }
+
+ set_req(out_req);
+
+ return RGWStreamWriteHTTPResourceCRF::init();
+ }
+
+ static bool keep_attr(const string& h) {
+ return (keep_headers.find(h) != keep_headers.end() ||
+ boost::algorithm::starts_with(h, "X_AMZ_"));
+ }
+
+ static void init_send_attrs(const DoutPrefixProvider *dpp,
+ CephContext *cct,
+ const rgw_rest_obj& rest_obj,
+ const rgw_sync_aws_src_obj_properties& src_properties,
+ const AWSSyncConfig_Profile *target,
+ map<string, string> *attrs) {
+ auto& new_attrs = *attrs;
+
+ new_attrs.clear();
+
+ for (auto& hi : rest_obj.attrs) {
+ if (keep_attr(hi.first)) {
+ new_attrs.insert(hi);
+ }
+ }
+
+ auto acl = rest_obj.acls.get_acl();
+
+ map<int, vector<string> > access_map;
+
+ if (target->acls) {
+ for (auto& grant : acl.get_grant_map()) {
+ auto& orig_grantee = grant.first;
+ auto& perm = grant.second;
+
+ string grantee;
+
+ const auto& am = target->acls->acl_mappings;
+
+ auto iter = am.find(orig_grantee);
+ if (iter == am.end()) {
+ ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+ continue;
+ }
+
+ grantee = iter->second.dest_id;
+
+ string type;
+
+ switch (iter->second.type) {
+ case ACL_TYPE_CANON_USER:
+ type = "id";
+ break;
+ case ACL_TYPE_EMAIL_USER:
+ type = "emailAddress";
+ break;
+ case ACL_TYPE_GROUP:
+ type = "uri";
+ break;
+ default:
+ continue;
+ }
+
+ string tv = type + "=" + grantee;
+
+ int flags = perm.get_permission().get_permissions();
+ if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+ access_map[flags].push_back(tv);
+ continue;
+ }
+
+ for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+ if (flags & i) {
+ access_map[i].push_back(tv);
+ }
+ }
+ }
+ }
+
+ for (auto aiter : access_map) {
+ int grant_type = aiter.first;
+
+ string header_str("x-amz-grant-");
+
+ switch (grant_type) {
+ case RGW_PERM_READ:
+ header_str.append("read");
+ break;
+ case RGW_PERM_WRITE:
+ header_str.append("write");
+ break;
+ case RGW_PERM_READ_ACP:
+ header_str.append("read-acp");
+ break;
+ case RGW_PERM_WRITE_ACP:
+ header_str.append("write-acp");
+ break;
+ case RGW_PERM_FULL_CONTROL:
+ header_str.append("full-control");
+ break;
+ }
+
+ string s;
+
+ for (auto viter : aiter.second) {
+ if (!s.empty()) {
+ s.append(", ");
+ }
+ s.append(viter);
+ }
+
+ ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+ new_attrs[header_str] = s;
+ }
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch);
+ new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+ utime_t ut(src_properties.mtime);
+ snprintf(buf, sizeof(buf), "%lld.%09lld",
+ (long long)ut.sec(),
+ (long long)ut.nsec());
+
+ new_attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+ new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag;
+ new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+ if (!rest_obj.key.instance.empty()) {
+ new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+ }
+ }
+
+ void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override {
+ RGWRESTStreamS3PutObj *r = static_cast<RGWRESTStreamS3PutObj *>(req);
+
+ map<string, string> new_attrs;
+ if (!multipart.is_multipart) {
+ init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+ }
+
+ r->set_send_length(rest_obj.content_len);
+
+ RGWAccessControlPolicy policy;
+
+ r->send_ready(dpp, target->conn->get_key(), new_attrs, policy);
+ }
+
+ void handle_headers(const map<string, string>& headers) {
+ for (auto h : headers) {
+ if (h.first == "ETAG") {
+ etag = h.second;
+ }
+ }
+ }
+
+ bool get_etag(string *petag) {
+ if (etag.empty()) {
+ return false;
+ }
+ *petag = etag;
+ return true;
+ }
+};
+
+
+class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *source_conn;
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ const rgw_obj& src_obj;
+ const rgw_obj& dest_obj;
+
+ rgw_sync_aws_src_obj_properties src_properties;
+
+ std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+ std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+public:
+ RGWAWSStreamObjToCloudPlainCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_source_conn,
+ const rgw_obj& _src_obj,
+ const rgw_sync_aws_src_obj_properties& _src_properties,
+ std::shared_ptr<AWSSyncConfig_Profile> _target,
+ const rgw_obj& _dest_obj) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ source_conn(_source_conn),
+ target(_target),
+ src_obj(_src_obj),
+ dest_obj(_dest_obj),
+ src_properties(_src_properties) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ /* init input */
+ in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+ source_conn, src_obj,
+ src_properties));
+
+ /* init output */
+ out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+ src_properties, target, dest_obj));
+
+ yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *source_conn;
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ const rgw_obj& src_obj;
+ const rgw_obj& dest_obj;
+
+ rgw_sync_aws_src_obj_properties src_properties;
+
+ string upload_id;
+
+ rgw_sync_aws_multipart_part_info part_info;
+
+ std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+ std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+ string *petag;
+
+public:
+ RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_source_conn,
+ const rgw_obj& _src_obj,
+ std::shared_ptr<AWSSyncConfig_Profile>& _target,
+ const rgw_obj& _dest_obj,
+ const rgw_sync_aws_src_obj_properties& _src_properties,
+ const string& _upload_id,
+ const rgw_sync_aws_multipart_part_info& _part_info,
+ string *_petag) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ source_conn(_source_conn),
+ target(_target),
+ src_obj(_src_obj),
+ dest_obj(_dest_obj),
+ src_properties(_src_properties),
+ upload_id(_upload_id),
+ part_info(_part_info),
+ petag(_petag) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ /* init input */
+ in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+ source_conn, src_obj,
+ src_properties));
+
+ in_crf->set_range(part_info.ofs, part_info.size);
+
+ /* init output */
+ out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+ src_properties, target, dest_obj));
+
+ out_crf->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+ yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ if (!(static_cast<RGWAWSStreamPutCRF *>(out_crf.get()))->get_etag(petag)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+ return set_cr_error(-EIO);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSAbortMultipartCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *dest_conn;
+ const rgw_obj& dest_obj;
+
+ string upload_id;
+
+public:
+ RGWAWSAbortMultipartCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_dest_conn,
+ const rgw_obj& _dest_obj,
+ const string& _upload_id) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ dest_conn(_dest_conn),
+ dest_obj(_dest_obj),
+ upload_id(_upload_id) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+ yield {
+ rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+ bufferlist bl;
+ call(new RGWDeleteRESTResourceCR(sc->cct, dest_conn, sc->env->http_manager,
+ obj_to_aws_path(dest_obj), params));
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl;
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSInitMultipartCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *dest_conn;
+ const rgw_obj& dest_obj;
+
+ uint64_t obj_size;
+ map<string, string> attrs;
+
+ bufferlist out_bl;
+
+ string *upload_id;
+
+ struct InitMultipartResult {
+ string bucket;
+ string key;
+ string upload_id;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+ RGWXMLDecoder::decode_xml("Key", key, obj);
+ RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+ }
+ } result;
+
+public:
+ RGWAWSInitMultipartCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_dest_conn,
+ const rgw_obj& _dest_obj,
+ uint64_t _obj_size,
+ const map<string, string>& _attrs,
+ string *_upload_id) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ dest_conn(_dest_conn),
+ dest_obj(_dest_obj),
+ obj_size(_obj_size),
+ attrs(_attrs),
+ upload_id(_upload_id) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+ yield {
+ rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+ bufferlist bl;
+ call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+ obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl));
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+ return set_cr_error(retcode);
+ }
+ {
+ /*
+ * If one of the following fails we cannot abort upload, as we cannot
+ * extract the upload id. If one of these fail it's very likely that that's
+ * the least of our problem.
+ */
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+ return set_cr_error(-EIO);
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+ return set_cr_error(-EIO);
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return set_cr_error(-EIO);
+ }
+ }
+
+ ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+ *upload_id = result.upload_id;
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSCompleteMultipartCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *dest_conn;
+ const rgw_obj& dest_obj;
+
+ bufferlist out_bl;
+
+ string upload_id;
+
+ struct CompleteMultipartReq {
+ map<int, rgw_sync_aws_multipart_part_info> parts;
+
+ explicit CompleteMultipartReq(const map<int, rgw_sync_aws_multipart_part_info>& _parts) : parts(_parts) {}
+
+ void dump_xml(Formatter *f) const {
+ for (auto p : parts) {
+ f->open_object_section("Part");
+ encode_xml("PartNumber", p.first, f);
+ encode_xml("ETag", p.second.etag, f);
+ f->close_section();
+ };
+ }
+ } req_enc;
+
+ struct CompleteMultipartResult {
+ string location;
+ string bucket;
+ string key;
+ string etag;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Location", bucket, obj);
+ RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+ RGWXMLDecoder::decode_xml("Key", key, obj);
+ RGWXMLDecoder::decode_xml("ETag", etag, obj);
+ }
+ } result;
+
+public:
+ RGWAWSCompleteMultipartCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_dest_conn,
+ const rgw_obj& _dest_obj,
+ string _upload_id,
+ const map<int, rgw_sync_aws_multipart_part_info>& _parts) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ dest_conn(_dest_conn),
+ dest_obj(_dest_obj),
+ upload_id(_upload_id),
+ req_enc(_parts) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+ yield {
+ rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+ stringstream ss;
+ XMLFormatter formatter;
+
+ encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+ formatter.flush(ss);
+
+ bufferlist bl;
+ bl.append(ss.str());
+
+ call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+ obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl));
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+ return set_cr_error(retcode);
+ }
+ {
+ /*
+ * If one of the following fails we cannot abort upload, as we cannot
+ * extract the upload id. If one of these fail it's very likely that that's
+ * the least of our problem.
+ */
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+ return set_cr_error(-EIO);
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+ return set_cr_error(-EIO);
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return set_cr_error(-EIO);
+ }
+ }
+
+ ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+
+class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWRESTConn *dest_conn;
+ const rgw_obj& dest_obj;
+ const rgw_raw_obj status_obj;
+
+ string upload_id;
+
+public:
+
+ RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncCtx *_sc,
+ RGWRESTConn *_dest_conn,
+ const rgw_obj& _dest_obj,
+ const rgw_raw_obj& _status_obj,
+ const string& _upload_id) : RGWCoroutine(_sc->cct), sc(_sc),
+ dest_conn(_dest_conn),
+ dest_obj(_dest_obj),
+ status_obj(_status_obj),
+ upload_id(_upload_id) {}
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield call(new RGWAWSAbortMultipartCR(sc, dest_conn, dest_obj, upload_id));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl;
+ /* ignore error, best effort */
+ }
+ yield call(new RGWRadosRemoveCR(sc->env->driver, status_obj));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl;
+ /* ignore error, best effort */
+ }
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ AWSSyncConfig& conf;
+ RGWRESTConn *source_conn;
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ const rgw_obj& src_obj;
+ const rgw_obj& dest_obj;
+
+ uint64_t obj_size;
+ string src_etag;
+ rgw_sync_aws_src_obj_properties src_properties;
+ rgw_rest_obj rest_obj;
+
+ rgw_sync_aws_multipart_upload_info status;
+
+ map<string, string> new_attrs;
+
+ rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr};
+
+ int ret_err{0};
+
+ rgw_raw_obj status_obj;
+
+public:
+ RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ AWSSyncConfig& _conf,
+ RGWRESTConn *_source_conn,
+ const rgw_obj& _src_obj,
+ std::shared_ptr<AWSSyncConfig_Profile>& _target,
+ const rgw_obj& _dest_obj,
+ uint64_t _obj_size,
+ const rgw_sync_aws_src_obj_properties& _src_properties,
+ const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sc->cct),
+ sc(_sc),
+ sync_env(_sc->env),
+ conf(_conf),
+ source_conn(_source_conn),
+ target(_target),
+ src_obj(_src_obj),
+ dest_obj(_dest_obj),
+ obj_size(_obj_size),
+ src_properties(_src_properties),
+ rest_obj(_rest_obj),
+ status_obj(sync_env->svc->zone->get_zone_params().log_pool,
+ RGWBucketPipeSyncStatusManager::obj_status_oid(_sync_pipe, sc->source_zone, src_obj)) {
+ }
+
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ yield call(new RGWSimpleRadosReadCR<rgw_sync_aws_multipart_upload_info>(
+ dpp, sync_env->driver, status_obj, &status, false));
+
+ if (retcode < 0 && retcode != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl;
+ return retcode;
+ }
+
+ if (retcode >= 0) {
+ /* check here that mtime and size did not change */
+
+ if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size ||
+ status.src_properties.etag != src_properties.etag) {
+ yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+ retcode = -ENOENT;
+ }
+ }
+
+ if (retcode == -ENOENT) {
+ RGWAWSStreamPutCRF::init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+
+ yield call(new RGWAWSInitMultipartCR(sc, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ status.obj_size = obj_size;
+ status.src_properties = src_properties;
+#define MULTIPART_MAX_PARTS 10000
+ uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+ status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size);
+ status.num_parts = (obj_size + status.part_size - 1) / status.part_size;
+ status.cur_part = 1;
+ }
+
+ for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) {
+ yield {
+ rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part];
+ cur_part_info.part_num = status.cur_part;
+ cur_part_info.ofs = status.cur_ofs;
+ cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs);
+
+ pcur_part_info = &cur_part_info;
+
+ status.cur_ofs += status.part_size;
+
+ call(new RGWAWSStreamObjToCloudMultipartPartCR(sc,
+ source_conn, src_obj,
+ target,
+ dest_obj,
+ status.src_properties,
+ status.upload_id,
+ cur_part_info,
+ &cur_part_info.etag));
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+ ret_err = retcode;
+ yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+ return set_cr_error(ret_err);
+ }
+
+ yield call(new RGWSimpleRadosWriteCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->driver, status_obj, status));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl;
+ /* continue with upload anyway */
+ }
+ ldpp_dout(dpp, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl;
+ }
+
+ yield call(new RGWAWSCompleteMultipartCR(sc, target->conn.get(), dest_obj, status.upload_id, status.parts));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+ ret_err = retcode;
+ yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+ return set_cr_error(ret_err);
+ }
+
+ /* remove status obj */
+ yield call(new RGWRadosRemoveCR(sync_env->driver, status_obj));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl;
+ /* ignore error, best effort */
+ }
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+template <class T>
+int decode_attr(map<string, bufferlist>& attrs, const char *attr_name, T *result, T def_val)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+ if (iter == attrs.end()) {
+ *result = def_val;
+ return 0;
+ }
+ bufferlist& bl = iter->second;
+ if (bl.length() == 0) {
+ *result = def_val;
+ return 0;
+ }
+ auto bliter = bl.cbegin();
+ try {
+ decode(*result, bliter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+ return 0;
+}
+
+// maybe use Fetch Remote Obj instead?
+class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR {
+ rgw_bucket_sync_pipe sync_pipe;
+ AWSSyncInstanceEnv& instance;
+
+ uint64_t versioned_epoch{0};
+
+ RGWRESTConn *source_conn{nullptr};
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ bufferlist res;
+ unordered_map <string, bool> bucket_created;
+ rgw_rest_obj rest_obj;
+ int ret{0};
+
+ uint32_t src_zone_short_id{0};
+ uint64_t src_pg_ver{0};
+
+ bufferlist out_bl;
+
+ struct CreateBucketResult {
+ string code;
+
+ void decode_xml(XMLObj *obj) {
+ RGWXMLDecoder::decode_xml("Code", code, obj);
+ }
+ } result;
+
+ rgw_obj src_obj;
+ rgw_obj dest_obj;
+
+public:
+ RGWAWSHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe,
+ rgw_obj_key& _key,
+ AWSSyncInstanceEnv& _instance,
+ uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+ sync_pipe(_sync_pipe),
+ instance(_instance), versioned_epoch(_versioned_epoch)
+ {}
+
+ ~RGWAWSHandleRemoteObjCBCR(){
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
+ } else {
+ ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl;
+ src_pg_ver = 0; /* all or nothing */
+ }
+ }
+ ldpp_dout(dpp, 4) << "AWS: download begin: z=" << sc->source_zone
+ << " b=" << src_bucket << " k=" << key << " size=" << size
+ << " mtime=" << mtime << " etag=" << etag
+ << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver
+ << dendl;
+
+ source_conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+ if (!source_conn) {
+ ldpp_dout(dpp, 0) << "ERROR: cannot find http connection to zone " << sc->source_zone << dendl;
+ return set_cr_error(-EINVAL);
+ }
+
+ instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+ instance.conf.get_target(target, sync_pipe.dest_bucket_info, key, &dest_obj.bucket.name, &dest_obj.key.name);
+
+ if (bucket_created.find(dest_obj.bucket.name) == bucket_created.end()){
+ yield {
+ ldpp_dout(dpp, 0) << "AWS: creating bucket " << dest_obj.bucket.name << dendl;
+ bufferlist bl;
+ call(new RGWPutRawRESTResourceCR <bufferlist> (sc->cct, target->conn.get(),
+ sync_env->http_manager,
+ dest_obj.bucket.name, nullptr, bl, &out_bl));
+ }
+ if (retcode < 0 ) {
+ RGWXMLDecoder::XMLParser parser;
+ if (!parser.init()) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+ return set_cr_error(retcode);
+ }
+
+ try {
+ RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+ } catch (RGWXMLDecoder::err& err) {
+ string str(out_bl.c_str(), out_bl.length());
+ ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (result.code != "BucketAlreadyOwnedByYou") {
+ return set_cr_error(retcode);
+ }
+ }
+
+ bucket_created[dest_obj.bucket.name] = true;
+ }
+
+ yield {
+ src_obj.bucket = src_bucket;
+ src_obj.key = key;
+
+ /* init output */
+ rgw_sync_aws_src_obj_properties src_properties;
+ src_properties.mtime = mtime;
+ src_properties.etag = etag;
+ src_properties.zone_short_id = src_zone_short_id;
+ src_properties.pg_ver = src_pg_ver;
+ src_properties.versioned_epoch = versioned_epoch;
+
+ if (size < instance.conf.s3.multipart_sync_threshold) {
+ call(new RGWAWSStreamObjToCloudPlainCR(sc, source_conn, src_obj,
+ src_properties,
+ target,
+ dest_obj));
+ } else {
+ rgw_rest_obj rest_obj;
+ rest_obj.init(key);
+ if (do_decode_rest_obj(dpp, sc->cct, attrs, headers, &rest_obj)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl;
+ return set_cr_error(-EINVAL);
+ }
+ call(new RGWAWSStreamObjToCloudMultipartCR(sc, sync_pipe, instance.conf, source_conn, src_obj,
+ target, dest_obj, size, src_properties, rest_obj));
+ }
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ return set_cr_done();
+ }
+
+ return 0;
+ }
+};
+
+class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+ rgw_bucket_sync_pipe sync_pipe;
+ AWSSyncInstanceEnv& instance;
+ uint64_t versioned_epoch;
+public:
+ RGWAWSHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+ AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+ sync_pipe(_sync_pipe),
+ instance(_instance), versioned_epoch(_versioned_epoch) {
+ }
+
+ ~RGWAWSHandleRemoteObjCR() {}
+
+ RGWStatRemoteObjCBCR *allocate_callback() override {
+ return new RGWAWSHandleRemoteObjCBCR(sc, sync_pipe, key, instance, versioned_epoch);
+ }
+};
+
+class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ std::shared_ptr<AWSSyncConfig_Profile> target;
+ rgw_bucket_sync_pipe sync_pipe;
+ rgw_obj_key key;
+ ceph::real_time mtime;
+ AWSSyncInstanceEnv& instance;
+ int ret{0};
+public:
+ RGWAWSRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+ AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sc->cct), sc(_sc),
+ sync_pipe(_sync_pipe), key(_key),
+ mtime(_mtime), instance(_instance) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ldpp_dout(dpp, 0) << ": remove remote obj: z=" << sc->source_zone
+ << " b=" <<sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+ yield {
+ instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+ string path = instance.conf.get_path(target, sync_pipe.dest_bucket_info, key);
+ ldpp_dout(dpp, 0) << "AWS: removing aws object at" << path << dendl;
+
+ call(new RGWDeleteRESTResourceCR(sc->cct, target->conn.get(),
+ sc->env->http_manager,
+ path, nullptr /* params */));
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+
+};
+
+
+class RGWAWSDataSyncModule: public RGWDataSyncModule {
+ CephContext *cct;
+ AWSSyncInstanceEnv instance;
+public:
+ RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) :
+ cct(_cct),
+ instance(_conf) {
+ }
+
+ void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+ instance.init(sc, instance_id);
+ }
+
+ ~RGWAWSDataSyncModule() {}
+
+ RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+ std::optional<uint64_t> versioned_epoch,
+ const rgw_zone_set_entry& source_trace_entry,
+ rgw_zone_set *zones_trace) override {
+ ldout(sc->cct, 0) << instance.id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+ return new RGWAWSHandleRemoteObjCR(sc, sync_pipe, key, instance, versioned_epoch.value_or(0));
+ }
+ RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch,
+ rgw_zone_set *zones_trace) override {
+ ldout(sc->cct, 0) <<"rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ return new RGWAWSRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, instance);
+ }
+ RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch,
+ rgw_zone_set *zones_trace) override {
+ ldout(sc->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+ << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ return NULL;
+ }
+};
+
+class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance {
+ RGWAWSDataSyncModule data_handler;
+public:
+ RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {}
+ RGWDataSyncModule *get_data_handler() override {
+ return &data_handler;
+ }
+};
+
+int RGWAWSSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance){
+ AWSSyncConfig conf;
+
+ int r = conf.init(dpp, cct, config);
+ if (r < 0) {
+ return r;
+ }
+
+ instance->reset(new RGWAWSSyncModuleInstance(cct, conf));
+ return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h
new file mode 100644
index 000000000..92532ff00
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_aws.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+struct rgw_sync_aws_multipart_part_info {
+ int part_num{0};
+ uint64_t ofs{0};
+ uint64_t size{0};
+ std::string etag;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(part_num, bl);
+ encode(ofs, bl);
+ encode(size, bl);
+ encode(etag, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(part_num, bl);
+ decode(ofs, bl);
+ decode(size, bl);
+ decode(etag, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info)
+
+struct rgw_sync_aws_src_obj_properties {
+ ceph::real_time mtime;
+ std::string etag;
+ uint32_t zone_short_id{0};
+ uint64_t pg_ver{0};
+ uint64_t versioned_epoch{0};
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(mtime, bl);
+ encode(etag, bl);
+ encode(zone_short_id, bl);
+ encode(pg_ver, bl);
+ encode(versioned_epoch, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(mtime, bl);
+ decode(etag, bl);
+ decode(zone_short_id, bl);
+ decode(pg_ver, bl);
+ decode(versioned_epoch, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties)
+
+struct rgw_sync_aws_multipart_upload_info {
+ std::string upload_id;
+ uint64_t obj_size;
+ rgw_sync_aws_src_obj_properties src_properties;
+ uint32_t part_size{0};
+ uint32_t num_parts{0};
+
+ int cur_part{0};
+ uint64_t cur_ofs{0};
+
+ std::map<int, rgw_sync_aws_multipart_part_info> parts;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(upload_id, bl);
+ encode(obj_size, bl);
+ encode(src_properties, bl);
+ encode(part_size, bl);
+ encode(num_parts, bl);
+ encode(cur_part, bl);
+ encode(cur_ofs, bl);
+ encode(parts, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(upload_id, bl);
+ decode(obj_size, bl);
+ decode(src_properties, bl);
+ decode(part_size, bl);
+ decode(num_parts, bl);
+ decode(cur_part, bl);
+ decode(cur_ofs, bl);
+ decode(parts, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info)
+
+class RGWAWSSyncModule : public RGWSyncModule {
+ public:
+ RGWAWSSyncModule() {}
+ bool supports_data_export() override { return false;}
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.cc b/src/rgw/driver/rados/rgw_sync_module_es.cc
new file mode 100644
index 000000000..4e8eb6201
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es.cc
@@ -0,0 +1,962 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_b64.h"
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_op.h"
+#include "rgw_es_query.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include "include/str_list.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+/*
+ * allowlist utility. Config string is a list of entries, where an entry is either an item,
+ * a prefix, or a suffix. An item would be the name of the entity that we'd look up,
+ * a prefix would be a string ending with an asterisk, a suffix would be a string starting
+ * with an asterisk. For example:
+ *
+ * bucket1, bucket2, foo*, *bar
+ */
+class ItemList {
+ bool approve_all{false};
+
+ set<string> entries;
+ set<string> prefixes;
+ set<string> suffixes;
+
+ void parse(const string& str) {
+ list<string> l;
+
+ get_str_list(str, ",", l);
+
+ for (auto& entry : l) {
+ entry = rgw_trim_whitespace(entry);
+ if (entry.empty()) {
+ continue;
+ }
+
+ if (entry == "*") {
+ approve_all = true;
+ return;
+ }
+
+ if (entry[0] == '*') {
+ suffixes.insert(entry.substr(1));
+ continue;
+ }
+
+ if (entry.back() == '*') {
+ prefixes.insert(entry.substr(0, entry.size() - 1));
+ continue;
+ }
+
+ entries.insert(entry);
+ }
+ }
+
+public:
+ ItemList() {}
+ void init(const string& str, bool def_val) {
+ if (str.empty()) {
+ approve_all = def_val;
+ } else {
+ parse(str);
+ }
+ }
+
+ bool exists(const string& entry) {
+ if (approve_all) {
+ return true;
+ }
+
+ if (entries.find(entry) != entries.end()) {
+ return true;
+ }
+
+ auto i = prefixes.upper_bound(entry);
+ if (i != prefixes.begin()) {
+ --i;
+ if (boost::algorithm::starts_with(entry, *i)) {
+ return true;
+ }
+ }
+
+ for (i = suffixes.begin(); i != suffixes.end(); ++i) {
+ if (boost::algorithm::ends_with(entry, *i)) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+};
+
+#define ES_NUM_SHARDS_MIN 5
+
+#define ES_NUM_SHARDS_DEFAULT 16
+#define ES_NUM_REPLICAS_DEFAULT 1
+
+using ESVersion = std::pair<int,int>;
+static constexpr ESVersion ES_V5{5,0};
+static constexpr ESVersion ES_V7{7,0};
+
+struct ESInfo {
+ std::string name;
+ std::string cluster_name;
+ std::string cluster_uuid;
+ ESVersion version;
+
+ void decode_json(JSONObj *obj);
+
+ std::string get_version_str(){
+ return std::to_string(version.first) + "." + std::to_string(version.second);
+ }
+};
+
+// simple wrapper structure to wrap the es version nested type
+struct es_version_decoder {
+ ESVersion version;
+
+ int parse_version(const std::string& s) {
+ int major, minor;
+ int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
+ if (ret < 0) {
+ return ret;
+ }
+ version = std::make_pair(major,minor);
+ return 0;
+ }
+
+ void decode_json(JSONObj *obj) {
+ std::string s;
+ JSONDecoder::decode_json("number",s,obj);
+ if (parse_version(s) < 0)
+ throw JSONDecoder::err("Failed to parse ElasticVersion");
+ }
+};
+
+
+void ESInfo::decode_json(JSONObj *obj)
+{
+ JSONDecoder::decode_json("name", name, obj);
+ JSONDecoder::decode_json("cluster_name", cluster_name, obj);
+ JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
+ es_version_decoder esv;
+ JSONDecoder::decode_json("version", esv, obj);
+ version = std::move(esv.version);
+}
+
+struct ElasticConfig {
+ uint64_t sync_instance{0};
+ string id;
+ string index_path;
+ std::unique_ptr<RGWRESTConn> conn;
+ bool explicit_custom_meta{true};
+ string override_index_path;
+ ItemList index_buckets;
+ ItemList allow_owners;
+ uint32_t num_shards{0};
+ uint32_t num_replicas{0};
+ std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
+ ESInfo es_info;
+
+ void init(CephContext *cct, const JSONFormattable& config) {
+ string elastic_endpoint = config["endpoint"];
+ id = string("elastic:") + elastic_endpoint;
+ conn.reset(new RGWRESTConn(cct, (rgw::sal::Driver*)nullptr, id, { elastic_endpoint }, nullopt /* region */ ));
+ explicit_custom_meta = config["explicit_custom_meta"](true);
+ index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */
+ allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */
+ override_index_path = config["override_index_path"];
+ num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT);
+ if (num_shards < ES_NUM_SHARDS_MIN) {
+ num_shards = ES_NUM_SHARDS_MIN;
+ }
+ num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT);
+ if (string user = config["username"], pw = config["password"];
+ !user.empty() && !pw.empty()) {
+ auto auth_string = user + ":" + pw;
+ default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
+ }
+
+ }
+
+ void init_instance(const RGWRealm& realm, uint64_t instance_id) {
+ sync_instance = instance_id;
+
+ if (!override_index_path.empty()) {
+ index_path = override_index_path;
+ return;
+ }
+
+ char buf[32];
+ snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF));
+
+ index_path = "/rgw-" + realm.get_name() + buf;
+ }
+
+ string get_index_path() {
+ return index_path;
+ }
+
+ map<string, string>& get_request_headers() {
+ return default_headers;
+ }
+
+ string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
+ if (es_info.version >= ES_V7) {
+ return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+;
+ } else {
+ return index_path + "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+ }
+ }
+
+ bool should_handle_operation(RGWBucketInfo& bucket_info) {
+ return index_buckets.exists(bucket_info.bucket.name) &&
+ allow_owners.exists(bucket_info.owner.to_str());
+ }
+};
+
+using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
+
+static const char *es_type_to_str(const ESType& t) {
+ switch (t) {
+ case ESType::String: return "string";
+ case ESType::Text: return "text";
+ case ESType::Keyword: return "keyword";
+ case ESType::Long: return "long";
+ case ESType::Integer: return "integer";
+ case ESType::Short: return "short";
+ case ESType::Byte: return "byte";
+ case ESType::Double: return "double";
+ case ESType::Float: return "float";
+ case ESType::Half_Float: return "half_float";
+ case ESType::Scaled_Float: return "scaled_float";
+ case ESType::Date: return "date";
+ case ESType::Boolean: return "boolean";
+ case ESType::Integer_Range: return "integer_range";
+ case ESType::Float_Range: return "float_range";
+ case ESType::Double_Range: return "date_range";
+ case ESType::Date_Range: return "date_range";
+ case ESType::Geo_Point: return "geo_point";
+ case ESType::Ip: return "ip";
+ default:
+ return "<unknown>";
+ }
+}
+
+struct es_type_v2 {
+ ESType estype;
+ const char *format{nullptr};
+ std::optional<bool> analyzed;
+
+ es_type_v2(ESType et) : estype(et) {}
+
+ void dump(Formatter *f) const {
+ const char *type_str = es_type_to_str(estype);
+ encode_json("type", type_str, f);
+ if (format) {
+ encode_json("format", format, f);
+ }
+
+ auto is_analyzed = analyzed;
+
+ if (estype == ESType::String &&
+ !is_analyzed) {
+ is_analyzed = false;
+ }
+
+ if (is_analyzed) {
+ encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
+ }
+ }
+};
+
+struct es_type_v5 {
+ ESType estype;
+ const char *format{nullptr};
+ std::optional<bool> analyzed;
+ std::optional<bool> index;
+
+ es_type_v5(ESType et) : estype(et) {}
+
+ void dump(Formatter *f) const {
+ ESType new_estype;
+ if (estype != ESType::String) {
+ new_estype = estype;
+ } else {
+ bool is_analyzed = analyzed.value_or(false);
+ new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
+ /* index = true; ... Not setting index=true, because that's the default,
+ * and dumping a boolean value *might* be a problem when backporting this
+ * because value might get quoted
+ */
+ }
+
+ const char *type_str = es_type_to_str(new_estype);
+ encode_json("type", type_str, f);
+ if (format) {
+ encode_json("format", format, f);
+ }
+ if (index) {
+ encode_json("index", index.value(), f);
+ }
+ }
+};
+
+template <class T>
+struct es_type : public T {
+ es_type(T t) : T(t) {}
+ es_type& set_format(const char *f) {
+ T::format = f;
+ return *this;
+ }
+
+ es_type& set_analyzed(bool a) {
+ T::analyzed = a;
+ return *this;
+ }
+};
+
+template <class T>
+struct es_index_mappings {
+ ESVersion es_version;
+ ESType string_type {ESType::String};
+
+ es_index_mappings(ESVersion esv):es_version(esv) {
+ }
+
+ es_type<T> est(ESType t) const {
+ return es_type<T>(t);
+ }
+
+ void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
+ f->open_object_section(section);
+ ::encode_json("type", "nested", f);
+ f->open_object_section("properties");
+ encode_json("name", est(string_type), f);
+ encode_json("value", est(type).set_format(format), f);
+ f->close_section(); // entry
+ f->close_section(); // custom-string
+ }
+
+ void dump(Formatter *f) const {
+ if (es_version <= ES_V7)
+ f->open_object_section("object");
+ f->open_object_section("properties");
+ encode_json("bucket", est(string_type), f);
+ encode_json("name", est(string_type), f);
+ encode_json("instance", est(string_type), f);
+ encode_json("versioned_epoch", est(ESType::Long), f);
+ f->open_object_section("meta");
+ f->open_object_section("properties");
+ encode_json("cache_control", est(string_type), f);
+ encode_json("content_disposition", est(string_type), f);
+ encode_json("content_encoding", est(string_type), f);
+ encode_json("content_language", est(string_type), f);
+ encode_json("content_type", est(string_type), f);
+ encode_json("storage_class", est(string_type), f);
+ encode_json("etag", est(string_type), f);
+ encode_json("expires", est(string_type), f);
+ encode_json("mtime", est(ESType::Date)
+ .set_format("strict_date_optional_time||epoch_millis"), f);
+ encode_json("size", est(ESType::Long), f);
+ dump_custom("custom-string", string_type, nullptr, f);
+ dump_custom("custom-int", ESType::Long, nullptr, f);
+ dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
+ f->close_section(); // properties
+ f->close_section(); // meta
+ f->close_section(); // properties
+
+ if (es_version <= ES_V7)
+ f->close_section(); // object
+ }
+};
+
+struct es_index_settings {
+ uint32_t num_replicas;
+ uint32_t num_shards;
+
+ es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {}
+
+ void dump(Formatter *f) const {
+ encode_json("number_of_replicas", num_replicas, f);
+ encode_json("number_of_shards", num_shards, f);
+ }
+};
+
+struct es_index_config_base {
+ virtual ~es_index_config_base() {}
+ virtual void dump(Formatter *f) const = 0;
+};
+
+template <class T>
+struct es_index_config : public es_index_config_base {
+ es_index_settings settings;
+ es_index_mappings<T> mappings;
+
+ es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) {
+ }
+
+ void dump(Formatter *f) const {
+ encode_json("settings", settings, f);
+ encode_json("mappings", mappings, f);
+ }
+};
+
+static bool is_sys_attr(const std::string& attr_name){
+ static constexpr std::initializer_list<const char*> rgw_sys_attrs =
+ {RGW_ATTR_PG_VER,
+ RGW_ATTR_SOURCE_ZONE,
+ RGW_ATTR_ID_TAG,
+ RGW_ATTR_TEMPURL_KEY1,
+ RGW_ATTR_TEMPURL_KEY2,
+ RGW_ATTR_UNIX1,
+ RGW_ATTR_UNIX_KEY1
+ };
+
+ return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
+static size_t attr_len(const bufferlist& val)
+{
+ size_t len = val.length();
+ if (len && val[len - 1] == '\0') {
+ --len;
+ }
+
+ return len;
+}
+
+struct es_obj_metadata {
+ const DoutPrefixProvider *dpp;
+ CephContext *cct;
+ ElasticConfigRef es_conf;
+ RGWBucketInfo bucket_info;
+ rgw_obj_key key;
+ ceph::real_time mtime;
+ uint64_t size;
+ map<string, bufferlist> attrs;
+ uint64_t versioned_epoch;
+
+ es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info,
+ const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size,
+ map<string, bufferlist>& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key),
+ mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {}
+
+ void dump(Formatter *f) const {
+ map<string, string> out_attrs;
+ map<string, string> custom_meta;
+ RGWAccessControlPolicy policy;
+ set<string> permissions;
+ RGWObjTags obj_tags;
+
+ for (auto i : attrs) {
+ const string& attr_name = i.first;
+ bufferlist& val = i.second;
+
+ if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
+ continue;
+ }
+
+ if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
+ custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+ string(val.c_str(), attr_len(val)));
+ continue;
+ }
+
+ if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
+ continue;
+ }
+
+ if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
+ // skip versioned object olh info
+ continue;
+ }
+
+ if (attr_name == RGW_ATTR_ACL) {
+ try {
+ auto i = val.cbegin();
+ decode(policy, i);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+ continue;
+ }
+
+ const RGWAccessControlList& acl = policy.get_acl();
+
+ permissions.insert(policy.get_owner().get_id().to_str());
+ for (auto acliter : acl.get_grant_map()) {
+ const ACLGrant& grant = acliter.second;
+ if (grant.get_type().get_type() == ACL_TYPE_CANON_USER &&
+ ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) {
+ rgw_user user;
+ if (grant.get_id(user)) {
+ permissions.insert(user.to_str());
+ }
+ }
+ }
+ } else if (attr_name == RGW_ATTR_TAGS) {
+ try {
+ auto tags_bl = val.cbegin();
+ decode(obj_tags, tags_bl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode obj tags for "
+ << bucket_info.bucket << "/" << key << dendl;
+ continue;
+ }
+ } else if (attr_name == RGW_ATTR_COMPRESSION) {
+ RGWCompressionInfo cs_info;
+ try {
+ auto vals_bl = val.cbegin();
+ decode(cs_info, vals_bl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode compression attr for "
+ << bucket_info.bucket << "/" << key << dendl;
+ continue;
+ }
+ out_attrs.emplace("compression",std::move(cs_info.compression_type));
+ } else {
+ if (!is_sys_attr(attr_name)) {
+ out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+ std::string(val.c_str(), attr_len(val)));
+ }
+ }
+ }
+ ::encode_json("bucket", bucket_info.bucket.name, f);
+ ::encode_json("name", key.name, f);
+ string instance = key.instance;
+ if (instance.empty())
+ instance = "null";
+ ::encode_json("instance", instance, f);
+ ::encode_json("versioned_epoch", versioned_epoch, f);
+ ::encode_json("owner", policy.get_owner(), f);
+ ::encode_json("permissions", permissions, f);
+ f->open_object_section("meta");
+ ::encode_json("size", size, f);
+
+ string mtime_str;
+ rgw_to_iso8601(mtime, &mtime_str);
+ ::encode_json("mtime", mtime_str, f);
+ for (auto i : out_attrs) {
+ ::encode_json(i.first.c_str(), i.second, f);
+ }
+ map<string, string> custom_str;
+ map<string, string> custom_int;
+ map<string, string> custom_date;
+
+ for (auto i : custom_meta) {
+ auto config = bucket_info.mdsearch_config.find(i.first);
+ if (config == bucket_info.mdsearch_config.end()) {
+ if (!es_conf->explicit_custom_meta) {
+ /* default custom meta is of type string */
+ custom_str[i.first] = i.second;
+ } else {
+ ldpp_dout(dpp, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl;
+ }
+ continue;
+ }
+ switch (config->second) {
+ case ESEntityTypeMap::ES_ENTITY_DATE:
+ custom_date[i.first] = i.second;
+ break;
+ case ESEntityTypeMap::ES_ENTITY_INT:
+ custom_int[i.first] = i.second;
+ break;
+ default:
+ custom_str[i.first] = i.second;
+ }
+ }
+
+ if (!custom_str.empty()) {
+ f->open_array_section("custom-string");
+ for (auto i : custom_str) {
+ f->open_object_section("entity");
+ ::encode_json("name", i.first.c_str(), f);
+ ::encode_json("value", i.second, f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ if (!custom_int.empty()) {
+ f->open_array_section("custom-int");
+ for (auto i : custom_int) {
+ f->open_object_section("entity");
+ ::encode_json("name", i.first.c_str(), f);
+ ::encode_json("value", i.second, f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ if (!custom_date.empty()) {
+ f->open_array_section("custom-date");
+ for (auto i : custom_date) {
+ /*
+ * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc,
+ * which will end up with failed sync
+ */
+ real_time t;
+ int r = parse_time(i.second.c_str(), &t);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl;
+ continue;
+ }
+
+ string time_str;
+ rgw_to_iso8601(t, &time_str);
+
+ f->open_object_section("entity");
+ ::encode_json("name", i.first.c_str(), f);
+ ::encode_json("value", time_str.c_str(), f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section(); // meta
+ const auto& m = obj_tags.get_tags();
+ if (m.size() > 0){
+ f->open_array_section("tagging");
+ for (const auto &it : m) {
+ f->open_object_section("tag");
+ ::encode_json("key", it.first, f);
+ ::encode_json("value",it.second, f);
+ f->close_section();
+ }
+ f->close_section(); // tagging
+ }
+ }
+};
+
+class RGWElasticGetESInfoCBCR : public RGWCoroutine {
+public:
+ RGWElasticGetESInfoCBCR(RGWDataSyncCtx *_sc,
+ ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ conf(_conf) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch info for zone: " << sc->source_zone << dendl;
+ yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
+ conf->conn.get(),
+ sync_env->http_manager,
+ "/", nullptr /*params*/,
+ &(conf->default_headers),
+ &(conf->es_info)));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch failed: " << retcode << dendl;
+ return set_cr_error(retcode);
+ }
+
+ ldpp_dout(dpp, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl;
+ return set_cr_done();
+ }
+ return 0;
+ }
+private:
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ ElasticConfigRef conf;
+};
+
+class RGWElasticPutIndexCBCR : public RGWCoroutine {
+public:
+ RGWElasticPutIndexCBCR(RGWDataSyncCtx *_sc,
+ ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ conf(_conf) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ldpp_dout(dpp, 5) << conf->id << ": put elasticsearch index for zone: " << sc->source_zone << dendl;
+
+ yield {
+ string path = conf->get_index_path();
+ es_index_settings settings(conf->num_replicas, conf->num_shards);
+ std::unique_ptr<es_index_config_base> index_conf;
+
+ if (conf->es_info.version >= ES_V5) {
+ ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
+ index_conf.reset(new es_index_config<es_type_v5>(settings, conf->es_info.version));
+ } else {
+ ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version < 5" << dendl;
+ index_conf.reset(new es_index_config<es_type_v2>(settings, conf->es_info.version));
+ }
+ call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sc->cct,
+ conf->conn.get(),
+ sync_env->http_manager,
+ path, nullptr /*params*/,
+ &(conf->default_headers),
+ *index_conf, nullptr, &err_response));
+ }
+ if (retcode < 0) {
+
+ if (err_response.error.type != "index_already_exists_exception" &&
+ err_response.error.type != "resource_already_exists_exception") {
+ ldpp_dout(dpp, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
+ return set_cr_error(retcode);
+ }
+
+ ldpp_dout(dpp, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+
+private:
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ ElasticConfigRef conf;
+
+ struct _err_response {
+ struct err_reason {
+ vector<err_reason> root_cause;
+ string type;
+ string reason;
+ string index;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("root_cause", root_cause, obj);
+ JSONDecoder::decode_json("type", type, obj);
+ JSONDecoder::decode_json("reason", reason, obj);
+ JSONDecoder::decode_json("index", index, obj);
+ }
+ } error;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("error", error, obj);
+ }
+ } err_response;
+};
+
+class RGWElasticInitConfigCBCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ ElasticConfigRef conf;
+
+public:
+ RGWElasticInitConfigCBCR(RGWDataSyncCtx *_sc,
+ ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+ sc(_sc), sync_env(_sc->env),
+ conf(_conf) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+
+ yield call(new RGWElasticGetESInfoCBCR(sc, conf));
+
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+
+ yield call(new RGWElasticPutIndexCBCR(sc, conf));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+
+};
+
+class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+ rgw_bucket_sync_pipe sync_pipe;
+ ElasticConfigRef conf;
+ uint64_t versioned_epoch;
+public:
+ RGWElasticHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+ ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+ sync_pipe(_sync_pipe), conf(_conf),
+ versioned_epoch(_versioned_epoch) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ldpp_dout(dpp, 10) << ": stat of remote obj: z=" << sc->source_zone
+ << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key
+ << " size=" << size << " mtime=" << mtime << dendl;
+
+ yield {
+ string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+ es_obj_metadata doc(sync_env->cct, conf, sync_pipe.dest_bucket_info, key, mtime, size, attrs, versioned_epoch);
+
+ call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
+ sync_env->http_manager,
+ path, nullptr /* params */,
+ &(conf->default_headers),
+ doc, nullptr /* result */));
+
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+};
+
+class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+ rgw_bucket_sync_pipe sync_pipe;
+ ElasticConfigRef conf;
+ uint64_t versioned_epoch;
+public:
+ RGWElasticHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+ ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+ sync_pipe(_sync_pipe),
+ conf(_conf), versioned_epoch(_versioned_epoch) {
+ }
+
+ ~RGWElasticHandleRemoteObjCR() override {}
+
+ RGWStatRemoteObjCBCR *allocate_callback() override {
+ return new RGWElasticHandleRemoteObjCBCR(sc, sync_pipe, key, conf, versioned_epoch);
+ }
+};
+
+class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine {
+ RGWDataSyncCtx *sc;
+ RGWDataSyncEnv *sync_env;
+ rgw_bucket_sync_pipe sync_pipe;
+ rgw_obj_key key;
+ ceph::real_time mtime;
+ ElasticConfigRef conf;
+public:
+ RGWElasticRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+ ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+ sync_pipe(_sync_pipe), key(_key),
+ mtime(_mtime), conf(_conf) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ reenter(this) {
+ ldpp_dout(dpp, 10) << ": remove remote obj: z=" << sc->source_zone
+ << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+ yield {
+ string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+
+ call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(),
+ sync_env->http_manager,
+ path, nullptr /* params */));
+ }
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ return set_cr_done();
+ }
+ return 0;
+ }
+
+};
+
+class RGWElasticDataSyncModule : public RGWDataSyncModule {
+ ElasticConfigRef conf;
+public:
+ RGWElasticDataSyncModule(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) : conf(std::make_shared<ElasticConfig>()) {
+ conf->init(cct, config);
+ }
+ ~RGWElasticDataSyncModule() override {}
+
+ void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+ conf->init_instance(sc->env->svc->zone->get_realm(), instance_id);
+ }
+
+ RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+ ldpp_dout(dpp, 5) << conf->id << ": init" << dendl;
+ return new RGWElasticInitConfigCBCR(sc, conf);
+ }
+
+ RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+ ldpp_dout(dpp, 5) << conf->id << ": start_sync" << dendl;
+ // try to get elastic search version
+ return new RGWElasticGetESInfoCBCR(sc, conf);
+ }
+
+ RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override {
+ ldpp_dout(dpp, 10) << conf->id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+ if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+ ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+ return nullptr;
+ }
+ return new RGWElasticHandleRemoteObjCR(sc, sync_pipe, key, conf, versioned_epoch.value_or(0));
+ }
+ RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+ /* versioned and versioned epoch params are useless in the elasticsearch backend case */
+ ldpp_dout(dpp, 10) << conf->id << ": rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+ ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+ return nullptr;
+ }
+ return new RGWElasticRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, conf);
+ }
+ RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+ ldpp_dout(dpp, 10) << conf->id << ": create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+ << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ ldpp_dout(dpp, 10) << conf->id << ": skipping operation (not handled)" << dendl;
+ return NULL;
+ }
+ RGWRESTConn *get_rest_conn() {
+ return conf->conn.get();
+ }
+
+ string get_index_path() {
+ return conf->get_index_path();
+ }
+
+ map<string, string>& get_request_headers() {
+ return conf->get_request_headers();
+ }
+};
+
+RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config)
+{
+ data_handler = std::unique_ptr<RGWElasticDataSyncModule>(new RGWElasticDataSyncModule(dpp, cct, config));
+}
+
+RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler()
+{
+ return data_handler.get();
+}
+
+RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn()
+{
+ return data_handler->get_rest_conn();
+}
+
+string RGWElasticSyncModuleInstance::get_index_path() {
+ return data_handler->get_index_path();
+}
+
+map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
+ return data_handler->get_request_headers();
+}
+
+RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
+ if (dialect != RGW_REST_S3) {
+ return orig;
+ }
+ delete orig;
+ return new RGWRESTMgr_MDSearch_S3();
+}
+
+int RGWElasticSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+ string endpoint = config["endpoint"];
+ instance->reset(new RGWElasticSyncModuleInstance(dpp, cct, config));
+ return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h
new file mode 100644
index 000000000..c8c9fcc43
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+enum class ESType {
+ /* string datatypes */
+ String, /* Deprecated Since 5.X+ */
+ Text,
+ Keyword,
+
+ /* Numeric Types */
+ Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
+
+ /* Date Type */
+ Date,
+
+ /* Boolean */
+ Boolean,
+
+ /* Binary; Must Be Base64 Encoded */
+ Binary,
+
+ /* Range Types */
+ Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
+
+ /* A Few Specialized Types */
+ Geo_Point,
+ Ip
+};
+
+
+class RGWElasticSyncModule : public RGWSyncModule {
+public:
+ RGWElasticSyncModule() {}
+ bool supports_data_export() override {
+ return false;
+ }
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWElasticDataSyncModule;
+class RGWRESTConn;
+
+class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance {
+ std::unique_ptr<RGWElasticDataSyncModule> data_handler;
+public:
+ RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config);
+ RGWDataSyncModule *get_data_handler() override;
+ RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
+ RGWRESTConn *get_rest_conn();
+ std::string get_index_path();
+ std::map<std::string, std::string>& get_request_headers();
+ bool supports_user_writes() override {
+ return true;
+ }
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.cc b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc
new file mode 100644
index 000000000..db9d48adb
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc
@@ -0,0 +1,428 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_es_query.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_sal_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct es_index_obj_response {
+ string bucket;
+ rgw_obj_key key;
+ uint64_t versioned_epoch{0};
+ ACLOwner owner;
+ set<string> read_permissions;
+
+ struct {
+ uint64_t size{0};
+ ceph::real_time mtime;
+ string etag;
+ string content_type;
+ string storage_class;
+ map<string, string> custom_str;
+ map<string, int64_t> custom_int;
+ map<string, string> custom_date;
+
+ template <class T>
+ struct _custom_entry {
+ string name;
+ T value;
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("name", name, obj);
+ JSONDecoder::decode_json("value", value, obj);
+ }
+ };
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("size", size, obj);
+ string mtime_str;
+ JSONDecoder::decode_json("mtime", mtime_str, obj);
+ parse_time(mtime_str.c_str(), &mtime);
+ JSONDecoder::decode_json("etag", etag, obj);
+ JSONDecoder::decode_json("content_type", content_type, obj);
+ JSONDecoder::decode_json("storage_class", storage_class, obj);
+ list<_custom_entry<string> > str_entries;
+ JSONDecoder::decode_json("custom-string", str_entries, obj);
+ for (auto& e : str_entries) {
+ custom_str[e.name] = e.value;
+ }
+ list<_custom_entry<int64_t> > int_entries;
+ JSONDecoder::decode_json("custom-int", int_entries, obj);
+ for (auto& e : int_entries) {
+ custom_int[e.name] = e.value;
+ }
+ list<_custom_entry<string> > date_entries;
+ JSONDecoder::decode_json("custom-date", date_entries, obj);
+ for (auto& e : date_entries) {
+ custom_date[e.name] = e.value;
+ }
+ }
+ } meta;
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("bucket", bucket, obj);
+ JSONDecoder::decode_json("name", key.name, obj);
+ JSONDecoder::decode_json("instance", key.instance, obj);
+ JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj);
+ JSONDecoder::decode_json("permissions", read_permissions, obj);
+ JSONDecoder::decode_json("owner", owner, obj);
+ JSONDecoder::decode_json("meta", meta, obj);
+ }
+};
+
+struct es_search_response {
+ uint32_t took;
+ bool timed_out;
+ struct {
+ uint32_t total;
+ uint32_t successful;
+ uint32_t failed;
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("total", total, obj);
+ JSONDecoder::decode_json("successful", successful, obj);
+ JSONDecoder::decode_json("failed", failed, obj);
+ }
+ } shards;
+ struct obj_hit {
+ string index;
+ string type;
+ string id;
+ // double score
+ es_index_obj_response source;
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("_index", index, obj);
+ JSONDecoder::decode_json("_type", type, obj);
+ JSONDecoder::decode_json("_id", id, obj);
+ JSONDecoder::decode_json("_source", source, obj);
+ }
+ };
+ struct {
+ uint32_t total;
+ // double max_score;
+ list<obj_hit> hits;
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("total", total, obj);
+ // JSONDecoder::decode_json("max_score", max_score, obj);
+ JSONDecoder::decode_json("hits", hits, obj);
+ }
+ } hits;
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("took", took, obj);
+ JSONDecoder::decode_json("timed_out", timed_out, obj);
+ JSONDecoder::decode_json("_shards", shards, obj);
+ JSONDecoder::decode_json("hits", hits, obj);
+ }
+};
+
+class RGWMetadataSearchOp : public RGWOp {
+ RGWSyncModuleInstanceRef sync_module_ref;
+ RGWElasticSyncModuleInstance *es_module;
+protected:
+ string expression;
+ string custom_prefix;
+#define MAX_KEYS_DEFAULT 100
+ uint64_t max_keys{MAX_KEYS_DEFAULT};
+ string marker_str;
+ uint64_t marker{0};
+ string next_marker;
+ bool is_truncated{false};
+ string err;
+
+ es_search_response response;
+
+public:
+ RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) {
+ es_module = static_cast<RGWElasticSyncModuleInstance *>(sync_module_ref.get());
+ }
+
+ int verify_permission(optional_yield) override {
+ return 0;
+ }
+ virtual int get_params() = 0;
+ void pre_exec() override;
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "metadata_search"; }
+ virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; }
+ virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+void RGWMetadataSearchOp::pre_exec()
+{
+ rgw_bucket_object_pre_exec(s);
+}
+
+void RGWMetadataSearchOp::execute(optional_yield y)
+{
+ op_ret = get_params();
+ if (op_ret < 0)
+ return;
+
+ list<pair<string, string> > conds;
+
+ if (!s->user->get_info().system) {
+ conds.push_back(make_pair("permissions", s->user->get_id().to_str()));
+ }
+
+ if (!s->bucket_name.empty()) {
+ conds.push_back(make_pair("bucket", s->bucket_name));
+ }
+
+ ESQueryCompiler es_query(expression, &conds, custom_prefix);
+
+ static map<string, string, ltstr_nocase> aliases = {
+ { "bucket", "bucket" }, /* forces lowercase */
+ { "name", "name" },
+ { "key", "name" },
+ { "instance", "instance" },
+ { "etag", "meta.etag" },
+ { "size", "meta.size" },
+ { "mtime", "meta.mtime" },
+ { "lastmodified", "meta.mtime" },
+ { "last_modified", "meta.mtime" },
+ { "contenttype", "meta.content_type" },
+ { "content_type", "meta.content_type" },
+ { "storageclass", "meta.storage_class" },
+ { "storage_class", "meta.storage_class" },
+ };
+ es_query.set_field_aliases(&aliases);
+
+ static map<string, ESEntityTypeMap::EntityType> generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR},
+ {"name", ESEntityTypeMap::ES_ENTITY_STR},
+ {"instance", ESEntityTypeMap::ES_ENTITY_STR},
+ {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
+ {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
+ {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
+ {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
+ {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
+ {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
+ ESEntityTypeMap gm(generic_map);
+ es_query.set_generic_type_map(&gm);
+
+ static set<string> restricted_fields = { {"permissions"} };
+ es_query.set_restricted_fields(&restricted_fields);
+
+ map<string, ESEntityTypeMap::EntityType> custom_map;
+ for (auto& i : s->bucket->get_info().mdsearch_config) {
+ custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second;
+ }
+
+ ESEntityTypeMap em(custom_map);
+ es_query.set_custom_type_map(&em);
+
+ bool valid = es_query.compile(&err);
+ if (!valid) {
+ ldpp_dout(this, 10) << "invalid query, failed generating request json" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ JSONFormatter f;
+ encode_json("root", es_query, &f);
+
+ RGWRESTConn *conn = es_module->get_rest_conn();
+
+ bufferlist in;
+ bufferlist out;
+
+ stringstream ss;
+
+ f.flush(ss);
+ in.append(ss.str());
+
+ string resource = es_module->get_index_path() + "/_search";
+ param_vec_t params;
+ static constexpr int BUFSIZE = 32;
+ char buf[BUFSIZE];
+ snprintf(buf, sizeof(buf), "%lld", (long long)max_keys);
+ params.push_back(param_pair_t("size", buf));
+ if (marker > 0) {
+ params.push_back(param_pair_t("from", marker_str.c_str()));
+ }
+ ldpp_dout(this, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
+ auto& extra_headers = es_module->get_request_headers();
+ op_ret = conn->get_resource(s, resource, &params, &extra_headers,
+ out, &in, nullptr, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
+ return;
+ }
+
+ ldpp_dout(this, 20) << "response: " << string(out.c_str(), out.length()) << dendl;
+
+ JSONParser jparser;
+ if (!jparser.parse(out.c_str(), out.length())) {
+ ldpp_dout(this, 0) << "ERROR: failed to parse elasticsearch response" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ try {
+ decode_json_obj(response, &jparser);
+ } catch (const JSONDecoder::err& e) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON input: " << e.what() << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+}
+
+class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp {
+public:
+ explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) {
+ custom_prefix = "x-amz-meta-";
+ }
+
+ int get_params() override {
+ expression = s->info.args.get("query");
+ bool exists;
+ string max_keys_str = s->info.args.get("max-keys", &exists);
+#define MAX_KEYS_MAX 10000
+ if (exists) {
+ string err;
+ max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+ if (max_keys > MAX_KEYS_MAX) {
+ max_keys = MAX_KEYS_MAX;
+ }
+ }
+ marker_str = s->info.args.get("marker", &exists);
+ if (exists) {
+ string err;
+ marker = strict_strtoll(marker_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+ }
+ uint64_t nm = marker + max_keys;
+ static constexpr int BUFSIZE = 32;
+ char buf[BUFSIZE];
+ snprintf(buf, sizeof(buf), "%lld", (long long)nm);
+ next_marker = buf;
+ return 0;
+ }
+ void send_response() override {
+ if (op_ret) {
+ s->err.message = err;
+ set_req_state_err(s, op_ret);
+ }
+ dump_errno(s);
+ end_header(s, this, "application/xml");
+
+ if (op_ret < 0) {
+ return;
+ }
+
+ is_truncated = (response.hits.hits.size() >= max_keys);
+
+ s->formatter->open_object_section("SearchMetadataResponse");
+ s->formatter->dump_string("Marker", marker_str);
+ s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false"));
+ if (is_truncated) {
+ s->formatter->dump_string("NextMarker", next_marker);
+ }
+ if (s->format == RGWFormat::JSON) {
+ s->formatter->open_array_section("Objects");
+ }
+ for (auto& i : response.hits.hits) {
+ s->formatter->open_object_section("Contents");
+ es_index_obj_response& e = i.source;
+ s->formatter->dump_string("Bucket", e.bucket);
+ s->formatter->dump_string("Key", e.key.name);
+ string instance = (!e.key.instance.empty() ? e.key.instance : "null");
+ s->formatter->dump_string("Instance", instance.c_str());
+ s->formatter->dump_int("VersionedEpoch", e.versioned_epoch);
+ dump_time(s, "LastModified", e.meta.mtime);
+ s->formatter->dump_int("Size", e.meta.size);
+ s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
+ s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
+ s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
+ dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
+ s->formatter->open_array_section("CustomMetadata");
+ for (auto& m : e.meta.custom_str) {
+ s->formatter->open_object_section("Entry");
+ s->formatter->dump_string("Name", m.first.c_str());
+ s->formatter->dump_string("Value", m.second);
+ s->formatter->close_section();
+ }
+ for (auto& m : e.meta.custom_int) {
+ s->formatter->open_object_section("Entry");
+ s->formatter->dump_string("Name", m.first.c_str());
+ s->formatter->dump_int("Value", m.second);
+ s->formatter->close_section();
+ }
+ for (auto& m : e.meta.custom_date) {
+ s->formatter->open_object_section("Entry");
+ s->formatter->dump_string("Name", m.first.c_str());
+ s->formatter->dump_string("Value", m.second);
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ rgw_flush_formatter(s, s->formatter);
+ s->formatter->close_section();
+ };
+ if (s->format == RGWFormat::JSON) {
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ rgw_flush_formatter_and_reset(s, s->formatter);
+ }
+};
+
+class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 {
+protected:
+ RGWOp *op_get() override {
+ if (s->info.args.exists("query")) {
+ return new RGWMetadataSearch_ObjStore_S3(driver->get_sync_module());
+ }
+ if (!s->init_state.url_bucket.empty() &&
+ s->info.args.exists("mdsearch")) {
+ return new RGWGetBucketMetaSearch_ObjStore_S3;
+ }
+ return nullptr;
+ }
+ RGWOp *op_head() override {
+ return nullptr;
+ }
+ RGWOp *op_post() override {
+ return nullptr;
+ }
+public:
+ explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {}
+ virtual ~RGWHandler_REST_MDSearch_S3() {}
+};
+
+
+RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(rgw::sal::Driver* driver,
+ req_state* const s,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string& frontend_prefix)
+{
+ int ret =
+ RGWHandler_REST_S3::init_from_header(driver, s,
+ RGWFormat::XML, true);
+ if (ret < 0) {
+ return nullptr;
+ }
+
+ if (!s->object->empty()) {
+ return nullptr;
+ }
+
+ RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry);
+
+ ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name()
+ << dendl;
+ return handler;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.h b/src/rgw/driver/rados/rgw_sync_module_es_rest.h
new file mode 100644
index 000000000..b18271a69
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWElasticSyncModuleInstance;
+
+class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr {
+public:
+ explicit RGWRESTMgr_MDSearch_S3() {}
+
+ RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+ req_state* s,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string& frontend_prefix) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.cc b/src/rgw/driver/rados/rgw_sync_module_log.cc
new file mode 100644
index 000000000..9666ecc4c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_log.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_log.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+public:
+ RGWLogStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sc, _src_bucket, _key) {}
+ int operate(const DoutPrefixProvider *dpp) override {
+ ldpp_dout(dpp, 0) << "SYNC_LOG: stat of remote obj: z=" << sc->source_zone
+ << " b=" << src_bucket << " k=" << key << " size=" << size << " mtime=" << mtime
+ << " attrs=" << attrs << dendl;
+ return set_cr_done();
+ }
+
+};
+
+class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR {
+public:
+ RGWLogStatRemoteObjCR(RGWDataSyncCtx *_sc,
+ rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sc, _src_bucket, _key) {
+ }
+
+ ~RGWLogStatRemoteObjCR() override {}
+
+ RGWStatRemoteObjCBCR *allocate_callback() override {
+ return new RGWLogStatRemoteObjCBCR(sc, src_bucket, key);
+ }
+};
+
+class RGWLogDataSyncModule : public RGWDataSyncModule {
+ string prefix;
+public:
+ explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {}
+
+ RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override {
+ ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+ return new RGWLogStatRemoteObjCR(sc, sync_pipe.info.source_bs.bucket, key);
+ }
+ RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+ ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ return NULL;
+ }
+ RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+ rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+ ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+ << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+ return NULL;
+ }
+};
+
+class RGWLogSyncModuleInstance : public RGWSyncModuleInstance {
+ RGWLogDataSyncModule data_handler;
+public:
+ explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {}
+ RGWDataSyncModule *get_data_handler() override {
+ return &data_handler;
+ }
+};
+
+int RGWLogSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+ string prefix = config["prefix"];
+ instance->reset(new RGWLogSyncModuleInstance(prefix));
+ return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h
new file mode 100644
index 000000000..ab475959d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_log.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+class RGWLogSyncModule : public RGWSyncModule {
+public:
+ RGWLogSyncModule() {}
+ bool supports_data_export() override {
+ return false;
+ }
+ int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_trace.cc b/src/rgw/driver/rados/rgw_sync_trace.cc
new file mode 100644
index 000000000..b34683593
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_trace.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_TRACE_H
+#define CEPH_RGW_SYNC_TRACE_H
+
+#include <regex>
+
+#include "common/debug.h"
+#include "common/ceph_json.h"
+
+#include "rgw_sync_trace.h"
+#include "rgw_rados.h"
+#include "rgw_worker.h"
+
+#define dout_context g_ceph_context
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using namespace std;
+
+
+RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+ const RGWSyncTraceNodeRef& _parent,
+ const string& _type, const string& _id) : cct(_cct),
+ parent(_parent),
+ type(_type),
+ id(_id),
+ handle(_handle),
+ history(cct->_conf->rgw_sync_trace_per_node_log_size)
+{
+ if (parent.get()) {
+ prefix = parent->get_prefix();
+ }
+
+ if (!type.empty()) {
+ prefix += type;
+ if (!id.empty()) {
+ prefix += "[" + id + "]";
+ }
+ prefix += ":";
+ }
+}
+
+void RGWSyncTraceNode::log(int level, const string& s)
+{
+ status = s;
+ history.push_back(status);
+ /* dump output on either rgw_sync, or rgw -- but only once */
+ if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) {
+ lsubdout(cct, rgw_sync,
+ ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+ } else {
+ lsubdout(cct, rgw,
+ ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+ }
+}
+
+
+class RGWSyncTraceServiceMapThread : public RGWRadosThread {
+ RGWRados *store;
+ RGWSyncTraceManager *manager;
+
+ uint64_t interval_msec() override {
+ return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000;
+ }
+public:
+ RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager)
+ : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {}
+
+ int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncTraceServiceMapThread::process(const DoutPrefixProvider *dpp)
+{
+ map<string, string> status;
+ status["current_sync"] = manager->get_active_names();
+ int ret = store->update_service_map(dpp, std::move(status));
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl;
+ }
+ return 0;
+}
+
+RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent,
+ const std::string& type,
+ const std::string& id)
+{
+ shunique_lock wl(lock, ceph::acquire_unique);
+ auto handle = alloc_handle();
+ RGWSyncTraceNodeRef& ref = nodes[handle];
+ ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id));
+ // return a separate shared_ptr that calls finish() on the node instead of
+ // deleting it. the lambda capture holds a reference to the original 'ref'
+ auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); };
+ return {ref.get(), deleter};
+}
+
+bool RGWSyncTraceNode::match(const string& search_term, bool search_history)
+{
+ try {
+ std::regex expr(search_term);
+ std::smatch m;
+
+ if (regex_search(prefix, m, expr)) {
+ return true;
+ }
+ if (regex_search(status, m,expr)) {
+ return true;
+ }
+ if (!search_history) {
+ return false;
+ }
+
+ for (auto h : history) {
+ if (regex_search(h, m, expr)) {
+ return true;
+ }
+ }
+ } catch (const std::regex_error& e) {
+ ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl;
+ }
+
+ return false;
+}
+
+void RGWSyncTraceManager::init(RGWRados *store)
+{
+ service_map_thread = new RGWSyncTraceServiceMapThread(store, this);
+ service_map_thread->start();
+}
+
+RGWSyncTraceManager::~RGWSyncTraceManager()
+{
+ cct->get_admin_socket()->unregister_commands(this);
+ service_map_thread->stop();
+ delete service_map_thread;
+
+ nodes.clear();
+}
+
+int RGWSyncTraceManager::hook_to_admin_command()
+{
+ AdminSocket *admin_socket = cct->get_admin_socket();
+
+ admin_commands = { { "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" },
+ { "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" },
+ { "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" },
+ { "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } };
+ for (auto cmd : admin_commands) {
+ int r = admin_socket->register_command(cmd[0], this,
+ cmd[1]);
+ if (r < 0) {
+ lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+static void dump_node(RGWSyncTraceNode *entry, bool show_history, Formatter *f)
+{
+ f->open_object_section("entry");
+ ::encode_json("status", entry->to_str(), f);
+ if (show_history) {
+ f->open_array_section("history");
+ for (auto h : entry->get_history()) {
+ ::encode_json("entry", h, f);
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+string RGWSyncTraceManager::get_active_names()
+{
+ shunique_lock rl(lock, ceph::acquire_shared);
+
+ stringstream ss;
+ JSONFormatter f;
+
+ f.open_array_section("result");
+ for (auto n : nodes) {
+ auto& entry = n.second;
+
+ if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+ continue;
+ }
+ const string& name = entry->get_resource_name();
+ if (!name.empty()) {
+ ::encode_json("entry", name, &f);
+ }
+ f.flush(ss);
+ }
+ f.close_section();
+ f.flush(ss);
+
+ return ss.str();
+}
+
+int RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap,
+ const bufferlist&,
+ Formatter *f,
+ std::ostream& ss,
+ bufferlist& out) {
+
+ bool show_history = (command == "sync trace history");
+ bool show_short = (command == "sync trace active_short");
+ bool show_active = (command == "sync trace active") || show_short;
+
+ string search;
+
+ auto si = cmdmap.find("search");
+ if (si != cmdmap.end()) {
+ search = boost::get<string>(si->second);
+ }
+
+ shunique_lock rl(lock, ceph::acquire_shared);
+
+ f->open_object_section("result");
+ f->open_array_section("running");
+ for (auto n : nodes) {
+ auto& entry = n.second;
+
+ if (!search.empty() && !entry->match(search, show_history)) {
+ continue;
+ }
+ if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+ continue;
+ }
+ if (show_short) {
+ const string& name = entry->get_resource_name();
+ if (!name.empty()) {
+ ::encode_json("entry", name, f);
+ }
+ } else {
+ dump_node(entry.get(), show_history, f);
+ }
+ f->flush(out);
+ }
+ f->close_section();
+
+ f->open_array_section("complete");
+ for (auto& entry : complete_nodes) {
+ if (!search.empty() && !entry->match(search, show_history)) {
+ continue;
+ }
+ if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+ continue;
+ }
+ dump_node(entry.get(), show_history, f);
+ f->flush(out);
+ }
+ f->close_section();
+
+ f->close_section();
+
+ return 0;
+}
+
+void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node)
+{
+ RGWSyncTraceNodeRef old_node;
+
+ {
+ shunique_lock wl(lock, ceph::acquire_unique);
+ if (!node) {
+ return;
+ }
+ auto iter = nodes.find(node->handle);
+ if (iter == nodes.end()) {
+ /* not found, already finished */
+ return;
+ }
+
+ if (complete_nodes.full()) {
+ /* take a reference to the entry that is going to be evicted,
+ * can't let it get evicted under lock held, otherwise
+ * it's a deadlock as it will call finish_node()
+ */
+ old_node = complete_nodes.front();
+ }
+
+ complete_nodes.push_back(iter->second);
+ nodes.erase(iter);
+ }
+};
+
+#endif
+
diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h
new file mode 100644
index 000000000..1fcc8bed8
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_trace.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+
+#include "common/ceph_mutex.h"
+#include "common/shunique_lock.h"
+#include "common/admin_socket.h"
+
+#include <set>
+#include <ostream>
+#include <string>
+#include <shared_mutex>
+#include <boost/circular_buffer.hpp>
+
+#define SSTR(o) ({ \
+ std::stringstream ss; \
+ ss << o; \
+ ss.str(); \
+})
+
+#define RGW_SNS_FLAG_ACTIVE 1
+#define RGW_SNS_FLAG_ERROR 2
+
+class RGWRados;
+class RGWSyncTraceManager;
+class RGWSyncTraceNode;
+class RGWSyncTraceServiceMapThread;
+
+using RGWSyncTraceNodeRef = std::shared_ptr<RGWSyncTraceNode>;
+
+class RGWSyncTraceNode final {
+ friend class RGWSyncTraceManager;
+
+ CephContext *cct;
+ RGWSyncTraceNodeRef parent;
+
+ uint16_t state{0};
+ std::string status;
+
+ ceph::mutex lock = ceph::make_mutex("RGWSyncTraceNode::lock");
+
+ std::string type;
+ std::string id;
+
+ std::string prefix;
+
+ std::string resource_name;
+
+ uint64_t handle;
+
+ boost::circular_buffer<std::string> history;
+
+ // private constructor, create with RGWSyncTraceManager::add_node()
+ RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+ const RGWSyncTraceNodeRef& _parent,
+ const std::string& _type, const std::string& _id);
+
+ public:
+ void set_resource_name(const std::string& s) {
+ resource_name = s;
+ }
+
+ const std::string& get_resource_name() {
+ return resource_name;
+ }
+
+ void set_flag(uint16_t s) {
+ state |= s;
+ }
+ void unset_flag(uint16_t s) {
+ state &= ~s;
+ }
+ bool test_flags(uint16_t f) {
+ return (state & f) == f;
+ }
+ void log(int level, const std::string& s);
+
+ std::string to_str() {
+ return prefix + " " + status;
+ }
+
+ const std::string& get_prefix() {
+ return prefix;
+ }
+
+ std::ostream& operator<<(std::ostream& os) {
+ os << to_str();
+ return os;
+ }
+
+ boost::circular_buffer<std::string>& get_history() {
+ return history;
+ }
+
+ bool match(const std::string& search_term, bool search_history);
+};
+
+class RGWSyncTraceManager : public AdminSocketHook {
+ friend class RGWSyncTraceNode;
+
+ mutable std::shared_timed_mutex lock;
+ using shunique_lock = ceph::shunique_lock<decltype(lock)>;
+
+ CephContext *cct;
+ RGWSyncTraceServiceMapThread *service_map_thread{nullptr};
+
+ std::map<uint64_t, RGWSyncTraceNodeRef> nodes;
+ boost::circular_buffer<RGWSyncTraceNodeRef> complete_nodes;
+
+ std::atomic<uint64_t> count = { 0 };
+
+ std::list<std::array<std::string, 3> > admin_commands;
+
+ uint64_t alloc_handle() {
+ return ++count;
+ }
+ void finish_node(RGWSyncTraceNode *node);
+
+public:
+ RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {}
+ ~RGWSyncTraceManager();
+
+ void init(RGWRados *store);
+
+ const RGWSyncTraceNodeRef root_node;
+
+ RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent,
+ const std::string& type,
+ const std::string& id = "");
+
+ int hook_to_admin_command();
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ const bufferlist&,
+ Formatter *f,
+ std::ostream& ss,
+ bufferlist& out) override;
+ std::string get_active_names();
+};
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
new file mode 100644
index 000000000..66651da5c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_tools.cc
@@ -0,0 +1,437 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "librados/librados_asio.h"
+
+#include "include/stringify.h"
+
+#include "rgw_tools.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_compression.h"
+#include "common/BackTrace.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define READ_CHUNK_LEN (512 * 1024)
+
+using namespace std;
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+ librados::Rados *rados, const rgw_pool& pool,
+ librados::IoCtx& ioctx, bool create,
+ bool mostly_omap,
+ bool bulk)
+{
+ int r = rados->ioctx_create(pool.name.c_str(), ioctx);
+ if (r == -ENOENT && create) {
+ r = rados->pool_create(pool.name.c_str());
+ if (r == -ERANGE) {
+ ldpp_dout(dpp, 0)
+ << __func__
+ << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
+ << " (this can be due to a pool or placement group misconfiguration, e.g."
+ << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
+ << dendl;
+ }
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+
+ r = rados->ioctx_create(pool.name.c_str(), ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ if (mostly_omap) {
+ // set pg_autoscale_bias
+ bufferlist inbl;
+ float bias = g_conf().get_val<double>("rgw_rados_pool_autoscale_bias");
+ int r = rados->mon_command(
+ "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+ pool.name + "\", \"var\": \"pg_autoscale_bias\", \"val\": \"" +
+ stringify(bias) + "\"}",
+ inbl, NULL, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << __func__ << " warning: failed to set pg_autoscale_bias on "
+ << pool.name << dendl;
+ }
+ // set recovery_priority
+ int p = g_conf().get_val<uint64_t>("rgw_rados_pool_recovery_priority");
+ r = rados->mon_command(
+ "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+ pool.name + "\", \"var\": \"recovery_priority\": \"" +
+ stringify(p) + "\"}",
+ inbl, NULL, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << __func__ << " warning: failed to set recovery_priority on "
+ << pool.name << dendl;
+ }
+ }
+ if (bulk) {
+ // set bulk
+ bufferlist inbl;
+ int r = rados->mon_command(
+ "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+ pool.name + "\", \"var\": \"bulk\", \"val\": \"true\"}",
+ inbl, NULL, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << __func__ << " warning: failed to set 'bulk' on "
+ << pool.name << dendl;
+ }
+ }
+ } else if (r < 0) {
+ return r;
+ }
+ if (!pool.ns.empty()) {
+ ioctx.set_namespace(pool.ns);
+ }
+ return 0;
+}
+
+map<string, bufferlist>* no_change_attrs() {
+ static map<string, bufferlist> no_change;
+ return &no_change;
+}
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+ const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive,
+ RGWObjVersionTracker *objv_tracker, real_time set_mtime, optional_yield y, map<string, bufferlist> *pattrs)
+{
+ map<string,bufferlist> no_attrs;
+ if (!pattrs) {
+ pattrs = &no_attrs;
+ }
+
+ rgw_raw_obj obj(pool, oid);
+
+ auto sysobj = svc_sysobj->get_obj(obj);
+ int ret;
+
+ if (pattrs != no_change_attrs()) {
+ ret = sysobj.wop()
+ .set_objv_tracker(objv_tracker)
+ .set_exclusive(exclusive)
+ .set_mtime(set_mtime)
+ .set_attrs(*pattrs)
+ .write(dpp, data, y);
+ } else {
+ ret = sysobj.wop()
+ .set_objv_tracker(objv_tracker)
+ .set_exclusive(exclusive)
+ .set_mtime(set_mtime)
+ .write_data(dpp, data, y);
+ }
+
+ return ret;
+}
+
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+ const rgw_pool& pool, const std::string& key,
+ RGWObjVersionTracker *objv_tracker,
+ real_time *pmtime, optional_yield y,
+ std::map<std::string, bufferlist> *pattrs)
+{
+ rgw_raw_obj obj(pool, key);
+ auto sysobj = svc_sysobj->get_obj(obj);
+ return sysobj.rop()
+ .set_attrs(pattrs)
+ .set_last_mod(pmtime)
+ .stat(y, dpp);
+}
+
+
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, const string& key, bufferlist& bl,
+ RGWObjVersionTracker *objv_tracker, real_time *pmtime, optional_yield y,
+ const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs,
+ rgw_cache_entry_info *cache_info,
+ boost::optional<obj_version> refresh_version, bool raw_attrs)
+{
+ const rgw_raw_obj obj(pool, key);
+ auto sysobj = svc_sysobj->get_obj(obj);
+ auto rop = sysobj.rop();
+ return rop.set_attrs(pattrs)
+ .set_last_mod(pmtime)
+ .set_objv_tracker(objv_tracker)
+ .set_raw_attrs(raw_attrs)
+ .set_cache_info(cache_info)
+ .set_refresh_version(refresh_version)
+ .read(dpp, &bl, y);
+}
+
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp,
+ RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const string& oid,
+ RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+ auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+ rgw_raw_obj obj(pool, oid);
+ return sysobj.wop()
+ .set_objv_tracker(objv_tracker)
+ .remove(dpp, y);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ librados::ObjectReadOperation *op, bufferlist* pbl,
+ optional_yield y, int flags)
+{
+ // given a yield_context, call async_operate() to yield the coroutine instead
+ // of blocking
+ if (y) {
+ auto& context = y.get_io_context();
+ auto& yield = y.get_yield_context();
+ boost::system::error_code ec;
+ auto bl = librados::async_operate(
+ context, ioctx, oid, op, flags, yield[ec]);
+ if (pbl) {
+ *pbl = std::move(bl);
+ }
+ return -ec.value();
+ }
+ // work on asio threads should be asynchronous, so warn when they block
+ if (is_asio_thread) {
+ ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+ ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+ }
+ return ioctx.operate(oid, op, nullptr, flags);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ librados::ObjectWriteOperation *op, optional_yield y,
+ int flags)
+{
+ if (y) {
+ auto& context = y.get_io_context();
+ auto& yield = y.get_yield_context();
+ boost::system::error_code ec;
+ librados::async_operate(context, ioctx, oid, op, flags, yield[ec]);
+ return -ec.value();
+ }
+ if (is_asio_thread) {
+ ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+ ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+ }
+ return ioctx.operate(oid, op, flags);
+}
+
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+ optional_yield y)
+{
+ if (y) {
+ auto& context = y.get_io_context();
+ auto& yield = y.get_yield_context();
+ boost::system::error_code ec;
+ auto reply = librados::async_notify(context, ioctx, oid,
+ bl, timeout_ms, yield[ec]);
+ if (pbl) {
+ *pbl = std::move(reply);
+ }
+ return -ec.value();
+ }
+ if (is_asio_thread) {
+ ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+ ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+ }
+ return ioctx.notify2(oid, bl, timeout_ms, pbl);
+}
+
+void rgw_filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
+ map<string, bufferlist> *attrset)
+{
+ attrset->clear();
+ map<string, bufferlist>::iterator iter;
+ for (iter = unfiltered_attrset.lower_bound(check_prefix);
+ iter != unfiltered_attrset.end(); ++iter) {
+ if (!boost::algorithm::starts_with(iter->first, check_prefix))
+ break;
+ (*attrset)[iter->first] = iter->second;
+ }
+}
+
+RGWDataAccess::RGWDataAccess(rgw::sal::Driver* _driver) : driver(_driver)
+{
+}
+
+
+int RGWDataAccess::Bucket::finish_init()
+{
+ auto iter = attrs.find(RGW_ATTR_ACL);
+ if (iter == attrs.end()) {
+ return 0;
+ }
+
+ bufferlist::const_iterator bliter = iter->second.begin();
+ try {
+ policy.decode(bliter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWDataAccess::Bucket::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ int ret = sd->driver->get_bucket(dpp, nullptr, tenant, name, &bucket, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bucket_info = bucket->get_info();
+ mtime = bucket->get_modification_time();
+ attrs = bucket->get_attrs();
+
+ return finish_init();
+}
+
+int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info,
+ const map<string, bufferlist>& _attrs)
+{
+ bucket_info = _bucket_info;
+ attrs = _attrs;
+
+ return finish_init();
+}
+
+int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key,
+ ObjectRef *obj) {
+ obj->reset(new Object(sd, shared_from_this(), key));
+ return 0;
+}
+
+int RGWDataAccess::Object::put(bufferlist& data,
+ map<string, bufferlist>& attrs,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ rgw::sal::Driver* driver = sd->driver;
+ CephContext *cct = driver->ctx();
+
+ string tag;
+ append_rand_alpha(cct, tag, tag, 32);
+
+ RGWBucketInfo& bucket_info = bucket->bucket_info;
+
+ rgw::BlockingAioThrottle aio(driver->ctx()->_conf->rgw_put_obj_min_window_size);
+
+ std::unique_ptr<rgw::sal::Bucket> b;
+ driver->get_bucket(NULL, bucket_info, &b);
+ std::unique_ptr<rgw::sal::Object> obj = b->get_object(key);
+
+ auto& owner = bucket->policy.get_owner();
+
+ string req_id = driver->zone_unique_id(driver->get_new_req_id());
+
+ std::unique_ptr<rgw::sal::Writer> processor;
+ processor = driver->get_atomic_writer(dpp, y, obj.get(),
+ owner.get_id(),
+ nullptr, olh_epoch, req_id);
+
+ int ret = processor->prepare(y);
+ if (ret < 0)
+ return ret;
+
+ rgw::sal::DataProcessor *filter = processor.get();
+
+ CompressorRef plugin;
+ boost::optional<RGWPutObj_Compress> compressor;
+
+ const auto& compression_type = driver->get_compression_type(bucket_info.placement_rule);
+ if (compression_type != "none") {
+ plugin = Compressor::create(driver->ctx(), compression_type);
+ if (!plugin) {
+ ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+ << compression_type << dendl;
+ } else {
+ compressor.emplace(driver->ctx(), plugin, filter);
+ filter = &*compressor;
+ }
+ }
+
+ off_t ofs = 0;
+ auto obj_size = data.length();
+
+ RGWMD5Etag etag_calc;
+
+ do {
+ size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size);
+
+ bufferlist bl;
+
+ data.splice(0, read_len, &bl);
+ etag_calc.update(bl);
+
+ ret = filter->process(std::move(bl), ofs);
+ if (ret < 0)
+ return ret;
+
+ ofs += read_len;
+ } while (data.length() > 0);
+
+ ret = filter->process({}, ofs);
+ if (ret < 0) {
+ return ret;
+ }
+ bool has_etag_attr = false;
+ auto iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
+ bufferlist& bl = iter->second;
+ etag = bl.to_str();
+ has_etag_attr = true;
+ }
+
+ if (!aclbl) {
+ RGWAccessControlPolicy_S3 policy(cct);
+
+ policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */
+
+ policy.encode(aclbl.emplace());
+ }
+
+ if (etag.empty()) {
+ etag_calc.finish(&etag);
+ }
+
+ if (!has_etag_attr) {
+ bufferlist etagbl;
+ etagbl.append(etag);
+ attrs[RGW_ATTR_ETAG] = etagbl;
+ }
+ attrs[RGW_ATTR_ACL] = *aclbl;
+
+ string *puser_data = nullptr;
+ if (user_data) {
+ puser_data = &(*user_data);
+ }
+
+ return processor->complete(obj_size, etag,
+ &mtime, mtime,
+ attrs, delete_at,
+ nullptr, nullptr,
+ puser_data,
+ nullptr, nullptr, y);
+}
+
+void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy)
+{
+ policy.encode(aclbl.emplace());
+}
+
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r) {
+ auto pc = c->pc;
+ librados::CB_AioCompleteAndSafe cb(pc);
+ cb(r);
+}
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h
new file mode 100644
index 000000000..66600856d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_tools.h
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "include/types.h"
+#include "include/ceph_hash.h"
+
+#include "common/ceph_time.h"
+
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+
+class RGWSI_SysObj;
+
+class RGWRados;
+struct RGWObjVersionTracker;
+class optional_yield;
+
+struct obj_version;
+
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+ librados::Rados *rados, const rgw_pool& pool,
+ librados::IoCtx& ioctx,
+ bool create = false,
+ bool mostly_omap = false,
+ bool bulk = false);
+
+#define RGW_NO_SHARD -1
+
+#define RGW_SHARDS_PRIME_0 7877
+#define RGW_SHARDS_PRIME_1 65521
+
+extern const std::string MP_META_SUFFIX;
+
+inline int rgw_shards_max()
+{
+ return RGW_SHARDS_PRIME_1;
+}
+
+// only called by rgw_shard_id and rgw_bucket_shard_index
+static inline int rgw_shards_mod(unsigned hval, int max_shards)
+{
+ if (max_shards <= RGW_SHARDS_PRIME_0) {
+ return hval % RGW_SHARDS_PRIME_0 % max_shards;
+ }
+ return hval % RGW_SHARDS_PRIME_1 % max_shards;
+}
+
+// used for logging and tagging
+inline int rgw_shard_id(const std::string& key, int max_shards)
+{
+ return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()),
+ max_shards);
+}
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+ const rgw_pool& pool, const std::string& oid,
+ bufferlist& data, bool exclusive,
+ RGWObjVersionTracker *objv_tracker,
+ real_time set_mtime, optional_yield y,
+ std::map<std::string, bufferlist> *pattrs = nullptr);
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool,
+ const std::string& key, bufferlist& bl,
+ RGWObjVersionTracker *objv_tracker, real_time *pmtime,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ std::map<std::string, bufferlist> *pattrs = nullptr,
+ rgw_cache_entry_info *cache_info = nullptr,
+ boost::optional<obj_version> refresh_version = boost::none,
+ bool raw_attrs=false);
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp,
+ RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const std::string& oid,
+ RGWObjVersionTracker *objv_tracker, optional_yield y);
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+ const rgw_pool& pool, const std::string& key,
+ RGWObjVersionTracker *objv_tracker,
+ real_time *pmtime, optional_yield y,
+ std::map<std::string, bufferlist> *pattrs = nullptr);
+
+const char *rgw_find_mime_by_ext(std::string& ext);
+
+void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, const std::string& check_prefix,
+ std::map<std::string, bufferlist> *attrset);
+
+/// indicates whether the current thread is in boost::asio::io_context::run(),
+/// used to log warnings if synchronous librados calls are made
+extern thread_local bool is_asio_thread;
+
+/// perform the rados operation, using the yield context when given
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ librados::ObjectReadOperation *op, bufferlist* pbl,
+ optional_yield y, int flags = 0);
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ librados::ObjectWriteOperation *op, optional_yield y,
+ int flags = 0);
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+ bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+ optional_yield y);
+
+int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct);
+void rgw_tools_cleanup();
+
+template<class H, size_t S>
+class RGWEtag
+{
+ H hash;
+
+public:
+ RGWEtag() {
+ if constexpr (std::is_same_v<H, MD5>) {
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ }
+ }
+
+ void update(const char *buf, size_t len) {
+ hash.Update((const unsigned char *)buf, len);
+ }
+
+ void update(bufferlist& bl) {
+ if (bl.length() > 0) {
+ update(bl.c_str(), bl.length());
+ }
+ }
+
+ void update(const std::string& s) {
+ if (!s.empty()) {
+ update(s.c_str(), s.size());
+ }
+ }
+ void finish(std::string *etag) {
+ char etag_buf[S];
+ char etag_buf_str[S * 2 + 16];
+
+ hash.Final((unsigned char *)etag_buf);
+ buf_to_hex((const unsigned char *)etag_buf, S,
+ etag_buf_str);
+
+ *etag = etag_buf_str;
+ }
+};
+
+using RGWMD5Etag = RGWEtag<MD5, CEPH_CRYPTO_MD5_DIGESTSIZE>;
+
+class RGWDataAccess
+{
+ rgw::sal::Driver* driver;
+
+public:
+ RGWDataAccess(rgw::sal::Driver* _driver);
+
+ class Object;
+ class Bucket;
+
+ using BucketRef = std::shared_ptr<Bucket>;
+ using ObjectRef = std::shared_ptr<Object>;
+
+ class Bucket : public std::enable_shared_from_this<Bucket> {
+ friend class RGWDataAccess;
+ friend class Object;
+
+ RGWDataAccess *sd{nullptr};
+ RGWBucketInfo bucket_info;
+ std::string tenant;
+ std::string name;
+ std::string bucket_id;
+ ceph::real_time mtime;
+ std::map<std::string, bufferlist> attrs;
+
+ RGWAccessControlPolicy policy;
+ int finish_init();
+
+ Bucket(RGWDataAccess *_sd,
+ const std::string& _tenant,
+ const std::string& _name,
+ const std::string& _bucket_id) : sd(_sd),
+ tenant(_tenant),
+ name(_name),
+ bucket_id(_bucket_id) {}
+ Bucket(RGWDataAccess *_sd) : sd(_sd) {}
+ int init(const DoutPrefixProvider *dpp, optional_yield y);
+ int init(const RGWBucketInfo& _bucket_info, const std::map<std::string, bufferlist>& _attrs);
+ public:
+ int get_object(const rgw_obj_key& key,
+ ObjectRef *obj);
+
+ };
+
+
+ class Object {
+ RGWDataAccess *sd{nullptr};
+ BucketRef bucket;
+ rgw_obj_key key;
+
+ ceph::real_time mtime;
+ std::string etag;
+ uint64_t olh_epoch{0};
+ ceph::real_time delete_at;
+ std::optional<std::string> user_data;
+
+ std::optional<bufferlist> aclbl;
+
+ Object(RGWDataAccess *_sd,
+ BucketRef&& _bucket,
+ const rgw_obj_key& _key) : sd(_sd),
+ bucket(_bucket),
+ key(_key) {}
+ public:
+ int put(bufferlist& data, std::map<std::string, bufferlist>& attrs, const DoutPrefixProvider *dpp, optional_yield y); /* might modify attrs */
+
+ void set_mtime(const ceph::real_time& _mtime) {
+ mtime = _mtime;
+ }
+
+ void set_etag(const std::string& _etag) {
+ etag = _etag;
+ }
+
+ void set_olh_epoch(uint64_t epoch) {
+ olh_epoch = epoch;
+ }
+
+ void set_delete_at(ceph::real_time _delete_at) {
+ delete_at = _delete_at;
+ }
+
+ void set_user_data(const std::string& _user_data) {
+ user_data = _user_data;
+ }
+
+ void set_policy(const RGWAccessControlPolicy& policy);
+
+ friend class Bucket;
+ };
+
+ int get_bucket(const DoutPrefixProvider *dpp,
+ const std::string& tenant,
+ const std::string name,
+ const std::string bucket_id,
+ BucketRef *bucket,
+ optional_yield y) {
+ bucket->reset(new Bucket(this, tenant, name, bucket_id));
+ return (*bucket)->init(dpp, y);
+ }
+
+ int get_bucket(const RGWBucketInfo& bucket_info,
+ const std::map<std::string, bufferlist>& attrs,
+ BucketRef *bucket) {
+ bucket->reset(new Bucket(this));
+ return (*bucket)->init(bucket_info, attrs);
+ }
+ friend class Bucket;
+ friend class Object;
+};
+
+using RGWDataAccessRef = std::shared_ptr<RGWDataAccess>;
+
+/// Complete an AioCompletion. To return error values or otherwise
+/// satisfy the caller. Useful for making complicated asynchronous
+/// calls and error handling.
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r);
+
+/// This returns a static, non-NULL pointer, recognized only by
+/// rgw_put_system_obj(). When supplied instead of the attributes, the
+/// attributes will be unmodified.
+///
+// (Currently providing nullptr will wipe all attributes.)
+
+std::map<std::string, ceph::buffer::list>* no_change_attrs();
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.cc b/src/rgw/driver/rados/rgw_trim_bilog.cc
new file mode 100644
index 000000000..4e34abf51
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_bilog.cc
@@ -0,0 +1,1445 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include <mutex>
+#include <boost/circular_buffer.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include "include/scope_guard.h"
+#include "common/bounded_key_counter.h"
+#include "common/errno.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_data_sync.h"
+#include "rgw_metadata.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_bilog_rados.h"
+
+#include <boost/asio/yield.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "trim: ")
+
+using namespace std;
+
+using rgw::BucketTrimConfig;
+using BucketChangeCounter = BoundedKeyCounter<std::string, int>;
+
+const std::string rgw::BucketTrimStatus::oid = "bilog.trim";
+using rgw::BucketTrimStatus;
+
+
+// watch/notify api for gateways to coordinate about which buckets to trim
+enum TrimNotifyType {
+ NotifyTrimCounters = 0,
+ NotifyTrimComplete,
+};
+WRITE_RAW_ENCODER(TrimNotifyType);
+
+struct TrimNotifyHandler {
+ virtual ~TrimNotifyHandler() = default;
+
+ virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0;
+};
+
+/// api to share the bucket trim counters between gateways in the same zone.
+/// each gateway will process different datalog shards, so the gateway that runs
+/// the trim process needs to accumulate their counters
+struct TrimCounters {
+ /// counter for a single bucket
+ struct BucketCounter {
+ std::string bucket; //< bucket instance metadata key
+ int count{0};
+
+ BucketCounter() = default;
+ BucketCounter(const std::string& bucket, int count)
+ : bucket(bucket), count(count) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ };
+ using Vector = std::vector<BucketCounter>;
+
+ /// request bucket trim counters from peer gateways
+ struct Request {
+ uint16_t max_buckets; //< maximum number of bucket counters to return
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ };
+
+ /// return the current bucket trim counters
+ struct Response {
+ Vector bucket_counters;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ };
+
+ /// server interface to query the hottest buckets
+ struct Server {
+ virtual ~Server() = default;
+
+ virtual void get_bucket_counters(int count, Vector& counters) = 0;
+ virtual void reset_bucket_counters() = 0;
+ };
+
+ /// notify handler
+ class Handler : public TrimNotifyHandler {
+ Server *const server;
+ public:
+ explicit Handler(Server *server) : server(server) {}
+
+ void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+ };
+};
+std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs)
+{
+ return out << rhs.bucket << ":" << rhs.count;
+}
+
+void TrimCounters::BucketCounter::encode(bufferlist& bl) const
+{
+ using ceph::encode;
+ // no versioning to save space
+ encode(bucket, bl);
+ encode(count, bl);
+}
+void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p)
+{
+ using ceph::decode;
+ decode(bucket, p);
+ decode(count, p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::BucketCounter);
+
+void TrimCounters::Request::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(max_buckets, bl);
+ ENCODE_FINISH(bl);
+}
+void TrimCounters::Request::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(max_buckets, p);
+ DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Request);
+
+void TrimCounters::Response::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(bucket_counters, bl);
+ ENCODE_FINISH(bl);
+}
+void TrimCounters::Response::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(bucket_counters, p);
+ DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Response);
+
+void TrimCounters::Handler::handle(bufferlist::const_iterator& input,
+ bufferlist& output)
+{
+ Request request;
+ decode(request, input);
+ auto count = std::min<uint16_t>(request.max_buckets, 128);
+
+ Response response;
+ server->get_bucket_counters(count, response.bucket_counters);
+ encode(response, output);
+}
+
+/// api to notify peer gateways that trim has completed and their bucket change
+/// counters can be reset
+struct TrimComplete {
+ struct Request {
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ };
+ struct Response {
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ };
+
+ /// server interface to reset bucket counters
+ using Server = TrimCounters::Server;
+
+ /// notify handler
+ class Handler : public TrimNotifyHandler {
+ Server *const server;
+ public:
+ explicit Handler(Server *server) : server(server) {}
+
+ void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+ };
+};
+
+void TrimComplete::Request::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ENCODE_FINISH(bl);
+}
+void TrimComplete::Request::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Request);
+
+void TrimComplete::Response::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ENCODE_FINISH(bl);
+}
+void TrimComplete::Response::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Response);
+
+void TrimComplete::Handler::handle(bufferlist::const_iterator& input,
+ bufferlist& output)
+{
+ Request request;
+ decode(request, input);
+
+ server->reset_bucket_counters();
+
+ Response response;
+ encode(response, output);
+}
+
+
+/// rados watcher for bucket trim notifications
+class BucketTrimWatcher : public librados::WatchCtx2 {
+ rgw::sal::RadosStore* const store;
+ const rgw_raw_obj& obj;
+ rgw_rados_ref ref;
+ uint64_t handle{0};
+
+ using HandlerPtr = std::unique_ptr<TrimNotifyHandler>;
+ boost::container::flat_map<TrimNotifyType, HandlerPtr> handlers;
+
+ public:
+ BucketTrimWatcher(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+ TrimCounters::Server *counters)
+ : store(store), obj(obj) {
+ handlers.emplace(NotifyTrimCounters,
+ std::make_unique<TrimCounters::Handler>(counters));
+ handlers.emplace(NotifyTrimComplete,
+ std::make_unique<TrimComplete::Handler>(counters));
+ }
+
+ ~BucketTrimWatcher() {
+ stop();
+ }
+
+ int start(const DoutPrefixProvider *dpp) {
+ int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ // register a watch on the realm's control object
+ r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+ if (r == -ENOENT) {
+ constexpr bool exclusive = true;
+ r = ref.pool.ioctx().create(ref.obj.oid, exclusive);
+ if (r == -EEXIST || r == 0) {
+ r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+ }
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "Failed to watch " << ref.obj
+ << " with " << cpp_strerror(-r) << dendl;
+ ref.pool.ioctx().close();
+ return r;
+ }
+
+ ldpp_dout(dpp, 10) << "Watching " << ref.obj.oid << dendl;
+ return 0;
+ }
+
+ int restart() {
+ int r = ref.pool.ioctx().unwatch2(handle);
+ if (r < 0) {
+ lderr(store->ctx()) << "Failed to unwatch on " << ref.obj
+ << " with " << cpp_strerror(-r) << dendl;
+ }
+ r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+ if (r < 0) {
+ lderr(store->ctx()) << "Failed to restart watch on " << ref.obj
+ << " with " << cpp_strerror(-r) << dendl;
+ ref.pool.ioctx().close();
+ }
+ return r;
+ }
+
+ void stop() {
+ if (handle) {
+ ref.pool.ioctx().unwatch2(handle);
+ ref.pool.ioctx().close();
+ }
+ }
+
+ /// respond to bucket trim notifications
+ void handle_notify(uint64_t notify_id, uint64_t cookie,
+ uint64_t notifier_id, bufferlist& bl) override {
+ if (cookie != handle) {
+ return;
+ }
+ bufferlist reply;
+ try {
+ auto p = bl.cbegin();
+ TrimNotifyType type;
+ decode(type, p);
+
+ auto handler = handlers.find(type);
+ if (handler != handlers.end()) {
+ handler->second->handle(p, reply);
+ } else {
+ lderr(store->ctx()) << "no handler for notify type " << type << dendl;
+ }
+ } catch (const buffer::error& e) {
+ lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl;
+ }
+ ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, reply);
+ }
+
+ /// reestablish the watch if it gets disconnected
+ void handle_error(uint64_t cookie, int err) override {
+ if (cookie != handle) {
+ return;
+ }
+ if (err == -ENOTCONN) {
+ ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl;
+ restart();
+ }
+ }
+};
+
+
+/// Interface to communicate with the trim manager about completed operations
+struct BucketTrimObserver {
+ virtual ~BucketTrimObserver() = default;
+
+ virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0;
+ virtual bool trimmed_recently(const std::string_view& bucket_instance) = 0;
+};
+
+/// trim each bilog shard to the given marker, while limiting the number of
+/// concurrent requests
+class BucketTrimShardCollectCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* const store;
+ const RGWBucketInfo& bucket_info;
+ rgw::bucket_index_layout_generation generation;
+ const std::vector<std::string>& markers; //< shard markers to trim
+ size_t i{0}; //< index of current shard marker
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to trim bilog shard: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ BucketTrimShardCollectCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& generation,
+ const std::vector<std::string>& markers)
+ : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+ dpp(dpp), store(store), bucket_info(bucket_info),
+ generation(generation), markers(markers)
+ {}
+ bool spawn_next() override;
+};
+
+bool BucketTrimShardCollectCR::spawn_next()
+{
+ while (i < markers.size()) {
+ const auto& marker = markers[i];
+ const auto shard_id = i++;
+
+ // skip empty markers
+ if (!marker.empty()) {
+ ldpp_dout(dpp, 10) << "trimming bilog shard " << shard_id
+ << " of " << bucket_info.bucket << " at marker " << marker << dendl;
+ spawn(new RGWRadosBILogTrimCR(dpp, store, bucket_info, shard_id,
+ generation, std::string{}, marker),
+ false);
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Delete a BI generation, limiting the number of requests in flight.
+class BucketCleanIndexCollectCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* const store;
+ const RGWBucketInfo& bucket_info;
+ rgw::bucket_index_layout_generation index;
+ uint32_t shard = 0;
+ const uint32_t num_shards = rgw::num_shards(index);
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "clean index: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ BucketCleanIndexCollectCR(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const RGWBucketInfo& bucket_info,
+ rgw::bucket_index_layout_generation index)
+ : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+ dpp(dpp), store(store), bucket_info(bucket_info),
+ index(index)
+ {}
+ bool spawn_next() override {
+ if (shard < num_shards) {
+ RGWRados::BucketShard bs(store->getRados());
+ bs.init(dpp, bucket_info, index, shard);
+ spawn(new RGWRadosRemoveOidCR(store, std::move(bs.bucket_obj), nullptr),
+ false);
+ ++shard;
+ return true;
+ } else {
+ return false;
+ }
+ }
+};
+
+
+/// trim the bilog of all of the given bucket instance's shards
+class BucketTrimInstanceCR : public RGWCoroutine {
+ static constexpr auto MAX_RETRIES = 25u;
+ rgw::sal::RadosStore* const store;
+ RGWHTTPManager *const http;
+ BucketTrimObserver *const observer;
+ std::string bucket_instance;
+ rgw_bucket_get_sync_policy_params get_policy_params;
+ std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+ rgw_bucket bucket;
+ const std::string& zone_id; //< my zone id
+ RGWBucketInfo _bucket_info;
+ const RGWBucketInfo *pbucket_info; //< pointer to bucket instance info to locate bucket indices
+ int child_ret = 0;
+ const DoutPrefixProvider *dpp;
+public:
+ struct StatusShards {
+ uint64_t generation = 0;
+ std::vector<rgw_bucket_shard_sync_info> shards;
+ };
+private:
+ std::vector<StatusShards> peer_status; //< sync status for each peer
+ std::vector<std::string> min_markers; //< min marker per shard
+
+ /// The log generation to trim
+ rgw::bucket_log_layout_generation totrim;
+
+ /// Generation to be cleaned/New bucket info (if any)
+ std::optional<std::pair<RGWBucketInfo,
+ rgw::bucket_log_layout_generation>> clean_info;
+ /// Maximum number of times to attempt to put bucket info
+ unsigned retries = 0;
+
+ int take_min_generation() {
+ // Initialize the min_generation to the bucket's current
+ // generation, used in case we have no peers.
+ auto min_generation = pbucket_info->layout.logs.back().gen;
+
+ // Determine the minimum generation
+ if (auto m = std::min_element(peer_status.begin(),
+ peer_status.end(),
+ [](const StatusShards& l,
+ const StatusShards& r) {
+ return l.generation < r.generation;
+ }); m != peer_status.end()) {
+ min_generation = m->generation;
+ }
+
+ auto& logs = pbucket_info->layout.logs;
+ auto log = std::find_if(logs.begin(), logs.end(),
+ rgw::matches_gen(min_generation));
+ if (log == logs.end()) {
+ ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << "ERROR: No log layout for min_generation="
+ << min_generation << dendl;
+ return -ENOENT;
+ }
+
+ totrim = *log;
+ return 0;
+ }
+
+ /// If there is a generation below the minimum, prepare to clean it up.
+ int maybe_remove_generation() {
+ if (clean_info)
+ return 0;
+
+
+ if (pbucket_info->layout.logs.front().gen < totrim.gen) {
+ clean_info = {*pbucket_info, {}};
+ auto log = clean_info->first.layout.logs.cbegin();
+ clean_info->second = *log;
+
+ if (clean_info->first.layout.logs.size() == 1) {
+ ldpp_dout(dpp, -1)
+ << "Critical error! Attempt to remove only log generation! "
+ << "log.gen=" << log->gen << ", totrim.gen=" << totrim.gen
+ << dendl;
+ return -EIO;
+ }
+ clean_info->first.layout.logs.erase(log);
+ }
+ return 0;
+ }
+
+ public:
+ BucketTrimInstanceCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ BucketTrimObserver *observer,
+ const std::string& bucket_instance,
+ const DoutPrefixProvider *dpp)
+ : RGWCoroutine(store->ctx()), store(store),
+ http(http), observer(observer),
+ bucket_instance(bucket_instance),
+ zone_id(store->svc()->zone->get_zone().id),
+ dpp(dpp) {
+ rgw_bucket_parse_bucket_key(cct, bucket_instance, &bucket, nullptr);
+ source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+namespace {
+/// populate the status with the minimum stable marker of each shard
+int take_min_status(
+ CephContext *cct,
+ const uint64_t min_generation,
+ std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator first,
+ std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator last,
+ std::vector<std::string> *status) {
+ for (auto peer = first; peer != last; ++peer) {
+ // Peers on later generations don't get a say in the matter
+ if (peer->generation > min_generation) {
+ continue;
+ }
+ if (peer->shards.size() != status->size()) {
+ // all peers must agree on the number of shards
+ return -EINVAL;
+ }
+
+ auto m = status->begin();
+ for (auto& shard : peer->shards) {
+ auto& marker = *m++;
+ // always take the first marker, or any later marker that's smaller
+ if (peer == first || marker > shard.inc_marker.position) {
+ marker = std::move(shard.inc_marker.position);
+ }
+ }
+ }
+ return 0;
+}
+}
+
+template<>
+inline int parse_decode_json<BucketTrimInstanceCR::StatusShards>(
+ BucketTrimInstanceCR::StatusShards& s, bufferlist& bl)
+{
+ JSONParser p;
+ if (!p.parse(bl.c_str(), bl.length())) {
+ return -EINVAL;
+ }
+
+ try {
+ bilog_status_v2 v;
+ decode_json_obj(v, &p);
+ s.generation = v.sync_status.incremental_gen;
+ s.shards = std::move(v.inc_status);
+ } catch (JSONDecoder::err& e) {
+ try {
+ // Fall back if we're talking to an old node that can't give v2
+ // output.
+ s.generation = 0;
+ decode_json_obj(s.shards, &p);
+ } catch (JSONDecoder::err& e) {
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int BucketTrimInstanceCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ ldpp_dout(dpp, 4) << "starting trim on bucket=" << bucket_instance << dendl;
+
+ get_policy_params.zone = zone_id;
+ get_policy_params.bucket = bucket;
+ yield call(new RGWBucketGetSyncPolicyHandlerCR(store->svc()->rados->get_async_processor(),
+ store,
+ get_policy_params,
+ source_policy,
+ dpp));
+ if (retcode < 0) {
+ if (retcode != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to fetch policy handler for bucket=" << bucket << dendl;
+ }
+
+ return set_cr_error(retcode);
+ }
+
+ if (auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+ opt_bucket_info) {
+ pbucket_info = &(*opt_bucket_info);
+ } else {
+ /* this shouldn't really happen */
+ return set_cr_error(-ENOENT);
+ }
+
+ if (pbucket_info->layout.logs.empty()) {
+ return set_cr_done(); // no bilogs to trim
+ }
+
+ // query peers for sync status
+ set_status("fetching sync status from relevant peers");
+ yield {
+ const auto& all_dests = source_policy->policy_handler->get_all_dests();
+
+ vector<rgw_zone_id> zids;
+ rgw_zone_id last_zid;
+ for (auto& diter : all_dests) {
+ const auto& zid = diter.first;
+ if (zid == last_zid) {
+ continue;
+ }
+ last_zid = zid;
+ zids.push_back(zid);
+ }
+
+ peer_status.resize(zids.size());
+
+ auto& zone_conn_map = store->svc()->zone->get_zone_conn_map();
+
+ auto p = peer_status.begin();
+ for (auto& zid : zids) {
+ // query data sync status from each sync peer
+ rgw_http_param_pair params[] = {
+ { "type", "bucket-index" },
+ { "status", nullptr },
+ { "options", "merge" },
+ { "bucket", bucket_instance.c_str() }, /* equal to source-bucket when `options==merge` and source-bucket
+ param is not provided */
+ { "source-zone", zone_id.c_str() },
+ { "version", "2" },
+ { nullptr, nullptr }
+ };
+
+ auto ziter = zone_conn_map.find(zid);
+ if (ziter == zone_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "WARNING: no connection to zone " << zid << ", can't trim bucket: " << bucket << dendl;
+ return set_cr_error(-ECANCELED);
+ }
+
+ using StatusCR = RGWReadRESTResourceCR<StatusShards>;
+ spawn(new StatusCR(cct, ziter->second, http, "/admin/log/", params, &*p),
+ false);
+ ++p;
+ }
+ }
+ // wait for a response from each peer. all must respond to attempt trim
+ while (num_spawned()) {
+ yield wait_for_child();
+ collect(&child_ret, nullptr);
+ if (child_ret < 0) {
+ drain_all();
+ return set_cr_error(child_ret);
+ }
+ }
+
+ // Determine the minimum generation
+ retcode = take_min_generation();
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to find minimum generation" << dendl;
+ return set_cr_error(retcode);
+ }
+ retcode = maybe_remove_generation();
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "error removing old generation from log: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (clean_info) {
+ if (clean_info->second.layout.type != rgw::BucketLogType::InIndex) {
+ ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+ << clean_info->second.layout.type
+ << " to rgw::bucket_index_layout_generation " << dendl;
+ return set_cr_error(-EINVAL);
+ }
+
+ yield call(new BucketCleanIndexCollectCR(dpp, store, clean_info->first,
+ clean_info->second.layout.in_index));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to remove previous generation: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ while (clean_info && retries < MAX_RETRIES) {
+ yield call(new RGWPutBucketInstanceInfoCR(
+ store->svc()->rados->get_async_processor(),
+ store, clean_info->first, false, {},
+ no_change_attrs(), dpp));
+
+ // Raced, try again.
+ if (retcode == -ECANCELED) {
+ yield call(new RGWGetBucketInstanceInfoCR(
+ store->svc()->rados->get_async_processor(),
+ store, clean_info->first.bucket,
+ &(clean_info->first), nullptr, dpp));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to get bucket info: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ if (clean_info->first.layout.logs.front().gen ==
+ clean_info->second.gen) {
+ clean_info->first.layout.logs.erase(
+ clean_info->first.layout.logs.begin());
+ ++retries;
+ continue;
+ }
+ // Raced, but someone else did what we needed to.
+ retcode = 0;
+ }
+
+ if (retcode < 0) {
+ ldpp_dout(dpp, 0) << "failed to put bucket info: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ clean_info = std::nullopt;
+ }
+ } else {
+ if (totrim.layout.type != rgw::BucketLogType::InIndex) {
+ ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+ << totrim.layout.type
+ << " to rgw::bucket_index_layout_generation " << dendl;
+ return set_cr_error(-EINVAL);
+ }
+ // To avoid hammering the OSD too hard, either trim old
+ // generations OR trim the current one.
+
+ // determine the minimum marker for each shard
+
+ // initialize each shard with the maximum marker, which is only used when
+ // there are no peers syncing from us
+ min_markers.assign(std::max(1u, rgw::num_shards(totrim.layout.in_index)),
+ RGWSyncLogTrimCR::max_marker);
+
+
+ retcode = take_min_status(cct, totrim.gen, peer_status.cbegin(),
+ peer_status.cend(), &min_markers);
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to correlate bucket sync status from peers" << dendl;
+ return set_cr_error(retcode);
+ }
+
+ // trim shards with a ShardCollectCR
+ ldpp_dout(dpp, 10) << "trimming bilogs for bucket=" << pbucket_info->bucket
+ << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl;
+ set_status("trimming bilog shards");
+ yield call(new BucketTrimShardCollectCR(dpp, store, *pbucket_info, totrim.layout.in_index,
+ min_markers));
+ // ENODATA just means there were no keys to trim
+ if (retcode == -ENODATA) {
+ retcode = 0;
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to trim bilog shards: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+
+ observer->on_bucket_trimmed(std::move(bucket_instance));
+ return set_cr_done();
+ }
+ return 0;
+}
+
+/// trim each bucket instance while limiting the number of concurrent operations
+
+class BucketTrimInstanceCollectCR : public RGWShardCollectCR {
+ rgw::sal::RadosStore* const store;
+ RGWHTTPManager *const http;
+ BucketTrimObserver *const observer;
+ std::vector<std::string>::const_iterator bucket;
+ std::vector<std::string>::const_iterator end;
+ const DoutPrefixProvider *dpp;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to trim bucket instance: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ BucketTrimInstanceCollectCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ BucketTrimObserver *observer,
+ const std::vector<std::string>& buckets,
+ int max_concurrent,
+ const DoutPrefixProvider *dpp)
+ : RGWShardCollectCR(store->ctx(), max_concurrent),
+ store(store), http(http), observer(observer),
+ bucket(buckets.begin()), end(buckets.end()),
+ dpp(dpp)
+ {}
+ bool spawn_next() override;
+};
+
+bool BucketTrimInstanceCollectCR::spawn_next()
+{
+ if (bucket == end) {
+ return false;
+ }
+ spawn(new BucketTrimInstanceCR(store, http, observer, *bucket, dpp), false);
+ ++bucket;
+ return true;
+}
+
+/// correlate the replies from each peer gateway into the given counter
+int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter)
+{
+ counter.clear();
+
+ try {
+ // decode notify responses
+ auto p = bl.cbegin();
+ std::map<std::pair<uint64_t, uint64_t>, bufferlist> replies;
+ std::set<std::pair<uint64_t, uint64_t>> timeouts;
+ decode(replies, p);
+ decode(timeouts, p);
+
+ for (auto& peer : replies) {
+ auto q = peer.second.cbegin();
+ TrimCounters::Response response;
+ decode(response, q);
+ for (const auto& b : response.bucket_counters) {
+ counter.insert(b.bucket, b.count);
+ }
+ }
+ } catch (const buffer::error& e) {
+ return -EIO;
+ }
+ return 0;
+}
+
+/// metadata callback has the signature bool(string&& key, string&& marker)
+using MetadataListCallback = std::function<bool(std::string&&, std::string&&)>;
+
+/// lists metadata keys, passing each to a callback until it returns false.
+/// on reaching the end, it will restart at the beginning and list up to the
+/// initial marker
+class AsyncMetadataList : public RGWAsyncRadosRequest {
+ CephContext *const cct;
+ RGWMetadataManager *const mgr;
+ const std::string section;
+ const std::string start_marker;
+ MetadataListCallback callback;
+
+ int _send_request(const DoutPrefixProvider *dpp) override;
+ public:
+ AsyncMetadataList(CephContext *cct, RGWCoroutine *caller,
+ RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr,
+ const std::string& section, const std::string& start_marker,
+ const MetadataListCallback& callback)
+ : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
+ section(section), start_marker(start_marker), callback(callback)
+ {}
+};
+
+int AsyncMetadataList::_send_request(const DoutPrefixProvider *dpp)
+{
+ void* handle = nullptr;
+ std::list<std::string> keys;
+ bool truncated{false};
+ std::string marker;
+
+ // start a listing at the given marker
+ int r = mgr->list_keys_init(dpp, section, start_marker, &handle);
+ if (r == -EINVAL) {
+ // restart with empty marker below
+ } else if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to init metadata listing: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ ldpp_dout(dpp, 20) << "starting metadata listing at " << start_marker << dendl;
+
+ // release the handle when scope exits
+ auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+
+ do {
+ // get the next key and marker
+ r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to list metadata: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ marker = mgr->get_marker(handle);
+
+ if (!keys.empty()) {
+ ceph_assert(keys.size() == 1);
+ auto& key = keys.front();
+ if (!callback(std::move(key), std::move(marker))) {
+ return 0;
+ }
+ }
+ } while (truncated);
+
+ if (start_marker.empty()) {
+ // already listed all keys
+ return 0;
+ }
+ }
+
+ // restart the listing from the beginning (empty marker)
+ handle = nullptr;
+
+ r = mgr->list_keys_init(dpp, section, "", &handle);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to restart metadata listing: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 20) << "restarting metadata listing" << dendl;
+
+ // release the handle when scope exits
+ auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+ do {
+ // get the next key and marker
+ r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to list metadata: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ marker = mgr->get_marker(handle);
+
+ if (!keys.empty()) {
+ ceph_assert(keys.size() == 1);
+ auto& key = keys.front();
+ // stop at original marker
+ if (marker > start_marker) {
+ return 0;
+ }
+ if (!callback(std::move(key), std::move(marker))) {
+ return 0;
+ }
+ }
+ } while (truncated);
+
+ return 0;
+}
+
+/// coroutine wrapper for AsyncMetadataList
+class MetadataListCR : public RGWSimpleCoroutine {
+ RGWAsyncRadosProcessor *const async_rados;
+ RGWMetadataManager *const mgr;
+ const std::string& section;
+ const std::string& start_marker;
+ MetadataListCallback callback;
+ RGWAsyncRadosRequest *req{nullptr};
+ public:
+ MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados,
+ RGWMetadataManager *mgr, const std::string& section,
+ const std::string& start_marker,
+ const MetadataListCallback& callback)
+ : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr),
+ section(section), start_marker(start_marker), callback(callback)
+ {}
+ ~MetadataListCR() override {
+ request_cleanup();
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(),
+ mgr, section, start_marker, callback);
+ async_rados->queue(req);
+ return 0;
+ }
+ int request_complete() override {
+ return req->get_ret_status();
+ }
+ void request_cleanup() override {
+ if (req) {
+ req->finish();
+ req = nullptr;
+ }
+ }
+};
+
+class BucketTrimCR : public RGWCoroutine {
+ rgw::sal::RadosStore* const store;
+ RGWHTTPManager *const http;
+ const BucketTrimConfig& config;
+ BucketTrimObserver *const observer;
+ const rgw_raw_obj& obj;
+ ceph::mono_time start_time;
+ bufferlist notify_replies;
+ BucketChangeCounter counter;
+ std::vector<std::string> buckets; //< buckets selected for trim
+ BucketTrimStatus status;
+ RGWObjVersionTracker objv; //< version tracker for trim status object
+ std::string last_cold_marker; //< position for next trim marker
+ const DoutPrefixProvider *dpp;
+
+ static const std::string section; //< metadata section for bucket instances
+ public:
+ BucketTrimCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ const BucketTrimConfig& config, BucketTrimObserver *observer,
+ const rgw_raw_obj& obj, const DoutPrefixProvider *dpp)
+ : RGWCoroutine(store->ctx()), store(store), http(http), config(config),
+ observer(observer), obj(obj), counter(config.counter_size), dpp(dpp)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+const std::string BucketTrimCR::section{"bucket.instance"};
+
+int BucketTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ start_time = ceph::mono_clock::now();
+
+ if (config.buckets_per_interval) {
+ // query watch/notify for hot buckets
+ ldpp_dout(dpp, 10) << "fetching active bucket counters" << dendl;
+ set_status("fetching active bucket counters");
+ yield {
+ // request the top bucket counters from each peer gateway
+ const TrimNotifyType type = NotifyTrimCounters;
+ TrimCounters::Request request{32};
+ bufferlist bl;
+ encode(type, bl);
+ encode(request, bl);
+ call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+ &notify_replies));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 10) << "failed to fetch peer bucket counters" << dendl;
+ return set_cr_error(retcode);
+ }
+
+ // select the hottest buckets for trim
+ retcode = accumulate_peer_counters(notify_replies, counter);
+ if (retcode < 0) {
+ ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl;
+ return set_cr_error(retcode);
+ }
+ buckets.reserve(config.buckets_per_interval);
+
+ const int max_count = config.buckets_per_interval -
+ config.min_cold_buckets_per_interval;
+ counter.get_highest(max_count,
+ [this] (const std::string& bucket, int count) {
+ buckets.push_back(bucket);
+ });
+ }
+
+ if (buckets.size() < config.buckets_per_interval) {
+ // read BucketTrimStatus for marker position
+ set_status("reading trim status");
+ using ReadStatus = RGWSimpleRadosReadCR<BucketTrimStatus>;
+ yield call(new ReadStatus(dpp, store, obj, &status, true, &objv));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 10) << "failed to read bilog trim status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ if (status.marker == "MAX") {
+ status.marker.clear(); // restart at the beginning
+ }
+ ldpp_dout(dpp, 10) << "listing cold buckets from marker="
+ << status.marker << dendl;
+
+ set_status("listing cold buckets for trim");
+ yield {
+ // capture a reference so 'this' remains valid in the callback
+ auto ref = boost::intrusive_ptr<RGWCoroutine>{this};
+ // list cold buckets to consider for trim
+ auto cb = [this, ref] (std::string&& bucket, std::string&& marker) {
+ // filter out keys that we trimmed recently
+ if (observer->trimmed_recently(bucket)) {
+ return true;
+ }
+ // filter out active buckets that we've already selected
+ auto i = std::find(buckets.begin(), buckets.end(), bucket);
+ if (i != buckets.end()) {
+ return true;
+ }
+ buckets.emplace_back(std::move(bucket));
+ // remember the last cold bucket spawned to update the status marker
+ last_cold_marker = std::move(marker);
+ // return true if there's room for more
+ return buckets.size() < config.buckets_per_interval;
+ };
+
+ call(new MetadataListCR(cct, store->svc()->rados->get_async_processor(),
+ store->ctl()->meta.mgr,
+ section, status.marker, cb));
+ }
+ if (retcode < 0) {
+ ldout(cct, 4) << "failed to list bucket instance metadata: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+
+ // trim bucket instances with limited concurrency
+ set_status("trimming buckets");
+ ldpp_dout(dpp, 4) << "collected " << buckets.size() << " buckets for trim" << dendl;
+ yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets,
+ config.concurrent_buckets, dpp));
+ // ignore errors from individual buckets
+
+ // write updated trim status
+ if (!last_cold_marker.empty() && status.marker != last_cold_marker) {
+ set_status("writing updated trim status");
+ status.marker = std::move(last_cold_marker);
+ ldpp_dout(dpp, 20) << "writing bucket trim marker=" << status.marker << dendl;
+ using WriteStatus = RGWSimpleRadosWriteCR<BucketTrimStatus>;
+ yield call(new WriteStatus(dpp, store, obj, status, &objv));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to write updated trim status: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ }
+
+ // notify peers that trim completed
+ set_status("trim completed");
+ yield {
+ const TrimNotifyType type = NotifyTrimComplete;
+ TrimComplete::Request request;
+ bufferlist bl;
+ encode(type, bl);
+ encode(request, bl);
+ call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+ nullptr));
+ }
+ if (retcode < 0) {
+ ldout(cct, 10) << "failed to notify peers of trim completion" << dendl;
+ return set_cr_error(retcode);
+ }
+
+ ldpp_dout(dpp, 4) << "bucket index log processing completed in "
+ << ceph::mono_clock::now() - start_time << dendl;
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class BucketTrimPollCR : public RGWCoroutine {
+ rgw::sal::RadosStore* const store;
+ RGWHTTPManager *const http;
+ const BucketTrimConfig& config;
+ BucketTrimObserver *const observer;
+ const rgw_raw_obj& obj;
+ const std::string name{"trim"}; //< lock name
+ const std::string cookie;
+ const DoutPrefixProvider *dpp;
+
+ public:
+ BucketTrimPollCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ const BucketTrimConfig& config,
+ BucketTrimObserver *observer, const rgw_raw_obj& obj,
+ const DoutPrefixProvider *dpp)
+ : RGWCoroutine(store->ctx()), store(store), http(http),
+ config(config), observer(observer), obj(obj),
+ cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+ dpp(dpp) {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int BucketTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ for (;;) {
+ set_status("sleeping");
+ wait(utime_t{static_cast<time_t>(config.trim_interval_sec), 0});
+
+ // prevent others from trimming for our entire wait interval
+ set_status("acquiring trim lock");
+ yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+ obj, name, cookie,
+ config.trim_interval_sec));
+ if (retcode < 0) {
+ ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+ continue;
+ }
+
+ set_status("trimming");
+ yield call(new BucketTrimCR(store, http, config, observer, obj, dpp));
+ if (retcode < 0) {
+ // on errors, unlock so other gateways can try
+ set_status("unlocking");
+ yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+ obj, name, cookie));
+ }
+ }
+ }
+ return 0;
+}
+
+/// tracks a bounded list of events with timestamps. old events can be expired,
+/// and recent events can be searched by key. expiration depends on events being
+/// inserted in temporal order
+template <typename T, typename Clock = ceph::coarse_mono_clock>
+class RecentEventList {
+ public:
+ using clock_type = Clock;
+ using time_point = typename clock_type::time_point;
+
+ RecentEventList(size_t max_size, const ceph::timespan& max_duration)
+ : events(max_size), max_duration(max_duration)
+ {}
+
+ /// insert an event at the given point in time. this time must be at least as
+ /// recent as the last inserted event
+ void insert(T&& value, const time_point& now) {
+ // ceph_assert(events.empty() || now >= events.back().time)
+ events.push_back(Event{std::move(value), now});
+ }
+
+ /// performs a linear search for an event matching the given key, whose type
+ /// U can be any that provides operator==(U, T)
+ template <typename U>
+ bool lookup(const U& key) const {
+ for (const auto& event : events) {
+ if (key == event.value) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /// remove events that are no longer recent compared to the given point in time
+ void expire_old(const time_point& now) {
+ const auto expired_before = now - max_duration;
+ while (!events.empty() && events.front().time < expired_before) {
+ events.pop_front();
+ }
+ }
+
+ private:
+ struct Event {
+ T value;
+ time_point time;
+ };
+ boost::circular_buffer<Event> events;
+ const ceph::timespan max_duration;
+};
+
+namespace rgw {
+
+// read bucket trim configuration from ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config)
+{
+ const auto& conf = cct->_conf;
+
+ config.trim_interval_sec =
+ conf.get_val<int64_t>("rgw_sync_log_trim_interval");
+ config.counter_size = 512;
+ config.buckets_per_interval =
+ conf.get_val<int64_t>("rgw_sync_log_trim_max_buckets");
+ config.min_cold_buckets_per_interval =
+ conf.get_val<int64_t>("rgw_sync_log_trim_min_cold_buckets");
+ config.concurrent_buckets =
+ conf.get_val<int64_t>("rgw_sync_log_trim_concurrent_buckets");
+ config.notify_timeout_ms = 10000;
+ config.recent_size = 128;
+ config.recent_duration = std::chrono::hours(2);
+}
+
+class BucketTrimManager::Impl : public TrimCounters::Server,
+ public BucketTrimObserver {
+ public:
+ rgw::sal::RadosStore* const store;
+ const BucketTrimConfig config;
+
+ const rgw_raw_obj status_obj;
+
+ /// count frequency of bucket instance entries in the data changes log
+ BucketChangeCounter counter;
+
+ using RecentlyTrimmedBucketList = RecentEventList<std::string>;
+ using clock_type = RecentlyTrimmedBucketList::clock_type;
+ /// track recently trimmed buckets to focus trim activity elsewhere
+ RecentlyTrimmedBucketList trimmed;
+
+ /// serve the bucket trim watch/notify api
+ BucketTrimWatcher watcher;
+
+ /// protect data shared between data sync, trim, and watch/notify threads
+ std::mutex mutex;
+
+ Impl(rgw::sal::RadosStore* store, const BucketTrimConfig& config)
+ : store(store), config(config),
+ status_obj(store->svc()->zone->get_zone_params().log_pool, BucketTrimStatus::oid),
+ counter(config.counter_size),
+ trimmed(config.recent_size, config.recent_duration),
+ watcher(store, status_obj, this)
+ {}
+
+ /// TrimCounters::Server interface for watch/notify api
+ void get_bucket_counters(int count, TrimCounters::Vector& buckets) {
+ buckets.reserve(count);
+ std::lock_guard<std::mutex> lock(mutex);
+ counter.get_highest(count, [&buckets] (const std::string& key, int count) {
+ buckets.emplace_back(key, count);
+ });
+ ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl;
+ }
+
+ void reset_bucket_counters() override {
+ ldout(store->ctx(), 20) << "bucket trim completed" << dendl;
+ std::lock_guard<std::mutex> lock(mutex);
+ counter.clear();
+ trimmed.expire_old(clock_type::now());
+ }
+
+ /// BucketTrimObserver interface to remember successfully-trimmed buckets
+ void on_bucket_trimmed(std::string&& bucket_instance) override {
+ ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl;
+ std::lock_guard<std::mutex> lock(mutex);
+ trimmed.insert(std::move(bucket_instance), clock_type::now());
+ }
+
+ bool trimmed_recently(const std::string_view& bucket_instance) override {
+ std::lock_guard<std::mutex> lock(mutex);
+ return trimmed.lookup(bucket_instance);
+ }
+};
+
+BucketTrimManager::BucketTrimManager(rgw::sal::RadosStore* store,
+ const BucketTrimConfig& config)
+ : impl(new Impl(store, config))
+{
+}
+BucketTrimManager::~BucketTrimManager() = default;
+
+int BucketTrimManager::init()
+{
+ return impl->watcher.start(this);
+}
+
+void BucketTrimManager::on_bucket_changed(const std::string_view& bucket)
+{
+ std::lock_guard<std::mutex> lock(impl->mutex);
+ // filter recently trimmed bucket instances out of bucket change counter
+ if (impl->trimmed.lookup(bucket)) {
+ return;
+ }
+ impl->counter.insert(std::string(bucket));
+}
+
+RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http)
+{
+ return new BucketTrimPollCR(impl->store, http, impl->config,
+ impl.get(), impl->status_obj, this);
+}
+
+RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http)
+{
+ // return the trim coroutine without any polling
+ return new BucketTrimCR(impl->store, http, impl->config,
+ impl.get(), impl->status_obj, this);
+}
+
+CephContext* BucketTrimManager::get_cct() const
+{
+ return impl->store->ctx();
+}
+
+unsigned BucketTrimManager::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& BucketTrimManager::gen_prefix(std::ostream& out) const
+{
+ return out << "rgw bucket trim manager: ";
+}
+
+} // namespace rgw
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+ std::string_view start_marker, std::string_view end_marker)
+{
+ auto& logs = bucket_info.layout.logs;
+ auto log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(gen));
+ if (log == logs.end()) {
+ ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << "ERROR: no log layout with gen=" << gen << dendl;
+ return -ENOENT;
+ }
+
+ auto log_layout = *log;
+
+ auto r = store->svc()->bilog_rados->log_trim(p, bucket_info, log_layout, shard_id, start_marker, end_marker);
+ if (r < 0) {
+ ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << "ERROR: bilog_rados->log_trim returned r=" << r << dendl;
+ }
+ return r;
+}
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h
new file mode 100644
index 000000000..6a11d2476
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_bilog.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string_view>
+
+#include "include/common_fwd.h"
+#include "include/encoding.h"
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "rgw_common.h"
+
+class RGWCoroutine;
+class RGWHTTPManager;
+
+namespace rgw {
+
+namespace sal {
+ class RadosStore;
+}
+
+/// Interface to inform the trim process about which buckets are most active
+struct BucketChangeObserver {
+ virtual ~BucketChangeObserver() = default;
+
+ virtual void on_bucket_changed(const std::string_view& bucket_instance) = 0;
+};
+
+/// Configuration for BucketTrimManager
+struct BucketTrimConfig {
+ /// time interval in seconds between bucket trim attempts
+ uint32_t trim_interval_sec{0};
+ /// maximum number of buckets to track with BucketChangeObserver
+ size_t counter_size{0};
+ /// maximum number of buckets to process each trim interval
+ uint32_t buckets_per_interval{0};
+ /// minimum number of buckets to choose from the global bucket instance list
+ uint32_t min_cold_buckets_per_interval{0};
+ /// maximum number of buckets to process in parallel
+ uint32_t concurrent_buckets{0};
+ /// timeout in ms for bucket trim notify replies
+ uint64_t notify_timeout_ms{0};
+ /// maximum number of recently trimmed buckets to remember (should be small
+ /// enough for a linear search)
+ size_t recent_size{0};
+ /// maximum duration to consider a trim as 'recent' (should be some multiple
+ /// of the trim interval, at least)
+ ceph::timespan recent_duration{0};
+};
+
+/// fill out the BucketTrimConfig from the ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config);
+
+/// Determines the buckets on which to focus trim activity, using two sources of
+/// input: the frequency of entries read from the data changes log, and a global
+/// listing of the bucket.instance metadata. This allows us to trim active
+/// buckets quickly, while also ensuring that all buckets will eventually trim
+class BucketTrimManager : public BucketChangeObserver, public DoutPrefixProvider {
+ class Impl;
+ std::unique_ptr<Impl> impl;
+ public:
+ BucketTrimManager(sal::RadosStore *store, const BucketTrimConfig& config);
+ ~BucketTrimManager();
+
+ int init();
+
+ /// increment a counter for the given bucket instance
+ void on_bucket_changed(const std::string_view& bucket_instance) override;
+
+ /// create a coroutine to run the bucket trim process every trim interval
+ RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http);
+
+ /// create a coroutine to trim buckets directly via radosgw-admin
+ RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http);
+
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const;
+ std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/// provides persistent storage for the trim manager's current position in the
+/// list of bucket instance metadata
+struct BucketTrimStatus {
+ std::string marker; //< metadata key of current bucket instance
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(marker, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(marker, p);
+ DECODE_FINISH(p);
+ }
+
+ static const std::string oid;
+};
+
+} // namespace rgw
+
+WRITE_CLASS_ENCODER(rgw::BucketTrimStatus);
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+ std::string_view start_marker, std::string_view end_marker);
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.cc b/src/rgw/driver/rados/rgw_trim_datalog.cc
new file mode 100644
index 000000000..72a160039
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_datalog.cc
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+#include <string>
+
+#include "common/errno.h"
+
+#include "rgw_trim_datalog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data trim: ")
+
+namespace {
+
+class DatalogTrimImplCR : public RGWSimpleCoroutine {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ int shard;
+ std::string marker;
+ std::string* last_trim_marker;
+
+ public:
+ DatalogTrimImplCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, int shard,
+ const std::string& marker, std::string* last_trim_marker)
+ : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), shard(shard),
+ marker(marker), last_trim_marker(last_trim_marker) {
+ set_description() << "Datalog trim shard=" << shard
+ << " marker=" << marker;
+ }
+
+ int send_request(const DoutPrefixProvider *dpp) override {
+ set_status() << "sending request";
+ cn = stack->create_completion_notifier();
+ return store->svc()->datalog_rados->trim_entries(dpp, shard, marker,
+ cn->completion());
+ }
+ int request_complete() override {
+ int r = cn->completion()->get_return_value();
+ ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << "(): trim of shard=" << shard
+ << " marker=" << marker << " returned r=" << r << dendl;
+
+ set_status() << "request complete; ret=" << r;
+ if (r != -ENODATA) {
+ return r;
+ }
+ // nothing left to trim, update last_trim_marker
+ if (*last_trim_marker < marker &&
+ marker != store->svc()->datalog_rados->max_marker()) {
+ *last_trim_marker = marker;
+ }
+ return 0;
+ }
+};
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_data_sync_marker& m)
+{
+ return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// populate the container starting with 'dest' with the minimum stable marker
+/// of each shard for all of the peers in [first, last)
+template <typename IterIn, typename IterOut>
+void take_min_markers(IterIn first, IterIn last, IterOut dest)
+{
+ if (first == last) {
+ return;
+ }
+ for (auto p = first; p != last; ++p) {
+ auto m = dest;
+ for (auto &shard : p->sync_markers) {
+ const auto& stable = get_stable_marker(shard.second);
+ if (*m > stable) {
+ *m = stable;
+ }
+ ++m;
+ }
+ }
+}
+
+} // anonymous namespace
+
+class DataLogTrimCR : public RGWCoroutine {
+ using TrimCR = DatalogTrimImplCR;
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWHTTPManager *http;
+ const int num_shards;
+ const std::string& zone_id; //< my zone id
+ std::vector<rgw_data_sync_status> peer_status; //< sync status for each peer
+ std::vector<std::string> min_shard_markers; //< min marker per shard
+ std::vector<std::string>& last_trim; //< last trimmed marker per shard
+ int ret{0};
+
+ public:
+ DataLogTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ int num_shards, std::vector<std::string>& last_trim)
+ : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+ num_shards(num_shards),
+ zone_id(store->svc()->zone->get_zone().id),
+ peer_status(store->svc()->zone->get_zone_data_notify_to_map().size()),
+ min_shard_markers(num_shards,
+ std::string(store->svc()->datalog_rados->max_marker())),
+ last_trim(last_trim)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ ldpp_dout(dpp, 10) << "fetching sync status for zone " << zone_id << dendl;
+ set_status("fetching sync status");
+ yield {
+ // query data sync status from each sync peer
+ rgw_http_param_pair params[] = {
+ { "type", "data" },
+ { "status", nullptr },
+ { "source-zone", zone_id.c_str() },
+ { nullptr, nullptr }
+ };
+
+ auto p = peer_status.begin();
+ for (auto& c : store->svc()->zone->get_zone_data_notify_to_map()) {
+ ldpp_dout(dpp, 20) << "query sync status from " << c.first << dendl;
+ using StatusCR = RGWReadRESTResourceCR<rgw_data_sync_status>;
+ spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p),
+ false);
+ ++p;
+ }
+ }
+
+ // must get a successful reply from all peers to consider trimming
+ ret = 0;
+ while (ret == 0 && num_spawned() > 0) {
+ yield wait_for_child();
+ collect_next(&ret);
+ }
+ drain_all();
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+ return set_cr_error(ret);
+ }
+
+ ldpp_dout(dpp, 10) << "trimming log shards" << dendl;
+ set_status("trimming log shards");
+ yield {
+ // determine the minimum marker for each shard
+ take_min_markers(peer_status.begin(), peer_status.end(),
+ min_shard_markers.begin());
+
+ for (int i = 0; i < num_shards; i++) {
+ const auto& m = min_shard_markers[i];
+ if (m <= last_trim[i]) {
+ continue;
+ }
+ ldpp_dout(dpp, 10) << "trimming log shard " << i
+ << " at marker=" << m
+ << " last_trim=" << last_trim[i] << dendl;
+ spawn(new TrimCR(dpp, store, i, m, &last_trim[i]),
+ true);
+ }
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards,
+ std::vector<std::string>& markers)
+{
+ return new DataLogTrimCR(dpp, store, http, num_shards, markers);
+}
+
+class DataLogTrimPollCR : public RGWCoroutine {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* store;
+ RGWHTTPManager *http;
+ const int num_shards;
+ const utime_t interval; //< polling interval
+ const std::string lock_oid; //< use first data log shard for lock
+ const std::string lock_cookie;
+ std::vector<std::string> last_trim; //< last trimmed marker per shard
+
+ public:
+ DataLogTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ int num_shards, utime_t interval)
+ : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+ num_shards(num_shards), interval(interval),
+ lock_oid(store->svc()->datalog_rados->get_oid(0, 0)),
+ lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+ last_trim(num_shards)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ for (;;) {
+ set_status("sleeping");
+ wait(interval);
+
+ // request a 'data_trim' lock that covers the entire wait interval to
+ // prevent other gateways from attempting to trim for the duration
+ set_status("acquiring trim lock");
+ yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+ rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, lock_oid),
+ "data_trim", lock_cookie,
+ interval.sec()));
+ if (retcode < 0) {
+ // if the lock is already held, go back to sleep and try again later
+ ldpp_dout(dpp, 4) << "failed to lock " << lock_oid << ", trying again in "
+ << interval.sec() << "s" << dendl;
+ continue;
+ }
+
+ set_status("trimming");
+ yield call(new DataLogTrimCR(dpp, store, http, num_shards, last_trim));
+
+ // note that the lock is not released. this is intentional, as it avoids
+ // duplicating this work in other gateways
+ }
+ }
+ return 0;
+}
+
+RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards, utime_t interval)
+{
+ return new DataLogTrimPollCR(dpp, store, http, num_shards, interval);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.h b/src/rgw/driver/rados/rgw_trim_datalog.h
new file mode 100644
index 000000000..9f5bf7252
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_datalog.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "common/dout.h"
+
+class RGWCoroutine;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+ class RadosStore;
+} }
+
+// DataLogTrimCR factory function
+extern RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards, utime_t interval);
+
+// factory function for datalog trim via radosgw-admin
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards,
+ std::vector<std::string>& markers);
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.cc b/src/rgw/driver/rados/rgw_trim_mdlog.cc
new file mode 100644
index 000000000..d8e19594a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_mdlog.cc
@@ -0,0 +1,795 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_trim_mdlog.h"
+#include "rgw_sync.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_zone.h"
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta trim: ")
+
+/// purge all log shards for the given mdlog
+class PurgeLogShardsCR : public RGWShardCollectCR {
+ rgw::sal::RadosStore* const store;
+ const RGWMetadataLog* mdlog;
+ const int num_shards;
+ rgw_raw_obj obj;
+ int i{0};
+
+ static constexpr int max_concurrent = 16;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to remove mdlog shard: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ PurgeLogShardsCR(rgw::sal::RadosStore* store, const RGWMetadataLog* mdlog,
+ const rgw_pool& pool, int num_shards)
+ : RGWShardCollectCR(store->ctx(), max_concurrent),
+ store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
+ {}
+
+ bool spawn_next() override {
+ if (i == num_shards) {
+ return false;
+ }
+ mdlog->get_shard_oid(i++, obj.oid);
+ spawn(new RGWRadosRemoveCR(store, obj), false);
+ return true;
+ }
+};
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
+class PurgePeriodLogsCR : public RGWCoroutine {
+ struct Svc {
+ RGWSI_Zone *zone;
+ RGWSI_MDLog *mdlog;
+ } svc;
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* const store;
+ RGWMetadataManager *const metadata;
+ RGWObjVersionTracker objv;
+ Cursor cursor;
+ epoch_t realm_epoch;
+ epoch_t *last_trim_epoch; //< update last trim on success
+
+ public:
+ PurgePeriodLogsCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, epoch_t realm_epoch, epoch_t *last_trim)
+ : RGWCoroutine(store->ctx()), dpp(dpp), store(store), metadata(store->ctl()->meta.mgr),
+ realm_epoch(realm_epoch), last_trim_epoch(last_trim) {
+ svc.zone = store->svc()->zone;
+ svc.mdlog = store->svc()->mdlog;
+ }
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int PurgePeriodLogsCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // read our current oldest log period
+ yield call(svc.mdlog->read_oldest_log_period_cr(dpp, &cursor, &objv));
+ if (retcode < 0) {
+ return set_cr_error(retcode);
+ }
+ ceph_assert(cursor);
+ ldpp_dout(dpp, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
+ << " period=" << cursor.get_period().get_id() << dendl;
+
+ // trim -up to- the given realm_epoch
+ while (cursor.get_epoch() < realm_epoch) {
+ ldpp_dout(dpp, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
+ << " period=" << cursor.get_period().get_id() << dendl;
+ yield {
+ const auto mdlog = svc.mdlog->get_log(cursor.get_period().get_id());
+ const auto& pool = svc.zone->get_zone_params().log_pool;
+ auto num_shards = cct->_conf->rgw_md_log_max_shards;
+ call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 1) << "failed to remove log shards: "
+ << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ ldpp_dout(dpp, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
+ << " period=" << cursor.get_period().get_id() << dendl;
+
+ // update our mdlog history
+ yield call(svc.mdlog->trim_log_period_cr(dpp, cursor, &objv));
+ if (retcode == -ENOENT) {
+ // must have raced to update mdlog history. return success and allow the
+ // winner to continue purging
+ ldpp_dout(dpp, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
+ << " period=" << cursor.get_period().get_id() << dendl;
+ return set_cr_done();
+ } else if (retcode < 0) {
+ ldpp_dout(dpp, 1) << "failed to remove log shards for realm_epoch="
+ << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
+ << " with: " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+
+ if (*last_trim_epoch < cursor.get_epoch()) {
+ *last_trim_epoch = cursor.get_epoch();
+ }
+
+ ceph_assert(cursor.has_next()); // get_current() should always come after
+ cursor.next();
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+namespace {
+
+using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
+
+/// construct a RGWRESTConn for each zone in the realm
+template <typename Zonegroups>
+connection_map make_peer_connections(rgw::sal::RadosStore* store,
+ const Zonegroups& zonegroups)
+{
+ connection_map connections;
+ for (auto& g : zonegroups) {
+ for (auto& z : g.second.zones) {
+ std::unique_ptr<RGWRESTConn> conn{
+ new RGWRESTConn(store->ctx(), store, z.first.id, z.second.endpoints, g.second.api_name)};
+ connections.emplace(z.first.id, std::move(conn));
+ }
+ }
+ return connections;
+}
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
+{
+ return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// comparison operator for take_min_status()
+bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
+{
+ // sort by stable marker
+ return get_stable_marker(lhs) < get_stable_marker(rhs);
+}
+
+/// populate the status with the minimum stable marker of each shard for any
+/// peer whose realm_epoch matches the minimum realm_epoch in the input
+template <typename Iter>
+int take_min_status(CephContext *cct, Iter first, Iter last,
+ rgw_meta_sync_status *status)
+{
+ if (first == last) {
+ return -EINVAL;
+ }
+ const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
+
+ status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
+ for (auto p = first; p != last; ++p) {
+ // validate peer's shard count
+ if (p->sync_markers.size() != num_shards) {
+ ldout(cct, 1) << "take_min_status got peer status with "
+ << p->sync_markers.size() << " shards, expected "
+ << num_shards << dendl;
+ return -EINVAL;
+ }
+ if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
+ // earlier epoch, take its entire status
+ *status = std::move(*p);
+ } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
+ // same epoch, take any earlier markers
+ auto m = status->sync_markers.begin();
+ for (auto& shard : p->sync_markers) {
+ if (shard.second < m->second) {
+ m->second = std::move(shard.second);
+ }
+ ++m;
+ }
+ }
+ }
+ return 0;
+}
+
+struct TrimEnv {
+ const DoutPrefixProvider *dpp;
+ rgw::sal::RadosStore* const store;
+ RGWHTTPManager *const http;
+ int num_shards;
+ const rgw_zone_id& zone;
+ Cursor current; //< cursor to current period
+ epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
+
+ TrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+ : dpp(dpp), store(store), http(http), num_shards(num_shards),
+ zone(store->svc()->zone->zone_id()),
+ current(store->svc()->mdlog->get_period_history()->get_current())
+ {}
+};
+
+struct MasterTrimEnv : public TrimEnv {
+ connection_map connections; //< peer connections
+ std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
+ /// last trim marker for each shard, only applies to current period's mdlog
+ std::vector<std::string> last_trim_markers;
+
+ MasterTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+ : TrimEnv(dpp, store, http, num_shards),
+ last_trim_markers(num_shards)
+ {
+ auto& period = current.get_period();
+ connections = make_peer_connections(store, period.get_map().zonegroups);
+ connections.erase(zone.id);
+ peer_status.resize(connections.size());
+ }
+};
+
+struct PeerTrimEnv : public TrimEnv {
+ /// last trim timestamp for each shard, only applies to current period's mdlog
+ std::vector<ceph::real_time> last_trim_timestamps;
+
+ PeerTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+ : TrimEnv(dpp, store, http, num_shards),
+ last_trim_timestamps(num_shards)
+ {}
+
+ void set_num_shards(int num_shards) {
+ this->num_shards = num_shards;
+ last_trim_timestamps.resize(num_shards);
+ }
+};
+
+} // anonymous namespace
+
+
+/// spawn a trim cr for each shard that needs it, while limiting the number
+/// of concurrent shards
+class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
+ private:
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ MasterTrimEnv& env;
+ RGWMetadataLog *mdlog;
+ int shard_id{0};
+ std::string oid;
+ const rgw_meta_sync_status& sync_status;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
+ const rgw_meta_sync_status& sync_status)
+ : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+ env(env), mdlog(mdlog), sync_status(sync_status)
+ {}
+
+ bool spawn_next() override;
+};
+
+bool MetaMasterTrimShardCollectCR::spawn_next()
+{
+ while (shard_id < env.num_shards) {
+ auto m = sync_status.sync_markers.find(shard_id);
+ if (m == sync_status.sync_markers.end()) {
+ shard_id++;
+ continue;
+ }
+ auto& stable = get_stable_marker(m->second);
+ auto& last_trim = env.last_trim_markers[shard_id];
+
+ if (stable <= last_trim) {
+ // already trimmed
+ ldpp_dout(env.dpp, 20) << "skipping log shard " << shard_id
+ << " at marker=" << stable
+ << " last_trim=" << last_trim
+ << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+ shard_id++;
+ continue;
+ }
+
+ mdlog->get_shard_oid(shard_id, oid);
+
+ ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id
+ << " at marker=" << stable
+ << " last_trim=" << last_trim
+ << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+ spawn(new RGWSyncLogTrimCR(env.dpp, env.store, oid, stable, &last_trim), false);
+ shard_id++;
+ return true;
+ }
+ return false;
+}
+
+/// spawn rest requests to read each peer's sync status
+class MetaMasterStatusCollectCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ MasterTrimEnv& env;
+ connection_map::iterator c;
+ std::vector<rgw_meta_sync_status>::iterator s;
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to fetch metadata sync status: "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ explicit MetaMasterStatusCollectCR(MasterTrimEnv& env)
+ : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+ env(env), c(env.connections.begin()), s(env.peer_status.begin())
+ {}
+
+ bool spawn_next() override {
+ if (c == env.connections.end()) {
+ return false;
+ }
+ static rgw_http_param_pair params[] = {
+ { "type", "metadata" },
+ { "status", nullptr },
+ { nullptr, nullptr }
+ };
+
+ ldout(cct, 20) << "query sync status from " << c->first << dendl;
+ auto conn = c->second.get();
+ using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
+ spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
+ false);
+ ++c;
+ ++s;
+ return true;
+ }
+};
+
+class MetaMasterTrimCR : public RGWCoroutine {
+ MasterTrimEnv& env;
+ rgw_meta_sync_status min_status; //< minimum sync status of all peers
+ int ret{0};
+
+ public:
+ explicit MetaMasterTrimCR(MasterTrimEnv& env)
+ : RGWCoroutine(env.store->ctx()), env(env)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaMasterTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // TODO: detect this and fail before we spawn the trim thread?
+ if (env.connections.empty()) {
+ ldpp_dout(dpp, 4) << "no peers, exiting" << dendl;
+ return set_cr_done();
+ }
+
+ ldpp_dout(dpp, 10) << "fetching sync status for zone " << env.zone << dendl;
+ // query mdlog sync status from peers
+ yield call(new MetaMasterStatusCollectCR(env));
+
+ // must get a successful reply from all peers to consider trimming
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+ return set_cr_error(ret);
+ }
+
+ // determine the minimum epoch and markers
+ ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
+ env.peer_status.end(), &min_status);
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << "failed to calculate min sync status from peers" << dendl;
+ return set_cr_error(ret);
+ }
+ yield {
+ auto store = env.store;
+ auto epoch = min_status.sync_info.realm_epoch;
+ ldpp_dout(dpp, 4) << "realm epoch min=" << epoch
+ << " current=" << env.current.get_epoch()<< dendl;
+ if (epoch > env.last_trim_epoch + 1) {
+ // delete any prior mdlog periods
+ spawn(new PurgePeriodLogsCR(dpp, store, epoch, &env.last_trim_epoch), true);
+ } else {
+ ldpp_dout(dpp, 10) << "mdlogs already purged up to realm_epoch "
+ << env.last_trim_epoch << dendl;
+ }
+
+ // if realm_epoch == current, trim mdlog based on markers
+ if (epoch == env.current.get_epoch()) {
+ auto mdlog = store->svc()->mdlog->get_log(env.current.get_period().get_id());
+ spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
+ }
+ }
+ // ignore any errors during purge/trim because we want to hold the lock open
+ return set_cr_done();
+ }
+ return 0;
+}
+
+
+/// read the first entry of the master's mdlog shard and trim to that position
+class MetaPeerTrimShardCR : public RGWCoroutine {
+ RGWMetaSyncEnv& env;
+ RGWMetadataLog *mdlog;
+ const std::string& period_id;
+ const int shard_id;
+ RGWMetadataLogInfo info;
+ ceph::real_time stable; //< safe timestamp to trim, according to master
+ ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
+ rgw_mdlog_shard_data result; //< result from master's mdlog listing
+
+ public:
+ MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
+ const std::string& period_id, int shard_id,
+ ceph::real_time *last_trim)
+ : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
+ period_id(period_id), shard_id(shard_id), last_trim(last_trim)
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimShardCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ // query master's first mdlog entry for this shard
+ yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+ "", 1, &result));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+ << shard_id << " for period " << period_id
+ << ": " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ if (result.entries.empty()) {
+ // if there are no mdlog entries, we don't have a timestamp to compare. we
+ // can't just trim everything, because there could be racing updates since
+ // this empty reply. query the mdlog shard info to read its max timestamp,
+ // then retry the listing to make sure it's still empty before trimming to
+ // that
+ ldpp_dout(dpp, 10) << "empty master mdlog shard " << shard_id
+ << ", reading last timestamp from shard info" << dendl;
+ // read the mdlog shard info for the last timestamp
+ yield call(create_read_remote_mdlog_shard_info_cr(&env, period_id, shard_id, &info));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 5) << "failed to read info from master's mdlog shard "
+ << shard_id << " for period " << period_id
+ << ": " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ if (ceph::real_clock::is_zero(info.last_update)) {
+ return set_cr_done(); // nothing to trim
+ }
+ ldpp_dout(dpp, 10) << "got mdlog shard info with last update="
+ << info.last_update << dendl;
+ // re-read the master's first mdlog entry to make sure it hasn't changed
+ yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+ "", 1, &result));
+ if (retcode < 0) {
+ ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+ << shard_id << " for period " << period_id
+ << ": " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ // if the mdlog is still empty, trim to max marker
+ if (result.entries.empty()) {
+ stable = info.last_update;
+ } else {
+ stable = result.entries.front().timestamp;
+
+ // can only trim -up to- master's first timestamp, so subtract a second.
+ // (this is why we use timestamps instead of markers for the peers)
+ stable -= std::chrono::seconds(1);
+ }
+ } else {
+ stable = result.entries.front().timestamp;
+ stable -= std::chrono::seconds(1);
+ }
+
+ if (stable <= *last_trim) {
+ ldpp_dout(dpp, 10) << "skipping log shard " << shard_id
+ << " at timestamp=" << stable
+ << " last_trim=" << *last_trim << dendl;
+ return set_cr_done();
+ }
+
+ ldpp_dout(dpp, 10) << "trimming log shard " << shard_id
+ << " at timestamp=" << stable
+ << " last_trim=" << *last_trim << dendl;
+ yield {
+ std::string oid;
+ mdlog->get_shard_oid(shard_id, oid);
+ call(new RGWRadosTimelogTrimCR(dpp, env.store, oid, real_time{}, stable, "", ""));
+ }
+ if (retcode < 0 && retcode != -ENODATA) {
+ ldpp_dout(dpp, 1) << "failed to trim mdlog shard " << shard_id
+ << ": " << cpp_strerror(retcode) << dendl;
+ return set_cr_error(retcode);
+ }
+ *last_trim = stable;
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
+ static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+ PeerTrimEnv& env;
+ RGWMetadataLog *mdlog;
+ const std::string& period_id;
+ RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
+ int shard_id{0};
+
+ int handle_result(int r) override {
+ if (r == -ENOENT) { // ENOENT is not a fatal error
+ return 0;
+ }
+ if (r < 0) {
+ ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+ public:
+ MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
+ : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+ env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
+ {
+ meta_env.init(env.dpp, cct, env.store, env.store->svc()->zone->get_master_conn(),
+ env.store->svc()->rados->get_async_processor(), env.http, nullptr,
+ env.store->getRados()->get_sync_tracer());
+ }
+
+ bool spawn_next() override;
+};
+
+bool MetaPeerTrimShardCollectCR::spawn_next()
+{
+ if (shard_id >= env.num_shards) {
+ return false;
+ }
+ auto& last_trim = env.last_trim_timestamps[shard_id];
+ spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
+ false);
+ shard_id++;
+ return true;
+}
+
+class MetaPeerTrimCR : public RGWCoroutine {
+ PeerTrimEnv& env;
+ rgw_mdlog_info mdlog_info; //< master's mdlog info
+
+ public:
+ explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ ldpp_dout(dpp, 10) << "fetching master mdlog info" << dendl;
+ yield {
+ // query mdlog_info from master for oldest_log_period
+ rgw_http_param_pair params[] = {
+ { "type", "metadata" },
+ { nullptr, nullptr }
+ };
+
+ using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
+ call(new LogInfoCR(cct, env.store->svc()->zone->get_master_conn(), env.http,
+ "/admin/log/", params, &mdlog_info));
+ }
+ if (retcode < 0) {
+ ldpp_dout(dpp, 4) << "failed to read mdlog info from master" << dendl;
+ return set_cr_error(retcode);
+ }
+ // use master's shard count instead
+ env.set_num_shards(mdlog_info.num_shards);
+
+ if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
+ // delete any prior mdlog periods
+ yield call(new PurgePeriodLogsCR(dpp, env.store, mdlog_info.realm_epoch,
+ &env.last_trim_epoch));
+ } else {
+ ldpp_dout(dpp, 10) << "mdlogs already purged through realm_epoch "
+ << env.last_trim_epoch << dendl;
+ }
+
+ // if realm_epoch == current, trim mdlog based on master's markers
+ if (mdlog_info.realm_epoch == env.current.get_epoch()) {
+ yield {
+ auto mdlog = env.store->svc()->mdlog->get_log(env.current.get_period().get_id());
+ call(new MetaPeerTrimShardCollectCR(env, mdlog));
+ // ignore any errors during purge/trim because we want to hold the lock open
+ }
+ }
+ return set_cr_done();
+ }
+ return 0;
+}
+
+class MetaTrimPollCR : public RGWCoroutine {
+ rgw::sal::RadosStore* const store;
+ const utime_t interval; //< polling interval
+ const rgw_raw_obj obj;
+ const std::string name{"meta_trim"}; //< lock name
+ const std::string cookie;
+
+ protected:
+ /// allocate the coroutine to run within the lease
+ virtual RGWCoroutine* alloc_cr() = 0;
+
+ public:
+ MetaTrimPollCR(rgw::sal::RadosStore* store, utime_t interval)
+ : RGWCoroutine(store->ctx()), store(store), interval(interval),
+ obj(store->svc()->zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
+ cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
+ {}
+
+ int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+ reenter(this) {
+ for (;;) {
+ set_status("sleeping");
+ wait(interval);
+
+ // prevent others from trimming for our entire wait interval
+ set_status("acquiring trim lock");
+ yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+ obj, name, cookie, interval.sec()));
+ if (retcode < 0) {
+ ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+ continue;
+ }
+
+ set_status("trimming");
+ yield call(alloc_cr());
+
+ if (retcode < 0) {
+ // on errors, unlock so other gateways can try
+ set_status("unlocking");
+ yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+ obj, name, cookie));
+ }
+ }
+ }
+ return 0;
+}
+
+class MetaMasterTrimPollCR : public MetaTrimPollCR {
+ MasterTrimEnv env; //< trim state to share between calls
+ RGWCoroutine* alloc_cr() override {
+ return new MetaMasterTrimCR(env);
+ }
+ public:
+ MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ int num_shards, utime_t interval)
+ : MetaTrimPollCR(store, interval),
+ env(dpp, store, http, num_shards)
+ {}
+};
+
+class MetaPeerTrimPollCR : public MetaTrimPollCR {
+ PeerTrimEnv env; //< trim state to share between calls
+ RGWCoroutine* alloc_cr() override {
+ return new MetaPeerTrimCR(env);
+ }
+ public:
+ MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ int num_shards, utime_t interval)
+ : MetaTrimPollCR(store, interval),
+ env(dpp, store, http, num_shards)
+ {}
+};
+
+namespace {
+bool sanity_check_endpoints(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store) {
+ bool retval = true;
+ auto current = store->svc()->mdlog->get_period_history()->get_current();
+ const auto& period = current.get_period();
+ for (const auto& [_, zonegroup] : period.get_map().zonegroups) {
+ if (zonegroup.endpoints.empty()) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " WARNING: Cluster is is misconfigured! "
+ << " Zonegroup " << zonegroup.get_name()
+ << " (" << zonegroup.get_id() << ") in Realm "
+ << period.get_realm_name() << " ( " << period.get_realm() << ") "
+ << " has no endpoints!" << dendl;
+ }
+ for (const auto& [_, zone] : zonegroup.zones) {
+ if (zone.endpoints.empty()) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " ERROR: Cluster is is misconfigured! "
+ << " Zone " << zone.name << " (" << zone.id << ") in Zonegroup "
+ << zonegroup.get_name() << " ( " << zonegroup.get_id()
+ << ") in Realm " << period.get_realm_name()
+ << " ( " << period.get_realm() << ") "
+ << " has no endpoints! Trimming is impossible." << dendl;
+ retval = false;
+ }
+ }
+ }
+ return retval;
+}
+}
+
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+ int num_shards, utime_t interval)
+{
+ if (!sanity_check_endpoints(dpp, store)) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+ return nullptr;
+ }
+ if (store->svc()->zone->is_meta_master()) {
+ return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval);
+ }
+ return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval);
+}
+
+
+struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
+ MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+ : MasterTrimEnv(dpp, store, http, num_shards),
+ MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
+ {}
+};
+
+struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
+ MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+ : PeerTrimEnv(dpp, store, http, num_shards),
+ MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
+ {}
+};
+
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards)
+{
+ if (!sanity_check_endpoints(dpp, store)) {
+ ldpp_dout(dpp, -1)
+ << __PRETTY_FUNCTION__ << ":" << __LINE__
+ << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+ return nullptr;
+ }
+ if (store->svc()->zone->is_meta_master()) {
+ return new MetaMasterAdminTrimCR(dpp, store, http, num_shards);
+ }
+ return new MetaPeerAdminTrimCR(dpp, store, http, num_shards);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.h b/src/rgw/driver/rados/rgw_trim_mdlog.h
new file mode 100644
index 000000000..1dba8612b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_mdlog.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+class RGWCoroutine;
+class DoutPrefixProvider;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+ class RadosStore;
+} }
+
+// MetaLogTrimCR factory function
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards, utime_t interval);
+
+// factory function for mdlog trim via radosgw-admin
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ RGWHTTPManager *http,
+ int num_shards);
diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc
new file mode 100644
index 000000000..51b38c082
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_user.cc
@@ -0,0 +1,2776 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_user.h"
+
+#include "rgw_bucket.h"
+#include "rgw_quota.h"
+
+#include "services/svc_user.h"
+#include "services/svc_meta.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+extern void op_type_to_str(uint32_t mask, char *buf, int len);
+
+static string key_type_to_str(int key_type) {
+ switch (key_type) {
+ case KEY_TYPE_SWIFT:
+ return "swift";
+ break;
+
+ default:
+ return "s3";
+ break;
+ }
+}
+
+static bool char_is_unreserved_url(char c)
+{
+ if (isalnum(c))
+ return true;
+
+ switch (c) {
+ case '-':
+ case '.':
+ case '_':
+ case '~':
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool validate_access_key(string& key)
+{
+ const char *p = key.c_str();
+ while (*p) {
+ if (!char_is_unreserved_url(*p))
+ return false;
+ p++;
+ }
+ return true;
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+ if (sink && !msg.empty())
+ *sink = msg;
+}
+
+/*
+ * Dump either the full user info or a subset to a formatter.
+ *
+ * NOTE: It is the caller's responsibility to ensure that the
+ * formatter is flushed at the correct time.
+ */
+
+static void dump_subusers_info(Formatter *f, RGWUserInfo &info)
+{
+ map<string, RGWSubUser>::iterator uiter;
+
+ f->open_array_section("subusers");
+ for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
+ RGWSubUser& u = uiter->second;
+ f->open_object_section("user");
+ string s;
+ info.user_id.to_str(s);
+ f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str());
+ char buf[256];
+ rgw_perm_to_str(u.perm_mask, buf, sizeof(buf));
+ f->dump_string("permissions", buf);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+static void dump_access_keys_info(Formatter *f, RGWUserInfo &info)
+{
+ map<string, RGWAccessKey>::iterator kiter;
+ f->open_array_section("keys");
+ for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
+ RGWAccessKey& k = kiter->second;
+ const char *sep = (k.subuser.empty() ? "" : ":");
+ const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+ f->open_object_section("key");
+ string s;
+ info.user_id.to_str(s);
+ f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+ f->dump_string("access_key", k.id);
+ f->dump_string("secret_key", k.key);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info)
+{
+ map<string, RGWAccessKey>::iterator kiter;
+ f->open_array_section("swift_keys");
+ for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) {
+ RGWAccessKey& k = kiter->second;
+ const char *sep = (k.subuser.empty() ? "" : ":");
+ const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+ f->open_object_section("key");
+ string s;
+ info.user_id.to_str(s);
+ f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+ f->dump_string("secret_key", k.key);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+static void dump_user_info(Formatter *f, RGWUserInfo &info,
+ RGWStorageStats *stats = NULL)
+{
+ f->open_object_section("user_info");
+ encode_json("tenant", info.user_id.tenant, f);
+ encode_json("user_id", info.user_id.id, f);
+ encode_json("display_name", info.display_name, f);
+ encode_json("email", info.user_email, f);
+ encode_json("suspended", (int)info.suspended, f);
+ encode_json("max_buckets", (int)info.max_buckets, f);
+
+ dump_subusers_info(f, info);
+ dump_access_keys_info(f, info);
+ dump_swift_keys_info(f, info);
+
+ encode_json("caps", info.caps, f);
+
+ char buf[256];
+ op_type_to_str(info.op_mask, buf, sizeof(buf));
+ encode_json("op_mask", (const char *)buf, f);
+ encode_json("system", (bool)info.system, f);
+ encode_json("admin", (bool)info.admin, f);
+ encode_json("default_placement", info.default_placement.name, f);
+ encode_json("default_storage_class", info.default_placement.storage_class, f);
+ encode_json("placement_tags", info.placement_tags, f);
+ encode_json("bucket_quota", info.quota.bucket_quota, f);
+ encode_json("user_quota", info.quota.user_quota, f);
+ encode_json("temp_url_keys", info.temp_url_keys, f);
+
+ string user_source_type;
+ switch ((RGWIdentityType)info.type) {
+ case TYPE_RGW:
+ user_source_type = "rgw";
+ break;
+ case TYPE_KEYSTONE:
+ user_source_type = "keystone";
+ break;
+ case TYPE_LDAP:
+ user_source_type = "ldap";
+ break;
+ case TYPE_NONE:
+ user_source_type = "none";
+ break;
+ default:
+ user_source_type = "none";
+ break;
+ }
+ encode_json("type", user_source_type, f);
+ encode_json("mfa_ids", info.mfa_ids, f);
+ if (stats) {
+ encode_json("stats", *stats, f);
+ }
+ f->close_section();
+}
+
+static int user_add_helper(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+ int ret = 0;
+ const rgw_user& uid = op_state.get_user_id();
+ std::string user_email = op_state.get_user_email();
+ std::string display_name = op_state.get_display_name();
+
+ // fail if the user exists already
+ if (op_state.has_existing_user()) {
+ if (op_state.found_by_email) {
+ set_err_msg(err_msg, "email: " + user_email +
+ " is the email address of an existing user");
+ ret = -ERR_EMAIL_EXIST;
+ } else if (op_state.found_by_key) {
+ set_err_msg(err_msg, "duplicate key provided");
+ ret = -ERR_KEY_EXIST;
+ } else {
+ set_err_msg(err_msg, "user: " + uid.to_str() + " exists");
+ ret = -EEXIST;
+ }
+ return ret;
+ }
+
+ // fail if the user_info has already been populated
+ if (op_state.is_populated()) {
+ set_err_msg(err_msg, "cannot overwrite already populated user");
+ return -EEXIST;
+ }
+
+ // fail if the display name was not included
+ if (display_name.empty()) {
+ set_err_msg(err_msg, "no display name specified");
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr)
+{
+ if (!usr) {
+ return;
+ }
+
+ user = usr;
+
+ driver = user->get_driver();
+}
+
+int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state)
+{
+ if (!op_state.is_initialized()) {
+ keys_allowed = false;
+ return -EINVAL;
+ }
+
+ const rgw_user& uid = op_state.get_user_id();
+ if (uid.compare(RGW_USER_ANON_ID) == 0) {
+ keys_allowed = false;
+ return -EINVAL;
+ }
+
+ swift_keys = op_state.get_swift_keys();
+ access_keys = op_state.get_access_keys();
+
+ keys_allowed = true;
+
+ return 0;
+}
+
+RGWUserAdminOpState::RGWUserAdminOpState(rgw::sal::Driver* driver)
+{
+ user = driver->get_user(rgw_user(RGW_USER_ANON_ID));
+}
+
+void RGWUserAdminOpState::set_user_id(const rgw_user& id)
+{
+ if (id.empty())
+ return;
+
+ user->get_info().user_id = id;
+}
+
+void RGWUserAdminOpState::set_subuser(std::string& _subuser)
+{
+ if (_subuser.empty())
+ return;
+
+ size_t pos = _subuser.find(":");
+ if (pos != string::npos) {
+ rgw_user tmp_id;
+ tmp_id.from_str(_subuser.substr(0, pos));
+ if (tmp_id.tenant.empty()) {
+ user->get_info().user_id.id = tmp_id.id;
+ } else {
+ user->get_info().user_id = tmp_id;
+ }
+ subuser = _subuser.substr(pos+1);
+ } else {
+ subuser = _subuser;
+ }
+
+ subuser_specified = true;
+}
+
+void RGWUserAdminOpState::set_user_info(RGWUserInfo& user_info)
+{
+ user->get_info() = user_info;
+}
+
+void RGWUserAdminOpState::set_user_version_tracker(RGWObjVersionTracker& objv_tracker)
+{
+ user->get_version_tracker() = objv_tracker;
+}
+
+const rgw_user& RGWUserAdminOpState::get_user_id()
+{
+ return user->get_id();
+}
+
+RGWUserInfo& RGWUserAdminOpState::get_user_info()
+{
+ return user->get_info();
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_swift_keys()
+{
+ return &user->get_info().swift_keys;
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_access_keys()
+{
+ return &user->get_info().access_keys;
+}
+
+map<std::string, RGWSubUser>* RGWUserAdminOpState::get_subusers()
+{
+ return &user->get_info().subusers;
+}
+
+RGWUserCaps *RGWUserAdminOpState::get_caps_obj()
+{
+ return &user->get_info().caps;
+}
+
+std::string RGWUserAdminOpState::build_default_swift_kid()
+{
+ if (user->get_id().empty() || subuser.empty())
+ return "";
+
+ std::string kid;
+ user->get_id().to_str(kid);
+ kid.append(":");
+ kid.append(subuser);
+
+ return kid;
+}
+
+std::string RGWUserAdminOpState::generate_subuser() {
+ if (user->get_id().empty())
+ return "";
+
+ std::string generated_subuser;
+ user->get_id().to_str(generated_subuser);
+ std::string rand_suffix;
+
+ int sub_buf_size = RAND_SUBUSER_LEN + 1;
+ char sub_buf[RAND_SUBUSER_LEN + 1];
+
+ gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size);
+
+ rand_suffix = sub_buf;
+ if (rand_suffix.empty())
+ return "";
+
+ generated_subuser.append(rand_suffix);
+ subuser = generated_subuser;
+
+ return generated_subuser;
+}
+
+/*
+ * Do a fairly exhaustive search for an existing key matching the parameters
+ * given. Also handles the case where no key type was specified and updates
+ * the operation state if needed.
+ */
+
+bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state)
+{
+ bool existing_key = false;
+
+ int key_type = op_state.get_key_type();
+ std::string kid = op_state.get_access_key();
+ std::map<std::string, RGWAccessKey>::iterator kiter;
+ std::string swift_kid = op_state.build_default_swift_kid();
+
+ RGWUserInfo dup_info;
+
+ if (kid.empty() && swift_kid.empty())
+ return false;
+
+ switch (key_type) {
+ case KEY_TYPE_SWIFT:
+ kiter = swift_keys->find(swift_kid);
+
+ existing_key = (kiter != swift_keys->end());
+ if (existing_key)
+ op_state.set_access_key(swift_kid);
+
+ break;
+ case KEY_TYPE_S3:
+ kiter = access_keys->find(kid);
+ existing_key = (kiter != access_keys->end());
+
+ break;
+ default:
+ kiter = access_keys->find(kid);
+
+ existing_key = (kiter != access_keys->end());
+ if (existing_key) {
+ op_state.set_key_type(KEY_TYPE_S3);
+ break;
+ }
+
+ kiter = swift_keys->find(kid);
+
+ existing_key = (kiter != swift_keys->end());
+ if (existing_key) {
+ op_state.set_key_type(KEY_TYPE_SWIFT);
+ break;
+ }
+
+ // handle the case where the access key was not provided in user:key format
+ if (swift_kid.empty())
+ return false;
+
+ kiter = swift_keys->find(swift_kid);
+
+ existing_key = (kiter != swift_keys->end());
+ if (existing_key) {
+ op_state.set_access_key(swift_kid);
+ op_state.set_key_type(KEY_TYPE_SWIFT);
+ }
+ }
+
+ op_state.set_existing_key(existing_key);
+
+ return existing_key;
+}
+
+int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
+ std::string *err_msg)
+{
+ RGWUserInfo dup_info;
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!keys_allowed) {
+ set_err_msg(err_msg, "keys not allowed for this user");
+ return -EACCES;
+ }
+
+ int32_t key_type = op_state.get_key_type();
+
+ // if a key type wasn't specified
+ if (key_type < 0) {
+ if (op_state.has_subuser()) {
+ key_type = KEY_TYPE_SWIFT;
+ } else {
+ key_type = KEY_TYPE_S3;
+ }
+ }
+
+ op_state.set_key_type(key_type);
+
+ /* see if the access key was specified */
+ if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() &&
+ op_state.get_access_key().empty()) {
+ set_err_msg(err_msg, "empty access key");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ // don't check for secret key because we may be doing a removal
+
+ if (check_existing_key(op_state)) {
+ op_state.set_access_key_exist();
+ }
+ return 0;
+}
+
+// Generate a new random key
+int RGWAccessKeyPool::generate_key(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+ optional_yield y, std::string *err_msg)
+{
+ std::string id;
+ std::string key;
+
+ std::pair<std::string, RGWAccessKey> key_pair;
+ RGWAccessKey new_key;
+ std::unique_ptr<rgw::sal::User> duplicate_check;
+
+ int key_type = op_state.get_key_type();
+ bool gen_access = op_state.will_gen_access();
+ bool gen_secret = op_state.will_gen_secret();
+
+ if (!keys_allowed) {
+ set_err_msg(err_msg, "access keys not allowed for this user");
+ return -EACCES;
+ }
+
+ if (op_state.has_existing_key()) {
+ set_err_msg(err_msg, "cannot create existing key");
+ return -ERR_KEY_EXIST;
+ }
+
+ if (!gen_access) {
+ id = op_state.get_access_key();
+ }
+
+ if (!id.empty()) {
+ switch (key_type) {
+ case KEY_TYPE_SWIFT:
+ if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+ set_err_msg(err_msg, "existing swift key in RGW system:" + id);
+ return -ERR_KEY_EXIST;
+ }
+ break;
+ case KEY_TYPE_S3:
+ if (driver->get_user_by_access_key(dpp, id, y, &duplicate_check) >= 0) {
+ set_err_msg(err_msg, "existing S3 key in RGW system:" + id);
+ return -ERR_KEY_EXIST;
+ }
+ }
+ }
+
+ //key's subuser
+ if (op_state.has_subuser()) {
+ //create user and subuser at the same time, user's s3 key should not be set this
+ if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) {
+ new_key.subuser = op_state.get_subuser();
+ }
+ }
+
+ //Secret key
+ if (!gen_secret) {
+ if (op_state.get_secret_key().empty()) {
+ set_err_msg(err_msg, "empty secret key");
+ return -ERR_INVALID_SECRET_KEY;
+ }
+
+ key = op_state.get_secret_key();
+ } else {
+ char secret_key_buf[SECRET_KEY_LEN + 1];
+ gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
+ key = secret_key_buf;
+ }
+
+ // Generate the access key
+ if (key_type == KEY_TYPE_S3 && gen_access) {
+ char public_id_buf[PUBLIC_ID_LEN + 1];
+
+ do {
+ int id_buf_size = sizeof(public_id_buf);
+ gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size);
+ id = public_id_buf;
+ if (!validate_access_key(id))
+ continue;
+
+ } while (!driver->get_user_by_access_key(dpp, id, y, &duplicate_check));
+ }
+
+ if (key_type == KEY_TYPE_SWIFT) {
+ id = op_state.build_default_swift_kid();
+ if (id.empty()) {
+ set_err_msg(err_msg, "empty swift access key");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ // check that the access key doesn't exist
+ if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+ set_err_msg(err_msg, "cannot create existing swift key");
+ return -ERR_KEY_EXIST;
+ }
+ }
+
+ // finally create the new key
+ new_key.id = id;
+ new_key.key = key;
+
+ key_pair.first = id;
+ key_pair.second = new_key;
+
+ if (key_type == KEY_TYPE_S3) {
+ access_keys->insert(key_pair);
+ } else if (key_type == KEY_TYPE_SWIFT) {
+ swift_keys->insert(key_pair);
+ }
+
+ return 0;
+}
+
+// modify an existing key
+int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+ std::string id;
+ std::string key = op_state.get_secret_key();
+ int key_type = op_state.get_key_type();
+
+ RGWAccessKey modify_key;
+
+ pair<string, RGWAccessKey> key_pair;
+ map<std::string, RGWAccessKey>::iterator kiter;
+
+ switch (key_type) {
+ case KEY_TYPE_S3:
+ id = op_state.get_access_key();
+ if (id.empty()) {
+ set_err_msg(err_msg, "no access key specified");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+ break;
+ case KEY_TYPE_SWIFT:
+ id = op_state.build_default_swift_kid();
+ if (id.empty()) {
+ set_err_msg(err_msg, "no subuser specified");
+ return -EINVAL;
+ }
+ break;
+ default:
+ set_err_msg(err_msg, "invalid key type");
+ return -ERR_INVALID_KEY_TYPE;
+ }
+
+ if (!op_state.has_existing_key()) {
+ set_err_msg(err_msg, "key does not exist");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ key_pair.first = id;
+
+ if (key_type == KEY_TYPE_SWIFT) {
+ modify_key.id = id;
+ modify_key.subuser = op_state.get_subuser();
+ } else if (key_type == KEY_TYPE_S3) {
+ kiter = access_keys->find(id);
+ if (kiter != access_keys->end()) {
+ modify_key = kiter->second;
+ }
+ }
+
+ if (op_state.will_gen_secret()) {
+ char secret_key_buf[SECRET_KEY_LEN + 1];
+ int key_buf_size = sizeof(secret_key_buf);
+ gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
+ key = secret_key_buf;
+ }
+
+ if (key.empty()) {
+ set_err_msg(err_msg, "empty secret key");
+ return -ERR_INVALID_SECRET_KEY;
+ }
+
+ // update the access key with the new secret key
+ modify_key.key = key;
+
+ key_pair.second = modify_key;
+
+
+ if (key_type == KEY_TYPE_S3) {
+ (*access_keys)[id] = modify_key;
+ } else if (key_type == KEY_TYPE_SWIFT) {
+ (*swift_keys)[id] = modify_key;
+ }
+
+ return 0;
+}
+
+int RGWAccessKeyPool::execute_add(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg, bool defer_user_update,
+ optional_yield y)
+{
+ int ret = 0;
+
+ std::string subprocess_msg;
+ int key_op = GENERATE_KEY;
+
+ // set the op
+ if (op_state.has_existing_key())
+ key_op = MODIFY_KEY;
+
+ switch (key_op) {
+ case GENERATE_KEY:
+ ret = generate_key(dpp, op_state, y, &subprocess_msg);
+ break;
+ case MODIFY_KEY:
+ ret = modify_key(op_state, &subprocess_msg);
+ break;
+ }
+
+ if (ret < 0) {
+ set_err_msg(err_msg, subprocess_msg);
+ return ret;
+ }
+
+ // store the updated info
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_user_update, optional_yield y)
+{
+ int ret;
+ std::string subprocess_msg;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to add access key, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWAccessKeyPool::execute_remove(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg,
+ bool defer_user_update,
+ optional_yield y)
+{
+ int ret = 0;
+
+ int key_type = op_state.get_key_type();
+ std::string id = op_state.get_access_key();
+ map<std::string, RGWAccessKey>::iterator kiter;
+ map<std::string, RGWAccessKey> *keys_map;
+
+ if (!op_state.has_existing_key()) {
+ set_err_msg(err_msg, "unable to find access key, with key type: " +
+ key_type_to_str(key_type));
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ if (key_type == KEY_TYPE_S3) {
+ keys_map = access_keys;
+ } else if (key_type == KEY_TYPE_SWIFT) {
+ keys_map = swift_keys;
+ } else {
+ keys_map = NULL;
+ set_err_msg(err_msg, "invalid access key");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ kiter = keys_map->find(id);
+ if (kiter == keys_map->end()) {
+ set_err_msg(err_msg, "key not found");
+ return -ERR_INVALID_ACCESS_KEY;
+ }
+
+ keys_map->erase(kiter);
+
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg, bool defer_user_update,
+ optional_yield y)
+{
+ int ret;
+
+ std::string subprocess_msg;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+// remove all keys associated with a subuser
+int RGWAccessKeyPool::remove_subuser_keys(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg,
+ bool defer_user_update,
+ optional_yield y)
+{
+ int ret = 0;
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!op_state.has_subuser()) {
+ set_err_msg(err_msg, "no subuser specified");
+ return -EINVAL;
+ }
+
+ std::string swift_kid = op_state.build_default_swift_kid();
+ if (swift_kid.empty()) {
+ set_err_msg(err_msg, "empty swift access key");
+ return -EINVAL;
+ }
+
+ map<std::string, RGWAccessKey>::iterator kiter;
+ map<std::string, RGWAccessKey> *keys_map;
+
+ // a subuser can have at most one swift key
+ keys_map = swift_keys;
+ kiter = keys_map->find(swift_kid);
+ if (kiter != keys_map->end()) {
+ keys_map->erase(kiter);
+ }
+
+ // a subuser may have multiple s3 key pairs
+ std::string subuser_str = op_state.get_subuser();
+ keys_map = access_keys;
+ RGWUserInfo user_info = op_state.get_user_info();
+ auto user_kiter = user_info.access_keys.begin();
+ for (; user_kiter != user_info.access_keys.end(); ++user_kiter) {
+ if (user_kiter->second.subuser == subuser_str) {
+ kiter = keys_map->find(user_kiter->first);
+ if (kiter != keys_map->end()) {
+ keys_map->erase(kiter);
+ }
+ }
+ }
+
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+RGWSubUserPool::RGWSubUserPool(RGWUser *usr)
+{
+ if (!usr) {
+ return;
+ }
+
+ user = usr;
+
+ subusers_allowed = true;
+ driver = user->get_driver();
+}
+
+int RGWSubUserPool::init(RGWUserAdminOpState& op_state)
+{
+ if (!op_state.is_initialized()) {
+ subusers_allowed = false;
+ return -EINVAL;
+ }
+
+ const rgw_user& uid = op_state.get_user_id();
+ if (uid.compare(RGW_USER_ANON_ID) == 0) {
+ subusers_allowed = false;
+ return -EACCES;
+ }
+
+ subuser_map = op_state.get_subusers();
+ if (subuser_map == NULL) {
+ subusers_allowed = false;
+ return -EINVAL;
+ }
+
+ subusers_allowed = true;
+
+ return 0;
+}
+
+bool RGWSubUserPool::exists(std::string subuser)
+{
+ if (subuser.empty())
+ return false;
+
+ if (!subuser_map)
+ return false;
+
+ if (subuser_map->count(subuser))
+ return true;
+
+ return false;
+}
+
+int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state,
+ std::string *err_msg)
+{
+ bool existing = false;
+ std::string subuser = op_state.get_subuser();
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!subusers_allowed) {
+ set_err_msg(err_msg, "subusers not allowed for this user");
+ return -EACCES;
+ }
+
+ if (subuser.empty() && !op_state.will_gen_subuser()) {
+ set_err_msg(err_msg, "empty subuser name");
+ return -EINVAL;
+ }
+
+ if (op_state.get_subuser_perm() == RGW_PERM_INVALID) {
+ set_err_msg(err_msg, "invalid subuser access");
+ return -EINVAL;
+ }
+
+ //set key type when it not set or set by context
+ if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+ op_state.set_key_type(KEY_TYPE_SWIFT);
+ op_state.key_type_setbycontext = true;
+ }
+
+ // check if the subuser exists
+ if (!subuser.empty())
+ existing = exists(subuser);
+
+ op_state.set_existing_subuser(existing);
+
+ return 0;
+}
+
+int RGWSubUserPool::execute_add(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg, bool defer_user_update,
+ optional_yield y)
+{
+ int ret = 0;
+ std::string subprocess_msg;
+
+ RGWSubUser subuser;
+ std::pair<std::string, RGWSubUser> subuser_pair;
+ std::string subuser_str = op_state.get_subuser();
+
+ subuser_pair.first = subuser_str;
+
+ // assumes key should be created
+ if (op_state.has_key_op()) {
+ ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg);
+ return ret;
+ }
+ }
+
+ // create the subuser
+ subuser.name = subuser_str;
+
+ if (op_state.has_subuser_perm())
+ subuser.perm_mask = op_state.get_subuser_perm();
+
+ // insert the subuser into user info
+ subuser_pair.second = subuser;
+ subuser_map->insert(subuser_pair);
+
+ // attempt to save the subuser
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+ std::string subprocess_msg;
+ int ret;
+ int32_t key_type = op_state.get_key_type();
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+ return ret;
+ }
+
+ if (op_state.get_access_key_exist()) {
+ set_err_msg(err_msg, "cannot create existing key");
+ return -ERR_KEY_EXIST;
+ }
+
+ if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) {
+ op_state.set_gen_access();
+ }
+
+ if (op_state.get_secret_key().empty()) {
+ op_state.set_gen_secret();
+ }
+
+ ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWSubUserPool::execute_remove(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state,
+ std::string *err_msg, bool defer_user_update,
+ optional_yield y)
+{
+ int ret = 0;
+ std::string subprocess_msg;
+
+ std::string subuser_str = op_state.get_subuser();
+
+ map<std::string, RGWSubUser>::iterator siter;
+ siter = subuser_map->find(subuser_str);
+ if (siter == subuser_map->end()){
+ set_err_msg(err_msg, "subuser not found: " + subuser_str);
+ return -ERR_NO_SUCH_SUBUSER;
+ }
+ if (!op_state.has_existing_subuser()) {
+ set_err_msg(err_msg, "subuser not found: " + subuser_str);
+ return -ERR_NO_SUCH_SUBUSER;
+ }
+
+ // always purge all associate keys
+ user->keys.remove_subuser_keys(dpp, op_state, &subprocess_msg, true, y);
+
+ // remove the subuser from the user info
+ subuser_map->erase(siter);
+
+ // attempt to save the subuser
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_user_update, optional_yield y)
+{
+ std::string subprocess_msg;
+ int ret;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWSubUserPool::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+ int ret = 0;
+ std::string subprocess_msg;
+ std::map<std::string, RGWSubUser>::iterator siter;
+ std::pair<std::string, RGWSubUser> subuser_pair;
+
+ std::string subuser_str = op_state.get_subuser();
+ RGWSubUser subuser;
+
+ if (!op_state.has_existing_subuser()) {
+ set_err_msg(err_msg, "subuser does not exist");
+ return -ERR_NO_SUCH_SUBUSER;
+ }
+
+ subuser_pair.first = subuser_str;
+
+ siter = subuser_map->find(subuser_str);
+ subuser = siter->second;
+
+ if (op_state.has_key_op()) {
+ ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg);
+ return ret;
+ }
+ }
+
+ if (op_state.has_subuser_perm())
+ subuser.perm_mask = op_state.get_subuser_perm();
+
+ subuser_pair.second = subuser;
+
+ subuser_map->erase(siter);
+ subuser_map->insert(subuser_pair);
+
+ // attempt to save the subuser
+ if (!defer_user_update)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+ return RGWSubUserPool::modify(dpp, op_state, y, err_msg, false);
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_user_update)
+{
+ std::string subprocess_msg;
+ int ret;
+
+ RGWSubUser subuser;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_modify(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+RGWUserCapPool::RGWUserCapPool(RGWUser *usr)
+{
+ if (!usr) {
+ return;
+ }
+ user = usr;
+ caps_allowed = true;
+}
+
+int RGWUserCapPool::init(RGWUserAdminOpState& op_state)
+{
+ if (!op_state.is_initialized()) {
+ caps_allowed = false;
+ return -EINVAL;
+ }
+
+ const rgw_user& uid = op_state.get_user_id();
+ if (uid.compare(RGW_USER_ANON_ID) == 0) {
+ caps_allowed = false;
+ return -EACCES;
+ }
+
+ caps = op_state.get_caps_obj();
+ if (!caps) {
+ caps_allowed = false;
+ return -ERR_INVALID_CAP;
+ }
+
+ caps_allowed = true;
+
+ return 0;
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y)
+{
+ int ret = 0;
+ std::string caps_str = op_state.get_caps();
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!caps_allowed) {
+ set_err_msg(err_msg, "caps not allowed for this user");
+ return -EACCES;
+ }
+
+ if (caps_str.empty()) {
+ set_err_msg(err_msg, "empty user caps");
+ return -ERR_INVALID_CAP;
+ }
+
+ int r = caps->add_from_string(caps_str);
+ if (r < 0) {
+ set_err_msg(err_msg, "unable to add caps: " + caps_str);
+ return r;
+ }
+
+ if (!defer_save)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg)
+{
+ return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y)
+{
+ int ret = 0;
+
+ std::string caps_str = op_state.get_caps();
+
+ if (!op_state.is_populated()) {
+ set_err_msg(err_msg, "user info was not populated");
+ return -EINVAL;
+ }
+
+ if (!caps_allowed) {
+ set_err_msg(err_msg, "caps not allowed for this user");
+ return -EACCES;
+ }
+
+ if (caps_str.empty()) {
+ set_err_msg(err_msg, "empty user caps");
+ return -ERR_INVALID_CAP;
+ }
+
+ int r = caps->remove_from_string(caps_str);
+ if (r < 0) {
+ set_err_msg(err_msg, "unable to remove caps: " + caps_str);
+ return r;
+ }
+
+ if (!defer_save)
+ ret = user->update(dpp, op_state, err_msg, y);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+RGWUser::RGWUser() : caps(this), keys(this), subusers(this)
+{
+ init_default();
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver,
+ RGWUserAdminOpState& op_state, optional_yield y)
+{
+ init_default();
+ int ret = init_storage(_driver);
+ if (ret < 0)
+ return ret;
+
+ ret = init(dpp, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+void RGWUser::init_default()
+{
+ // use anonymous user info as a placeholder
+ rgw_get_anon_user(old_info);
+ user_id = RGW_USER_ANON_ID;
+
+ clear_populated();
+}
+
+int RGWUser::init_storage(rgw::sal::Driver* _driver)
+{
+ if (!_driver) {
+ return -EINVAL;
+ }
+
+ driver = _driver;
+
+ clear_populated();
+
+ /* API wrappers */
+ keys = RGWAccessKeyPool(this);
+ caps = RGWUserCapPool(this);
+ subusers = RGWSubUserPool(this);
+
+ return 0;
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y)
+{
+ bool found = false;
+ std::string swift_user;
+ user_id = op_state.get_user_id();
+ std::string user_email = op_state.get_user_email();
+ std::string access_key = op_state.get_access_key();
+ std::string subuser = op_state.get_subuser();
+
+ int key_type = op_state.get_key_type();
+ if (key_type == KEY_TYPE_SWIFT) {
+ swift_user = op_state.get_access_key();
+ access_key.clear();
+ }
+
+ std::unique_ptr<rgw::sal::User> user;
+
+ clear_populated();
+
+ if (user_id.empty() && !subuser.empty()) {
+ size_t pos = subuser.find(':');
+ if (pos != string::npos) {
+ user_id = subuser.substr(0, pos);
+ op_state.set_user_id(user_id);
+ }
+ }
+
+ if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+ user = driver->get_user(user_id);
+ found = (user->load_user(dpp, y) >= 0);
+ op_state.found_by_uid = found;
+ }
+ if (driver->ctx()->_conf.get_val<bool>("rgw_user_unique_email")) {
+ if (!user_email.empty() && !found) {
+ found = (driver->get_user_by_email(dpp, user_email, y, &user) >= 0);
+ op_state.found_by_email = found;
+ }
+ }
+ if (!swift_user.empty() && !found) {
+ found = (driver->get_user_by_swift(dpp, swift_user, y, &user) >= 0);
+ op_state.found_by_key = found;
+ }
+ if (!access_key.empty() && !found) {
+ found = (driver->get_user_by_access_key(dpp, access_key, y, &user) >= 0);
+ op_state.found_by_key = found;
+ }
+
+ op_state.set_existing_user(found);
+ if (found) {
+ op_state.set_user_info(user->get_info());
+ op_state.set_populated();
+ op_state.objv = user->get_version_tracker();
+ op_state.set_user_version_tracker(user->get_version_tracker());
+
+ old_info = user->get_info();
+ set_populated();
+ }
+
+ if (user_id.empty()) {
+ user_id = user->get_id();
+ }
+ op_state.set_initialized();
+
+ // this may have been called by a helper object
+ int ret = init_members(op_state);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUser::init_members(RGWUserAdminOpState& op_state)
+{
+ int ret = 0;
+
+ ret = keys.init(op_state);
+ if (ret < 0)
+ return ret;
+
+ ret = subusers.init(op_state);
+ if (ret < 0)
+ return ret;
+
+ ret = caps.init(op_state);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUser::update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ optional_yield y)
+{
+ int ret;
+ std::string subprocess_msg;
+ rgw::sal::User* user = op_state.get_user();
+
+ if (!driver) {
+ set_err_msg(err_msg, "couldn't initialize storage");
+ return -EINVAL;
+ }
+
+ // if op_state.op_access_keys is not empty most recent keys have been fetched from master zone
+ if(!op_state.op_access_keys.empty()) {
+ auto user_access_keys = op_state.get_access_keys();
+ *(user_access_keys) = op_state.op_access_keys;
+ }
+
+ RGWUserInfo *pold_info = (is_populated() ? &old_info : nullptr);
+
+ ret = user->store_user(dpp, y, false, pold_info);
+ op_state.objv = user->get_version_tracker();
+ op_state.set_user_version_tracker(user->get_version_tracker());
+
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to store user info");
+ return ret;
+ }
+
+ old_info = user->get_info();
+ set_populated();
+
+ return 0;
+}
+
+int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+ int ret = 0;
+ const rgw_user& uid = op_state.get_user_id();
+
+ if (uid.compare(RGW_USER_ANON_ID) == 0) {
+ set_err_msg(err_msg, "unable to perform operations on the anonymous user");
+ return -EINVAL;
+ }
+
+ if (is_populated() && user_id.compare(uid) != 0) {
+ set_err_msg(err_msg, "user id mismatch, operation id: " + uid.to_str()
+ + " does not match: " + user_id.to_str());
+
+ return -EINVAL;
+ }
+
+ ret = rgw_validate_tenant_name(uid.tenant);
+ if (ret) {
+ set_err_msg(err_msg,
+ "invalid tenant only alphanumeric and _ characters are allowed");
+ return ret;
+ }
+
+ //set key type when it not set or set by context
+ if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+ op_state.set_key_type(KEY_TYPE_S3);
+ op_state.key_type_setbycontext = true;
+ }
+
+ return 0;
+}
+
+// update swift_keys with new user id
+static void rename_swift_keys(const rgw_user& user,
+ std::map<std::string, RGWAccessKey>& keys)
+{
+ std::string user_id;
+ user.to_str(user_id);
+
+ auto modify_keys = std::move(keys);
+ for ([[maybe_unused]] auto& [k, key] : modify_keys) {
+ std::string id = user_id + ":" + key.subuser;
+ key.id = id;
+ keys[id] = std::move(key);
+ }
+}
+
+int RGWUser::execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+ int ret;
+ bool populated = op_state.is_populated();
+
+ if (!op_state.has_existing_user() && !populated) {
+ set_err_msg(err_msg, "user not found");
+ return -ENOENT;
+ }
+
+ if (!populated) {
+ ret = init(dpp, op_state, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to retrieve user info");
+ return ret;
+ }
+ }
+
+ std::unique_ptr<rgw::sal::User> old_user = driver->get_user(op_state.get_user_info().user_id);
+ std::unique_ptr<rgw::sal::User> new_user = driver->get_user(op_state.get_new_uid());
+ if (old_user->get_tenant() != new_user->get_tenant()) {
+ set_err_msg(err_msg, "users have to be under the same tenant namespace "
+ + old_user->get_tenant() + " != " + new_user->get_tenant());
+ return -EINVAL;
+ }
+
+ // create a stub user and write only the uid index and buckets object
+ std::unique_ptr<rgw::sal::User> user;
+ user = driver->get_user(new_user->get_id());
+
+ const bool exclusive = !op_state.get_overwrite_new_user(); // overwrite if requested
+
+ ret = user->store_user(dpp, y, exclusive);
+ if (ret == -EEXIST) {
+ set_err_msg(err_msg, "user name given by --new-uid already exists");
+ return ret;
+ }
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to store new user info");
+ return ret;
+ }
+
+ RGWAccessControlPolicy policy_instance;
+ policy_instance.create_default(new_user->get_id(), old_user->get_display_name());
+
+ //unlink and link buckets to new user
+ string marker;
+ CephContext *cct = driver->ctx();
+ size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+ rgw::sal::BucketList buckets;
+
+ do {
+ ret = old_user->list_buckets(dpp, marker, "", max_buckets, false, buckets, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to list user buckets");
+ return ret;
+ }
+
+ auto& m = buckets.get_buckets();
+
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ auto& bucket = it->second;
+ marker = it->first;
+
+ ret = bucket->load_bucket(dpp, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket->get_name());
+ return ret;
+ }
+
+ ret = bucket->set_acl(dpp, policy_instance, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "failed to set acl on bucket " + bucket->get_name());
+ return ret;
+ }
+
+ ret = rgw_chown_bucket_and_objects(driver, bucket.get(), new_user.get(),
+ std::string(), nullptr, dpp, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "failed to run bucket chown" + cpp_strerror(-ret));
+ return ret;
+ }
+ }
+
+ } while (buckets.is_truncated());
+
+ // update the 'stub user' with all of the other fields and rewrite all of the
+ // associated index objects
+ RGWUserInfo& user_info = op_state.get_user_info();
+ user_info.user_id = new_user->get_id();
+ op_state.objv = user->get_version_tracker();
+ op_state.set_user_version_tracker(user->get_version_tracker());
+
+ rename_swift_keys(new_user->get_id(), user_info.swift_keys);
+
+ return update(dpp, op_state, err_msg, y);
+}
+
+int RGWUser::execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ optional_yield y)
+{
+ const rgw_user& uid = op_state.get_user_id();
+ std::string user_email = op_state.get_user_email();
+ std::string display_name = op_state.get_display_name();
+
+ // set the user info
+ RGWUserInfo user_info;
+ user_id = uid;
+ user_info.user_id = user_id;
+ user_info.display_name = display_name;
+ user_info.type = TYPE_RGW;
+
+ if (!user_email.empty())
+ user_info.user_email = user_email;
+
+ CephContext *cct = driver->ctx();
+ if (op_state.max_buckets_specified) {
+ user_info.max_buckets = op_state.get_max_buckets();
+ } else {
+ user_info.max_buckets =
+ cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+ }
+
+ user_info.suspended = op_state.get_suspension_status();
+ user_info.admin = op_state.admin;
+ user_info.system = op_state.system;
+
+ if (op_state.op_mask_specified)
+ user_info.op_mask = op_state.get_op_mask();
+
+ if (op_state.has_bucket_quota()) {
+ user_info.quota.bucket_quota = op_state.get_bucket_quota();
+ } else {
+ rgw_apply_default_bucket_quota(user_info.quota.bucket_quota, cct->_conf);
+ }
+
+ if (op_state.temp_url_key_specified) {
+ map<int, string>::iterator iter;
+ for (iter = op_state.temp_url_keys.begin();
+ iter != op_state.temp_url_keys.end(); ++iter) {
+ user_info.temp_url_keys[iter->first] = iter->second;
+ }
+ }
+
+ if (op_state.has_user_quota()) {
+ user_info.quota.user_quota = op_state.get_user_quota();
+ } else {
+ rgw_apply_default_user_quota(user_info.quota.user_quota, cct->_conf);
+ }
+
+ if (op_state.default_placement_specified) {
+ user_info.default_placement = op_state.default_placement;
+ }
+
+ if (op_state.placement_tags_specified) {
+ user_info.placement_tags = op_state.placement_tags;
+ }
+
+ // update the request
+ op_state.set_user_info(user_info);
+ op_state.set_populated();
+
+ // update the helper objects
+ int ret = init_members(op_state);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to initialize user");
+ return ret;
+ }
+
+ // see if we need to add an access key
+ std::string subprocess_msg;
+ bool defer_user_update = true;
+ if (op_state.has_key_op()) {
+ ret = keys.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create access key, " + subprocess_msg);
+ return ret;
+ }
+ }
+
+ // see if we need to add some caps
+ if (op_state.has_caps_op()) {
+ ret = caps.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg);
+ return ret;
+ }
+ }
+
+ ret = update(dpp, op_state, err_msg, y);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUser::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+ std::string subprocess_msg;
+ int ret = user_add_helper(op_state, &subprocess_msg);
+ if (ret != 0) {
+ set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_add(dpp, op_state, &subprocess_msg, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create user, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWUser::rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+ std::string subprocess_msg;
+ int ret;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_rename(dpp, op_state, &subprocess_msg, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to rename user, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWUser::execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+ int ret;
+
+ bool purge_data = op_state.will_purge_data();
+ rgw::sal::User* user = op_state.get_user();
+
+ if (!op_state.has_existing_user()) {
+ set_err_msg(err_msg, "user does not exist");
+ return -ENOENT;
+ }
+
+ rgw::sal::BucketList buckets;
+ string marker;
+ CephContext *cct = driver->ctx();
+ size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+ do {
+ ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to read user bucket info");
+ return ret;
+ }
+
+ auto& m = buckets.get_buckets();
+ if (!m.empty() && !purge_data) {
+ set_err_msg(err_msg, "must specify purge data to remove user with buckets");
+ return -EEXIST; // change to code that maps to 409: conflict
+ }
+
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ ret = it->second->remove_bucket(dpp, true, false, nullptr, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to delete user data");
+ return ret;
+ }
+
+ marker = it->first;
+ }
+
+ } while (buckets.is_truncated());
+
+ ret = user->remove_user(dpp, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to remove user from RADOS");
+ return ret;
+ }
+
+ op_state.clear_populated();
+ clear_populated();
+
+ return 0;
+}
+
+int RGWUser::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+ std::string subprocess_msg;
+ int ret;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_remove(dpp, op_state, &subprocess_msg, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to remove user, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWUser::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+ bool populated = op_state.is_populated();
+ int ret = 0;
+ std::string subprocess_msg;
+ std::string op_email = op_state.get_user_email();
+ std::string display_name = op_state.get_display_name();
+
+ RGWUserInfo user_info;
+ std::unique_ptr<rgw::sal::User> duplicate_check;
+
+ // ensure that the user info has been populated or is populate-able
+ if (!op_state.has_existing_user() && !populated) {
+ set_err_msg(err_msg, "user not found");
+ return -ENOENT;
+ }
+
+ // if the user hasn't already been populated...attempt to
+ if (!populated) {
+ ret = init(dpp, op_state, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to retrieve user info");
+ return ret;
+ }
+ }
+
+ // ensure that we can modify the user's attributes
+ if (user_id.compare(RGW_USER_ANON_ID) == 0) {
+ set_err_msg(err_msg, "unable to modify anonymous user's info");
+ return -EACCES;
+ }
+
+ user_info = old_info;
+
+ std::string old_email = old_info.user_email;
+ if (!op_email.empty()) {
+ // make sure we are not adding a duplicate email
+ if (old_email != op_email) {
+ ret = driver->get_user_by_email(dpp, op_email, y, &duplicate_check);
+ if (ret >= 0 && duplicate_check->get_id().compare(user_id) != 0) {
+ set_err_msg(err_msg, "cannot add duplicate email");
+ return -ERR_EMAIL_EXIST;
+ }
+ }
+ user_info.user_email = op_email;
+ } else if (op_email.empty() && op_state.user_email_specified) {
+ ldpp_dout(dpp, 10) << "removing email index: " << user_info.user_email << dendl;
+ /* will be physically removed later when calling update() */
+ user_info.user_email.clear();
+ }
+
+ // update the remaining user info
+ if (!display_name.empty())
+ user_info.display_name = display_name;
+
+ if (op_state.max_buckets_specified)
+ user_info.max_buckets = op_state.get_max_buckets();
+
+ if (op_state.admin_specified)
+ user_info.admin = op_state.admin;
+
+ if (op_state.system_specified)
+ user_info.system = op_state.system;
+
+ if (op_state.temp_url_key_specified) {
+ map<int, string>::iterator iter;
+ for (iter = op_state.temp_url_keys.begin();
+ iter != op_state.temp_url_keys.end(); ++iter) {
+ user_info.temp_url_keys[iter->first] = iter->second;
+ }
+ }
+
+ if (op_state.op_mask_specified)
+ user_info.op_mask = op_state.get_op_mask();
+
+ if (op_state.has_bucket_quota())
+ user_info.quota.bucket_quota = op_state.get_bucket_quota();
+
+ if (op_state.has_user_quota())
+ user_info.quota.user_quota = op_state.get_user_quota();
+
+ if (op_state.has_suspension_op()) {
+ __u8 suspended = op_state.get_suspension_status();
+ user_info.suspended = suspended;
+
+ rgw::sal::BucketList buckets;
+
+ if (user_id.empty()) {
+ set_err_msg(err_msg, "empty user id passed...aborting");
+ return -EINVAL;
+ }
+
+ string marker;
+ CephContext *cct = driver->ctx();
+ size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+ std::unique_ptr<rgw::sal::User> user = driver->get_user(user_id);
+ do {
+ ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "could not get buckets for uid: " + user_id.to_str());
+ return ret;
+ }
+
+ auto& m = buckets.get_buckets();
+
+ vector<rgw_bucket> bucket_names;
+ for (auto iter = m.begin(); iter != m.end(); ++iter) {
+ auto& bucket = iter->second;
+ bucket_names.push_back(bucket->get_key());
+
+ marker = iter->first;
+ }
+
+ ret = driver->set_buckets_enabled(dpp, bucket_names, !suspended);
+ if (ret < 0) {
+ set_err_msg(err_msg, "failed to modify bucket");
+ return ret;
+ }
+
+ } while (buckets.is_truncated());
+ }
+
+ if (op_state.mfa_ids_specified) {
+ user_info.mfa_ids = op_state.mfa_ids;
+ }
+
+ if (op_state.default_placement_specified) {
+ user_info.default_placement = op_state.default_placement;
+ }
+
+ if (op_state.placement_tags_specified) {
+ user_info.placement_tags = op_state.placement_tags;
+ }
+
+ op_state.set_user_info(user_info);
+
+ // if we're supposed to modify keys, do so
+ if (op_state.has_key_op()) {
+ ret = keys.add(dpp, op_state, &subprocess_msg, true, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg);
+ return ret;
+ }
+ }
+
+ ret = update(dpp, op_state, err_msg, y);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUser::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+ std::string subprocess_msg;
+ int ret;
+
+ ret = check_op(op_state, &subprocess_msg);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+ return ret;
+ }
+
+ ret = execute_modify(dpp, op_state, &subprocess_msg, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to modify user, " + subprocess_msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWUser::info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info,
+ optional_yield y, std::string *err_msg)
+{
+ int ret = init(dpp, op_state, y);
+ if (ret < 0) {
+ set_err_msg(err_msg, "unable to fetch user info");
+ return ret;
+ }
+
+ fetched_info = op_state.get_user_info();
+
+ return 0;
+}
+
+int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg)
+{
+ if (!is_populated()) {
+ set_err_msg(err_msg, "no user info saved");
+ return -EINVAL;
+ }
+
+ fetched_info = old_info;
+
+ return 0;
+}
+
+int RGWUser::list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher)
+{
+ Formatter *formatter = flusher.get_formatter();
+ void *handle = nullptr;
+ std::string metadata_key = "user";
+ if (op_state.max_entries > 1000) {
+ op_state.max_entries = 1000;
+ }
+
+ int ret = driver->meta_list_keys_init(dpp, metadata_key, op_state.marker, &handle);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bool truncated = false;
+ uint64_t count = 0;
+ uint64_t left = 0;
+ flusher.start(0);
+
+ // open the result object section
+ formatter->open_object_section("result");
+
+ // open the user id list array section
+ formatter->open_array_section("keys");
+ do {
+ std::list<std::string> keys;
+ left = op_state.max_entries - count;
+ ret = driver->meta_list_keys_next(dpp, handle, left, keys, &truncated);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ } if (ret != -ENOENT) {
+ for (std::list<std::string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+ formatter->dump_string("key", *iter);
+ ++count;
+ }
+ }
+ } while (truncated && left > 0);
+ // close user id list section
+ formatter->close_section();
+
+ formatter->dump_bool("truncated", truncated);
+ formatter->dump_int("count", count);
+ if (truncated) {
+ formatter->dump_string("marker", driver->meta_get_marker(handle));
+ }
+
+ // close result object section
+ formatter->close_section();
+
+ driver->meta_list_keys_complete(handle);
+
+ flusher.flush();
+ return 0;
+}
+
+int RGWUserAdminOp_User::list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher)
+{
+ RGWUser user;
+
+ int ret = user.init_storage(driver);
+ if (ret < 0)
+ return ret;
+
+ ret = user.list(dpp, op_state, flusher);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUserAdminOp_User::info(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ std::unique_ptr<rgw::sal::User> ruser;
+
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ ruser = driver->get_user(info.user_id);
+
+ if (op_state.sync_stats) {
+ ret = rgw_user_sync_all_stats(dpp, driver, ruser.get(), y);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ RGWStorageStats stats;
+ RGWStorageStats *arg_stats = NULL;
+ if (op_state.fetch_stats) {
+ int ret = ruser->read_stats(dpp, y, &stats);
+ if (ret < 0 && ret != -ENOENT) {
+ return ret;
+ }
+
+ arg_stats = &stats;
+ }
+
+ if (formatter) {
+ flusher.start(0);
+
+ dump_user_info(formatter, info, arg_stats);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_User::create(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.add(dpp, op_state, y, NULL);
+ if (ret < 0) {
+ if (ret == -EEXIST)
+ ret = -ERR_USER_EXIST;
+ return ret;
+ }
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ dump_user_info(formatter, info);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_User::modify(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.modify(dpp, op_state, y, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -ERR_NO_SUCH_USER;
+ return ret;
+ }
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ dump_user_info(formatter, info);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_User::remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+
+ ret = user.remove(dpp, op_state, y, NULL);
+
+ if (ret == -ENOENT)
+ ret = -ERR_NO_SUCH_USER;
+ return ret;
+}
+
+int RGWUserAdminOp_Subuser::create(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.subusers.add(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ dump_subusers_info(formatter, info);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_Subuser::modify(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.subusers.modify(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ dump_subusers_info(formatter, info);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_Subuser::remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ ret = user.subusers.remove(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUserAdminOp_Key::create(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.keys.add(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ int key_type = op_state.get_key_type();
+
+ if (key_type == KEY_TYPE_SWIFT)
+ dump_swift_keys_info(formatter, info);
+
+ else if (key_type == KEY_TYPE_S3)
+ dump_access_keys_info(formatter, info);
+
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+int RGWUserAdminOp_Key::remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher,
+ optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+
+ ret = user.keys.remove(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWUserAdminOp_Caps::add(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.caps.add(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ info.caps.dump(formatter);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+
+int RGWUserAdminOp_Caps::remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state,
+ RGWFormatterFlusher& flusher, optional_yield y)
+{
+ RGWUserInfo info;
+ RGWUser user;
+ int ret = user.init(dpp, driver, op_state, y);
+ if (ret < 0)
+ return ret;
+
+ if (!op_state.has_existing_user())
+ return -ERR_NO_SUCH_USER;
+
+ Formatter *formatter = flusher.get_formatter();
+
+ ret = user.caps.remove(dpp, op_state, y, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = user.info(info, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (formatter) {
+ flusher.start(0);
+
+ info.caps.dump(formatter);
+ flusher.flush();
+ }
+
+ return 0;
+}
+
+class RGWUserMetadataHandler : public RGWMetadataHandler_GenericMetaBE {
+public:
+ struct Svc {
+ RGWSI_User *user{nullptr};
+ } svc;
+
+ RGWUserMetadataHandler(RGWSI_User *user_svc) {
+ base_init(user_svc->ctx(), user_svc->get_be_handler());
+ svc.user = user_svc;
+ }
+
+ ~RGWUserMetadataHandler() {}
+
+ string get_type() override { return "user"; }
+
+ int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWUserCompleteInfo uci;
+ RGWObjVersionTracker objv_tracker;
+ real_time mtime;
+
+ rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+ int ret = svc.user->read_user_info(op->ctx(), user, &uci.info, &objv_tracker,
+ &mtime, nullptr, &uci.attrs,
+ y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime);
+ *obj = mdo;
+
+ return 0;
+ }
+
+ RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+ RGWUserCompleteInfo uci;
+
+ try {
+ decode_json_obj(uci, jo);
+ } catch (JSONDecoder::err& e) {
+ return nullptr;
+ }
+
+ return new RGWUserMetadataObject(uci, objv, mtime);
+ }
+
+ int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone) override;
+
+ int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp) override {
+ RGWUserInfo info;
+
+ rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+ int ret = svc.user->read_user_info(op->ctx(), user, &info, nullptr,
+ nullptr, nullptr, nullptr,
+ y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return svc.user->remove_user_info(op->ctx(), info, &objv_tracker,
+ y, dpp);
+ }
+};
+
+class RGWMetadataHandlerPut_User : public RGWMetadataHandlerPut_SObj
+{
+ RGWUserMetadataHandler *uhandler;
+ RGWUserMetadataObject *uobj;
+public:
+ RGWMetadataHandlerPut_User(RGWUserMetadataHandler *_handler,
+ RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+ optional_yield y,
+ RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+ uhandler(_handler) {
+ uobj = static_cast<RGWUserMetadataObject *>(obj);
+ }
+
+ int put_checked(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWUserMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+ RGWMetadataObject *obj,
+ RGWObjVersionTracker& objv_tracker,
+ optional_yield y, const DoutPrefixProvider *dpp,
+ RGWMDLogSyncType type, bool from_remote_zone)
+{
+ RGWMetadataHandlerPut_User put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+ return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_User::put_checked(const DoutPrefixProvider *dpp)
+{
+ RGWUserMetadataObject *orig_obj = static_cast<RGWUserMetadataObject *>(old_obj);
+ RGWUserCompleteInfo& uci = uobj->get_uci();
+
+ map<string, bufferlist> *pattrs{nullptr};
+ if (uci.has_attrs) {
+ pattrs = &uci.attrs;
+ }
+
+ RGWUserInfo *pold_info = (orig_obj ? &orig_obj->get_uci().info : nullptr);
+
+ auto mtime = obj->get_mtime();
+
+ int ret = uhandler->svc.user->store_user_info(op->ctx(), uci.info, pold_info,
+ &objv_tracker, mtime,
+ false, pattrs, y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return STATUS_APPLIED;
+}
+
+
+RGWUserCtl::RGWUserCtl(RGWSI_Zone *zone_svc,
+ RGWSI_User *user_svc,
+ RGWUserMetadataHandler *_umhandler) : umhandler(_umhandler) {
+ svc.zone = zone_svc;
+ svc.user = user_svc;
+ be_handler = umhandler->get_be_handler();
+}
+
+template <class T>
+class optional_default
+{
+ const std::optional<T>& opt;
+ std::optional<T> def;
+ const T *p;
+public:
+ optional_default(const std::optional<T>& _o) : opt(_o) {
+ if (opt) {
+ p = &(*opt);
+ } else {
+ def = T();
+ p = &(*def);
+ }
+ }
+
+ const T *operator->() {
+ return p;
+ }
+
+ const T& operator*() {
+ return *p;
+ }
+};
+
+int RGWUserCtl::get_info_by_uid(const DoutPrefixProvider *dpp,
+ const rgw_user& uid,
+ RGWUserInfo *info,
+ optional_yield y,
+ const GetParams& params)
+
+{
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->read_user_info(op->ctx(),
+ uid,
+ info,
+ params.objv_tracker,
+ params.mtime,
+ params.cache_info,
+ params.attrs,
+ y,
+ dpp);
+ });
+}
+
+int RGWUserCtl::get_info_by_email(const DoutPrefixProvider *dpp,
+ const string& email,
+ RGWUserInfo *info,
+ optional_yield y,
+ const GetParams& params)
+{
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->get_user_info_by_email(op->ctx(), email,
+ info,
+ params.objv_tracker,
+ params.mtime,
+ y,
+ dpp);
+ });
+}
+
+int RGWUserCtl::get_info_by_swift(const DoutPrefixProvider *dpp,
+ const string& swift_name,
+ RGWUserInfo *info,
+ optional_yield y,
+ const GetParams& params)
+{
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->get_user_info_by_swift(op->ctx(), swift_name,
+ info,
+ params.objv_tracker,
+ params.mtime,
+ y,
+ dpp);
+ });
+}
+
+int RGWUserCtl::get_info_by_access_key(const DoutPrefixProvider *dpp,
+ const string& access_key,
+ RGWUserInfo *info,
+ optional_yield y,
+ const GetParams& params)
+{
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->get_user_info_by_access_key(op->ctx(), access_key,
+ info,
+ params.objv_tracker,
+ params.mtime,
+ y,
+ dpp);
+ });
+}
+
+int RGWUserCtl::get_attrs_by_uid(const DoutPrefixProvider *dpp,
+ const rgw_user& user_id,
+ map<string, bufferlist> *pattrs,
+ optional_yield y,
+ RGWObjVersionTracker *objv_tracker)
+{
+ RGWUserInfo user_info;
+
+ return get_info_by_uid(dpp, user_id, &user_info, y, RGWUserCtl::GetParams()
+ .set_attrs(pattrs)
+ .set_objv_tracker(objv_tracker));
+}
+
+int RGWUserCtl::store_info(const DoutPrefixProvider *dpp,
+ const RGWUserInfo& info, optional_yield y,
+ const PutParams& params)
+{
+ string key = RGWSI_User::get_meta_key(info.user_id);
+
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->store_user_info(op->ctx(), info,
+ params.old_info,
+ params.objv_tracker,
+ params.mtime,
+ params.exclusive,
+ params.attrs,
+ y,
+ dpp);
+ });
+}
+
+int RGWUserCtl::remove_info(const DoutPrefixProvider *dpp,
+ const RGWUserInfo& info, optional_yield y,
+ const RemoveParams& params)
+
+{
+ string key = RGWSI_User::get_meta_key(info.user_id);
+
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->remove_user_info(op->ctx(), info,
+ params.objv_tracker,
+ y, dpp);
+ });
+}
+
+int RGWUserCtl::list_buckets(const DoutPrefixProvider *dpp,
+ const rgw_user& user,
+ const string& marker,
+ const string& end_marker,
+ uint64_t max,
+ bool need_stats,
+ RGWUserBuckets *buckets,
+ bool *is_truncated,
+ optional_yield y,
+ uint64_t default_max)
+{
+ if (!max) {
+ max = default_max;
+ }
+
+ int ret = svc.user->list_buckets(dpp, user, marker, end_marker,
+ max, buckets, is_truncated, y);
+ if (ret < 0) {
+ return ret;
+ }
+ if (need_stats) {
+ map<string, RGWBucketEnt>& m = buckets->get_buckets();
+ ret = ctl.bucket->read_buckets_stats(m, y, dpp);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: could not get stats for buckets" << dendl;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int RGWUserCtl::read_stats(const DoutPrefixProvider *dpp,
+ const rgw_user& user, RGWStorageStats *stats,
+ optional_yield y,
+ ceph::real_time *last_stats_sync,
+ ceph::real_time *last_stats_update)
+{
+ return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+ return svc.user->read_stats(dpp, op->ctx(), user, stats,
+ last_stats_sync, last_stats_update, y);
+ });
+}
+
+RGWMetadataHandler *RGWUserMetaHandlerAllocator::alloc(RGWSI_User *user_svc) {
+ return new RGWUserMetadataHandler(user_svc);
+}
+
+void rgw_user::dump(Formatter *f) const
+{
+ ::encode_json("user", *this, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h
new file mode 100644
index 000000000..ea05de806
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_user.h
@@ -0,0 +1,885 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <boost/algorithm/string.hpp>
+#include "include/ceph_assert.h"
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "rgw_string.h"
+
+#include "common/Formatter.h"
+#include "rgw_formats.h"
+#include "rgw_metadata.h"
+#include "rgw_sal_fwd.h"
+
+#define RGW_USER_ANON_ID "anonymous"
+
+#define SECRET_KEY_LEN 40
+#define PUBLIC_ID_LEN 20
+#define RAND_SUBUSER_LEN 5
+
+#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/"
+
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWUserBuckets;
+
+class RGWGetUserStats_CB;
+
+/**
+ * A string wrapper that includes encode/decode functions
+ * for easily accessing a UID in all forms
+ */
+struct RGWUID
+{
+ rgw_user user_id;
+ void encode(bufferlist& bl) const {
+ std::string s;
+ user_id.to_str(s);
+ using ceph::encode;
+ encode(s, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ std::string s;
+ using ceph::decode;
+ decode(s, bl);
+ user_id.from_str(s);
+ }
+};
+WRITE_CLASS_ENCODER(RGWUID)
+
+/** Entry for bucket metadata collection */
+struct bucket_meta_entry {
+ size_t size;
+ size_t size_rounded;
+ ceph::real_time creation_time;
+ uint64_t count;
+};
+
+extern int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::User* user, optional_yield y);
+extern int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver, rgw::sal::User* user,
+ std::map<std::string, bucket_meta_entry>& buckets_usage_map, optional_yield y);
+
+/**
+ * Get the anonymous (ie, unauthenticated) user info.
+ */
+extern void rgw_get_anon_user(RGWUserInfo& info);
+
+extern void rgw_perm_to_str(uint32_t mask, char *buf, int len);
+extern uint32_t rgw_str_to_perm(const char *str);
+
+extern int rgw_validate_tenant_name(const std::string& t);
+
+enum ObjectKeyType {
+ KEY_TYPE_SWIFT,
+ KEY_TYPE_S3,
+ KEY_TYPE_UNDEFINED
+};
+
+enum RGWKeyPoolOp {
+ GENERATE_KEY,
+ MODIFY_KEY
+};
+
+enum RGWUserId {
+ RGW_USER_ID,
+ RGW_SWIFT_USERNAME,
+ RGW_USER_EMAIL,
+ RGW_ACCESS_KEY,
+};
+
+/*
+ * An RGWUser class along with supporting classes created
+ * to support the creation of an RESTful administrative API
+ */
+struct RGWUserAdminOpState {
+ // user attributes
+ std::unique_ptr<rgw::sal::User> user;
+ std::string user_email;
+ std::string display_name;
+ rgw_user new_user_id;
+ bool overwrite_new_user = false;
+ int32_t max_buckets{RGW_DEFAULT_MAX_BUCKETS};
+ __u8 suspended{0};
+ __u8 admin{0};
+ __u8 system{0};
+ __u8 exclusive{0};
+ __u8 fetch_stats{0};
+ __u8 sync_stats{0};
+ std::string caps;
+ RGWObjVersionTracker objv;
+ uint32_t op_mask{0};
+ std::map<int, std::string> temp_url_keys;
+
+ // subuser attributes
+ std::string subuser;
+ uint32_t perm_mask{RGW_PERM_NONE};
+
+ // key_attributes
+ std::string id; // access key
+ std::string key; // secret key
+ // access keys fetched for a user in the middle of an op
+ std::map<std::string, RGWAccessKey> op_access_keys;
+ int32_t key_type{-1};
+ bool access_key_exist = false;
+
+ std::set<std::string> mfa_ids;
+
+ // operation attributes
+ bool existing_user{false};
+ bool existing_key{false};
+ bool existing_subuser{false};
+ bool existing_email{false};
+ bool subuser_specified{false};
+ bool gen_secret{false};
+ bool gen_access{false};
+ bool gen_subuser{false};
+ bool id_specified{false};
+ bool key_specified{false};
+ bool type_specified{false};
+ bool key_type_setbycontext{false}; // key type set by user or subuser context
+ bool purge_data{false};
+ bool purge_keys{false};
+ bool display_name_specified{false};
+ bool user_email_specified{false};
+ bool max_buckets_specified{false};
+ bool perm_specified{false};
+ bool op_mask_specified{false};
+ bool caps_specified{false};
+ bool suspension_op{false};
+ bool admin_specified{false};
+ bool system_specified{false};
+ bool key_op{false};
+ bool temp_url_key_specified{false};
+ bool found_by_uid{false};
+ bool found_by_email{false};
+ bool found_by_key{false};
+ bool mfa_ids_specified{false};
+
+ // req parameters
+ bool populated{false};
+ bool initialized{false};
+ bool key_params_checked{false};
+ bool subuser_params_checked{false};
+ bool user_params_checked{false};
+
+ bool bucket_quota_specified{false};
+ bool user_quota_specified{false};
+ bool bucket_ratelimit_specified{false};
+ bool user_ratelimit_specified{false};
+
+ RGWQuota quota;
+ RGWRateLimitInfo user_ratelimit;
+ RGWRateLimitInfo bucket_ratelimit;
+
+ // req parameters for listing user
+ std::string marker{""};
+ uint32_t max_entries{1000};
+ rgw_placement_rule default_placement; // user default placement
+ bool default_placement_specified{false};
+
+ std::list<std::string> placement_tags; // user default placement_tags
+ bool placement_tags_specified{false};
+
+ void set_access_key(const std::string& access_key) {
+ if (access_key.empty())
+ return;
+
+ id = access_key;
+ id_specified = true;
+ gen_access = false;
+ key_op = true;
+ }
+
+ void set_secret_key(const std::string& secret_key) {
+ if (secret_key.empty())
+ return;
+
+ key = secret_key;
+ key_specified = true;
+ gen_secret = false;
+ key_op = true;
+ }
+
+ void set_user_id(const rgw_user& id);
+
+ void set_new_user_id(const rgw_user& id) {
+ if (id.empty())
+ return;
+
+ new_user_id = id;
+ }
+ void set_overwrite_new_user(bool b) {
+ overwrite_new_user = b;
+ }
+
+ void set_user_email(std::string& email) {
+ /* always lowercase email address */
+ boost::algorithm::to_lower(email);
+ user_email = email;
+ user_email_specified = true;
+ }
+
+ void set_display_name(const std::string& name) {
+ if (name.empty())
+ return;
+
+ display_name = name;
+ display_name_specified = true;
+ }
+
+ void set_subuser(std::string& _subuser);
+
+ void set_caps(const std::string& _caps) {
+ if (_caps.empty())
+ return;
+
+ caps = _caps;
+ caps_specified = true;
+ }
+
+ void set_perm(uint32_t perm) {
+ perm_mask = perm;
+ perm_specified = true;
+ }
+
+ void set_op_mask(uint32_t mask) {
+ op_mask = mask;
+ op_mask_specified = true;
+ }
+
+ void set_temp_url_key(const std::string& key, int index) {
+ temp_url_keys[index] = key;
+ temp_url_key_specified = true;
+ }
+
+ void set_key_type(int32_t type) {
+ key_type = type;
+ type_specified = true;
+ }
+
+ void set_access_key_exist() {
+ access_key_exist = true;
+ }
+
+ void set_suspension(__u8 is_suspended) {
+ suspended = is_suspended;
+ suspension_op = true;
+ }
+
+ void set_admin(__u8 is_admin) {
+ admin = is_admin;
+ admin_specified = true;
+ }
+
+ void set_system(__u8 is_system) {
+ system = is_system;
+ system_specified = true;
+ }
+
+ void set_exclusive(__u8 is_exclusive) {
+ exclusive = is_exclusive;
+ }
+
+ void set_fetch_stats(__u8 is_fetch_stats) {
+ fetch_stats = is_fetch_stats;
+ }
+
+ void set_sync_stats(__u8 is_sync_stats) {
+ sync_stats = is_sync_stats;
+ }
+
+ void set_user_info(RGWUserInfo& user_info);
+
+ void set_user_version_tracker(RGWObjVersionTracker& objv_tracker);
+
+ void set_max_buckets(int32_t mb) {
+ max_buckets = mb;
+ max_buckets_specified = true;
+ }
+
+ void set_gen_access() {
+ gen_access = true;
+ key_op = true;
+ }
+
+ void set_gen_secret() {
+ gen_secret = true;
+ key_op = true;
+ }
+
+ void set_generate_key() {
+ if (id.empty())
+ gen_access = true;
+ if (key.empty())
+ gen_secret = true;
+ key_op = true;
+ }
+
+ void clear_generate_key() {
+ gen_access = false;
+ gen_secret = false;
+ }
+
+ void set_purge_keys() {
+ purge_keys = true;
+ key_op = true;
+ }
+
+ void set_bucket_quota(RGWQuotaInfo& quotas) {
+ quota.bucket_quota = quotas;
+ bucket_quota_specified = true;
+ }
+
+ void set_user_quota(RGWQuotaInfo& quotas) {
+ quota.user_quota = quotas;
+ user_quota_specified = true;
+ }
+
+ void set_bucket_ratelimit(RGWRateLimitInfo& ratelimit) {
+ bucket_ratelimit = ratelimit;
+ bucket_ratelimit_specified = true;
+ }
+
+ void set_user_ratelimit(RGWRateLimitInfo& ratelimit) {
+ user_ratelimit = ratelimit;
+ user_ratelimit_specified = true;
+ }
+
+ void set_mfa_ids(const std::set<std::string>& ids) {
+ mfa_ids = ids;
+ mfa_ids_specified = true;
+ }
+
+ void set_default_placement(const rgw_placement_rule& _placement) {
+ default_placement = _placement;
+ default_placement_specified = true;
+ }
+
+ void set_placement_tags(const std::list<std::string>& _tags) {
+ placement_tags = _tags;
+ placement_tags_specified = true;
+ }
+
+ bool is_populated() { return populated; }
+ bool is_initialized() { return initialized; }
+ bool has_existing_user() { return existing_user; }
+ bool has_existing_key() { return existing_key; }
+ bool has_existing_subuser() { return existing_subuser; }
+ bool has_existing_email() { return existing_email; }
+ bool has_subuser() { return subuser_specified; }
+ bool has_key_op() { return key_op; }
+ bool has_caps_op() { return caps_specified; }
+ bool has_suspension_op() { return suspension_op; }
+ bool has_subuser_perm() { return perm_specified; }
+ bool has_op_mask() { return op_mask_specified; }
+ bool will_gen_access() { return gen_access; }
+ bool will_gen_secret() { return gen_secret; }
+ bool will_gen_subuser() { return gen_subuser; }
+ bool will_purge_keys() { return purge_keys; }
+ bool will_purge_data() { return purge_data; }
+ bool will_generate_subuser() { return gen_subuser; }
+ bool has_bucket_quota() { return bucket_quota_specified; }
+ bool has_user_quota() { return user_quota_specified; }
+ void set_populated() { populated = true; }
+ void clear_populated() { populated = false; }
+ void set_initialized() { initialized = true; }
+ void set_existing_user(bool flag) { existing_user = flag; }
+ void set_existing_key(bool flag) { existing_key = flag; }
+ void set_existing_subuser(bool flag) { existing_subuser = flag; }
+ void set_existing_email(bool flag) { existing_email = flag; }
+ void set_purge_data(bool flag) { purge_data = flag; }
+ void set_generate_subuser(bool flag) { gen_subuser = flag; }
+ __u8 get_suspension_status() { return suspended; }
+ int32_t get_key_type() {return key_type; }
+ bool get_access_key_exist() {return access_key_exist; }
+ uint32_t get_subuser_perm() { return perm_mask; }
+ int32_t get_max_buckets() { return max_buckets; }
+ uint32_t get_op_mask() { return op_mask; }
+ RGWQuotaInfo& get_bucket_quota() { return quota.bucket_quota; }
+ RGWQuotaInfo& get_user_quota() { return quota.user_quota; }
+ std::set<std::string>& get_mfa_ids() { return mfa_ids; }
+
+ rgw::sal::User* get_user() { return user.get(); }
+ const rgw_user& get_user_id();
+ std::string get_subuser() { return subuser; }
+ std::string get_access_key() { return id; }
+ std::string get_secret_key() { return key; }
+ std::string get_caps() { return caps; }
+ std::string get_user_email() { return user_email; }
+ std::string get_display_name() { return display_name; }
+ rgw_user& get_new_uid() { return new_user_id; }
+ bool get_overwrite_new_user() const { return overwrite_new_user; }
+ std::map<int, std::string>& get_temp_url_keys() { return temp_url_keys; }
+
+ RGWUserInfo& get_user_info();
+
+ std::map<std::string, RGWAccessKey>* get_swift_keys();
+ std::map<std::string, RGWAccessKey>* get_access_keys();
+ std::map<std::string, RGWSubUser>* get_subusers();
+
+ RGWUserCaps* get_caps_obj();
+
+ std::string build_default_swift_kid();
+
+ std::string generate_subuser();
+
+ RGWUserAdminOpState(rgw::sal::Driver* driver);
+};
+
+class RGWUser;
+
+class RGWAccessKeyPool
+{
+ RGWUser *user{nullptr};
+
+ std::map<std::string, int, ltstr_nocase> key_type_map;
+ rgw_user user_id;
+ rgw::sal::Driver* driver{nullptr};
+
+ std::map<std::string, RGWAccessKey> *swift_keys{nullptr};
+ std::map<std::string, RGWAccessKey> *access_keys{nullptr};
+
+ // we don't want to allow keys for the anonymous user or a null user
+ bool keys_allowed{false};
+
+private:
+ int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+ int generate_key(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg = NULL);
+ int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+ int check_key_owner(RGWUserAdminOpState& op_state);
+ bool check_existing_key(RGWUserAdminOpState& op_state);
+ int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+ /* API Contract Fulfilment */
+ int execute_add(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y);
+ int execute_remove(const DoutPrefixProvider *dpp,
+ RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y);
+ int remove_subuser_keys(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y);
+
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+ optional_yield y);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ bool defer_save, optional_yield y);
+public:
+ explicit RGWAccessKeyPool(RGWUser* usr);
+
+ int init(RGWUserAdminOpState& op_state);
+
+ /* API Contracted Methods */
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg = NULL);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg = NULL);
+
+ friend class RGWUser;
+ friend class RGWSubUserPool;
+};
+
+class RGWSubUserPool
+{
+ RGWUser *user{nullptr};
+
+ rgw_user user_id;
+ rgw::sal::Driver* driver{nullptr};
+ bool subusers_allowed{false};
+
+ std::map<std::string, RGWSubUser> *subuser_map{nullptr};
+
+private:
+ int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+ /* API Contract Fulfillment */
+ int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+ int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+ int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+ optional_yield y);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+ int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_save);
+public:
+ explicit RGWSubUserPool(RGWUser *user);
+
+ bool exists(std::string subuser);
+ int init(RGWUserAdminOpState& op_state);
+
+ /* API contracted methods */
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg = NULL);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+ int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+ friend class RGWUser;
+};
+
+class RGWUserCapPool
+{
+ RGWUserCaps *caps{nullptr};
+ bool caps_allowed{false};
+ RGWUser *user{nullptr};
+
+private:
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+ optional_yield y);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+ optional_yield y);
+
+public:
+ explicit RGWUserCapPool(RGWUser *user);
+
+ int init(RGWUserAdminOpState& op_state);
+
+ /* API contracted methods */
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+ std::string *err_msg = NULL);
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+ friend class RGWUser;
+};
+
+class RGWUser
+{
+
+private:
+ RGWUserInfo old_info;
+ rgw::sal::Driver* driver{nullptr};
+
+ rgw_user user_id;
+ bool info_stored{false};
+
+ void set_populated() { info_stored = true; }
+ void clear_populated() { info_stored = false; }
+ bool is_populated() { return info_stored; }
+
+ int check_op(RGWUserAdminOpState& req, std::string *err_msg);
+ int update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+ void clear_members();
+ void init_default();
+
+ /* API Contract Fulfillment */
+ int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+ optional_yield y);
+ int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+ std::string *err_msg, optional_yield y);
+ int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+ int execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+public:
+ RGWUser();
+
+ int init(const DoutPrefixProvider *dpp, rgw::sal::Driver* storage, RGWUserAdminOpState& op_state,
+ optional_yield y);
+
+ int init_storage(rgw::sal::Driver* storage);
+ int init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y);
+ int init_members(RGWUserAdminOpState& op_state);
+
+ rgw::sal::Driver* get_driver() { return driver; }
+
+ /* API Contracted Members */
+ RGWUserCapPool caps;
+ RGWAccessKeyPool keys;
+ RGWSubUserPool subusers;
+
+ /* API Contracted Methods */
+ int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+ int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+ int rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+ /* remove an already populated RGWUser */
+ int remove(std::string *err_msg = NULL);
+
+ int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+ /* retrieve info from an existing user in the RGW system */
+ int info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, optional_yield y,
+ std::string *err_msg = NULL);
+
+ /* info from an already populated RGWUser */
+ int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL);
+
+ /* list the existing users */
+ int list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+ friend class RGWAccessKeyPool;
+ friend class RGWSubUserPool;
+ friend class RGWUserCapPool;
+};
+
+/* Wrappers for admin API functionality */
+
+class RGWUserAdminOp_User
+{
+public:
+ static int list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+ static int info(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int create(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int modify(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+
+ static int remove(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+};
+
+class RGWUserAdminOp_Subuser
+{
+public:
+ static int create(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int modify(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+};
+
+class RGWUserAdminOp_Key
+{
+public:
+ static int create(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+};
+
+class RGWUserAdminOp_Caps
+{
+public:
+ static int add(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+
+ static int remove(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+ optional_yield y);
+};
+
+struct RGWUserCompleteInfo {
+ RGWUserInfo info;
+ std::map<std::string, bufferlist> attrs;
+ bool has_attrs{false};
+
+ void dump(Formatter * const f) const {
+ info.dump(f);
+ encode_json("attrs", attrs, f);
+ }
+
+ void decode_json(JSONObj *obj) {
+ decode_json_obj(info, obj);
+ has_attrs = JSONDecoder::decode_json("attrs", attrs, obj);
+ }
+};
+
+class RGWUserMetadataObject : public RGWMetadataObject {
+ RGWUserCompleteInfo uci;
+public:
+ RGWUserMetadataObject() {}
+ RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, const obj_version& v, real_time m)
+ : uci(_uci) {
+ objv = v;
+ mtime = m;
+ }
+
+ void dump(Formatter *f) const override {
+ uci.dump(f);
+ }
+
+ RGWUserCompleteInfo& get_uci() {
+ return uci;
+ }
+};
+
+class RGWUserMetadataHandler;
+
+class RGWUserCtl
+{
+ struct Svc {
+ RGWSI_Zone *zone{nullptr};
+ RGWSI_User *user{nullptr};
+ } svc;
+
+ struct Ctl {
+ RGWBucketCtl *bucket{nullptr};
+ } ctl;
+
+ RGWUserMetadataHandler *umhandler;
+ RGWSI_MetaBackend_Handler *be_handler{nullptr};
+
+public:
+ RGWUserCtl(RGWSI_Zone *zone_svc,
+ RGWSI_User *user_svc,
+ RGWUserMetadataHandler *_umhandler);
+
+ void init(RGWBucketCtl *bucket_ctl) {
+ ctl.bucket = bucket_ctl;
+ }
+
+ RGWBucketCtl *get_bucket_ctl() {
+ return ctl.bucket;
+ }
+
+ struct GetParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ ceph::real_time *mtime{nullptr};
+ rgw_cache_entry_info *cache_info{nullptr};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+
+ GetParams() {}
+
+ GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ GetParams& set_mtime(ceph::real_time *_mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+ cache_info = _cache_info;
+ return *this;
+ }
+
+ GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+ };
+
+ struct PutParams {
+ RGWUserInfo *old_info{nullptr};
+ RGWObjVersionTracker *objv_tracker{nullptr};
+ ceph::real_time mtime;
+ bool exclusive{false};
+ std::map<std::string, bufferlist> *attrs{nullptr};
+
+ PutParams() {}
+
+ PutParams& set_old_info(RGWUserInfo *_info) {
+ old_info = _info;
+ return *this;
+ }
+
+ PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+
+ PutParams& set_mtime(const ceph::real_time& _mtime) {
+ mtime = _mtime;
+ return *this;
+ }
+
+ PutParams& set_exclusive(bool _exclusive) {
+ exclusive = _exclusive;
+ return *this;
+ }
+
+ PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+ attrs = _attrs;
+ return *this;
+ }
+ };
+
+ struct RemoveParams {
+ RGWObjVersionTracker *objv_tracker{nullptr};
+
+ RemoveParams() {}
+
+ RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+ objv_tracker = _objv_tracker;
+ return *this;
+ }
+ };
+
+ int get_info_by_uid(const DoutPrefixProvider *dpp,
+ const rgw_user& uid, RGWUserInfo *info,
+ optional_yield y, const GetParams& params = {});
+ int get_info_by_email(const DoutPrefixProvider *dpp,
+ const std::string& email, RGWUserInfo *info,
+ optional_yield y, const GetParams& params = {});
+ int get_info_by_swift(const DoutPrefixProvider *dpp,
+ const std::string& swift_name, RGWUserInfo *info,
+ optional_yield y, const GetParams& params = {});
+ int get_info_by_access_key(const DoutPrefixProvider *dpp,
+ const std::string& access_key, RGWUserInfo *info,
+ optional_yield y, const GetParams& params = {});
+
+ int get_attrs_by_uid(const DoutPrefixProvider *dpp,
+ const rgw_user& user_id,
+ std::map<std::string, bufferlist> *attrs,
+ optional_yield y,
+ RGWObjVersionTracker *objv_tracker = nullptr);
+
+ int store_info(const DoutPrefixProvider *dpp,
+ const RGWUserInfo& info, optional_yield y,
+ const PutParams& params = {});
+ int remove_info(const DoutPrefixProvider *dpp,
+ const RGWUserInfo& info, optional_yield y,
+ const RemoveParams& params = {});
+
+ int list_buckets(const DoutPrefixProvider *dpp,
+ const rgw_user& user,
+ const std::string& marker,
+ const std::string& end_marker,
+ uint64_t max,
+ bool need_stats,
+ RGWUserBuckets *buckets,
+ bool *is_truncated,
+ optional_yield y,
+ uint64_t default_max = 1000);
+
+ int read_stats(const DoutPrefixProvider *dpp,
+ const rgw_user& user, RGWStorageStats *stats,
+ optional_yield y,
+ ceph::real_time *last_stats_sync = nullptr, /* last time a full stats sync completed */
+ ceph::real_time *last_stats_update = nullptr); /* last time a stats update was done */
+};
+
+class RGWUserMetaHandlerAllocator {
+public:
+ static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
+};
diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc
new file mode 100644
index 000000000..ed09f24f6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_zone.cc
@@ -0,0 +1,1288 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_zone.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_sal_config.h"
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+RGWMetaSyncStatusManager::~RGWMetaSyncStatusManager(){}
+
+#define FIRST_EPOCH 1
+
+struct RGWAccessKey;
+
+/// Generate a random uuid for realm/period/zonegroup/zone ids
+static std::string gen_random_uuid()
+{
+ uuid_d uuid;
+ uuid.generate_random();
+ return uuid.to_string();
+}
+
+void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
+ encode_json("default_zonegroup", default_zonegroup, f);
+}
+
+void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
+
+ JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
+ /* backward compatability with region */
+ if (default_zonegroup.empty()) {
+ JSONDecoder::decode_json("default_region", default_zonegroup, obj);
+ }
+}
+
+int RGWZoneGroup::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+ name = default_zonegroup_name;
+ api_name = default_zonegroup_name;
+ is_master = true;
+
+ RGWZoneGroupPlacementTarget placement_target;
+ placement_target.name = "default-placement";
+ placement_targets[placement_target.name] = placement_target;
+ default_placement.name = "default-placement";
+
+ RGWZoneParams zone_params(default_zone_name);
+
+ int r = zone_params.init(dpp, cct, sysobj_svc, y, false);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ r = zone_params.create_default(dpp, y);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
+ return r;
+ } else if (r == -EEXIST) {
+ ldpp_dout(dpp, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
+ zone_params.clear_id();
+ r = zone_params.init(dpp, cct, sysobj_svc, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
+ << dendl;
+ }
+
+ RGWZone& default_zone = zones[zone_params.get_id()];
+ default_zone.name = zone_params.get_name();
+ default_zone.id = zone_params.get_id();
+ master_zone = default_zone.id;
+
+ // initialize supported zone features
+ default_zone.supported_features.insert(rgw::zone_features::supported.begin(),
+ rgw::zone_features::supported.end());
+ // enable default zonegroup features
+ enabled_features.insert(rgw::zone_features::enabled.begin(),
+ rgw::zone_features::enabled.end());
+
+ r = create(dpp, y);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (r == -EEXIST) {
+ ldpp_dout(dpp, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
+ id.clear();
+ r = init(dpp, cct, sysobj_svc, y);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (old_format) {
+ name = id;
+ }
+
+ post_process_params(dpp, y);
+
+ return 0;
+}
+
+int RGWZoneGroup::equals(const string& other_zonegroup) const
+{
+ if (is_master && other_zonegroup.empty())
+ return true;
+
+ return (id == other_zonegroup);
+}
+
+int RGWZoneGroup::add_zone(const DoutPrefixProvider *dpp,
+ const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+ const list<string>& endpoints, const string *ptier_type,
+ bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm,
+ string *predirect_zone, std::optional<int> bucket_index_max_shards,
+ RGWSyncModulesManager *sync_mgr,
+ const rgw::zone_features::set& enable_features,
+ const rgw::zone_features::set& disable_features,
+ optional_yield y)
+{
+ auto& zone_id = zone_params.get_id();
+ auto& zone_name = zone_params.get_name();
+
+ // check for duplicate zone name on insert
+ if (!zones.count(zone_id)) {
+ for (const auto& zone : zones) {
+ if (zone.second.name == zone_name) {
+ ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name
+ << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
+ return -EEXIST;
+ }
+ }
+ }
+
+ if (is_master) {
+ if (*is_master) {
+ if (!master_zone.empty() && master_zone != zone_id) {
+ ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
+ }
+ master_zone = zone_id;
+ } else if (master_zone == zone_id) {
+ master_zone.clear();
+ }
+ }
+
+ RGWZone& zone = zones[zone_id];
+ zone.name = zone_name;
+ zone.id = zone_id;
+ if (!endpoints.empty()) {
+ zone.endpoints = endpoints;
+ }
+ if (read_only) {
+ zone.read_only = *read_only;
+ }
+ if (ptier_type) {
+ zone.tier_type = *ptier_type;
+ if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+ ldpp_dout(dpp, 0) << "ERROR: could not found sync module: " << *ptier_type
+ << ", valid sync modules: "
+ << sync_mgr->get_registered_module_names()
+ << dendl;
+ return -ENOENT;
+ }
+ }
+
+ if (psync_from_all) {
+ zone.sync_from_all = *psync_from_all;
+ }
+
+ if (predirect_zone) {
+ zone.redirect_zone = *predirect_zone;
+ }
+
+ if (bucket_index_max_shards) {
+ zone.bucket_index_max_shards = *bucket_index_max_shards;
+ }
+
+ for (auto add : sync_from) {
+ zone.sync_from.insert(add);
+ }
+
+ for (auto rm : sync_from_rm) {
+ zone.sync_from.erase(rm);
+ }
+
+ zone.supported_features.insert(enable_features.begin(),
+ enable_features.end());
+
+ for (const auto& feature : disable_features) {
+ if (enabled_features.contains(feature)) {
+ lderr(cct) << "ERROR: Cannot disable zone feature \"" << feature
+ << "\" until it's been disabled in zonegroup " << name << dendl;
+ return -EINVAL;
+ }
+ auto i = zone.supported_features.find(feature);
+ if (i == zone.supported_features.end()) {
+ ldout(cct, 1) << "WARNING: zone feature \"" << feature
+ << "\" was not enabled in zone " << zone.name << dendl;
+ continue;
+ }
+ zone.supported_features.erase(i);
+ }
+
+ post_process_params(dpp, y);
+
+ return update(dpp,y);
+}
+
+
+int RGWZoneGroup::rename_zone(const DoutPrefixProvider *dpp,
+ const RGWZoneParams& zone_params,
+ optional_yield y)
+{
+ RGWZone& zone = zones[zone_params.get_id()];
+ zone.name = zone_params.get_name();
+
+ return update(dpp, y);
+}
+
+void RGWZoneGroup::post_process_params(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ bool log_data = zones.size() > 1;
+
+ if (master_zone.empty()) {
+ auto iter = zones.begin();
+ if (iter != zones.end()) {
+ master_zone = iter->first;
+ }
+ }
+
+ for (auto& item : zones) {
+ RGWZone& zone = item.second;
+ zone.log_data = log_data;
+
+ RGWZoneParams zone_params(zone.id, zone.name);
+ int ret = zone_params.init(dpp, cct, sysobj_svc, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
+ continue;
+ }
+
+ for (auto& pitem : zone_params.placement_pools) {
+ const string& placement_name = pitem.first;
+ if (placement_targets.find(placement_name) == placement_targets.end()) {
+ RGWZoneGroupPlacementTarget placement_target;
+ placement_target.name = placement_name;
+ placement_targets[placement_name] = placement_target;
+ }
+ }
+ }
+
+ if (default_placement.empty() && !placement_targets.empty()) {
+ default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD);
+ }
+}
+
+int RGWZoneGroup::remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y)
+{
+ auto iter = zones.find(zone_id);
+ if (iter == zones.end()) {
+ ldpp_dout(dpp, 0) << "zone id " << zone_id << " is not a part of zonegroup "
+ << name << dendl;
+ return -ENOENT;
+ }
+
+ zones.erase(iter);
+
+ post_process_params(dpp, y);
+
+ return update(dpp, y);
+}
+
+void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const {
+ encode_json("default_id", default_id, f);
+}
+
+void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("default_id", default_id, obj);
+}
+
+int RGWSystemMetaObj::rename(const DoutPrefixProvider *dpp, const string& new_name, optional_yield y)
+{
+ string new_id;
+ int ret = read_id(dpp, new_name, new_id, y);
+ if (!ret) {
+ return -EEXIST;
+ }
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ string old_name = name;
+ name = new_name;
+ ret = update(dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ ret = store_name(dpp, true, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ /* delete old name */
+ rgw_pool pool(get_pool(cct));
+ string oid = get_names_oid_prefix() + old_name;
+ rgw_raw_obj old_name_obj(pool, oid);
+ auto sysobj = sysobj_svc->get_obj(old_name_obj);
+ ret = sysobj.wop().remove(dpp, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return ret;
+}
+
+int RGWSystemMetaObj::read(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ int ret = read_id(dpp, name, id, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return read_info(dpp, id, y);
+}
+
+int RGWZoneParams::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+ name = default_zone_name;
+
+ int r = create(dpp, y);
+ if (r < 0) {
+ return r;
+ }
+
+ if (old_format) {
+ name = id;
+ }
+
+ return r;
+}
+
+const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const
+{
+ static const std::string NONE{"none"};
+ auto p = placement_pools.find(placement_rule.name);
+ if (p == placement_pools.end()) {
+ return NONE;
+ }
+ const auto& type = p->second.get_compression_type(placement_rule.get_storage_class());
+ return !type.empty() ? type : NONE;
+}
+
+// run an MD5 hash on the zone_id and return the first 32 bits
+static uint32_t gen_short_zone_id(const std::string zone_id)
+{
+ unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size());
+ hash.Final(md5);
+
+ uint32_t short_id;
+ memcpy((char *)&short_id, md5, sizeof(short_id));
+ return std::max(short_id, 1u);
+}
+
+int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
+{
+ if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
+ ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
+ ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
+ return -EINVAL;
+ }
+ map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
+ if (iter != zonegroups.end()) {
+ RGWZoneGroup& old_zonegroup = iter->second;
+ if (!old_zonegroup.api_name.empty()) {
+ zonegroups_by_api.erase(old_zonegroup.api_name);
+ }
+ }
+ zonegroups[zonegroup.get_id()] = zonegroup;
+
+ if (!zonegroup.api_name.empty()) {
+ zonegroups_by_api[zonegroup.api_name] = zonegroup;
+ }
+
+ if (zonegroup.is_master_zonegroup()) {
+ master_zonegroup = zonegroup.get_id();
+ } else if (master_zonegroup == zonegroup.get_id()) {
+ master_zonegroup = "";
+ }
+
+ for (auto& i : zonegroup.zones) {
+ auto& zone = i.second;
+ if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
+ continue;
+ }
+ // calculate the zone's short id
+ uint32_t short_id = gen_short_zone_id(zone.id);
+
+ // search for an existing zone with the same short id
+ for (auto& s : short_zone_ids) {
+ if (s.second == short_id) {
+ ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
+ << ") generates the same short_zone_id " << short_id
+ << " as existing zone id " << s.first << dendl;
+ return -EEXIST;
+ }
+ }
+
+ short_zone_ids[zone.id] = short_id;
+ }
+
+ return 0;
+}
+
+uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
+{
+ auto i = short_zone_ids.find(zone_id);
+ if (i == short_zone_ids.end()) {
+ return 0;
+ }
+ return i->second;
+}
+
+bool RGWPeriodMap::find_zone_by_name(const string& zone_name,
+ RGWZoneGroup *zonegroup,
+ RGWZone *zone) const
+{
+ for (auto& iter : zonegroups) {
+ auto& zg = iter.second;
+ for (auto& ziter : zg.zones) {
+ auto& z = ziter.second;
+
+ if (z.name == zone_name) {
+ *zonegroup = zg;
+ *zone = z;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+namespace rgw {
+
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view realm_id,
+ std::string_view realm_name,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer)
+{
+ if (!realm_id.empty()) {
+ return cfgstore->read_realm_by_id(dpp, y, realm_id, info, writer);
+ }
+ if (!realm_name.empty()) {
+ return cfgstore->read_realm_by_name(dpp, y, realm_name, info, writer);
+ }
+ return cfgstore->read_default_realm(dpp, y, info, writer);
+}
+
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer_out)
+{
+ if (info.name.empty()) {
+ ldpp_dout(dpp, -1) << __func__ << " requires a realm name" << dendl;
+ return -EINVAL;
+ }
+ if (info.id.empty()) {
+ info.id = gen_random_uuid();
+ }
+
+ // if the realm already has a current_period, just make sure it exists
+ std::optional<RGWPeriod> period;
+ if (!info.current_period.empty()) {
+ period.emplace();
+ int r = cfgstore->read_period(dpp, y, info.current_period,
+ std::nullopt, *period);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __func__ << " failed to read realm's current_period="
+ << info.current_period << " with " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // create the realm
+ std::unique_ptr<sal::RealmWriter> writer;
+ int r = cfgstore->create_realm(dpp, y, exclusive, info, &writer);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!period) {
+ // initialize and exclusive-create the initial period
+ period.emplace();
+ period->id = gen_random_uuid();
+ period->period_map.id = period->id;
+ period->epoch = FIRST_EPOCH;
+ period->realm_id = info.id;
+ period->realm_name = info.name;
+
+ r = cfgstore->create_period(dpp, y, true, *period);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __func__ << " failed to create the initial period id="
+ << period->id << " for realm " << info.name
+ << " with " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // update the realm's current_period
+ r = realm_set_current_period(dpp, y, cfgstore, *writer, info, *period);
+ if (r < 0) {
+ return r;
+ }
+
+ // try to set as default. may race with another create, so pass exclusive=true
+ // so we don't override an existing default
+ r = set_default_realm(dpp, y, cfgstore, info, true);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ if (writer_out) {
+ *writer_out = std::move(writer);
+ }
+ return 0;
+}
+
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWRealm& info,
+ bool exclusive)
+{
+ return cfgstore->write_default_realm_id(dpp, y, exclusive, info.id);
+}
+
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ sal::RealmWriter& writer, RGWRealm& realm,
+ const RGWPeriod& period)
+{
+ // update realm epoch to match the period's
+ if (realm.epoch > period.realm_epoch) {
+ ldpp_dout(dpp, -1) << __func__ << " with old realm epoch "
+ << period.realm_epoch << ", current epoch=" << realm.epoch << dendl;
+ return -EINVAL;
+ }
+ if (realm.epoch == period.realm_epoch && realm.current_period != period.id) {
+ ldpp_dout(dpp, -1) << __func__ << " with same realm epoch "
+ << period.realm_epoch << ", but different period id "
+ << period.id << " != " << realm.current_period << dendl;
+ return -EINVAL;
+ }
+
+ realm.epoch = period.realm_epoch;
+ realm.current_period = period.id;
+
+ // update the realm object
+ int r = writer.write(dpp, y, realm);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __func__ << " failed to overwrite realm "
+ << realm.name << " with " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // reflect the zonegroup and period config
+ (void) reflect_period(dpp, y, cfgstore, period);
+ return 0;
+}
+
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWPeriod& info)
+{
+ // overwrite the local period config and zonegroup objects
+ constexpr bool exclusive = false;
+
+ int r = cfgstore->write_period_config(dpp, y, exclusive, info.realm_id,
+ info.period_config);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __func__ << " failed to store period config for realm id="
+ << info.realm_id << " with " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& [zonegroup_id, zonegroup] : info.period_map.zonegroups) {
+ r = cfgstore->create_zonegroup(dpp, y, exclusive, zonegroup, nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << __func__ << " failed to store zonegroup id="
+ << zonegroup_id << " with " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (zonegroup.is_master) {
+ // set master as default if no default exists
+ constexpr bool exclusive = true;
+ r = set_default_zonegroup(dpp, y, cfgstore, zonegroup, exclusive);
+ if (r == 0) {
+ ldpp_dout(dpp, 1) << "Set the period's master zonegroup "
+ << zonegroup.name << " as the default" << dendl;
+ }
+ }
+ }
+ return 0;
+}
+
+std::string get_staging_period_id(std::string_view realm_id)
+{
+ return string_cat_reserve(realm_id, ":staging");
+}
+
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info)
+{
+ ldpp_dout(dpp, 20) << __func__ << " realm id=" << info.realm_id
+ << " period id=" << info.id << dendl;
+
+ info.predecessor_uuid = std::move(info.id);
+ info.id = get_staging_period_id(info.realm_id);
+ info.period_map.reset();
+ info.realm_epoch++;
+}
+
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, RGWPeriod& info)
+{
+ // clear zone short ids of removed zones. period_map.update() will add the
+ // remaining zones back
+ info.period_map.short_zone_ids.clear();
+
+ // list all zonegroups in the realm
+ rgw::sal::ListResult<std::string> listing;
+ std::array<std::string, 1000> zonegroup_names; // list in pages of 1000
+ do {
+ int ret = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+ zonegroup_names, listing);
+ if (ret < 0) {
+ std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl;
+ return -ret;
+ }
+ for (const auto& name : listing.entries) {
+ RGWZoneGroup zg;
+ ret = cfgstore->read_zonegroup_by_name(dpp, y, name, zg, nullptr);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to read zonegroup "
+ << name << ": " << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+
+ if (zg.realm_id != info.realm_id) {
+ ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name()
+ << " with realm id " << zg.realm_id
+ << ", not on our realm " << info.realm_id << dendl;
+ continue;
+ }
+
+ if (zg.master_zone.empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+ return -EINVAL;
+ }
+
+ if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+ ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+ << " has a non existent master zone "<< dendl;
+ return -EINVAL;
+ }
+
+ if (zg.is_master_zonegroup()) {
+ info.master_zonegroup = zg.get_id();
+ info.master_zone = zg.master_zone;
+ }
+
+ ret = info.period_map.update(zg, dpp->get_cct());
+ if (ret < 0) {
+ return ret;
+ }
+ } // foreach name in listing.entries
+ } while (!listing.next.empty());
+
+ // read the realm's current period config
+ int ret = cfgstore->read_period_config(dpp, y, info.realm_id,
+ info.period_config);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+ << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, sal::Driver* driver,
+ RGWRealm& realm, sal::RealmWriter& realm_writer,
+ const RGWPeriod& current_period,
+ RGWPeriod& info, std::ostream& error_stream,
+ bool force_if_stale)
+{
+ auto zone_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone; // XXX
+
+ ldpp_dout(dpp, 20) << __func__ << " realm " << realm.id
+ << " period " << current_period.id << dendl;
+ // gateway must be in the master zone to commit
+ if (info.master_zone != zone_svc->get_zone_params().id) {
+ error_stream << "Cannot commit period on zone "
+ << zone_svc->get_zone_params().id << ", it must be sent to "
+ "the period's master zone " << info.master_zone << '.' << std::endl;
+ return -EINVAL;
+ }
+ // period predecessor must match current period
+ if (info.predecessor_uuid != current_period.id) {
+ error_stream << "Period predecessor " << info.predecessor_uuid
+ << " does not match current period " << current_period.id
+ << ". Use 'period pull' to get the latest period from the master, "
+ "reapply your changes, and try again." << std::endl;
+ return -EINVAL;
+ }
+ // realm epoch must be 1 greater than current period
+ if (info.realm_epoch != current_period.realm_epoch + 1) {
+ error_stream << "Period's realm epoch " << info.realm_epoch
+ << " does not come directly after current realm epoch "
+ << current_period.realm_epoch << ". Use 'realm pull' to get the "
+ "latest realm and period from the master zone, reapply your changes, "
+ "and try again." << std::endl;
+ return -EINVAL;
+ }
+ // did the master zone change?
+ if (info.master_zone != current_period.master_zone) {
+ // store the current metadata sync status in the period
+ int r = info.update_sync_status(dpp, driver, current_period,
+ error_stream, force_if_stale);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ // create an object with a new period id
+ info.period_map.id = info.id = gen_random_uuid();
+ info.epoch = FIRST_EPOCH;
+
+ constexpr bool exclusive = true;
+ r = cfgstore->create_period(dpp, y, exclusive, info);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ // set as current period
+ r = realm_set_current_period(dpp, y, cfgstore, realm_writer, realm, info);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+ << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+ << info.id << dendl;
+ (void) cfgstore->realm_notify_new_period(dpp, y, info);
+ return 0;
+ }
+ // period must be based on current epoch
+ if (info.epoch != current_period.epoch) {
+ error_stream << "Period epoch " << info.epoch << " does not match "
+ "predecessor epoch " << current_period.epoch << ". Use "
+ "'period pull' to get the latest epoch from the master zone, "
+ "reapply your changes, and try again." << std::endl;
+ return -EINVAL;
+ }
+ // set period as next epoch
+ info.id = current_period.id;
+ info.epoch = current_period.epoch + 1;
+ info.predecessor_uuid = current_period.predecessor_uuid;
+ info.realm_epoch = current_period.realm_epoch;
+ // write the period
+ constexpr bool exclusive = true;
+ int r = cfgstore->create_period(dpp, y, exclusive, info);
+ if (r == -EEXIST) {
+ // already have this epoch (or a more recent one)
+ return 0;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = reflect_period(dpp, y, cfgstore, info);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ ldpp_dout(dpp, 4) << "Committed new epoch " << info.epoch
+ << " for period " << info.id << dendl;
+ (void) cfgstore->realm_notify_new_period(dpp, y, info);
+ return 0;
+}
+
+
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view zonegroup_id,
+ std::string_view zonegroup_name,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+ if (!zonegroup_id.empty()) {
+ return cfgstore->read_zonegroup_by_id(dpp, y, zonegroup_id, info, writer);
+ }
+ if (!zonegroup_name.empty()) {
+ return cfgstore->read_zonegroup_by_name(dpp, y, zonegroup_name, info, writer);
+ }
+
+ std::string realm_id;
+ int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+ if (r == -ENOENT) {
+ return cfgstore->read_zonegroup_by_name(dpp, y, default_zonegroup_name,
+ info, writer);
+ }
+ if (r < 0) {
+ return r;
+ }
+ return cfgstore->read_default_zonegroup(dpp, y, realm_id, info, writer);
+}
+
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWZoneGroup& info)
+{
+ if (info.name.empty()) {
+ ldpp_dout(dpp, -1) << __func__ << " requires a zonegroup name" << dendl;
+ return -EINVAL;
+ }
+ if (info.id.empty()) {
+ info.id = gen_random_uuid();
+ }
+
+ // insert the default placement target if it doesn't exist
+ constexpr std::string_view default_placement_name = "default-placement";
+
+ RGWZoneGroupPlacementTarget placement_target;
+ placement_target.name = default_placement_name;
+
+ info.placement_targets.emplace(default_placement_name, placement_target);
+ if (info.default_placement.name.empty()) {
+ info.default_placement.name = default_placement_name;
+ }
+
+ int r = cfgstore->create_zonegroup(dpp, y, exclusive, info, nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to create zonegroup with "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // try to set as default. may race with another create, so pass exclusive=true
+ // so we don't override an existing default
+ r = set_default_zonegroup(dpp, y, cfgstore, info, true);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to set zonegroup as default: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ return 0;
+}
+
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+ bool exclusive)
+{
+ return cfgstore->write_default_zonegroup_id(
+ dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+ RGWZoneGroup& zonegroup,
+ const rgw_zone_id& zone_id)
+{
+ auto z = zonegroup.zones.find(zone_id);
+ if (z == zonegroup.zones.end()) {
+ return -ENOENT;
+ }
+ zonegroup.zones.erase(z);
+
+ if (zonegroup.master_zone == zone_id) {
+ // choose a new master zone
+ auto m = zonegroup.zones.begin();
+ if (m != zonegroup.zones.end()) {
+ zonegroup.master_zone = m->first;
+ ldpp_dout(dpp, 0) << "NOTICE: promoted " << m->second.name
+ << " as new master_zone of zonegroup " << zonegroup.name << dendl;
+ } else {
+ ldpp_dout(dpp, 0) << "NOTICE: removed master_zone of zonegroup "
+ << zonegroup.name << dendl;
+ }
+ }
+
+ const bool log_data = zonegroup.zones.size() > 1;
+ for (auto& [id, zone] : zonegroup.zones) {
+ zone.log_data = log_data;
+ }
+
+ return 0;
+}
+
+// try to remove the given zone id from every zonegroup in the cluster
+static int remove_zone_from_groups(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ sal::ConfigStore* cfgstore,
+ const rgw_zone_id& zone_id)
+{
+ std::array<std::string, 128> zonegroup_names;
+ sal::ListResult<std::string> listing;
+ do {
+ int r = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+ zonegroup_names, listing);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to list zonegroups with "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto& name : listing.entries) {
+ RGWZoneGroup zonegroup;
+ std::unique_ptr<sal::ZoneGroupWriter> writer;
+ r = cfgstore->read_zonegroup_by_name(dpp, y, name, zonegroup, &writer);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to load zonegroup " << name
+ << " with " << cpp_strerror(r) << dendl;
+ continue;
+ }
+
+ r = remove_zone_from_group(dpp, zonegroup, zone_id);
+ if (r < 0) {
+ continue;
+ }
+
+ // write the updated zonegroup
+ r = writer->write(dpp, y, zonegroup);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to write zonegroup " << name
+ << " with " << cpp_strerror(r) << dendl;
+ continue;
+ }
+ ldpp_dout(dpp, 0) << "Removed zone from zonegroup " << name << dendl;
+ }
+ } while (!listing.next.empty());
+
+ return 0;
+}
+
+
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view zone_id,
+ std::string_view zone_name,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ if (!zone_id.empty()) {
+ return cfgstore->read_zone_by_id(dpp, y, zone_id, info, writer);
+ }
+ if (!zone_name.empty()) {
+ return cfgstore->read_zone_by_name(dpp, y, zone_name, info, writer);
+ }
+
+ std::string realm_id;
+ int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+ if (r == -ENOENT) {
+ return cfgstore->read_zone_by_name(dpp, y, default_zone_name, info, writer);
+ }
+ if (r < 0) {
+ return r;
+ }
+ return cfgstore->read_default_zone(dpp, y, realm_id, info, writer);
+}
+
+extern int get_zones_pool_set(const DoutPrefixProvider *dpp, optional_yield y,
+ rgw::sal::ConfigStore* cfgstore,
+ std::string_view my_zone_id,
+ std::set<rgw_pool>& pools);
+
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWZoneParams& info, std::unique_ptr<sal::ZoneWriter>* writer)
+{
+ if (info.name.empty()) {
+ ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl;
+ return -EINVAL;
+ }
+ if (info.id.empty()) {
+ info.id = gen_random_uuid();
+ }
+
+ // add default placement with empty pool name
+ rgw_pool pool;
+ auto& placement = info.placement_pools["default-placement"];
+ placement.storage_classes.set_storage_class(
+ RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+
+ // build a set of all pool names used by other zones
+ std::set<rgw_pool> pools;
+ int r = get_zones_pool_set(dpp, y, cfgstore, info.id, pools);
+ if (r < 0) {
+ return r;
+ }
+
+ // initialize pool names with the zone name prefix
+ r = init_zone_pool_names(dpp, y, pools, info);
+ if (r < 0) {
+ return r;
+ }
+
+ r = cfgstore->create_zone(dpp, y, exclusive, info, nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "failed to create zone with "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // try to set as default. may race with another create, so pass exclusive=true
+ // so we don't override an existing default
+ r = set_default_zone(dpp, y, cfgstore, info, true);
+ if (r < 0 && r != -EEXIST) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to set zone as default: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ return 0;
+
+}
+
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+ bool exclusive)
+{
+ return cfgstore->write_default_zone_id(
+ dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+ sal::ZoneWriter& writer)
+{
+ // remove this zone from any zonegroups that contain it
+ int r = remove_zone_from_groups(dpp, y, cfgstore, info.id);
+ if (r < 0) {
+ return r;
+ }
+
+ return writer.remove(dpp, y);
+}
+
+} // namespace rgw
+
+static inline int conf_to_uint64(const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+ string sval;
+ if (config.find(key, &sval)) {
+ string err;
+ uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+ *pval = val;
+ }
+ return 0;
+}
+
+int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
+{
+ int r = -1;
+
+ if (config.exists("retain_head_object")) {
+ string s = config["retain_head_object"];
+ if (s == "true") {
+ retain_head_object = true;
+ } else {
+ retain_head_object = false;
+ }
+ }
+
+ if (tier_type == "cloud-s3") {
+ r = t.s3.update_params(config);
+ }
+
+ return r;
+}
+
+int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
+{
+ if (config.exists("retain_head_object")) {
+ retain_head_object = false;
+ }
+
+ if (tier_type == "cloud-s3") {
+ t.s3.clear_params(config);
+ }
+
+ return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::update_params(const JSONFormattable& config)
+{
+ int r = -1;
+
+ if (config.exists("endpoint")) {
+ endpoint = config["endpoint"];
+ }
+ if (config.exists("target_path")) {
+ target_path = config["target_path"];
+ }
+ if (config.exists("region")) {
+ region = config["region"];
+ }
+ if (config.exists("host_style")) {
+ string s;
+ s = config["host_style"];
+ if (s != "virtual") {
+ host_style = PathStyle;
+ } else {
+ host_style = VirtualStyle;
+ }
+ }
+ if (config.exists("target_storage_class")) {
+ target_storage_class = config["target_storage_class"];
+ }
+ if (config.exists("access_key")) {
+ key.id = config["access_key"];
+ }
+ if (config.exists("secret")) {
+ key.key = config["secret"];
+ }
+ if (config.exists("multipart_sync_threshold")) {
+ r = conf_to_uint64(config, "multipart_sync_threshold", &multipart_sync_threshold);
+ if (r < 0) {
+ multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+ }
+ }
+
+ if (config.exists("multipart_min_part_size")) {
+ r = conf_to_uint64(config, "multipart_min_part_size", &multipart_min_part_size);
+ if (r < 0) {
+ multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+ }
+ }
+
+ if (config.exists("acls")) {
+ const JSONFormattable& cc = config["acls"];
+ if (cc.is_array()) {
+ for (auto& c : cc.array()) {
+ RGWTierACLMapping m;
+ m.init(c);
+ if (!m.source_id.empty()) {
+ acl_mappings[m.source_id] = m;
+ }
+ }
+ } else {
+ RGWTierACLMapping m;
+ m.init(cc);
+ if (!m.source_id.empty()) {
+ acl_mappings[m.source_id] = m;
+ }
+ }
+ }
+ return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::clear_params(const JSONFormattable& config)
+{
+ if (config.exists("endpoint")) {
+ endpoint.clear();
+ }
+ if (config.exists("target_path")) {
+ target_path.clear();
+ }
+ if (config.exists("region")) {
+ region.clear();
+ }
+ if (config.exists("host_style")) {
+ /* default */
+ host_style = PathStyle;
+ }
+ if (config.exists("target_storage_class")) {
+ target_storage_class.clear();
+ }
+ if (config.exists("access_key")) {
+ key.id.clear();
+ }
+ if (config.exists("secret")) {
+ key.key.clear();
+ }
+ if (config.exists("multipart_sync_threshold")) {
+ multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+ }
+ if (config.exists("multipart_min_part_size")) {
+ multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+ }
+ if (config.exists("acls")) {
+ const JSONFormattable& cc = config["acls"];
+ if (cc.is_array()) {
+ for (auto& c : cc.array()) {
+ RGWTierACLMapping m;
+ m.init(c);
+ acl_mappings.erase(m.source_id);
+ }
+ } else {
+ RGWTierACLMapping m;
+ m.init(cc);
+ acl_mappings.erase(m.source_id);
+ }
+ }
+ return 0;
+}
+
+void rgw_meta_sync_info::generate_test_instances(list<rgw_meta_sync_info*>& o)
+{
+ auto info = new rgw_meta_sync_info;
+ info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+ info->period = "periodid";
+ info->realm_epoch = 5;
+ o.push_back(info);
+ o.push_back(new rgw_meta_sync_info);
+}
+
+void rgw_meta_sync_marker::generate_test_instances(list<rgw_meta_sync_marker*>& o)
+{
+ auto marker = new rgw_meta_sync_marker;
+ marker->state = rgw_meta_sync_marker::IncrementalSync;
+ marker->marker = "01234";
+ marker->realm_epoch = 5;
+ o.push_back(marker);
+ o.push_back(new rgw_meta_sync_marker);
+}
+
+void rgw_meta_sync_status::generate_test_instances(list<rgw_meta_sync_status*>& o)
+{
+ o.push_back(new rgw_meta_sync_status);
+}
+
+void RGWZoneParams::generate_test_instances(list<RGWZoneParams*> &o)
+{
+ o.push_back(new RGWZoneParams);
+ o.push_back(new RGWZoneParams);
+}
+
+void RGWPeriodLatestEpochInfo::generate_test_instances(list<RGWPeriodLatestEpochInfo*> &o)
+{
+ RGWPeriodLatestEpochInfo *z = new RGWPeriodLatestEpochInfo;
+ o.push_back(z);
+ o.push_back(new RGWPeriodLatestEpochInfo);
+}
+
+void RGWZoneGroup::generate_test_instances(list<RGWZoneGroup*>& o)
+{
+ RGWZoneGroup *r = new RGWZoneGroup;
+ o.push_back(r);
+ o.push_back(new RGWZoneGroup);
+}
+
+void RGWPeriodLatestEpochInfo::dump(Formatter *f) const {
+ encode_json("latest_epoch", epoch, f);
+}
+
+void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("latest_epoch", epoch, obj);
+}
+
+void RGWNameToId::dump(Formatter *f) const {
+ encode_json("obj_id", obj_id, f);
+}
+
+void RGWNameToId::decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("obj_id", obj_id, obj);
+}
+
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h
new file mode 100644
index 000000000..2d69d5f1c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_zone.h
@@ -0,0 +1,943 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <ostream>
+#include "rgw_zone_types.h"
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+#include "rgw_sync_policy.h"
+
+
+class RGWSyncModulesManager;
+
+class RGWSI_SysObj;
+class RGWSI_Zone;
+
+class RGWSystemMetaObj {
+public:
+ std::string id;
+ std::string name;
+
+ CephContext *cct{nullptr};
+ RGWSI_SysObj *sysobj_svc{nullptr};
+ RGWSI_Zone *zone_svc{nullptr};
+
+ int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+ int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+ int read_info(const DoutPrefixProvider *dpp, const std::string& obj_id, optional_yield y, bool old_format = false);
+ int read_id(const DoutPrefixProvider *dpp, const std::string& obj_name, std::string& obj_id, optional_yield y);
+ int read_default(const DoutPrefixProvider *dpp,
+ RGWDefaultSystemMetaObjInfo& default_info,
+ const std::string& oid,
+ optional_yield y);
+ /* read and use default id */
+ int use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+
+public:
+ RGWSystemMetaObj() {}
+ RGWSystemMetaObj(const std::string& _name): name(_name) {}
+ RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {}
+ RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) {
+ reinit_instance(_cct, _sysobj_svc);
+ }
+ RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) {
+ reinit_instance(_cct, _sysobj_svc);
+ }
+
+ const std::string& get_name() const { return name; }
+ const std::string& get_id() const { return id; }
+
+ void set_name(const std::string& _name) { name = _name;}
+ void set_id(const std::string& _id) { id = _id;}
+ void clear_id() { id.clear(); }
+
+ virtual ~RGWSystemMetaObj() {}
+
+ virtual void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(id, bl);
+ encode(name, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ virtual void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(id, bl);
+ decode(name, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc);
+ int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+ optional_yield y,
+ bool setup_obj = true, bool old_format = false);
+ virtual int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y,
+ bool old_format = false);
+ virtual int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false);
+ int delete_default();
+ virtual int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+ int delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+ int rename(const DoutPrefixProvider *dpp, const std::string& new_name, optional_yield y);
+ int update(const DoutPrefixProvider *dpp, optional_yield y) { return store_info(dpp, false, y);}
+ int update_name(const DoutPrefixProvider *dpp, optional_yield y) { return store_name(dpp, false, y);}
+ int read(const DoutPrefixProvider *dpp, optional_yield y);
+ int write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+
+ virtual rgw_pool get_pool(CephContext *cct) const = 0;
+ virtual const std::string get_default_oid(bool old_format = false) const = 0;
+ virtual const std::string& get_names_oid_prefix() const = 0;
+ virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0;
+ virtual std::string get_predefined_id(CephContext *cct) const = 0;
+ virtual const std::string& get_predefined_name(CephContext *cct) const = 0;
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWSystemMetaObj)
+
+struct RGWZoneParams : RGWSystemMetaObj {
+ rgw_pool domain_root;
+ rgw_pool control_pool;
+ rgw_pool gc_pool;
+ rgw_pool lc_pool;
+ rgw_pool log_pool;
+ rgw_pool intent_log_pool;
+ rgw_pool usage_log_pool;
+ rgw_pool user_keys_pool;
+ rgw_pool user_email_pool;
+ rgw_pool user_swift_pool;
+ rgw_pool user_uid_pool;
+ rgw_pool roles_pool;
+ rgw_pool reshard_pool;
+ rgw_pool otp_pool;
+ rgw_pool oidc_pool;
+ rgw_pool notif_pool;
+
+ RGWAccessKey system_key;
+
+ std::map<std::string, RGWZonePlacementInfo> placement_pools;
+
+ std::string realm_id;
+
+ JSONFormattable tier_config;
+
+ RGWZoneParams() : RGWSystemMetaObj() {}
+ explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){}
+ RGWZoneParams(const rgw_zone_id& id, const std::string& name) : RGWSystemMetaObj(id.id, name) {}
+ RGWZoneParams(const rgw_zone_id& id, const std::string& name, const std::string& _realm_id)
+ : RGWSystemMetaObj(id.id, name), realm_id(_realm_id) {}
+ virtual ~RGWZoneParams();
+
+ rgw_pool get_pool(CephContext *cct) const override;
+ const std::string get_default_oid(bool old_format = false) const override;
+ const std::string& get_names_oid_prefix() const override;
+ const std::string& get_info_oid_prefix(bool old_format = false) const override;
+ std::string get_predefined_id(CephContext *cct) const override;
+ const std::string& get_predefined_name(CephContext *cct) const override;
+
+ int init(const DoutPrefixProvider *dpp,
+ CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y,
+ bool setup_obj = true, bool old_format = false);
+ using RGWSystemMetaObj::init;
+ int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+ int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+ int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+ int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+ int fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y);
+
+ const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
+
+ void encode(bufferlist& bl) const override {
+ ENCODE_START(14, 1, bl);
+ encode(domain_root, bl);
+ encode(control_pool, bl);
+ encode(gc_pool, bl);
+ encode(log_pool, bl);
+ encode(intent_log_pool, bl);
+ encode(usage_log_pool, bl);
+ encode(user_keys_pool, bl);
+ encode(user_email_pool, bl);
+ encode(user_swift_pool, bl);
+ encode(user_uid_pool, bl);
+ RGWSystemMetaObj::encode(bl);
+ encode(system_key, bl);
+ encode(placement_pools, bl);
+ rgw_pool unused_metadata_heap;
+ encode(unused_metadata_heap, bl);
+ encode(realm_id, bl);
+ encode(lc_pool, bl);
+ std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+ encode(old_tier_config, bl);
+ encode(roles_pool, bl);
+ encode(reshard_pool, bl);
+ encode(otp_pool, bl);
+ encode(tier_config, bl);
+ encode(oidc_pool, bl);
+ encode(notif_pool, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) override {
+ DECODE_START(14, bl);
+ decode(domain_root, bl);
+ decode(control_pool, bl);
+ decode(gc_pool, bl);
+ decode(log_pool, bl);
+ decode(intent_log_pool, bl);
+ decode(usage_log_pool, bl);
+ decode(user_keys_pool, bl);
+ decode(user_email_pool, bl);
+ decode(user_swift_pool, bl);
+ decode(user_uid_pool, bl);
+ if (struct_v >= 6) {
+ RGWSystemMetaObj::decode(bl);
+ } else if (struct_v >= 2) {
+ decode(name, bl);
+ id = name;
+ }
+ if (struct_v >= 3)
+ decode(system_key, bl);
+ if (struct_v >= 4)
+ decode(placement_pools, bl);
+ if (struct_v >= 5) {
+ rgw_pool unused_metadata_heap;
+ decode(unused_metadata_heap, bl);
+ }
+ if (struct_v >= 6) {
+ decode(realm_id, bl);
+ }
+ if (struct_v >= 7) {
+ decode(lc_pool, bl);
+ } else {
+ lc_pool = log_pool.name + ":lc";
+ }
+ std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+ if (struct_v >= 8) {
+ decode(old_tier_config, bl);
+ }
+ if (struct_v >= 9) {
+ decode(roles_pool, bl);
+ } else {
+ roles_pool = name + ".rgw.meta:roles";
+ }
+ if (struct_v >= 10) {
+ decode(reshard_pool, bl);
+ } else {
+ reshard_pool = log_pool.name + ":reshard";
+ }
+ if (struct_v >= 11) {
+ ::decode(otp_pool, bl);
+ } else {
+ otp_pool = name + ".rgw.otp";
+ }
+ if (struct_v >= 12) {
+ ::decode(tier_config, bl);
+ } else {
+ for (auto& kv : old_tier_config) {
+ tier_config.set(kv.first, kv.second);
+ }
+ }
+ if (struct_v >= 13) {
+ ::decode(oidc_pool, bl);
+ } else {
+ oidc_pool = name + ".rgw.meta:oidc";
+ }
+ if (struct_v >= 14) {
+ decode(notif_pool, bl);
+ } else {
+ notif_pool = log_pool.name + ":notif";
+ }
+ DECODE_FINISH(bl);
+ }
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<RGWZoneParams*>& o);
+
+ bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const {
+ auto iter = placement_pools.find(placement_id);
+ if (iter == placement_pools.end()) {
+ return false;
+ }
+ *placement = iter->second;
+ return true;
+ }
+
+ /*
+ * return data pool of the head object
+ */
+ bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool* pool) const {
+ const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement;
+ if (!explicit_placement.data_pool.empty()) {
+ if (!obj.in_extra_data) {
+ *pool = explicit_placement.data_pool;
+ } else {
+ *pool = explicit_placement.get_data_extra_pool();
+ }
+ return true;
+ }
+ if (placement_rule.empty()) {
+ return false;
+ }
+ auto iter = placement_pools.find(placement_rule.name);
+ if (iter == placement_pools.end()) {
+ return false;
+ }
+ if (!obj.in_extra_data) {
+ *pool = iter->second.get_data_pool(placement_rule.storage_class);
+ } else {
+ *pool = iter->second.get_data_extra_pool();
+ }
+ return true;
+ }
+
+ bool valid_placement(const rgw_placement_rule& rule) const {
+ auto iter = placement_pools.find(rule.name);
+ if (iter == placement_pools.end()) {
+ return false;
+ }
+ return iter->second.storage_class_exists(rule.storage_class);
+ }
+};
+WRITE_CLASS_ENCODER(RGWZoneParams)
+
+struct RGWZoneGroup : public RGWSystemMetaObj {
+ std::string api_name;
+ std::list<std::string> endpoints;
+ bool is_master = false;
+
+ rgw_zone_id master_zone;
+ std::map<rgw_zone_id, RGWZone> zones;
+
+ std::map<std::string, RGWZoneGroupPlacementTarget> placement_targets;
+ rgw_placement_rule default_placement;
+
+ std::list<std::string> hostnames;
+ std::list<std::string> hostnames_s3website;
+ // TODO: Maybe convert hostnames to a map<std::string,std::list<std::string>> for
+ // endpoint_type->hostnames
+/*
+20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; };
+20:05 < _robbat21irssi> but that's a later compatability migration planning bit
+20:06 < yehudasa> more like if (!hostnames.empty()) {
+20:06 < yehudasa> for (std::list<std::string>::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) {
+20:06 < yehudasa> hostname_map["s3"].append(iter->second);
+20:07 < yehudasa> hostname_map["s3website"].append(iter->second);
+20:07 < yehudasa> s/append/push_back/g
+20:08 < _robbat21irssi> inner loop over APIs
+20:08 < yehudasa> yeah, probably
+20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website
+*/
+ std::map<std::string, std::list<std::string> > api_hostname_map;
+ std::map<std::string, std::list<std::string> > api_endpoints_map;
+
+ std::string realm_id;
+
+ rgw_sync_policy_info sync_policy;
+ rgw::zone_features::set enabled_features;
+
+ RGWZoneGroup(): is_master(false){}
+ RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {}
+ explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {}
+ RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc,
+ const std::string& _realm_id, const std::list<std::string>& _endpoints)
+ : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master),
+ realm_id(_realm_id) {}
+ virtual ~RGWZoneGroup();
+
+ bool is_master_zonegroup() const { return is_master;}
+ void update_master(const DoutPrefixProvider *dpp, bool _is_master, optional_yield y) {
+ is_master = _is_master;
+ post_process_params(dpp, y);
+ }
+ void post_process_params(const DoutPrefixProvider *dpp, optional_yield y);
+
+ void encode(bufferlist& bl) const override {
+ ENCODE_START(6, 1, bl);
+ encode(name, bl);
+ encode(api_name, bl);
+ encode(is_master, bl);
+ encode(endpoints, bl);
+ encode(master_zone, bl);
+ encode(zones, bl);
+ encode(placement_targets, bl);
+ encode(default_placement, bl);
+ encode(hostnames, bl);
+ encode(hostnames_s3website, bl);
+ RGWSystemMetaObj::encode(bl);
+ encode(realm_id, bl);
+ encode(sync_policy, bl);
+ encode(enabled_features, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) override {
+ DECODE_START(6, bl);
+ decode(name, bl);
+ decode(api_name, bl);
+ decode(is_master, bl);
+ decode(endpoints, bl);
+ decode(master_zone, bl);
+ decode(zones, bl);
+ decode(placement_targets, bl);
+ decode(default_placement, bl);
+ if (struct_v >= 2) {
+ decode(hostnames, bl);
+ }
+ if (struct_v >= 3) {
+ decode(hostnames_s3website, bl);
+ }
+ if (struct_v >= 4) {
+ RGWSystemMetaObj::decode(bl);
+ decode(realm_id, bl);
+ } else {
+ id = name;
+ }
+ if (struct_v >= 5) {
+ decode(sync_policy, bl);
+ }
+ if (struct_v >= 6) {
+ decode(enabled_features, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+ int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+ int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+ int equals(const std::string& other_zonegroup) const;
+ int add_zone(const DoutPrefixProvider *dpp,
+ const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+ const std::list<std::string>& endpoints, const std::string *ptier_type,
+ bool *psync_from_all, std::list<std::string>& sync_from,
+ std::list<std::string>& sync_from_rm, std::string *predirect_zone,
+ std::optional<int> bucket_index_max_shards, RGWSyncModulesManager *sync_mgr,
+ const rgw::zone_features::set& enable_features,
+ const rgw::zone_features::set& disable_features,
+ optional_yield y);
+ int remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y);
+ int rename_zone(const DoutPrefixProvider *dpp, const RGWZoneParams& zone_params, optional_yield y);
+ rgw_pool get_pool(CephContext *cct) const override;
+ const std::string get_default_oid(bool old_region_format = false) const override;
+ const std::string& get_info_oid_prefix(bool old_region_format = false) const override;
+ const std::string& get_names_oid_prefix() const override;
+ std::string get_predefined_id(CephContext *cct) const override;
+ const std::string& get_predefined_name(CephContext *cct) const override;
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<RGWZoneGroup*>& o);
+
+ bool supports(std::string_view feature) const {
+ return enabled_features.contains(feature);
+ }
+};
+WRITE_CLASS_ENCODER(RGWZoneGroup)
+
+struct RGWPeriodMap
+{
+ std::string id;
+ std::map<std::string, RGWZoneGroup> zonegroups;
+ std::map<std::string, RGWZoneGroup> zonegroups_by_api;
+ std::map<std::string, uint32_t> short_zone_ids;
+
+ std::string master_zonegroup;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+
+ int update(const RGWZoneGroup& zonegroup, CephContext *cct);
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+
+ void reset() {
+ zonegroups.clear();
+ zonegroups_by_api.clear();
+ master_zonegroup.clear();
+ }
+
+ uint32_t get_zone_short_id(const std::string& zone_id) const;
+
+ bool find_zone_by_id(const rgw_zone_id& zone_id,
+ RGWZoneGroup *zonegroup,
+ RGWZone *zone) const;
+ bool find_zone_by_name(const std::string& zone_id,
+ RGWZoneGroup *zonegroup,
+ RGWZone *zone) const;
+};
+WRITE_CLASS_ENCODER(RGWPeriodMap)
+
+struct RGWPeriodConfig
+{
+ RGWQuota quota;
+ RGWRateLimitInfo user_ratelimit;
+ RGWRateLimitInfo bucket_ratelimit;
+ // rate limit unauthenticated user
+ RGWRateLimitInfo anon_ratelimit;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(quota.bucket_quota, bl);
+ encode(quota.user_quota, bl);
+ encode(bucket_ratelimit, bl);
+ encode(user_ratelimit, bl);
+ encode(anon_ratelimit, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(quota.bucket_quota, bl);
+ decode(quota.user_quota, bl);
+ if (struct_v >= 2) {
+ decode(bucket_ratelimit, bl);
+ decode(user_ratelimit, bl);
+ decode(anon_ratelimit, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+
+ // the period config must be stored in a local object outside of the period,
+ // so that it can be used in a default configuration where no realm/period
+ // exists
+ int read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+ int write(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+
+ static std::string get_oid(const std::string& realm_id);
+ static rgw_pool get_pool(CephContext *cct);
+};
+WRITE_CLASS_ENCODER(RGWPeriodConfig)
+
+class RGWRealm;
+class RGWPeriod;
+
+class RGWRealm : public RGWSystemMetaObj
+{
+public:
+ std::string current_period;
+ epoch_t epoch{0}; //< realm epoch, incremented for each new period
+
+ int create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+ int delete_control(const DoutPrefixProvider *dpp, optional_yield y);
+public:
+ RGWRealm() {}
+ RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {}
+ RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {}
+ RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){}
+ virtual ~RGWRealm() override;
+
+ void encode(bufferlist& bl) const override {
+ ENCODE_START(1, 1, bl);
+ RGWSystemMetaObj::encode(bl);
+ encode(current_period, bl);
+ encode(epoch, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) override {
+ DECODE_START(1, bl);
+ RGWSystemMetaObj::decode(bl);
+ decode(current_period, bl);
+ decode(epoch, bl);
+ DECODE_FINISH(bl);
+ }
+
+ int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+ int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+ rgw_pool get_pool(CephContext *cct) const override;
+ const std::string get_default_oid(bool old_format = false) const override;
+ const std::string& get_names_oid_prefix() const override;
+ const std::string& get_info_oid_prefix(bool old_format = false) const override;
+ std::string get_predefined_id(CephContext *cct) const override;
+ const std::string& get_predefined_name(CephContext *cct) const override;
+
+ using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<RGWRealm*>& o);
+
+ const std::string& get_current_period() const {
+ return current_period;
+ }
+ int set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y);
+ void clear_current_period_and_epoch() {
+ current_period.clear();
+ epoch = 0;
+ }
+ epoch_t get_epoch() const { return epoch; }
+
+ std::string get_control_oid() const;
+ /// send a notify on the realm control object
+ int notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
+ /// notify the zone of a new period
+ int notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y);
+
+ int find_zone(const DoutPrefixProvider *dpp,
+ const rgw_zone_id& zid,
+ RGWPeriod *pperiod,
+ RGWZoneGroup *pzonegroup,
+ bool *pfound,
+ optional_yield y) const;
+};
+WRITE_CLASS_ENCODER(RGWRealm)
+
+struct RGWPeriodLatestEpochInfo {
+ epoch_t epoch = 0;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(epoch, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(epoch, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<RGWPeriodLatestEpochInfo*>& o);
+};
+WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
+
+
+/*
+ * The RGWPeriod object contains the entire configuration of a
+ * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
+ * this configuration is maintained across all zones by passing around
+ * the RGWPeriod object in its JSON representation.
+ *
+ * If a new configuration changes which zone is the metadata master
+ * zone (i.e., master zone of the master zonegroup), then a new
+ * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
+ * incremented, and the RGWRealm object is updated to reflect that new
+ * current_period id and epoch. If the configuration changes BUT which
+ * zone is the metadata master does NOT change, then only the
+ * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
+ * same).
+ *
+ * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
+ * is linked back to its predecessor RGWPeriod through the
+ * RGWPeriod::predecessor_uuid field, thus creating a "linked
+ * list"-like structure of RGWPeriods back to the cluster's creation.
+ */
+class RGWPeriod
+{
+public:
+ std::string id; //< a uuid
+ epoch_t epoch{0};
+ std::string predecessor_uuid;
+ std::vector<std::string> sync_status;
+ RGWPeriodMap period_map;
+ RGWPeriodConfig period_config;
+ std::string master_zonegroup;
+ rgw_zone_id master_zone;
+
+ std::string realm_id;
+ std::string realm_name;
+ epoch_t realm_epoch{1}; //< realm epoch when period was made current
+
+ CephContext *cct{nullptr};
+ RGWSI_SysObj *sysobj_svc{nullptr};
+
+ int read_info(const DoutPrefixProvider *dpp, optional_yield y);
+ int read_latest_epoch(const DoutPrefixProvider *dpp,
+ RGWPeriodLatestEpochInfo& epoch_info,
+ optional_yield y,
+ RGWObjVersionTracker *objv = nullptr);
+ int use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y);
+ int use_current_period();
+
+ const std::string get_period_oid() const;
+ const std::string get_period_oid_prefix() const;
+
+ // gather the metadata sync status for each shard; only for use on master zone
+ int update_sync_status(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ const RGWPeriod &current_period,
+ std::ostream& error_stream, bool force_if_stale);
+
+public:
+ RGWPeriod() {}
+
+ explicit RGWPeriod(const std::string& period_id, epoch_t _epoch = 0)
+ : id(period_id), epoch(_epoch) {}
+
+ const std::string& get_id() const { return id; }
+ epoch_t get_epoch() const { return epoch; }
+ epoch_t get_realm_epoch() const { return realm_epoch; }
+ const std::string& get_predecessor() const { return predecessor_uuid; }
+ const rgw_zone_id& get_master_zone() const { return master_zone; }
+ const std::string& get_master_zonegroup() const { return master_zonegroup; }
+ const std::string& get_realm() const { return realm_id; }
+ const std::string& get_realm_name() const { return realm_name; }
+ const RGWPeriodMap& get_map() const { return period_map; }
+ RGWPeriodConfig& get_config() { return period_config; }
+ const RGWPeriodConfig& get_config() const { return period_config; }
+ const std::vector<std::string>& get_sync_status() const { return sync_status; }
+ rgw_pool get_pool(CephContext *cct) const;
+ const std::string& get_latest_epoch_oid() const;
+ const std::string& get_info_oid_prefix() const;
+
+ void set_user_quota(RGWQuotaInfo& user_quota) {
+ period_config.quota.user_quota = user_quota;
+ }
+
+ void set_bucket_quota(RGWQuotaInfo& bucket_quota) {
+ period_config.quota.bucket_quota = bucket_quota;
+ }
+
+ void set_id(const std::string& _id) {
+ this->id = _id;
+ period_map.id = _id;
+ }
+ void set_epoch(epoch_t epoch) { this->epoch = epoch; }
+ void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; }
+
+ void set_predecessor(const std::string& predecessor)
+ {
+ predecessor_uuid = predecessor;
+ }
+
+ void set_realm_id(const std::string& _realm_id) {
+ realm_id = _realm_id;
+ }
+
+ int reflect(const DoutPrefixProvider *dpp, optional_yield y);
+
+ int get_zonegroup(RGWZoneGroup& zonegroup,
+ const std::string& zonegroup_id) const;
+
+ bool is_single_zonegroup() const
+ {
+ return (period_map.zonegroups.size() <= 1);
+ }
+
+ /*
+ returns true if there are several zone groups with a least one zone
+ */
+ bool is_multi_zonegroups_with_zones() const
+ {
+ int count = 0;
+ for (const auto& zg: period_map.zonegroups) {
+ if (zg.second.zones.size() > 0) {
+ if (count++ > 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool find_zone(const DoutPrefixProvider *dpp,
+ const rgw_zone_id& zid,
+ RGWZoneGroup *pzonegroup,
+ optional_yield y) const;
+
+ int get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& epoch, optional_yield y);
+ int set_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y,
+ epoch_t epoch, bool exclusive = false,
+ RGWObjVersionTracker *objv = nullptr);
+ // update latest_epoch if the given epoch is higher, else return -EEXIST
+ int update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y);
+
+ int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, optional_yield y,
+ const std::string &period_realm_name = "", bool setup_obj = true);
+ int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, bool setup_obj = true);
+
+ int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+ int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+ int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+ int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y);
+
+ void fork();
+ int update(const DoutPrefixProvider *dpp, optional_yield y);
+
+ // commit a staging period; only for use on master zone
+ int commit(const DoutPrefixProvider *dpp,
+ rgw::sal::Driver* driver,
+ RGWRealm& realm, const RGWPeriod &current_period,
+ std::ostream& error_stream, optional_yield y,
+ bool force_if_stale = false);
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(id, bl);
+ encode(epoch, bl);
+ encode(realm_epoch, bl);
+ encode(predecessor_uuid, bl);
+ encode(sync_status, bl);
+ encode(period_map, bl);
+ encode(master_zone, bl);
+ encode(master_zonegroup, bl);
+ encode(period_config, bl);
+ encode(realm_id, bl);
+ encode(realm_name, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(id, bl);
+ decode(epoch, bl);
+ decode(realm_epoch, bl);
+ decode(predecessor_uuid, bl);
+ decode(sync_status, bl);
+ decode(period_map, bl);
+ decode(master_zone, bl);
+ decode(master_zonegroup, bl);
+ decode(period_config, bl);
+ decode(realm_id, bl);
+ decode(realm_name, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(Formatter *f) const;
+ void decode_json(JSONObj *obj);
+ static void generate_test_instances(std::list<RGWPeriod*>& o);
+
+ static std::string get_staging_id(const std::string& realm_id) {
+ return realm_id + ":staging";
+ }
+};
+WRITE_CLASS_ENCODER(RGWPeriod)
+
+namespace rgw {
+
+/// Look up a realm by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default realm.
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view realm_id,
+ std::string_view realm_name,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Create a realm and its initial period. If the info.id is empty, a
+/// random uuid will be generated.
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWRealm& info,
+ std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Set the given realm as the cluster's default realm.
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWRealm& info,
+ bool exclusive = false);
+
+/// Update the current_period of an existing realm.
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ sal::RealmWriter& writer, RGWRealm& realm,
+ const RGWPeriod& period);
+
+/// Overwrite the local zonegroup and period config objects with the new
+/// configuration contained in the given period.
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWPeriod& info);
+
+/// Return the staging period id for the given realm.
+std::string get_staging_period_id(std::string_view realm_id);
+
+/// Convert the given period into a separate staging period, where
+/// radosgw-admin can make changes to it without effecting the running
+/// configuration.
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info);
+
+/// Read all zonegroups in the period's realm and add them to the period.
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, RGWPeriod& info);
+
+/// Validates the given 'staging' period and tries to commit it as the
+/// realm's new current period.
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, sal::Driver* driver,
+ RGWRealm& realm, sal::RealmWriter& realm_writer,
+ const RGWPeriod& current_period,
+ RGWPeriod& info, std::ostream& error_stream,
+ bool force_if_stale);
+
+
+/// Look up a zonegroup by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default zonegroup.
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view zonegroup_id,
+ std::string_view zonegroup_name,
+ RGWZoneGroup& info,
+ std::unique_ptr<sal::ZoneGroupWriter>* writer = nullptr);
+
+/// Initialize and create the given zonegroup. If the given info.id is empty,
+/// a random uuid will be generated. May fail with -EEXIST.
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWZoneGroup& info);
+
+/// Set the given zonegroup as its realm's default zonegroup.
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+ bool exclusive = false);
+
+/// Add a zone to the zonegroup, or update an existing zone entry.
+int add_zone_to_group(const DoutPrefixProvider* dpp,
+ RGWZoneGroup& zonegroup,
+ const RGWZoneParams& zone_params,
+ const bool *pis_master, const bool *pread_only,
+ const std::list<std::string>& endpoints,
+ const std::string *ptier_type,
+ const bool *psync_from_all,
+ const std::list<std::string>& sync_from,
+ const std::list<std::string>& sync_from_rm,
+ const std::string *predirect_zone,
+ std::optional<int> bucket_index_max_shards,
+ const rgw::zone_features::set& enable_features,
+ const rgw::zone_features::set& disable_features);
+
+/// Remove a zone by id from its zonegroup, promoting a new master zone if
+/// necessary.
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+ RGWZoneGroup& info,
+ const rgw_zone_id& zone_id);
+
+
+/// Look up a zone by its id. If no id is given, look it up by name. If no name
+/// is given, fall back to the realm's default zone.
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore,
+ std::string_view zone_id,
+ std::string_view zone_name,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize and create a new zone. If the given info.id is empty, a random
+/// uuid will be generated. Pool names are initialized with the zone name as a
+/// prefix. If any pool names conflict with existing zones, a random suffix is
+/// added.
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, bool exclusive,
+ RGWZoneParams& info,
+ std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize the zone's pool names using the zone name as a prefix. If a pool
+/// name conflicts with an existing zone's pool, add a unique suffix.
+int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
+ const std::set<rgw_pool>& pools, RGWZoneParams& info);
+
+/// Set the given zone as its realm's default zone.
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+ bool exclusive = false);
+
+/// Delete an existing zone and remove it from any zonegroups that contain it.
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+ sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+ sal::ZoneWriter& writer);
+
+} // namespace rgw