1 files changed, 3136 insertions, 0 deletions
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc
new file mode 100644
index 00000000..b0e95959
--- /dev/null
+++ b/src/rgw/rgw_sync.cc
@@ -0,0 +1,3136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/optional.hpp>
+
+#include "common/ceph_json.h"
+#include "common/RWLock.h"
+#include "common/RefCountedObj.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+#include "common/admin_socket.h"
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_metadata.h"
+#include "rgw_rest_conn.h"
+#include "rgw_tools.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_http_client.h"
+#include "rgw_sync_trace.h"
+
+#include "cls/lock/cls_lock_client.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta sync: ")
+
+static string mdlog_sync_status_oid = "mdlog.sync-status";
+static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
+static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
+
+RGWSyncErrorLogger::RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
+  for (int i = 0; i < num_shards; i++) {
+    oids.push_back(get_shard_oid(oid_prefix, i));
+  }
+}
+string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
+  char buf[oid_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
+  cls_log_entry entry;
+
+  rgw_sync_error_info info(source_zone, error_code, message);
+  bufferlist bl;
+  encode(info, bl);
+  store->time_log_prepare_entry(entry, real_clock::now(), section, name, bl);
+
+  uint32_t shard_id = ++counter % num_shards;
+
+
+  return new RGWRadosTimelogAddCR(store, oids[shard_id], entry);
+}
+
+void RGWSyncBackoff::update_wait_time()
+{
+  if (cur_wait == 0) {
+    cur_wait = 1;
+  } else {
+    cur_wait = (cur_wait << 1);
+  }
+  if (cur_wait >= max_secs) {
+    cur_wait = max_secs;
+  }
+}
+
+void RGWSyncBackoff::backoff_sleep()
+{
+  update_wait_time();
+  sleep(cur_wait);
+}
+
+void RGWSyncBackoff::backoff(RGWCoroutine *op)
+{
+  update_wait_time();
+  op->wait(utime_t(cur_wait, 0));
+}
+
+int RGWBackoffControlCR::operate() {
+  reenter(this) {
+    // retry the operation until it succeeds
+    while (true) {
+      yield {
+        Mutex::Locker l(lock);
+        cr = alloc_cr();
+        cr->get();
+        call(cr);
+      }
+      {
+        Mutex::Locker l(lock);
+        cr->put();
+        cr = NULL;
+      }
+      if (retcode >= 0) {
+        break;
+      }
+      if (retcode != -EBUSY && retcode != -EAGAIN) {
+        ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
+        if (exit_on_error) {
+          return set_cr_error(retcode);
+        }
+      }
+      if (reset_backoff) {
+        backoff.reset();
+      }
+      yield backoff.backoff(this);
+    }
+
+    // run an optional finisher
+    yield call(alloc_finisher_cr());
+    if (retcode < 0) {
+      ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+void rgw_mdlog_info::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("num_objects", num_shards, obj);
+  JSONDecoder::decode_json("period", period, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_mdlog_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("section", section, obj);
+  JSONDecoder::decode_json("name", name, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("data", log_data, obj);
+}
+
+void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("truncated", truncated, obj);
+  JSONDecoder::decode_json("entries", entries, obj);
+};
+
+int RGWShardCollectCR::operate() {
+  reenter(this) {
+    while (spawn_next()) {
+      current_running++;
+
+      while (current_running >= max_concurrent) {
+        int child_ret;
+        yield wait_for_child();
+        if (collect_next(&child_ret)) {
+          current_running--;
+          if (child_ret < 0 && child_ret != -ENOENT) {
+            ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
+            status = child_ret;
+          }
+        }
+      }
+    }
+    while (current_running > 0) {
+      int child_ret;
+      yield wait_for_child();
+      if (collect_next(&child_ret)) {
+        current_running--;
+        if (child_ret < 0 && child_ret != -ENOENT) {
+          ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
+          status = child_ret;
+        }
+      }
+    }
+    if (status < 0) {
+      return set_cr_error(status);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  int num_shards;
+  map<int, RGWMetadataLogInfo> *mdlog_info;
+
+  int shard_id;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+public:
+  RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, int _num_shards,
+                     map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env),
+                                                                 period(period), num_shards(_num_shards),
+                                                                 mdlog_info(_mdlog_info), shard_id(0) {}
+  bool spawn_next() override;
+};
+
+class RGWListRemoteMDLogCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  map<int, string> shards;
+  int max_entries_per_shard;
+  map<int, rgw_mdlog_shard_data> *result;
+
+  map<int, string>::iterator iter;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+public:
+  RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, map<int, string>& _shards,
+                     int _max_entries_per_shard,
+                     map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env), period(period),
+                                                                 max_entries_per_shard(_max_entries_per_shard),
+                                                                 result(_result) {
+    shards.swap(_shards);
+    iter = shards.begin();
+  }
+  bool spawn_next() override;
+};
+
+RGWRemoteMetaLog::~RGWRemoteMetaLog()
+{
+  delete error_logger;
+}
+
+int RGWRemoteMetaLog::read_log_info(rgw_mdlog_info *log_info)
+{
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { NULL, NULL } };
+
+  int ret = conn->get_json_resource("/admin/log", pairs, *log_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
+
+  return 0;
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_info(const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info log_info;
+  int ret = read_log_info(&log_info);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return run(new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_next(const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+
+  return run(new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
+}
+
+int RGWRemoteMetaLog::init()
+{
+  conn = store->svc.zone->get_master_conn();
+
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+  init_sync_env(&sync_env);
+
+  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta");
+
+  return 0;
+}
+
+void RGWRemoteMetaLog::finish()
+{
+  going_down = true;
+  stop();
+}
+
+#define CLONE_MAX_ENTRIES 100
+
+int RGWMetaSyncStatusManager::init()
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+
+  if (!store->svc.zone->get_master_conn()) {
+    lderr(store->ctx()) << "no REST connection to master zone" << dendl;
+    return -EIO;
+  }
+
+  int r = rgw_init_ioctx(store->get_rados_handle(), store->svc.zone->get_zone_params().log_pool, ioctx, true);
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: failed to open log pool (" << store->svc.zone->get_zone_params().log_pool << " ret=" << r << dendl;
+    return r;
+  }
+
+  r = master_log.init();
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: failed to init remote log, r=" << r << dendl;
+    return r;
+  }
+
+  RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
+
+  rgw_meta_sync_status sync_status;
+  r = read_sync_status(&sync_status);
+  if (r < 0 && r != -ENOENT) {
+    lderr(store->ctx()) << "ERROR: failed to read sync status, r=" << r << dendl;
+    return r;
+  }
+
+  int num_shards = sync_status.sync_info.num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    shard_objs[i] = rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env.shard_obj_name(i));
+  }
+
+  RWLock::WLocker wl(ts_to_shard_lock);
+  for (int i = 0; i < num_shards; i++) {
+    clone_markers.push_back(string());
+    utime_shard ut;
+    ut.shard_id = i;
+    ts_to_shard[ut] = i;
+  }
+
+  return 0;
+}
+
+unsigned RGWMetaSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream&  RGWMetaSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  return out << "meta sync: ";
+}
+
+void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn,
+                          RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+                          RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) {
+  dpp = _dpp;
+  cct = _cct;
+  store = _store;
+  conn = _conn;
+  async_rados = _async_rados;
+  http_manager = _http_manager;
+  error_logger = _error_logger;
+  sync_tracer = _sync_tracer;
+}
+
+string RGWMetaSyncEnv::status_oid()
+{
+  return mdlog_sync_status_oid;
+}
+
+string RGWMetaSyncEnv::shard_obj_name(int shard_id)
+{
+  char buf[mdlog_sync_status_shard_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
+
+  return string(buf);
+}
+
+class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
+  RGWRados *store;
+  RGWMetadataLog *mdlog;
+  int shard_id;
+  int max_entries;
+
+protected:
+  int _send_request() override {
+    real_time from_time;
+    real_time end_time;
+
+    void *handle;
+
+    mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle);
+
+    int ret = mdlog->list_entries(handle, max_entries, entries, &marker, &truncated);
+
+    mdlog->complete_list_entries(handle);
+
+    return ret;
+  }
+public:
+  string marker;
+  list<cls_log_entry> entries;
+  bool truncated;
+
+  RGWAsyncReadMDLogEntries(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
+                           RGWMetadataLog* mdlog, int _shard_id,
+                           std::string _marker, int _max_entries)
+    : RGWAsyncRadosRequest(caller, cn), store(_store), mdlog(mdlog),
+      shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {}
+};
+
+class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *const mdlog;
+  int shard_id;
+  string marker;
+  string *pmarker;
+  int max_entries;
+  list<cls_log_entry> *entries;
+  bool *truncated;
+
+  RGWAsyncReadMDLogEntries *req{nullptr};
+
+public:
+  RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                        int _shard_id, string*_marker, int _max_entries,
+                        list<cls_log_entry> *_entries, bool *_truncated)
+    : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
+      entries(_entries), truncated(_truncated) {}
+
+  ~RGWReadMDLogEntriesCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request() override {
+    marker = *pmarker;
+    req = new RGWAsyncReadMDLogEntries(this, stack->create_completion_notifier(),
+                                       sync_env->store, mdlog, shard_id, marker,
+                                       max_entries);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    *pmarker = std::move(req->marker);
+    *entries = std::move(req->entries);
+    *truncated = req->truncated;
+    return req->get_ret_status();
+  }
+};
+
+
+class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
+  RGWMetaSyncEnv *env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  RGWMetadataLogInfo *shard_info;
+
+public:
+  RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
+                                int _shard_id, RGWMetadataLogInfo *_shard_info)
+    : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), shard_info(_shard_info) {}
+
+  int operate() override {
+    auto store = env->store;
+    RGWRESTConn *conn = store->svc.zone->get_master_conn();
+    reenter(this) {
+      yield {
+	char buf[16];
+	snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "metadata" },
+	                                { "id", buf },
+	                                { "period", period.c_str() },
+					{ "info" , NULL },
+	                                { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
+                                          env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read();
+        if (ret < 0) {
+          ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(shard_info);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  uint32_t max_entries;
+  rgw_mdlog_shard_data *result;
+
+public:
+  RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
+                            int _shard_id, const string& _marker, uint32_t _max_entries,
+                            rgw_mdlog_shard_data *_result)
+    : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+  int send_request() override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", shard_id);
+
+    char max_entries_buf[32];
+    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+    const char *marker_key = (marker.empty() ? "" : "marker");
+
+    rgw_http_param_pair pairs[] = { { "type", "metadata" },
+      { "id", buf },
+      { "period", period.c_str() },
+      { "max-entries", max_entries_buf },
+      { marker_key, marker.c_str() },
+      { NULL, NULL } };
+
+    string p = "/admin/log/";
+
+    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+    init_new_io(http_op);
+
+    int ret = http_op->aio_read();
+    if (ret < 0) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
+      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+      http_op->put();
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret = http_op->wait(result);
+    http_op->put();
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+};
+
+bool RGWReadRemoteMDLogInfoCR::spawn_next() {
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+bool RGWListRemoteMDLogCR::spawn_next() {
+  if (iter == shards.end()) {
+    return false;
+  }
+
+  spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+  ++iter;
+  return true;
+}
+
+class RGWInitSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  rgw_meta_sync_info status;
+  vector<RGWMetadataLogInfo> shards_info;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+public:
+  RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             const rgw_meta_sync_info &status)
+    : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
+      status(status), shards_info(status.num_shards),
+      lease_cr(nullptr), lease_stack(nullptr) {}
+
+  ~RGWInitSyncStatusCoroutine() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  int operate() override {
+    int ret;
+    reenter(this) {
+      yield {
+        set_status("acquiring sync lock");
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+        RGWRados *store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(sync_env->dpp, 5) << "lease cr failed, done early " << dendl;
+          set_status("lease lock failed, early abort");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      yield {
+        set_status("writing sync status");
+        RGWRados *store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store->svc.sysobj,
+                                                           rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+
+      if (retcode < 0) {
+        set_status("failed to write sync status");
+        ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
+        yield lease_cr->go_down();
+        return set_cr_error(retcode);
+      }
+      /* fetch current position in logs */
+      set_status("fetching remote log position");
+      yield {
+        for (int i = 0; i < (int)status.num_shards; i++) {
+          spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
+                                                  &shards_info[i]), false);
+	}
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield {
+        set_status("updating sync status");
+        for (int i = 0; i < (int)status.num_shards; i++) {
+	  rgw_meta_sync_marker marker;
+          RGWMetadataLogInfo& info = shards_info[i];
+	  marker.next_step_marker = info.marker;
+	  marker.timestamp = info.last_update;
+          RGWRados *store = sync_env->store;
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
+                                                                store->svc.sysobj,
+                                                                rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
+                                                                marker), true);
+        }
+      }
+      yield {
+        set_status("changing sync state: build full sync maps");
+	status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+        RGWRados *store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store->svc.sysobj,
+                                                           rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+      set_status("drop lock lease");
+      yield lease_cr->go_down();
+      while (collect(&ret, NULL)) {
+	if (ret < 0) {
+	  return set_cr_error(ret);
+	}
+        yield;
+      }
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWMetaSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+ public:
+  RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
+                             map<uint32_t, rgw_meta_sync_marker>& markers)
+    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
+      env(env), num_shards(num_shards), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
+  rgw_raw_obj obj{env->store->svc.zone->get_zone_params().log_pool,
+                  env->shard_obj_name(shard_id)};
+  spawn(new CR(env->async_rados, env->store->svc.sysobj, obj, &markers[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  rgw_meta_sync_status *sync_status;
+
+public:
+  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             rgw_meta_sync_status *_status)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
+  {}
+  int operate() override;
+};
+
+int RGWReadSyncStatusCoroutine::operate()
+{
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      rgw_raw_obj obj{sync_env->store->svc.zone->get_zone_params().log_pool,
+                      sync_env->status_oid()};
+      call(new ReadInfoCR(sync_env->async_rados, sync_env->store->svc.sysobj, obj,
+                          &sync_status->sync_info, empty_on_enoent));
+    }
+    if (retcode < 0) {
+      ldpp_dout(sync_env->dpp, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers));
+    if (retcode < 0) {
+      ldpp_dout(sync_env->dpp, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWFetchAllMetaCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  int num_shards;
+
+
+  int ret_status;
+
+  list<string> sections;
+  list<string>::iterator sections_iter;
+
+  struct meta_list_result {
+    list<string> keys;
+    string marker;
+    uint64_t count{0};
+    bool truncated{false};
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("keys", keys, obj);
+      JSONDecoder::decode_json("marker", marker, obj);
+      JSONDecoder::decode_json("count", count, obj);
+      JSONDecoder::decode_json("truncated", truncated, obj);
+    }
+  } result;
+  list<string>::iterator iter;
+
+  std::unique_ptr<RGWShardedOmapCRManager> entries_index;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+  bool lost_lock;
+  bool failed;
+
+  string marker;
+
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
+                    map<uint32_t, rgw_meta_sync_marker>& _markers,
+                    RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+						      num_shards(_num_shards),
+						      ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
+                                                      lost_lock(false), failed(false), markers(_markers) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta");
+  }
+
+  ~RGWFetchAllMetaCR() override {
+  }
+
+  void append_section_from_set(set<string>& all_sections, const string& name) {
+    set<string>::iterator iter = all_sections.find(name);
+    if (iter != all_sections.end()) {
+      sections.emplace_back(std::move(*iter));
+      all_sections.erase(iter);
+    }
+  }
+  /*
+   * meta sync should go in the following order: user, bucket.instance, bucket
+   * then whatever other sections exist (if any)
+   */
+  void rearrange_sections() {
+    set<string> all_sections;
+    std::move(sections.begin(), sections.end(),
+              std::inserter(all_sections, all_sections.end()));
+    sections.clear();
+
+    append_section_from_set(all_sections, "user");
+    append_section_from_set(all_sections, "bucket.instance");
+    append_section_from_set(all_sections, "bucket");
+
+    std::move(all_sections.begin(), all_sections.end(),
+              std::back_inserter(sections));
+  }
+
+  int operate() override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    reenter(this) {
+      yield {
+        set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
+                                                sync_env->store,
+                                                rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(sync_env->dpp, 5) << "lease cr failed, done early " << dendl;
+          set_status("failed acquiring lock");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
+                                                      sync_env->store->svc.zone->get_zone_params().log_pool,
+                                                      mdlog_sync_full_sync_index_prefix));
+      yield {
+	call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
+				       "/admin/metadata", NULL, &sections));
+      }
+      if (get_ret_status() < 0) {
+        ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl;
+        yield entries_index->finish();
+        yield lease_cr->go_down();
+        drain_all();
+	return set_cr_error(get_ret_status());
+      }
+      rearrange_sections();
+      sections_iter = sections.begin();
+      for (; sections_iter != sections.end(); ++sections_iter) {
+        do {
+          yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+            string entrypoint = string("/admin/metadata/") + *sections_iter;
+            rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+              { "marker", result.marker.c_str() },
+              { NULL, NULL } };
+            result.keys.clear();
+            call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+                                                              entrypoint, pairs, &result));
+          }
+          ret_status = get_ret_status();
+          if (ret_status == -ENOENT) {
+            set_retcode(0); /* reset coroutine status so that we don't return it */
+            ret_status = 0;
+          }
+          if (ret_status < 0) {
+            tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter));
+            yield entries_index->finish();
+            yield lease_cr->go_down();
+            drain_all();
+            return set_cr_error(ret_status);
+          }
+          iter = result.keys.begin();
+          for (; iter != result.keys.end(); ++iter) {
+            if (!lease_cr->is_locked()) {
+              lost_lock = true;
+              break;
+            }
+            yield; // allow entries_index consumer to make progress
+
+            tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter));
+            string s = *sections_iter + ":" + *iter;
+            int shard_id;
+            RGWRados *store = sync_env->store;
+            int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
+            if (ret < 0) {
+              tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter));
+              ret_status = ret;
+              break;
+            }
+            if (!entries_index->append(s, shard_id)) {
+              break;
+            }
+          }
+        } while (result.truncated);
+      }
+      yield {
+        if (!entries_index->finish()) {
+          failed = true;
+        }
+      }
+      if (!failed) {
+        for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
+          int shard_id = (int)iter->first;
+          rgw_meta_sync_marker& marker = iter->second;
+          marker.total_entries = entries_index->get_total_entries(shard_id);
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados, sync_env->store->svc.sysobj,
+                                                                rgw_raw_obj(sync_env->store->svc.zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
+                                                                marker), true);
+        }
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield lease_cr->go_down();
+
+      int ret;
+      while (collect(&ret, NULL)) {
+	if (ret < 0) {
+	  return set_cr_error(ret);
+	}
+        yield;
+      }
+      drain_all();
+      if (failed) {
+        yield return set_cr_error(-EIO);
+      }
+      if (lost_lock) {
+        yield return set_cr_error(-EBUSY);
+      }
+
+      if (ret_status < 0) {
+        yield return set_cr_error(ret_status);
+      }
+
+      yield return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+static string full_sync_index_shard_oid(int shard_id)
+{
+  char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+class RGWReadRemoteMetadataCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op;
+
+  string section;
+  string key;
+
+  bufferlist *pbl;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
+                                                      const string& _section, const string& _key, bufferlist *_pbl,
+                                                      const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                                      http_op(NULL),
+                                                      section(_section),
+                                                      key(_key),
+						      pbl(_pbl) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta",
+                                         section + ":" + key);
+  }
+
+  int operate() override {
+    RGWRESTConn *conn = sync_env->conn;
+    reenter(this) {
+      yield {
+        rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
+	                                { NULL, NULL } };
+
+        string p = string("/admin/metadata/") + section + "/" + key;
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read();
+        if (ret < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(pbl);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
+  RGWRados *store;
+  string raw_key;
+  bufferlist bl;
+protected:
+  int _send_request() override {
+    int ret = store->meta_mgr->put(raw_key, bl, RGWMetadataHandler::APPLY_ALWAYS);
+    if (ret < 0) {
+      ldout(store->ctx(), 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
+                       const string& _raw_key,
+                       bufferlist& _bl) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key), bl(_bl) {}
+};
+
+
+class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+  bufferlist bl;
+
+  RGWAsyncMetaStoreEntry *req;
+
+public:
+  RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key,
+                       bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), bl(_bl), req(NULL) {
+  }
+
+  ~RGWMetaStoreEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request() override {
+    req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
+			           sync_env->store, raw_key, bl);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
+  RGWRados *store;
+  string raw_key;
+protected:
+  int _send_request() override {
+    int ret = store->meta_mgr->remove(raw_key);
+    if (ret < 0) {
+      ldout(store->ctx(), 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
+                       const string& _raw_key) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key) {}
+};
+
+
+class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+
+  RGWAsyncMetaRemoveEntry *req;
+
+public:
+  RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), req(NULL) {
+  }
+
+  ~RGWMetaRemoveEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request() override {
+    req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
+			           sync_env->store, raw_key);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    int r = req->get_ret_status();
+    if (r == -ENOENT) {
+      r = 0;
+    }
+    return r;
+  }
+};
+
+#define META_SYNC_UPDATE_MARKER_WINDOW 10
+
+
+int RGWLastCallerWinsCR::operate() {
+  RGWCoroutine *call_cr;
+  reenter(this) {
+    while (cr) {
+      call_cr = cr;
+      cr = nullptr;
+      yield call(call_cr);
+      /* cr might have been modified at this point */
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+  RGWMetaSyncEnv *sync_env;
+
+  string marker_oid;
+  rgw_meta_sync_marker sync_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
+                         const string& _marker_oid,
+                         const rgw_meta_sync_marker& _marker,
+                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
+                                                                sync_env(_sync_env),
+                                                                marker_oid(_marker_oid),
+                                                                sync_marker(_marker),
+                                                                tn(_tn){}
+
+  RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.marker = new_marker;
+    if (index_pos > 0) {
+      sync_marker.pos = index_pos;
+    }
+
+    if (!real_clock::is_zero(timestamp)) {
+      sync_marker.timestamp = timestamp;
+    }
+
+    ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
+    tn->log(20, SSTR("new marker=" << new_marker));
+    RGWRados *store = sync_env->store;
+    return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
+                                                           store->svc.sysobj,
+                                                           rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, marker_oid),
+                                                           sync_marker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+		           const string& _raw_key, const string& _entry_marker,
+                           const RGWMDLogStatus& _op_status,
+                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct),
+                                                      sync_env(_sync_env),
+						      raw_key(_raw_key), entry_marker(_entry_marker),
+                                                      op_status(_op_status),
+                                                      pos(0), sync_status(0),
+                                                      marker_tracker(_marker_tracker), tries(0) {
+  error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0);
+  tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key);
+}
+
+int RGWMetaSyncSingleEntryCR::operate() {
+  reenter(this) {
+#define NUM_TRANSIENT_ERROR_RETRIES 10
+
+    if (error_injection &&
+        rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
+      ldpp_dout(sync_env->dpp, 0) << __FILE__ << ":" << __LINE__ << ": injecting meta sync error on key=" << raw_key << dendl;
+      return set_cr_error(-EIO);
+    }
+
+    if (op_status != MDLOG_STATUS_COMPLETE) {
+      tn->log(20, "skipping pending operation");
+      yield call(marker_tracker->finish(entry_marker));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      yield {
+        pos = raw_key.find(':');
+        section = raw_key.substr(0, pos);
+        key = raw_key.substr(pos + 1);
+        tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)")));
+        call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn));
+      }
+
+      sync_status = retcode;
+
+      if (sync_status == -ENOENT) {
+        /* FIXME: do we need to remove the entry from the local zone? */
+        break;
+      }
+
+      if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+        ldpp_dout(sync_env->dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl;
+        continue;
+      }
+
+      if (sync_status < 0) {
+        tn->log(10, SSTR("failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status));
+        log_error() << "failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
+        yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), section, key, -sync_status,
+                                                        string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
+        return set_cr_error(sync_status);
+      }
+
+      break;
+    }
+
+    retcode = 0;
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      if (sync_status != -ENOENT) {
+        tn->log(10, SSTR("storing local metadata entry"));
+          yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
+      } else {
+        tn->log(10, SSTR("removing local metadata entry"));
+          yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
+      }
+      if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+        ldpp_dout(sync_env->dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl;
+        continue;
+      }
+      break;
+    }
+
+    sync_status = retcode;
+
+    if (sync_status == 0 && marker_tracker) {
+      /* update marker */
+      yield call(marker_tracker->finish(entry_marker));
+      sync_status = retcode;
+    }
+    if (sync_status < 0) {
+      tn->log(10, SSTR("failed, status=" << sync_status));
+      return set_cr_error(sync_status);
+    }
+    tn->log(10, "success");
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWCloneMetaLogCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *mdlog;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  bool truncated = false;
+  string *new_marker;
+
+  int max_entries = CLONE_MAX_ENTRIES;
+
+  RGWRESTReadResource *http_op = nullptr;
+  boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
+
+  RGWMetadataLogInfo shard_info;
+  rgw_mdlog_shard_data data;
+
+public:
+  RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                           const std::string& period, int _id,
+                           const string& _marker, string *_new_marker)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+  }
+  ~RGWCloneMetaLogCoroutine() override {
+    if (http_op) {
+      http_op->put();
+    }
+    if (completion) {
+      completion->cancel();
+    }
+  }
+
+  int operate() override;
+
+  int state_init();
+  int state_read_shard_status();
+  int state_read_shard_status_complete();
+  int state_send_rest_request();
+  int state_receive_rest_response();
+  int state_store_mdlog_entries();
+  int state_store_mdlog_entries_complete();
+};
+
+class RGWMetaSyncShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period; //< currently syncing period id
+  const epoch_t realm_epoch; //< realm_epoch of period
+  RGWMetadataLog* mdlog; //< log of syncing period
+  uint32_t shard_id;
+  rgw_meta_sync_marker& sync_marker;
+  boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
+  string marker;
+  string max_marker;
+  const std::string& period_marker; //< max marker stored in next period
+
+  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+  std::set<std::string> entries;
+  std::set<std::string>::iterator iter;
+
+  string oid;
+
+  RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
+
+  list<cls_log_entry> log_entries;
+  list<cls_log_entry>::iterator log_iter;
+  bool truncated = false;
+
+  string mdlog_marker;
+  string raw_key;
+  rgw_mdlog_entry mdlog_entry;
+
+  Mutex inc_lock;
+  Cond inc_cond;
+
+  boost::asio::coroutine incremental_cr;
+  boost::asio::coroutine full_cr;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  bool lost_lock = false;
+
+  bool *reset_backoff;
+
+  // hold a reference to the cr stack while it's in the map
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  map<StackRef, string> stack_to_pos;
+  map<string, string> pos_to_prev;
+
+  bool can_adjust_marker = false;
+  bool done_with_period = false;
+
+  int total_entries = 0;
+
+  RGWSyncTraceNodeRef tn;
+public:
+  RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                     const std::string& period, epoch_t realm_epoch,
+                     RGWMetadataLog* mdlog, uint32_t _shard_id,
+                     rgw_meta_sync_marker& _marker,
+                     const std::string& period_marker, bool *_reset_backoff,
+                     RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
+      period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(period_marker), inc_lock("RGWMetaSyncShardCR::inc_lock"),
+      reset_backoff(_reset_backoff), tn(_tn) {
+    *reset_backoff = false;
+  }
+
+  ~RGWMetaSyncShardCR() override {
+    delete marker_tracker;
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
+    delete marker_tracker;
+    marker_tracker = mt;
+  }
+
+  int operate() override {
+    int r;
+    while (true) {
+      switch (sync_marker.state) {
+      case rgw_meta_sync_marker::FullSync:
+        r  = full_sync();
+        if (r < 0) {
+          ldpp_dout(sync_env->dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      case rgw_meta_sync_marker::IncrementalSync:
+        r  = incremental_sync();
+        if (r < 0) {
+          ldpp_dout(sync_env->dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      }
+    }
+    /* unreachable */
+    return 0;
+  }
+
+  void collect_children()
+  {
+    int child_ret;
+    RGWCoroutinesStack *child;
+    while (collect_next(&child_ret, &child)) {
+      auto iter = stack_to_pos.find(child);
+      if (iter == stack_to_pos.end()) {
+        /* some other stack that we don't care about */
+        continue;
+      }
+
+      string& pos = iter->second;
+
+      if (child_ret < 0) {
+        ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
+      }
+
+      map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
+      ceph_assert(prev_iter != pos_to_prev.end());
+
+      /*
+       * we should get -EAGAIN for transient errors, for which we want to retry, so we don't
+       * update the marker and abort. We'll get called again for these. Permanent errors will be
+       * handled by marking the entry at the error log shard, so that we retry on it separately
+       */
+      if (child_ret == -EAGAIN) {
+        can_adjust_marker = false;
+      }
+
+      if (pos_to_prev.size() == 1) {
+        if (can_adjust_marker) {
+          sync_marker.marker = pos;
+        }
+        pos_to_prev.erase(prev_iter);
+      } else {
+        ceph_assert(pos_to_prev.size() > 1);
+        pos_to_prev.erase(prev_iter);
+        prev_iter = pos_to_prev.begin();
+        if (can_adjust_marker) {
+          sync_marker.marker = prev_iter->second;
+        }
+      }
+
+      ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
+      stack_to_pos.erase(iter);
+    }
+  }
+
+  int full_sync() {
+#define OMAP_GET_MAX_ENTRIES 100
+    int max_entries = OMAP_GET_MAX_ENTRIES;
+    reenter(&full_cr) {
+      set_status("full_sync");
+      tn->log(10, "start full sync");
+      oid = full_sync_index_shard_oid(shard_id);
+      can_adjust_marker = true;
+      /* grab lock */
+      yield {
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+        RGWRados *store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+        lost_lock = false;
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          drain_all();
+          tn->log(5, "failed to take lease");
+          return lease_cr->get_ret_status();
+        }
+        set_sleeping(true);
+        yield;
+      }
+      tn->log(10, "took lease");
+
+      /* lock succeeded, a retry now should avoid previous backoff status */
+      *reset_backoff = true;
+
+      /* prepare marker tracker */
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      marker = sync_marker.marker;
+
+      total_entries = sync_marker.pos;
+
+      /* sync! */
+      do {
+        if (!lease_cr->is_locked()) {
+          tn->log(10, "lost lease");
+          lost_lock = true;
+          break;
+        }
+        omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+        yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
+                                             marker, max_entries, omapkeys));
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
+          tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+        entries = std::move(omapkeys->entries);
+        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+        if (entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        iter = entries.begin();
+        for (; iter != entries.end(); ++iter) {
+          marker = *iter;
+          tn->log(20, SSTR("full sync: " << marker));
+          total_entries++;
+          if (!marker_tracker->start(marker, total_entries, real_time())) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?"));
+          } else {
+            // fetch remote and write locally
+            yield {
+              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false);
+              // stack_to_pos holds a reference to the stack
+              stack_to_pos[stack] = marker;
+              pos_to_prev[marker] = marker;
+            }
+          }
+        }
+        collect_children();
+      } while (omapkeys->more && can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      if (!lost_lock) {
+        /* update marker to reflect we're done with full sync */
+        if (can_adjust_marker) {
+          // apply updates to a temporary marker, or operate() will send us
+          // to incremental_sync() after we yield
+          temp_marker = sync_marker;
+	  temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
+	  temp_marker->marker = std::move(temp_marker->next_step_marker);
+	  temp_marker->next_step_marker.clear();
+	  temp_marker->realm_epoch = realm_epoch;
+	  ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
+
+	  using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
+	  yield call(new WriteMarkerCR(sync_env->async_rados, sync_env->store->svc.sysobj,
+				       rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+				       *temp_marker));
+        }
+
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+      }
+
+      /* 
+       * if we reached here, it means that lost_lock is true, otherwise the state
+       * change in the previous block will prevent us from reaching here
+       */
+
+      yield lease_cr->go_down();
+
+      lease_cr.reset();
+
+      drain_all();
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      tn->log(10, "full sync complete");
+
+      // apply the sync marker update
+      ceph_assert(temp_marker);
+      sync_marker = std::move(*temp_marker);
+      temp_marker = boost::none;
+      // must not yield after this point!
+    }
+    return 0;
+  }
+    
+
+  int incremental_sync() {
+    reenter(&incremental_cr) {
+      set_status("incremental_sync");
+      tn->log(10, "start incremental sync");
+      can_adjust_marker = true;
+      /* grab lock */
+      if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
+        yield {
+          uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+          string lock_name = "sync_lock";
+          RGWRados *store = sync_env->store;
+          lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                   rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                   lock_name, lock_duration, this));
+          lease_stack.reset(spawn(lease_cr.get(), false));
+          lost_lock = false;
+        }
+        while (!lease_cr->is_locked()) {
+          if (lease_cr->is_done()) {
+            drain_all();
+            tn->log(10, "failed to take lease");
+            return lease_cr->get_ret_status();
+          }
+          set_sleeping(true);
+          yield;
+        }
+      }
+      tn->log(10, "took lease");
+      // if the period has advanced, we can't use the existing marker
+      if (sync_marker.realm_epoch < realm_epoch) {
+        ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker
+            << " from old realm_epoch=" << sync_marker.realm_epoch
+            << " (now " << realm_epoch << ')' << dendl;
+        sync_marker.realm_epoch = realm_epoch;
+        sync_marker.marker.clear();
+      }
+      mdlog_marker = sync_marker.marker;
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      /*
+       * mdlog_marker: the remote sync marker positiion
+       * sync_marker: the local sync marker position
+       * max_marker: the max mdlog position that we fetched
+       * marker: the current position we try to sync
+       * period_marker: the last marker before the next period begins (optional)
+       */
+      marker = max_marker = sync_marker.marker;
+      /* inc sync */
+      do {
+        if (!lease_cr->is_locked()) {
+          lost_lock = true;
+          tn->log(10, "lost lease");
+          break;
+        }
+#define INCREMENTAL_MAX_ENTRIES 100
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
+        if (!period_marker.empty() && period_marker <= mdlog_marker) {
+          tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker));
+          done_with_period = true;
+          break;
+        }
+	if (mdlog_marker <= max_marker) {
+	  /* we're at the tip, try to bring more entries */
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
+          yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
+                                                  period, shard_id,
+                                                  mdlog_marker, &mdlog_marker));
+	}
+        if (retcode < 0) {
+          tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          *reset_backoff = false; // back off and try again later
+          return retcode;
+        }
+        *reset_backoff = true; /* if we got to this point, all systems function */
+	if (mdlog_marker > max_marker) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+          tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker));
+          marker = max_marker;
+          yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
+                                               &max_marker, INCREMENTAL_MAX_ENTRIES,
+                                               &log_entries, &truncated));
+          if (retcode < 0) {
+            tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode));
+            yield lease_cr->go_down();
+            drain_all();
+            *reset_backoff = false; // back off and try again later
+            return retcode;
+          }
+          for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
+            if (!period_marker.empty() && period_marker <= log_iter->id) {
+              done_with_period = true;
+              if (period_marker < log_iter->id) {
+                tn->log(10, SSTR("found key=" << log_iter->id
+                    << " past period_marker=" << period_marker));
+                break;
+              }
+              ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl;
+              // sync this entry, then return control to RGWMetaSyncCR
+            }
+            if (!mdlog_entry.convert_from(*log_iter)) {
+              tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry"));
+              continue;
+            }
+            tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp));
+            if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
+              ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
+            } else {
+              raw_key = log_iter->section + ":" + log_iter->name;
+              yield {
+                RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false);
+                ceph_assert(stack);
+                // stack_to_pos holds a reference to the stack
+                stack_to_pos[stack] = log_iter->id;
+                pos_to_prev[log_iter->id] = marker;
+              }
+            }
+            marker = log_iter->id;
+          }
+        }
+        collect_children();
+	ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
+        if (done_with_period) {
+          // return control to RGWMetaSyncCR and advance to the next period
+          tn->log(10, SSTR(*this << ": done with period"));
+          break;
+        }
+	if (mdlog_marker == max_marker && can_adjust_marker) {
+          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+#define INCREMENTAL_INTERVAL 20
+	  yield wait(utime_t(INCREMENTAL_INTERVAL, 0));
+	}
+      } while (can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      yield lease_cr->go_down();
+
+      drain_all();
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      return set_cr_done();
+    }
+    /* TODO */
+    return 0;
+  }
+};
+
+class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
+{
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period;
+  epoch_t realm_epoch;
+  RGWMetadataLog* mdlog;
+  uint32_t shard_id;
+  rgw_meta_sync_marker sync_marker;
+  const std::string period_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+  static constexpr bool exit_on_error = false; // retry on all errors
+public:
+  RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                            const std::string& period, epoch_t realm_epoch,
+                            RGWMetadataLog* mdlog, uint32_t _shard_id,
+                            const rgw_meta_sync_marker& _marker,
+                            std::string&& period_marker,
+                            RGWSyncTraceNodeRef& _tn_parent)
+    : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
+      pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(std::move(period_marker)) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard",
+                                         std::to_string(shard_id));
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
+                                  shard_id, sync_marker, period_marker, backoff_ptr(), tn);
+  }
+
+  RGWCoroutine *alloc_finisher_cr() override {
+    RGWRados *store = sync_env->store;
+    return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->async_rados, store->svc.sysobj,
+                                                          rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                          &sync_marker);
+  }
+};
+
+class RGWMetaSyncCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  const rgw_pool& pool;
+  RGWPeriodHistory::Cursor cursor; //< sync position in period history
+  RGWPeriodHistory::Cursor next; //< next period in history
+  rgw_meta_sync_status sync_status;
+  RGWSyncTraceNodeRef tn;
+
+  std::mutex mutex; //< protect access to shard_crs
+
+  // TODO: it should be enough to hold a reference on the stack only, as calling
+  // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
+  // already completed
+  using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  using RefPair = std::pair<ControlCRRef, StackRef>;
+  map<int, RefPair> shard_crs;
+  int ret{0};
+
+public:
+  RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor,
+                const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+      pool(sync_env->store->svc.zone->get_zone_params().log_pool),
+      cursor(cursor), sync_status(_sync_status), tn(_tn) {}
+
+  ~RGWMetaSyncCR() {
+  }
+
+  int operate() override {
+    reenter(this) {
+      // loop through one period at a time
+      tn->log(1, "start");
+      for (;;) {
+        if (cursor == sync_env->store->period_history->get_current()) {
+          next = RGWPeriodHistory::Cursor{};
+          if (cursor) {
+            ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR on current period="
+                << cursor.get_period().get_id() << dendl;
+          } else {
+            ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR with no period" << dendl;
+          }
+        } else {
+          next = cursor;
+          next.next();
+          ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR on period="
+              << cursor.get_period().get_id() << ", next="
+              << next.get_period().get_id() << dendl;
+        }
+
+        yield {
+          // get the mdlog for the current period (may be empty)
+          auto& period_id = sync_status.sync_info.period;
+          auto realm_epoch = sync_status.sync_info.realm_epoch;
+          auto mdlog = sync_env->store->meta_mgr->get_log(period_id);
+
+          tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id));
+
+          // prevent wakeup() from accessing shard_crs while we're spawning them
+          std::lock_guard<std::mutex> lock(mutex);
+
+          // sync this period on each shard
+          for (const auto& m : sync_status.sync_markers) {
+            uint32_t shard_id = m.first;
+            auto& marker = m.second;
+
+            std::string period_marker;
+            if (next) {
+              // read the maximum marker from the next period's sync status
+              period_marker = next.get_period().get_sync_status()[shard_id];
+              if (period_marker.empty()) {
+                // no metadata changes have occurred on this shard, skip it
+                ldpp_dout(sync_env->dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
+                    << " with empty period marker" << dendl;
+                continue;
+              }
+            }
+
+            using ShardCR = RGWMetaSyncShardControlCR;
+            auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
+                                  mdlog, shard_id, marker,
+                                  std::move(period_marker), tn);
+            auto stack = spawn(cr, false);
+            shard_crs[shard_id] = RefPair{cr, stack};
+          }
+        }
+        // wait for each shard to complete
+        while (ret == 0 && num_spawned() > 0) {
+          yield wait_for_child();
+          collect(&ret, nullptr);
+        }
+        drain_all();
+        {
+          // drop shard cr refs under lock
+          std::lock_guard<std::mutex> lock(mutex);
+          shard_crs.clear();
+        }
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        // advance to the next period
+        ceph_assert(next);
+        cursor = next;
+
+        // write the updated sync info
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+        yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados,
+                                                                 sync_env->store->svc.sysobj,
+                                                                 rgw_raw_obj(pool, sync_env->status_oid()),
+                                                                 sync_status.sync_info));
+      }
+    }
+    return 0;
+  }
+
+  void wakeup(int shard_id) {
+    std::lock_guard<std::mutex> lock(mutex);
+    auto iter = shard_crs.find(shard_id);
+    if (iter == shard_crs.end()) {
+      return;
+    }
+    iter->second.first->wakeup();
+  }
+};
+
+void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
+  env->dpp = dpp;
+  env->cct = store->ctx();
+  env->store = store;
+  env->conn = conn;
+  env->async_rados = async_rados;
+  env->http_manager = &http_manager;
+  env->error_logger = error_logger;
+  env->sync_tracer = store->get_sync_tracer();
+}
+
+int RGWRemoteMetaLog::read_sync_status(rgw_meta_sync_status *sync_status)
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWMetaSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  tn->log(20, "read sync status");
+  ret = crs.run(new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
+  http_manager.stop();
+  return ret;
+}
+
+int RGWRemoteMetaLog::init_sync_status()
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info mdlog_info;
+  int r = read_log_info(&mdlog_info);
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  rgw_meta_sync_info sync_info;
+  sync_info.num_shards = mdlog_info.num_shards;
+  auto cursor = store->period_history->get_current();
+  if (cursor) {
+    sync_info.period = cursor.get_period().get_id();
+    sync_info.realm_epoch = cursor.get_epoch();
+  }
+
+  return run(new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
+}
+
+int RGWRemoteMetaLog::store_sync_info(const rgw_meta_sync_info& sync_info)
+{
+  tn->log(20, "store sync info");
+  return run(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(async_rados, store->svc.sysobj,
+                                                           rgw_raw_obj(store->svc.zone->get_zone_params().log_pool, sync_env.status_oid()),
+                                                           sync_info));
+}
+
+// return a cursor to the period at our sync position
+static RGWPeriodHistory::Cursor get_period_at(RGWRados* store,
+                                              const rgw_meta_sync_info& info)
+{
+  if (info.period.empty()) {
+    // return an empty cursor with error=0
+    return RGWPeriodHistory::Cursor{};
+  }
+
+  // look for an existing period in our history
+  auto cursor = store->period_history->lookup(info.realm_epoch);
+  if (cursor) {
+    // verify that the period ids match
+    auto& existing = cursor.get_period().get_id();
+    if (existing != info.period) {
+      lderr(store->ctx()) << "ERROR: sync status period=" << info.period
+          << " does not match period=" << existing
+          << " in history at realm epoch=" << info.realm_epoch << dendl;
+      return RGWPeriodHistory::Cursor{-EEXIST};
+    }
+    return cursor;
+  }
+
+  // read the period from rados or pull it from the master
+  RGWPeriod period;
+  int r = store->period_puller->pull(info.period, period);
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: failed to read period id "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+    return RGWPeriodHistory::Cursor{r};
+  }
+  // attach the period to our history
+  cursor = store->period_history->attach(std::move(period));
+  if (!cursor) {
+    r = cursor.get_error();
+    lderr(store->ctx()) << "ERROR: failed to read period history back to "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+  }
+  return cursor;
+}
+
+int RGWRemoteMetaLog::run_sync()
+{
+  if (store->svc.zone->is_meta_master()) {
+    return 0;
+  }
+
+  int r = 0;
+
+  // get shard count and oldest log period from master
+  rgw_mdlog_info mdlog_info;
+  for (;;) {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = read_log_info(&mdlog_info);
+    if (r == -EIO || r == -ENOENT) {
+      // keep retrying if master isn't alive or hasn't initialized the log
+      ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl;
+      backoff.backoff_sleep();
+      continue;
+    }
+    backoff.reset();
+    if (r < 0) {
+      lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+      return r;
+    }
+    break;
+  }
+
+  rgw_meta_sync_status sync_status;
+  do {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
+      return r;
+    }
+
+    if (!mdlog_info.period.empty()) {
+      // restart sync if the remote has a period, but:
+      // a) our status does not, or
+      // b) our sync period comes before the remote's oldest log period
+      if (sync_status.sync_info.period.empty() ||
+          sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
+        sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
+        string reason;
+        if (sync_status.sync_info.period.empty()) {
+          reason = "period is empty";
+        } else {
+          reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch);
+        }
+        tn->log(1, "initialize sync (reason: " + reason + ")");
+        ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch
+           << " in sync status comes before remote's oldest mdlog epoch="
+           << mdlog_info.realm_epoch << ", restarting sync" << dendl;
+      }
+    }
+
+    if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
+      ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl;
+      sync_status.sync_info.num_shards = mdlog_info.num_shards;
+      auto cursor = store->period_history->get_current();
+      if (cursor) {
+        // run full sync, then start incremental from the current period/epoch
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+      }
+      r = run(new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
+      if (r == -EBUSY) {
+        backoff.backoff_sleep();
+        continue;
+      }
+      backoff.reset();
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl;
+        return r;
+      }
+    }
+  } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
+
+  auto num_shards = sync_status.sync_info.num_shards;
+  if (num_shards != mdlog_info.num_shards) {
+    lderr(store->ctx()) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
+    return -EINVAL;
+  }
+
+  RGWPeriodHistory::Cursor cursor;
+  do {
+    r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r));
+      return r;
+    }
+
+    switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
+      case rgw_meta_sync_info::StateBuildingFullSyncMaps:
+        tn->log(20, "building full sync maps");
+        r = run(new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
+        if (r == -EBUSY || r == -EAGAIN) {
+          backoff.backoff_sleep();
+          continue;
+        }
+        backoff.reset();
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")"));
+          return r;
+        }
+
+        sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
+        r = store_sync_info(sync_status.sync_info);
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")"));
+          return r;
+        }
+        /* fall through */
+      case rgw_meta_sync_info::StateSync:
+        tn->log(20, "sync");
+        // find our position in the period history (if any)
+        cursor = get_period_at(store, sync_status.sync_info);
+        r = cursor.get_error();
+        if (r < 0) {
+          return r;
+        }
+        meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn);
+        r = run(meta_sync_cr);
+        if (r < 0) {
+          tn->log(0, "ERROR: failed to fetch all metadata keys");
+          return r;
+        }
+        break;
+      default:
+        tn->log(0, "ERROR: bad sync state!");
+        return -EIO;
+    }
+  } while (!going_down);
+
+  return 0;
+}
+
+void RGWRemoteMetaLog::wakeup(int shard_id)
+{
+  if (!meta_sync_cr) {
+    return;
+  }
+  meta_sync_cr->wakeup(shard_id);
+}
+
+int RGWCloneMetaLogCoroutine::operate()
+{
+  reenter(this) {
+    do {
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
+        return state_init();
+      }
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
+        return state_read_shard_status();
+      }
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
+        return state_read_shard_status_complete();
+      }
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
+        return state_send_rest_request();
+      }
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
+        return state_receive_rest_response();
+      }
+      yield {
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
+        return state_store_mdlog_entries();
+      }
+    } while (truncated);
+    yield {
+      ldpp_dout(sync_env->dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
+      return state_store_mdlog_entries_complete();
+    }
+  }
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_init()
+{
+  data = rgw_mdlog_shard_data();
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status()
+{
+  const bool add_ref = false; // default constructs with refs=1
+
+  completion.reset(new RGWMetadataLogInfoCompletion(
+    [this](int ret, const cls_log_header& header) {
+      if (ret < 0) {
+        if (ret != -ENOENT) {
+          ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with "
+                                      << cpp_strerror(ret) << dendl;
+        }
+      } else {
+        shard_info.marker = header.max_marker;
+        shard_info.last_update = header.max_time.to_real_time();
+      }
+      // wake up parent stack
+      io_complete();
+    }), add_ref);
+
+  int ret = mdlog->get_info_async(shard_id, completion.get());
+  if (ret < 0) {
+    ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
+{
+  completion.reset();
+
+  ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
+
+  marker = shard_info.marker;
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_send_rest_request()
+{
+  RGWRESTConn *conn = sync_env->conn;
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%d", shard_id);
+
+  char max_entries_buf[32];
+  snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
+
+  const char *marker_key = (marker.empty() ? "" : "marker");
+
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { "id", buf },
+                                  { "period", period.c_str() },
+                                  { "max-entries", max_entries_buf },
+                                  { marker_key, marker.c_str() },
+                                  { NULL, NULL } };
+
+  http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
+
+  init_new_io(http_op);
+
+  int ret = http_op->aio_read();
+  if (ret < 0) {
+    ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+    log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_receive_rest_response()
+{
+  int ret = http_op->wait(&data);
+  if (ret < 0) {
+    error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
+    ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+  http_op->put();
+  http_op = NULL;
+
+  ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
+
+  truncated = ((int)data.entries.size() == max_entries);
+
+  if (data.entries.empty()) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+    return set_cr_done();
+  }
+
+  if (new_marker) {
+    *new_marker = data.entries.back().id;
+  }
+
+  return 0;
+}
+
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
+{
+  list<cls_log_entry> dest_entries;
+
+  vector<rgw_mdlog_entry>::iterator iter;
+  for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
+    rgw_mdlog_entry& entry = *iter;
+    ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl;
+
+    cls_log_entry dest_entry;
+    dest_entry.id = entry.id;
+    dest_entry.section = entry.section;
+    dest_entry.name = entry.name;
+    dest_entry.timestamp = utime_t(entry.timestamp);
+  
+    encode(entry.log_data, dest_entry.data);
+
+    dest_entries.push_back(dest_entry);
+
+    marker = entry.id;
+  }
+
+  RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
+
+  int ret = mdlog->store_entries_in_shard(dest_entries, shard_id, cn->completion());
+  if (ret < 0) {
+    cn->put();
+    ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
+{
+  return set_cr_done();
+}
+
+
+// TODO: move into rgw_sync_trim.cc
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta trim: ")
+
+/// purge all log shards for the given mdlog
+class PurgeLogShardsCR : public RGWShardCollectCR {
+  RGWRados *const store;
+  const RGWMetadataLog* mdlog;
+  const int num_shards;
+  rgw_raw_obj obj;
+  int i{0};
+
+  static constexpr int max_concurrent = 16;
+
+ public:
+  PurgeLogShardsCR(RGWRados *store, const RGWMetadataLog* mdlog,
+                   const rgw_pool& pool, int num_shards)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
+  {}
+
+  bool spawn_next() override {
+    if (i == num_shards) {
+      return false;
+    }
+    mdlog->get_shard_oid(i++, obj.oid);
+    spawn(new RGWRadosRemoveCR(store, obj), false);
+    return true;
+  }
+};
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
+class PurgePeriodLogsCR : public RGWCoroutine {
+  RGWRados *const store;
+  RGWMetadataManager *const metadata;
+  RGWObjVersionTracker objv;
+  Cursor cursor;
+  epoch_t realm_epoch;
+  epoch_t *last_trim_epoch; //< update last trim on success
+
+ public:
+  PurgePeriodLogsCR(RGWRados *store, epoch_t realm_epoch, epoch_t *last_trim)
+    : RGWCoroutine(store->ctx()), store(store), metadata(store->meta_mgr),
+      realm_epoch(realm_epoch), last_trim_epoch(last_trim)
+  {}
+
+  int operate() override;
+};
+
+int PurgePeriodLogsCR::operate()
+{
+  reenter(this) {
+    // read our current oldest log period
+    yield call(metadata->read_oldest_log_period_cr(&cursor, &objv));
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    ceph_assert(cursor);
+    ldout(cct, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
+        << " period=" << cursor.get_period().get_id() << dendl;
+
+    // trim -up to- the given realm_epoch
+    while (cursor.get_epoch() < realm_epoch) {
+      ldout(cct, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+      yield {
+        const auto mdlog = metadata->get_log(cursor.get_period().get_id());
+        const auto& pool = store->svc.zone->get_zone_params().log_pool;
+        auto num_shards = cct->_conf->rgw_md_log_max_shards;
+        call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
+      }
+      if (retcode < 0) {
+        ldout(cct, 1) << "failed to remove log shards: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+
+      // update our mdlog history
+      yield call(metadata->trim_log_period_cr(cursor, &objv));
+      if (retcode == -ENOENT) {
+        // must have raced to update mdlog history. return success and allow the
+        // winner to continue purging
+        ldout(cct, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
+            << " period=" << cursor.get_period().get_id() << dendl;
+        return set_cr_done();
+      } else if (retcode < 0) {
+        ldout(cct, 1) << "failed to remove log shards for realm_epoch="
+            << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
+            << " with: " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (*last_trim_epoch < cursor.get_epoch()) {
+        *last_trim_epoch = cursor.get_epoch();
+      }
+
+      ceph_assert(cursor.has_next()); // get_current() should always come after
+      cursor.next();
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+namespace {
+
+using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
+
+/// construct a RGWRESTConn for each zone in the realm
+template <typename Zonegroups>
+connection_map make_peer_connections(RGWRados *store,
+                                     const Zonegroups& zonegroups)
+{
+  connection_map connections;
+  for (auto& g : zonegroups) {
+    for (auto& z : g.second.zones) {
+      std::unique_ptr<RGWRESTConn> conn{
+        new RGWRESTConn(store->ctx(), store->svc.zone, z.first, z.second.endpoints)};
+      connections.emplace(z.first, std::move(conn));
+    }
+  }
+  return connections;
+}
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// comparison operator for take_min_status()
+bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
+{
+  // sort by stable marker
+  return get_stable_marker(lhs) < get_stable_marker(rhs);
+}
+
+/// populate the status with the minimum stable marker of each shard for any
+/// peer whose realm_epoch matches the minimum realm_epoch in the input
+template <typename Iter>
+int take_min_status(CephContext *cct, Iter first, Iter last,
+                    rgw_meta_sync_status *status)
+{
+  if (first == last) {
+    return -EINVAL;
+  }
+  const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
+
+  status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
+  for (auto p = first; p != last; ++p) {
+    // validate peer's shard count
+    if (p->sync_markers.size() != num_shards) {
+      ldout(cct, 1) << "take_min_status got peer status with "
+          << p->sync_markers.size() << " shards, expected "
+          << num_shards << dendl;
+      return -EINVAL;
+    }
+    if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
+      // earlier epoch, take its entire status
+      *status = std::move(*p);
+    } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
+      // same epoch, take any earlier markers
+      auto m = status->sync_markers.begin();
+      for (auto& shard : p->sync_markers) {
+        if (shard.second < m->second) {
+          m->second = std::move(shard.second);
+        }
+        ++m;
+      }
+    }
+  }
+  return 0;
+}
+
+struct TrimEnv {
+  const DoutPrefixProvider *dpp;
+  RGWRados *const store;
+  RGWHTTPManager *const http;
+  int num_shards;
+  const std::string& zone;
+  Cursor current; //< cursor to current period
+  epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
+
+  TrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : dpp(dpp), store(store), http(http), num_shards(num_shards),
+      zone(store->svc.zone->get_zone_params().get_id()),
+      current(store->period_history->get_current())
+  {}
+};
+
+struct MasterTrimEnv : public TrimEnv {
+  connection_map connections; //< peer connections
+  std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
+  /// last trim marker for each shard, only applies to current period's mdlog
+  std::vector<std::string> last_trim_markers;
+
+  MasterTrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_markers(num_shards)
+  {
+    auto& period = current.get_period();
+    connections = make_peer_connections(store, period.get_map().zonegroups);
+    connections.erase(zone);
+    peer_status.resize(connections.size());
+  }
+};
+
+struct PeerTrimEnv : public TrimEnv {
+  /// last trim timestamp for each shard, only applies to current period's mdlog
+  std::vector<ceph::real_time> last_trim_timestamps;
+
+  PeerTrimEnv(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_timestamps(num_shards)
+  {}
+
+  void set_num_shards(int num_shards) {
+    this->num_shards = num_shards;
+    last_trim_timestamps.resize(num_shards);
+  }
+};
+
+} // anonymous namespace
+
+
+/// spawn a trim cr for each shard that needs it, while limiting the number
+/// of concurrent shards
+class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
+ private:
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  int shard_id{0};
+  std::string oid;
+  const rgw_meta_sync_status& sync_status;
+
+ public:
+  MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
+                               const rgw_meta_sync_status& sync_status)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), sync_status(sync_status)
+  {}
+
+  bool spawn_next() override;
+};
+
+bool MetaMasterTrimShardCollectCR::spawn_next()
+{
+  while (shard_id < env.num_shards) {
+    auto m = sync_status.sync_markers.find(shard_id);
+    if (m == sync_status.sync_markers.end()) {
+      shard_id++;
+      continue;
+    }
+    auto& stable = get_stable_marker(m->second);
+    auto& last_trim = env.last_trim_markers[shard_id];
+
+    if (stable <= last_trim) {
+      // already trimmed
+      ldout(cct, 20) << "skipping log shard " << shard_id
+          << " at marker=" << stable
+          << " last_trim=" << last_trim
+          << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+      shard_id++;
+      continue;
+    }
+
+    mdlog->get_shard_oid(shard_id, oid);
+
+    ldout(cct, 10) << "trimming log shard " << shard_id
+        << " at marker=" << stable
+        << " last_trim=" << last_trim
+        << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+    spawn(new RGWSyncLogTrimCR(env.store, oid, stable, &last_trim), false);
+    shard_id++;
+    return true;
+  }
+  return false;
+}
+
+/// spawn rest requests to read each peer's sync status
+class MetaMasterStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  connection_map::iterator c;
+  std::vector<rgw_meta_sync_status>::iterator s;
+ public:
+  explicit MetaMasterStatusCollectCR(MasterTrimEnv& env)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), c(env.connections.begin()), s(env.peer_status.begin())
+  {}
+
+  bool spawn_next() override {
+    if (c == env.connections.end()) {
+      return false;
+    }
+    static rgw_http_param_pair params[] = {
+      { "type", "metadata" },
+      { "status", nullptr },
+      { nullptr, nullptr }
+    };
+
+    ldout(cct, 20) << "query sync status from " << c->first << dendl;
+    auto conn = c->second.get();
+    using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
+    spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
+          false);
+    ++c;
+    ++s;
+    return true;
+  }
+};
+
+class MetaMasterTrimCR : public RGWCoroutine {
+  MasterTrimEnv& env;
+  rgw_meta_sync_status min_status; //< minimum sync status of all peers
+  int ret{0};
+
+ public:
+  explicit MetaMasterTrimCR(MasterTrimEnv& env)
+    : RGWCoroutine(env.store->ctx()), env(env)
+  {}
+
+  int operate() override;
+};
+
+int MetaMasterTrimCR::operate()
+{
+  reenter(this) {
+    // TODO: detect this and fail before we spawn the trim thread?
+    if (env.connections.empty()) {
+      ldout(cct, 4) << "no peers, exiting" << dendl;
+      return set_cr_done();
+    }
+
+    ldout(cct, 10) << "fetching sync status for zone " << env.zone << dendl;
+    // query mdlog sync status from peers
+    yield call(new MetaMasterStatusCollectCR(env));
+
+    // must get a successful reply from all peers to consider trimming
+    if (ret < 0) {
+      ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    // determine the minimum epoch and markers
+    ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
+                          env.peer_status.end(), &min_status);
+    if (ret < 0) {
+      ldout(cct, 4) << "failed to calculate min sync status from peers" << dendl;
+      return set_cr_error(ret);
+    }
+    yield {
+      auto store = env.store;
+      auto epoch = min_status.sync_info.realm_epoch;
+      ldout(cct, 4) << "realm epoch min=" << epoch
+          << " current=" << env.current.get_epoch()<< dendl;
+      if (epoch > env.last_trim_epoch + 1) {
+        // delete any prior mdlog periods
+        spawn(new PurgePeriodLogsCR(store, epoch, &env.last_trim_epoch), true);
+      } else {
+        ldout(cct, 10) << "mdlogs already purged up to realm_epoch "
+            << env.last_trim_epoch << dendl;
+      }
+
+      // if realm_epoch == current, trim mdlog based on markers
+      if (epoch == env.current.get_epoch()) {
+        auto mdlog = store->meta_mgr->get_log(env.current.get_period().get_id());
+        spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
+      }
+    }
+    // ignore any errors during purge/trim because we want to hold the lock open
+    return set_cr_done();
+  }
+  return 0;
+}
+
+
+/// read the first entry of the master's mdlog shard and trim to that position
+class MetaPeerTrimShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  const int shard_id;
+  RGWMetadataLogInfo info;
+  ceph::real_time stable; //< safe timestamp to trim, according to master
+  ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
+  rgw_mdlog_shard_data result; //< result from master's mdlog listing
+
+ public:
+  MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
+                      const std::string& period_id, int shard_id,
+                      ceph::real_time *last_trim)
+    : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
+      period_id(period_id), shard_id(shard_id), last_trim(last_trim)
+  {}
+
+  int operate() override;
+};
+
+int MetaPeerTrimShardCR::operate()
+{
+  reenter(this) {
+    // query master's first mdlog entry for this shard
+    yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
+                                             "", 1, &result));
+    if (retcode < 0) {
+      ldpp_dout(env.dpp, 5) << "failed to read first entry from master's mdlog shard "
+          << shard_id << " for period " << period_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    if (result.entries.empty()) {
+      // if there are no mdlog entries, we don't have a timestamp to compare. we
+      // can't just trim everything, because there could be racing updates since
+      // this empty reply. query the mdlog shard info to read its max timestamp,
+      // then retry the listing to make sure it's still empty before trimming to
+      // that
+      ldpp_dout(env.dpp, 10) << "empty master mdlog shard " << shard_id
+          << ", reading last timestamp from shard info" << dendl;
+      // read the mdlog shard info for the last timestamp
+      using ShardInfoCR = RGWReadRemoteMDLogShardInfoCR;
+      yield call(new ShardInfoCR(&env, period_id, shard_id, &info));
+      if (retcode < 0) {
+        ldpp_dout(env.dpp, 5) << "failed to read info from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (ceph::real_clock::is_zero(info.last_update)) {
+        return set_cr_done(); // nothing to trim
+      }
+      ldpp_dout(env.dpp, 10) << "got mdlog shard info with last update="
+          << info.last_update << dendl;
+      // re-read the master's first mdlog entry to make sure it hasn't changed
+      yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
+                                               "", 1, &result));
+      if (retcode < 0) {
+        ldpp_dout(env.dpp, 5) << "failed to read first entry from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      // if the mdlog is still empty, trim to max marker
+      if (result.entries.empty()) {
+        stable = info.last_update;
+      } else {
+        stable = result.entries.front().timestamp;
+
+        // can only trim -up to- master's first timestamp, so subtract a second.
+        // (this is why we use timestamps instead of markers for the peers)
+        stable -= std::chrono::seconds(1);
+      }
+    } else {
+      stable = result.entries.front().timestamp;
+      stable -= std::chrono::seconds(1);
+    }
+
+    if (stable <= *last_trim) {
+      ldpp_dout(env.dpp, 10) << "skipping log shard " << shard_id
+          << " at timestamp=" << stable
+          << " last_trim=" << *last_trim << dendl;
+      return set_cr_done();
+    }
+
+    ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id
+        << " at timestamp=" << stable
+        << " last_trim=" << *last_trim << dendl;
+    yield {
+      std::string oid;
+      mdlog->get_shard_oid(shard_id, oid);
+      call(new RGWRadosTimelogTrimCR(env.store, oid, real_time{}, stable, "", ""));
+    }
+    if (retcode < 0 && retcode != -ENODATA) {
+      ldpp_dout(env.dpp, 1) << "failed to trim mdlog shard " << shard_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    *last_trim = stable;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  PeerTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
+  int shard_id{0};
+
+ public:
+  MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
+  {
+    meta_env.init(env.dpp, cct, env.store, env.store->svc.zone->get_master_conn(),
+                  env.store->get_async_rados(), env.http, nullptr,
+                  env.store->get_sync_tracer());
+  }
+
+  bool spawn_next() override;
+};
+
+bool MetaPeerTrimShardCollectCR::spawn_next()
+{
+  if (shard_id >= env.num_shards) {
+    return false;
+  }
+  auto& last_trim = env.last_trim_timestamps[shard_id];
+  spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
+        false);
+  shard_id++;
+  return true;
+}
+
+class MetaPeerTrimCR : public RGWCoroutine {
+  PeerTrimEnv& env;
+  rgw_mdlog_info mdlog_info; //< master's mdlog info
+
+ public:
+  explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
+
+  int operate() override;
+};
+
+int MetaPeerTrimCR::operate()
+{
+  reenter(this) {
+    ldout(cct, 10) << "fetching master mdlog info" << dendl;
+    yield {
+      // query mdlog_info from master for oldest_log_period
+      rgw_http_param_pair params[] = {
+        { "type", "metadata" },
+        { nullptr, nullptr }
+      };
+
+      using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
+      call(new LogInfoCR(cct, env.store->svc.zone->get_master_conn(), env.http,
+                         "/admin/log/", params, &mdlog_info));
+    }
+    if (retcode < 0) {
+      ldout(cct, 4) << "failed to read mdlog info from master" << dendl;
+      return set_cr_error(retcode);
+    }
+    // use master's shard count instead
+    env.set_num_shards(mdlog_info.num_shards);
+
+    if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
+      // delete any prior mdlog periods
+      yield call(new PurgePeriodLogsCR(env.store, mdlog_info.realm_epoch,
+                                       &env.last_trim_epoch));
+    } else {
+      ldout(cct, 10) << "mdlogs already purged through realm_epoch "
+          << env.last_trim_epoch << dendl;
+    }
+
+    // if realm_epoch == current, trim mdlog based on master's markers
+    if (mdlog_info.realm_epoch == env.current.get_epoch()) {
+      yield {
+        auto meta_mgr = env.store->meta_mgr;
+        auto mdlog = meta_mgr->get_log(env.current.get_period().get_id());
+        call(new MetaPeerTrimShardCollectCR(env, mdlog));
+        // ignore any errors during purge/trim because we want to hold the lock open
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaTrimPollCR : public RGWCoroutine {
+  RGWRados *const store;
+  const utime_t interval; //< polling interval
+  const rgw_raw_obj obj;
+  const std::string name{"meta_trim"}; //< lock name
+  const std::string cookie;
+
+ protected:
+  /// allocate the coroutine to run within the lease
+  virtual RGWCoroutine* alloc_cr() = 0;
+
+ public:
+  MetaTrimPollCR(RGWRados *store, utime_t interval)
+    : RGWCoroutine(store->ctx()), store(store), interval(interval),
+      obj(store->svc.zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
+  {}
+
+  int operate() override;
+};
+
+int MetaTrimPollCR::operate()
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store,
+                                          obj, name, cookie, interval.sec()));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(alloc_cr());
+
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+class MetaMasterTrimPollCR : public MetaTrimPollCR  {
+  MasterTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaMasterTrimCR(env);
+  }
+ public:
+  MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http,
+                       int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+class MetaPeerTrimPollCR : public MetaTrimPollCR {
+  PeerTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaPeerTrimCR(env);
+  }
+ public:
+  MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http,
+                     int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  if (store->svc.zone->is_meta_master()) {
+    return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval);
+  }
+  return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval);
+}
+
+
+struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
+  MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : MasterTrimEnv(dpp, store, http, num_shards),
+      MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
+  {}
+};
+
+struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
+  MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : PeerTrimEnv(dpp, store, http, num_shards),
+      MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
+  {}
+};
+
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards)
+{
+  if (store->svc.zone->is_meta_master()) {
+    return new MetaMasterAdminTrimCR(dpp, store, http, num_shards);
+  }
+  return new MetaPeerAdminTrimCR(dpp, store, http, num_shards);
+}