1 files changed, 286 insertions, 0 deletions
diff --git a/src/rgw/rgw_reshard.h b/src/rgw/rgw_reshard.h
new file mode 100644
index 000000000..ecb18690f
--- /dev/null
+++ b/src/rgw/rgw_reshard.h
@@ -0,0 +1,286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef RGW_RESHARD_H
+#define RGW_RESHARD_H
+
+#include <vector>
+#include <initializer_list>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/asio/basic_waitable_timer.hpp>
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
+#include "common/async/yield_context.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/lock/cls_lock_client.h"
+
+#include "rgw_common.h"
+
+
+class RGWReshard;
+namespace rgw { namespace sal {
+  class RGWRadosStore;
+} }
+
+class RGWBucketReshardLock {
+  using Clock = ceph::coarse_mono_clock;
+
+  rgw::sal::RGWRadosStore* store;
+  const std::string lock_oid;
+  const bool ephemeral;
+  rados::cls::lock::Lock internal_lock;
+  std::chrono::seconds duration;
+
+  Clock::time_point start_time;
+  Clock::time_point renew_thresh;
+
+  void reset_time(const Clock::time_point& now) {
+    start_time = now;
+    renew_thresh = start_time + duration / 2;
+  }
+
+public:
+  RGWBucketReshardLock(rgw::sal::RGWRadosStore* _store,
+		       const std::string& reshard_lock_oid,
+		       bool _ephemeral);
+  RGWBucketReshardLock(rgw::sal::RGWRadosStore* _store,
+		       const RGWBucketInfo& bucket_info,
+		       bool _ephemeral) :
+    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+  {}
+
+  int lock();
+  void unlock();
+  int renew(const Clock::time_point&);
+
+  bool should_renew(const Clock::time_point& now) const {
+    return now >= renew_thresh;
+  }
+}; // class RGWBucketReshardLock
+
+class RGWBucketReshard {
+public:
+
+  friend class RGWReshard;
+
+  using Clock = ceph::coarse_mono_clock;
+
+private:
+
+  rgw::sal::RGWRadosStore *store;
+  RGWBucketInfo bucket_info;
+  std::map<string, bufferlist> bucket_attrs;
+
+  RGWBucketReshardLock reshard_lock;
+  RGWBucketReshardLock* outer_reshard_lock;
+
+  // using an initializer_list as an array in contiguous memory
+  // allocated in at once
+  static const std::initializer_list<uint16_t> reshard_primes;
+
+  int create_new_bucket_instance(int new_num_shards,
+				 RGWBucketInfo& new_bucket_info,
+                                 const DoutPrefixProvider *dpp);
+  int do_reshard(int num_shards,
+		 RGWBucketInfo& new_bucket_info,
+		 int max_entries,
+                 bool verbose,
+                 ostream *os,
+		 Formatter *formatter,
+                 const DoutPrefixProvider *dpp);
+public:
+
+  // pass nullptr for the final parameter if no outer reshard lock to
+  // manage
+  RGWBucketReshard(rgw::sal::RGWRadosStore *_store,
+		   const RGWBucketInfo& _bucket_info,
+                   const std::map<string, bufferlist>& _bucket_attrs,
+		   RGWBucketReshardLock* _outer_reshard_lock);
+  int execute(int num_shards, int max_op_entries,
+              const DoutPrefixProvider *dpp,
+              bool verbose = false, ostream *out = nullptr,
+              Formatter *formatter = nullptr,
+	      RGWReshard *reshard_log = nullptr);
+  int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
+  int cancel(const DoutPrefixProvider *dpp);
+  static int clear_resharding(const DoutPrefixProvider *dpp,
+                              rgw::sal::RGWRadosStore* store,
+			      const RGWBucketInfo& bucket_info);
+  int clear_resharding(const DoutPrefixProvider *dpp) {
+    return clear_resharding(dpp, store, bucket_info);
+  }
+  static int clear_index_shard_reshard_status(const DoutPrefixProvider *dpp,
+                                              rgw::sal::RGWRadosStore* store,
+					      const RGWBucketInfo& bucket_info);
+  int clear_index_shard_reshard_status(const DoutPrefixProvider *dpp) {
+    return clear_index_shard_reshard_status(dpp, store, bucket_info);
+  }
+  static int set_resharding_status(const DoutPrefixProvider *dpp,
+                                   rgw::sal::RGWRadosStore* store,
+				   const RGWBucketInfo& bucket_info,
+				   const string& new_instance_id,
+				   int32_t num_shards,
+				   cls_rgw_reshard_status status);
+  int set_resharding_status(const DoutPrefixProvider *dpp, const string& new_instance_id,
+			    int32_t num_shards,
+			    cls_rgw_reshard_status status) {
+    return set_resharding_status(dpp, store, bucket_info,
+				 new_instance_id, num_shards, status);
+  }
+
+  static uint32_t get_max_prime_shards() {
+    return *std::crbegin(reshard_primes);
+  }
+
+  // returns the prime in our list less than or equal to the
+  // parameter; the lowest value that can be returned is 1
+  static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
+    auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
+			       requested_shards);
+    if (it == reshard_primes.begin()) {
+      return 1;
+    } else {
+      return *(--it);
+    }
+  }
+
+  // returns the prime in our list greater than or equal to the
+  // parameter; if we do not have such a prime, 0 is returned
+  static uint32_t get_prime_shards_greater_or_equal(
+    uint32_t requested_shards)
+  {
+    auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
+			       requested_shards);
+    if (it == reshard_primes.end()) {
+      return 0;
+    } else {
+      return *it;
+    }
+  }
+
+  // returns a preferred number of shards given a calculated number of
+  // shards based on max_dynamic_shards and the list of prime values
+  static uint32_t get_preferred_shards(uint32_t suggested_shards,
+				       uint32_t max_dynamic_shards) {
+
+    // use a prime if max is within our prime range, otherwise use
+    // specified max
+    const uint32_t absolute_max =
+      max_dynamic_shards >= get_max_prime_shards() ?
+      max_dynamic_shards :
+      get_prime_shards_less_or_equal(max_dynamic_shards);
+
+    // if we can use a prime number, use it, otherwise use suggested;
+    // note get_prime_shards_greater_or_equal will return 0 if no prime in
+    // prime range
+    const uint32_t prime_ish_num_shards =
+      std::max(get_prime_shards_greater_or_equal(suggested_shards),
+	       suggested_shards);
+
+    // dynamic sharding cannot reshard more than defined maximum
+    const uint32_t final_num_shards =
+      std::min(prime_ish_num_shards, absolute_max);
+
+    return final_num_shards;
+  }
+}; // RGWBucketReshard
+
+
+class RGWReshard {
+public:
+    using Clock = ceph::coarse_mono_clock;
+
+private:
+    rgw::sal::RGWRadosStore *store;
+    string lock_name;
+    rados::cls::lock::Lock instance_lock;
+    int num_logshards;
+
+    bool verbose;
+    ostream *out;
+    Formatter *formatter;
+
+    void get_logshard_oid(int shard_num, string *shard);
+protected:
+  class ReshardWorker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWReshard *reshard;
+    ceph::mutex lock = ceph::make_mutex("ReshardWorker");
+    ceph::condition_variable cond;
+
+  public:
+    ReshardWorker(CephContext * const _cct,
+		  RGWReshard * const _reshard)
+      : cct(_cct),
+        reshard(_reshard) {}
+
+    void *entry() override;
+    void stop();
+
+    CephContext *get_cct() const override;
+    unsigned get_subsys() const;
+    std::ostream& gen_prefix(std::ostream& out) const;
+  };
+
+  ReshardWorker *worker = nullptr;
+  std::atomic<bool> down_flag = { false };
+
+  string get_logshard_key(const string& tenant, const string& bucket_name);
+  void get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid);
+
+public:
+  RGWReshard(rgw::sal::RGWRadosStore* _store, bool _verbose = false, ostream *_out = nullptr, Formatter *_formatter = nullptr);
+  int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const RGWBucketInfo& new_bucket_info);
+  int get(cls_rgw_reshard_entry& entry);
+  int remove(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int list(int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
+  int clear_bucket_resharding(const string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
+
+  /* reshard thread */
+  int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
+  int process_all_logshards(const DoutPrefixProvider *dpp);
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
+
+class RGWReshardWait {
+ public:
+  // the blocking wait uses std::condition_variable::wait_for(), which uses the
+  // std::chrono::steady_clock. use that for the async waits as well
+  using Clock = std::chrono::steady_clock;
+ private:
+  const ceph::timespan duration;
+  ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
+  ceph::condition_variable cond;
+
+  struct Waiter : boost::intrusive::list_base_hook<> {
+    using Executor = boost::asio::io_context::executor_type;
+    using Timer = boost::asio::basic_waitable_timer<Clock,
+          boost::asio::wait_traits<Clock>, Executor>;
+    Timer timer;
+    explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
+  };
+  boost::intrusive::list<Waiter> waiters;
+
+  bool going_down{false};
+
+public:
+  RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
+    : duration(duration) {}
+  ~RGWReshardWait() {
+    ceph_assert(going_down);
+  }
+  int wait(optional_yield y);
+  // unblock any threads waiting on reshard
+  void stop();
+};
+
+#endif