summaryrefslogtreecommitdiffstats
path: root/src/librbd/io
diff options
context:
space:
mode:
Diffstat (limited to 'src/librbd/io')
-rw-r--r--src/librbd/io/AioCompletion.cc294
-rw-r--r--src/librbd/io/AioCompletion.h203
-rw-r--r--src/librbd/io/AsyncOperation.cc94
-rw-r--r--src/librbd/io/AsyncOperation.h52
-rw-r--r--src/librbd/io/CopyupRequest.cc773
-rw-r--r--src/librbd/io/CopyupRequest.h145
-rw-r--r--src/librbd/io/Dispatcher.h252
-rw-r--r--src/librbd/io/DispatcherInterface.h37
-rw-r--r--src/librbd/io/FlushTracker.cc126
-rw-r--r--src/librbd/io/FlushTracker.h61
-rw-r--r--src/librbd/io/ImageDispatch.cc200
-rw-r--r--src/librbd/io/ImageDispatch.h95
-rw-r--r--src/librbd/io/ImageDispatchInterface.h87
-rw-r--r--src/librbd/io/ImageDispatchSpec.cc54
-rw-r--r--src/librbd/io/ImageDispatchSpec.h254
-rw-r--r--src/librbd/io/ImageDispatcher.cc324
-rw-r--r--src/librbd/io/ImageDispatcher.h77
-rw-r--r--src/librbd/io/ImageDispatcherInterface.h41
-rw-r--r--src/librbd/io/ImageRequest.cc909
-rw-r--r--src/librbd/io/ImageRequest.h377
-rw-r--r--src/librbd/io/IoOperations.cc101
-rw-r--r--src/librbd/io/IoOperations.h18
-rw-r--r--src/librbd/io/ObjectDispatch.cc161
-rw-r--r--src/librbd/io/ObjectDispatch.h115
-rw-r--r--src/librbd/io/ObjectDispatchInterface.h102
-rw-r--r--src/librbd/io/ObjectDispatchSpec.cc47
-rw-r--r--src/librbd/io/ObjectDispatchSpec.h295
-rw-r--r--src/librbd/io/ObjectDispatcher.cc208
-rw-r--r--src/librbd/io/ObjectDispatcher.h60
-rw-r--r--src/librbd/io/ObjectDispatcherInterface.h35
-rw-r--r--src/librbd/io/ObjectRequest.cc1073
-rw-r--r--src/librbd/io/ObjectRequest.h505
-rw-r--r--src/librbd/io/QosImageDispatch.cc328
-rw-r--r--src/librbd/io/QosImageDispatch.h135
-rw-r--r--src/librbd/io/QueueImageDispatch.cc154
-rw-r--r--src/librbd/io/QueueImageDispatch.h110
-rw-r--r--src/librbd/io/ReadResult.cc262
-rw-r--r--src/librbd/io/ReadResult.h129
-rw-r--r--src/librbd/io/RefreshImageDispatch.cc166
-rw-r--r--src/librbd/io/RefreshImageDispatch.h101
-rw-r--r--src/librbd/io/SimpleSchedulerObjectDispatch.cc565
-rw-r--r--src/librbd/io/SimpleSchedulerObjectDispatch.h227
-rw-r--r--src/librbd/io/TypeTraits.h20
-rw-r--r--src/librbd/io/Types.cc49
-rw-r--r--src/librbd/io/Types.h328
-rw-r--r--src/librbd/io/Utils.cc249
-rw-r--r--src/librbd/io/Utils.h83
-rw-r--r--src/librbd/io/WriteBlockImageDispatch.cc270
-rw-r--r--src/librbd/io/WriteBlockImageDispatch.h134
49 files changed, 10485 insertions, 0 deletions
diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc
new file mode 100644
index 000000000..c04b80770
--- /dev/null
+++ b/src/librbd/io/AioCompletion.cc
@@ -0,0 +1,294 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AioCompletion.h"
+#include <errno.h>
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+#include <boost/asio/dispatch.hpp>
+#include <boost/asio/post.hpp>
+
+#ifdef WITH_LTTNG
+#include "tracing/librbd.h"
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+int AioCompletion::wait_for_complete() {
+ tracepoint(librbd, aio_wait_for_complete_enter, this);
+ {
+ std::unique_lock<std::mutex> locker(lock);
+ while (state != AIO_STATE_COMPLETE) {
+ cond.wait(locker);
+ }
+ }
+ tracepoint(librbd, aio_wait_for_complete_exit, 0);
+ return 0;
+}
+
+void AioCompletion::finalize() {
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ // finalize any pending error results since we won't be
+ // atomically incrementing rval anymore
+ int err_r = error_rval;
+ if (err_r < 0) {
+ rval = err_r;
+ }
+
+ ssize_t r = rval;
+ ldout(cct, 20) << "r=" << r << dendl;
+ if (r >= 0 && aio_type == AIO_TYPE_READ) {
+ read_result.assemble_result(cct);
+ }
+}
+
+void AioCompletion::complete() {
+ ceph_assert(ictx != nullptr);
+
+ ssize_t r = rval;
+ if ((aio_type == AIO_TYPE_CLOSE) || (aio_type == AIO_TYPE_OPEN && r < 0)) {
+ ictx = nullptr;
+ external_callback = false;
+ } else {
+ CephContext *cct = ictx->cct;
+
+ tracepoint(librbd, aio_complete_enter, this, r);
+ if (ictx->perfcounter != nullptr) {
+ ceph::timespan elapsed = coarse_mono_clock::now() - start_time;
+ switch (aio_type) {
+ case AIO_TYPE_GENERIC:
+ case AIO_TYPE_OPEN:
+ break;
+ case AIO_TYPE_READ:
+ ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break;
+ case AIO_TYPE_WRITE:
+ ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break;
+ case AIO_TYPE_DISCARD:
+ ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break;
+ case AIO_TYPE_FLUSH:
+ ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break;
+ case AIO_TYPE_WRITESAME:
+ ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break;
+ case AIO_TYPE_COMPARE_AND_WRITE:
+ ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break;
+ default:
+ lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
+ break;
+ }
+ }
+ }
+
+ state = AIO_STATE_CALLBACK;
+ if (complete_cb) {
+ if (external_callback) {
+ complete_external_callback();
+ } else {
+ complete_cb(rbd_comp, complete_arg);
+ complete_event_socket();
+ notify_callbacks_complete();
+ }
+ } else {
+ complete_event_socket();
+ notify_callbacks_complete();
+ }
+
+ tracepoint(librbd, aio_complete_exit);
+}
+
+void AioCompletion::init_time(ImageCtx *i, aio_type_t t) {
+ if (ictx == nullptr) {
+ ictx = i;
+ aio_type = t;
+ start_time = coarse_mono_clock::now();
+ }
+}
+
+void AioCompletion::start_op() {
+ ceph_assert(ictx != nullptr);
+
+ if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) {
+ // no need to track async open/close operations
+ return;
+ }
+
+ ceph_assert(!async_op.started());
+ async_op.start_op(*ictx);
+}
+
+void AioCompletion::queue_complete() {
+ uint32_t zero = 0;
+ pending_count.compare_exchange_strong(zero, 1);
+ ceph_assert(zero == 0);
+
+ add_request();
+
+ // ensure completion fires in clean lock context
+ boost::asio::post(ictx->asio_engine->get_api_strand(), [this]() {
+ complete_request(0);
+ });
+}
+
+void AioCompletion::block(CephContext* cct) {
+ ldout(cct, 20) << dendl;
+ ceph_assert(!was_armed);
+
+ get();
+ ++pending_count;
+}
+
+void AioCompletion::unblock(CephContext* cct) {
+ ldout(cct, 20) << dendl;
+ ceph_assert(was_armed);
+
+ uint32_t previous_pending_count = pending_count--;
+ ceph_assert(previous_pending_count > 0);
+
+ if (previous_pending_count == 1) {
+ queue_complete();
+ }
+ put();
+}
+
+void AioCompletion::fail(int r)
+{
+ ceph_assert(ictx != nullptr);
+ ceph_assert(r < 0);
+
+ bool queue_required = true;
+ if (aio_type == AIO_TYPE_CLOSE || aio_type == AIO_TYPE_OPEN) {
+ // executing from a safe context and the ImageCtx has been destructed
+ queue_required = false;
+ } else {
+ CephContext *cct = ictx->cct;
+ lderr(cct) << cpp_strerror(r) << dendl;
+ }
+
+ ceph_assert(!was_armed);
+ was_armed = true;
+
+ rval = r;
+
+ uint32_t previous_pending_count = pending_count.load();
+ if (previous_pending_count == 0) {
+ if (queue_required) {
+ queue_complete();
+ } else {
+ complete();
+ }
+ }
+}
+
+void AioCompletion::set_request_count(uint32_t count) {
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ ceph_assert(!was_armed);
+ was_armed = true;
+
+ ldout(cct, 20) << "pending=" << count << dendl;
+ uint32_t previous_pending_count = pending_count.fetch_add(count);
+ if (previous_pending_count == 0 && count == 0) {
+ queue_complete();
+ }
+}
+
+void AioCompletion::complete_request(ssize_t r)
+{
+ ceph_assert(ictx != nullptr);
+ CephContext *cct = ictx->cct;
+
+ if (r > 0) {
+ rval += r;
+ } else if (r < 0 && r != -EEXIST) {
+ // might race w/ another thread setting an error code but
+ // first one wins
+ int zero = 0;
+ error_rval.compare_exchange_strong(zero, r);
+ }
+
+ uint32_t previous_pending_count = pending_count--;
+ ceph_assert(previous_pending_count > 0);
+ auto pending_count = previous_pending_count - 1;
+
+ ldout(cct, 20) << "cb=" << complete_cb << ", "
+ << "pending=" << pending_count << dendl;
+ if (pending_count == 0) {
+ finalize();
+ complete();
+ }
+ put();
+}
+
+bool AioCompletion::is_complete() {
+ tracepoint(librbd, aio_is_complete_enter, this);
+ bool done = (this->state != AIO_STATE_PENDING);
+ tracepoint(librbd, aio_is_complete_exit, done);
+ return done;
+}
+
+ssize_t AioCompletion::get_return_value() {
+ tracepoint(librbd, aio_get_return_value_enter, this);
+ ssize_t r = rval;
+ tracepoint(librbd, aio_get_return_value_exit, r);
+ return r;
+}
+
+void AioCompletion::complete_external_callback() {
+ get();
+
+ // ensure librbd external users never experience concurrent callbacks
+ // from multiple librbd-internal threads.
+ boost::asio::dispatch(ictx->asio_engine->get_api_strand(), [this]() {
+ complete_cb(rbd_comp, complete_arg);
+ complete_event_socket();
+ notify_callbacks_complete();
+ put();
+ });
+}
+
+void AioCompletion::complete_event_socket() {
+ if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) {
+ ictx->event_socket_completions.push(this);
+ ictx->event_socket.notify();
+ }
+}
+
+void AioCompletion::notify_callbacks_complete() {
+ state = AIO_STATE_COMPLETE;
+
+ {
+ std::unique_lock<std::mutex> locker(lock);
+ cond.notify_all();
+ }
+
+ if (image_dispatcher_ctx != nullptr) {
+ image_dispatcher_ctx->complete(rval);
+ }
+
+ // note: possible for image to be closed after op marked finished
+ if (async_op.started()) {
+ async_op.finish_op();
+ }
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h
new file mode 100644
index 000000000..4ae93fe36
--- /dev/null
+++ b/src/librbd/io/AioCompletion.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H
+#define CEPH_LIBRBD_IO_AIO_COMPLETION_H
+
+#include "common/ceph_time.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "include/utime.h"
+#include "include/rbd/librbd.hpp"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+/**
+ * AioCompletion is the overall completion for a single
+ * rbd I/O request. It may be composed of many AioObjectRequests,
+ * which each go to a single object.
+ *
+ * The retrying of individual requests is handled at a lower level,
+ * so all AioCompletion cares about is the count of outstanding
+ * requests. The number of expected individual requests should be
+ * set initially using set_request_count() prior to issuing the
+ * requests. This ensures that the completion will not be completed
+ * within the caller's thread of execution (instead via a librados
+ * context or via a thread pool context for cache read hits).
+ */
+struct AioCompletion {
+ typedef enum {
+ AIO_STATE_PENDING = 0,
+ AIO_STATE_CALLBACK,
+ AIO_STATE_COMPLETE,
+ } aio_state_t;
+
+ mutable std::mutex lock;
+ std::condition_variable cond;
+
+ callback_t complete_cb = nullptr;
+ void *complete_arg = nullptr;
+ rbd_completion_t rbd_comp = nullptr;
+
+ /// note: only using atomic for built-in memory barrier
+ std::atomic<aio_state_t> state{AIO_STATE_PENDING};
+
+ std::atomic<ssize_t> rval{0};
+ std::atomic<int> error_rval{0};
+ std::atomic<uint32_t> ref{1};
+ std::atomic<uint32_t> pending_count{0}; ///< number of requests/blocks
+ std::atomic<bool> released{false};
+
+ ImageCtx *ictx = nullptr;
+ coarse_mono_time start_time;
+ aio_type_t aio_type = AIO_TYPE_NONE;
+
+ ReadResult read_result;
+
+ AsyncOperation async_op;
+
+ bool event_notify = false;
+ bool was_armed = false;
+ bool external_callback = false;
+
+ Context* image_dispatcher_ctx = nullptr;
+
+ template <typename T, void (T::*MF)(int)>
+ static void callback_adapter(completion_t cb, void *arg) {
+ AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
+ T *t = reinterpret_cast<T *>(arg);
+ (t->*MF)(comp->get_return_value());
+ comp->release();
+ }
+
+ static AioCompletion *create(void *cb_arg, callback_t cb_complete,
+ rbd_completion_t rbd_comp) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(cb_arg, cb_complete);
+ comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp);
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create(T *obj) {
+ AioCompletion *comp = new AioCompletion();
+ comp->set_complete_cb(obj, &callback_adapter<T, MF>);
+ comp->rbd_comp = comp;
+ return comp;
+ }
+
+ template <typename T, void (T::*MF)(int) = &T::complete>
+ static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx,
+ aio_type_t type) {
+ AioCompletion *comp = create<T, MF>(obj);
+ comp->init_time(image_ctx, type);
+ comp->start_op();
+ return comp;
+ }
+
+ AioCompletion() {
+ }
+
+ ~AioCompletion() {
+ }
+
+ int wait_for_complete();
+
+ void finalize();
+
+ inline bool is_initialized(aio_type_t type) const {
+ std::unique_lock<std::mutex> locker(lock);
+ return ((ictx != nullptr) && (aio_type == type));
+ }
+ inline bool is_started() const {
+ std::unique_lock<std::mutex> locker(lock);
+ return async_op.started();
+ }
+
+ void block(CephContext* cct);
+ void unblock(CephContext* cct);
+
+ void init_time(ImageCtx *i, aio_type_t t);
+ void start_op();
+ void fail(int r);
+
+ void complete();
+
+ void set_complete_cb(void *cb_arg, callback_t cb) {
+ complete_cb = cb;
+ complete_arg = cb_arg;
+ }
+
+ void set_request_count(uint32_t num);
+ void add_request() {
+ ceph_assert(pending_count > 0);
+ get();
+ }
+ void complete_request(ssize_t r);
+
+ bool is_complete();
+
+ ssize_t get_return_value();
+
+ void get() {
+ ceph_assert(ref > 0);
+ ++ref;
+ }
+ void release() {
+ bool previous_released = released.exchange(true);
+ ceph_assert(!previous_released);
+ put();
+ }
+ void put() {
+ uint32_t previous_ref = ref--;
+ ceph_assert(previous_ref > 0);
+
+ if (previous_ref == 1) {
+ delete this;
+ }
+ }
+
+ void set_event_notify(bool s) {
+ event_notify = s;
+ }
+
+ void *get_arg() {
+ return complete_arg;
+ }
+
+private:
+ void queue_complete();
+ void complete_external_callback();
+ void complete_event_socket();
+ void notify_callbacks_complete();
+};
+
+class C_AioRequest : public Context {
+public:
+ C_AioRequest(AioCompletion *completion) : m_completion(completion) {
+ m_completion->add_request();
+ }
+ ~C_AioRequest() override {}
+ void finish(int r) override {
+ m_completion->complete_request(r);
+ }
+protected:
+ AioCompletion *m_completion;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H
diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc
new file mode 100644
index 000000000..18db2410e
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/AsyncOperation.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::AsyncOperation: "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+struct C_CompleteFlushes : public Context {
+ ImageCtx *image_ctx;
+ std::list<Context *> flush_contexts;
+
+ explicit C_CompleteFlushes(ImageCtx *image_ctx,
+ std::list<Context *> &&flush_contexts)
+ : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) {
+ }
+ void finish(int r) override {
+ std::shared_lock owner_locker{image_ctx->owner_lock};
+ while (!flush_contexts.empty()) {
+ Context *flush_ctx = flush_contexts.front();
+ flush_contexts.pop_front();
+
+ ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl;
+ flush_ctx->complete(0);
+ }
+ }
+};
+
+} // anonymous namespace
+
+void AsyncOperation::start_op(ImageCtx &image_ctx) {
+ ceph_assert(m_image_ctx == NULL);
+ m_image_ctx = &image_ctx;
+
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+ std::lock_guard l{m_image_ctx->async_ops_lock};
+ m_image_ctx->async_ops.push_front(&m_xlist_item);
+}
+
+void AsyncOperation::finish_op() {
+ ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
+
+ {
+ std::lock_guard l{m_image_ctx->async_ops_lock};
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+ ceph_assert(m_xlist_item.remove_myself());
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end() && !m_flush_contexts.empty()) {
+ ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: "
+ << *iter << dendl;
+ (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(),
+ m_flush_contexts.begin(),
+ m_flush_contexts.end());
+ return;
+ }
+ }
+
+ if (!m_flush_contexts.empty()) {
+ C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
+ std::move(m_flush_contexts));
+ m_image_ctx->asio_engine->post(ctx, 0);
+ }
+}
+
+void AsyncOperation::flush(Context* on_finish) {
+ {
+ std::lock_guard locker{m_image_ctx->async_ops_lock};
+ xlist<AsyncOperation *>::iterator iter(&m_xlist_item);
+ ++iter;
+
+ // linked list stored newest -> oldest ops
+ if (!iter.end()) {
+ (*iter)->m_flush_contexts.push_back(on_finish);
+ return;
+ }
+ }
+
+ m_image_ctx->asio_engine->post(on_finish, 0);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h
new file mode 100644
index 000000000..b0a37c4b8
--- /dev/null
+++ b/src/librbd/io/AsyncOperation.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRBD_IO_ASYNC_OPERATION_H
+#define LIBRBD_IO_ASYNC_OPERATION_H
+
+#include "include/ceph_assert.h"
+#include "include/xlist.h"
+#include <list>
+
+class Context;
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+class AsyncOperation {
+public:
+
+ AsyncOperation()
+ : m_image_ctx(NULL), m_xlist_item(this)
+ {
+ }
+
+ ~AsyncOperation()
+ {
+ ceph_assert(!m_xlist_item.is_on_list());
+ }
+
+ inline bool started() const {
+ return m_xlist_item.is_on_list();
+ }
+
+ void start_op(ImageCtx &image_ctx);
+ void finish_op();
+
+ void flush(Context *on_finish);
+
+private:
+
+ ImageCtx *m_image_ctx;
+ xlist<AsyncOperation *>::item m_xlist_item;
+ std::list<Context *> m_flush_contexts;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // LIBRBD_IO_ASYNC_OPERATION_H
diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc
new file mode 100644
index 000000000..228f95980
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.cc
@@ -0,0 +1,773 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/CopyupRequest.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/AsyncObjectThrottle.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/deep_copy/ObjectCopyRequest.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \
+ << " " << __func__ << ": " \
+ << data_object_name(m_image_ctx, m_object_no) << " "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+
+namespace {
+
+template <typename I>
+class C_UpdateObjectMap : public C_AsyncObjectThrottle<I> {
+public:
+ C_UpdateObjectMap(AsyncObjectThrottle<I> &throttle, I *image_ctx,
+ uint64_t object_no, uint8_t head_object_map_state,
+ const std::vector<uint64_t> *snap_ids,
+ bool first_snap_is_clean, const ZTracer::Trace &trace,
+ size_t snap_id_idx)
+ : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no),
+ m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids),
+ m_first_snap_is_clean(first_snap_is_clean), m_trace(trace),
+ m_snap_id_idx(snap_id_idx)
+ {
+ }
+
+ int send() override {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock));
+ if (image_ctx.exclusive_lock == nullptr) {
+ return 1;
+ }
+ ceph_assert(image_ctx.exclusive_lock->is_lock_owner());
+
+ std::shared_lock image_locker{image_ctx.image_lock};
+ if (image_ctx.object_map == nullptr) {
+ return 1;
+ }
+
+ uint64_t snap_id = m_snap_ids[m_snap_id_idx];
+ if (snap_id == CEPH_NOSNAP) {
+ return update_head();
+ } else {
+ return update_snapshot(snap_id);
+ }
+ }
+
+ int update_head() {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false,
+ this);
+ return (sent ? 0 : 1);
+ }
+
+ int update_snapshot(uint64_t snap_id) {
+ auto& image_ctx = this->m_image_ctx;
+ ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock));
+
+ uint8_t state = OBJECT_EXISTS;
+ if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.image_lock) &&
+ (m_snap_id_idx > 0 || m_first_snap_is_clean)) {
+ // first snapshot should be exists+dirty since it contains
+ // the copyup data -- later snapshots inherit the data.
+ state = OBJECT_EXISTS_CLEAN;
+ }
+
+ bool sent = image_ctx.object_map->template aio_update<Context>(
+ snap_id, m_object_no, state, {}, m_trace, true, this);
+ ceph_assert(sent);
+ return 0;
+ }
+
+private:
+ uint64_t m_object_no;
+ uint8_t m_head_object_map_state;
+ const std::vector<uint64_t> &m_snap_ids;
+ bool m_first_snap_is_clean;
+ const ZTracer::Trace &m_trace;
+ size_t m_snap_id_idx;
+};
+
+} // anonymous namespace
+
+template <typename I>
+CopyupRequest<I>::CopyupRequest(I *ictx, uint64_t objectno,
+ Extents &&image_extents, ImageArea area,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(ictx), m_object_no(objectno),
+ m_image_extents(std::move(image_extents)), m_image_area(area),
+ m_trace(librbd::util::create_trace(*m_image_ctx, "copy-up", parent_trace))
+{
+ ceph_assert(m_image_ctx->data_ctx.is_valid());
+ m_async_op.start_op(*librbd::util::get_image_ctx(m_image_ctx));
+}
+
+template <typename I>
+CopyupRequest<I>::~CopyupRequest() {
+ ceph_assert(m_pending_requests.empty());
+ m_async_op.finish_op();
+}
+
+template <typename I>
+void CopyupRequest<I>::append_request(AbstractObjectWriteRequest<I> *req,
+ const Extents& object_extents) {
+ std::lock_guard locker{m_lock};
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_request=" << req << ", "
+ << "append=" << m_append_request_permitted << dendl;
+ if (m_append_request_permitted) {
+ m_pending_requests.push_back(req);
+
+ for (auto [offset, length] : object_extents) {
+ if (length > 0) {
+ m_write_object_extents.union_insert(offset, length);
+ }
+ }
+ } else {
+ m_restart_requests.push_back(req);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::send() {
+ read_from_parent();
+}
+
+template <typename I>
+void CopyupRequest<I>::read_from_parent() {
+ auto cct = m_image_ctx->cct;
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+
+ if (m_image_ctx->parent == nullptr) {
+ ldout(cct, 5) << "parent detached" << dendl;
+
+ m_image_ctx->asio_engine->post(
+ [this]() { handle_read_from_parent(-ENOENT); });
+ return;
+ } else if (is_deep_copy()) {
+ deep_copy();
+ return;
+ }
+
+ auto comp = AioCompletion::create_and_start<
+ CopyupRequest<I>,
+ &CopyupRequest<I>::handle_read_from_parent>(
+ this, librbd::util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ);
+
+ ldout(cct, 20) << "completion=" << comp
+ << " image_extents=" << m_image_extents
+ << " area=" << m_image_area << dendl;
+ auto req = io::ImageDispatchSpec::create_read(
+ *m_image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp,
+ std::move(m_image_extents), m_image_area,
+ ReadResult{&m_copyup_extent_map, &m_copyup_data},
+ m_image_ctx->parent->get_data_io_context(), 0, 0, m_trace);
+ req->send();
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_read_from_parent(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.lock();
+ disable_append_requests();
+ m_lock.unlock();
+
+ lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ convert_copyup_extent_map();
+
+ m_image_ctx->image_lock.lock_shared();
+ m_lock.lock();
+ disable_append_requests();
+
+ r = prepare_copyup_data();
+ if (r < 0) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ lderr(m_image_ctx->cct) << "failed to prepare copyup data: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_copyup_is_zero = m_copyup_data.is_zero();
+ m_copyup_required = is_copyup_required();
+ if (!m_copyup_required) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 20) << "no-op, skipping" << dendl;
+ finish(0);
+ return;
+ }
+
+ // copyup() will affect snapshots only if parent data is not all
+ // zeros.
+ if (!m_copyup_is_zero) {
+ m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(),
+ m_image_ctx->snaps.rend());
+ }
+
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::deep_copy() {
+ auto cct = m_image_ctx->cct;
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ ceph_assert(m_image_ctx->parent != nullptr);
+
+ m_lock.lock();
+ m_deep_copied = true;
+ m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten;
+ m_lock.unlock();
+
+ ldout(cct, 20) << "flatten=" << m_flatten << dendl;
+
+ uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION;
+ if (m_flatten) {
+ flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN;
+ }
+
+ auto ctx = librbd::util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_deep_copy>(this);
+ auto req = deep_copy::ObjectCopyRequest<I>::create(
+ m_image_ctx->parent, m_image_ctx, 0, 0,
+ m_image_ctx->migration_info.snap_map, m_object_no, flags, nullptr, ctx);
+
+ req->send();
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_deep_copy(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_image_ctx->image_lock.lock_shared();
+ m_lock.lock();
+ m_copyup_required = is_copyup_required();
+ if (r == -ENOENT && !m_flatten && m_copyup_required) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ ldout(cct, 10) << "restart deep-copy with flatten" << dendl;
+ send();
+ return;
+ }
+
+ disable_append_requests();
+
+ if (r < 0 && r != -ENOENT) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_copyup_required && !is_update_object_map_required(r)) {
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ ldout(cct, 20) << "skipping" << dendl;
+ finish(r);
+ return;
+ }
+
+ // For deep-copy, copyup() will never affect snapshots. However,
+ // this state machine is responsible for updating object maps for
+ // snapshots that have been created on destination image after
+ // migration started.
+ if (r != -ENOENT) {
+ compute_deep_copy_snap_ids();
+ }
+
+ m_lock.unlock();
+ m_image_ctx->image_lock.unlock_shared();
+
+ update_object_maps();
+}
+
+template <typename I>
+void CopyupRequest<I>::update_object_maps() {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ if (m_image_ctx->object_map == nullptr) {
+ image_locker.unlock();
+ owner_locker.unlock();
+
+ copyup();
+ return;
+ }
+
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ bool copy_on_read = m_pending_requests.empty();
+ uint8_t head_object_map_state = OBJECT_EXISTS;
+ if (copy_on_read && !m_snap_ids.empty() &&
+ m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF,
+ m_image_ctx->image_lock)) {
+ // HEAD is non-dirty since data is tied to first snapshot
+ head_object_map_state = OBJECT_EXISTS_CLEAN;
+ }
+
+ auto r_it = m_pending_requests.rbegin();
+ if (r_it != m_pending_requests.rend()) {
+ // last write-op determines the final object map state
+ head_object_map_state = (*r_it)->get_pre_write_object_map_state();
+ }
+
+ if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) {
+ // (maybe) need to update the HEAD object map state
+ m_snap_ids.push_back(CEPH_NOSNAP);
+ }
+ image_locker.unlock();
+
+ ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner());
+ typename AsyncObjectThrottle<I>::ContextFactory context_factory(
+ boost::lambda::bind(boost::lambda::new_ptr<C_UpdateObjectMap<I>>(),
+ boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state,
+ &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2));
+ auto ctx = librbd::util::create_context_callback<
+ CopyupRequest<I>, &CopyupRequest<I>::handle_update_object_maps>(this);
+ auto throttle = new AsyncObjectThrottle<I>(
+ nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size());
+ throttle->start_ops(
+ m_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops"));
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_update_object_maps(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ lderr(m_image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+
+ finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void CopyupRequest<I>::copyup() {
+ auto cct = m_image_ctx->cct;
+ m_image_ctx->image_lock.lock_shared();
+ auto snapc = m_image_ctx->snapc;
+ auto io_context = m_image_ctx->get_data_io_context();
+ m_image_ctx->image_lock.unlock_shared();
+
+ m_lock.lock();
+ if (!m_copyup_required) {
+ m_lock.unlock();
+
+ ldout(cct, 20) << "skipping copyup" << dendl;
+ finish(0);
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ bool copy_on_read = m_pending_requests.empty() && !m_deep_copied;
+ bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero;
+ if (m_copyup_is_zero) {
+ m_copyup_data.clear();
+ m_copyup_extent_map.clear();
+ }
+
+ neorados::WriteOp copyup_op;
+ neorados::WriteOp write_op;
+ neorados::WriteOp* op;
+ if (copy_on_read || deep_copyup) {
+ // copyup-op will use its own request issued to the initial object revision
+ op = &copyup_op;
+ ++m_pending_copyups;
+ } else {
+ // copyup-op can be combined with the write-ops (if any)
+ op = &write_op;
+ }
+
+ if (m_image_ctx->enable_sparse_copyup) {
+ cls_client::sparse_copyup(op, m_copyup_extent_map, m_copyup_data);
+ } else {
+ // convert the sparse read back into a standard (thick) read
+ Striper::StripedReadResult destriper;
+ destriper.add_partial_sparse_result(
+ cct, std::move(m_copyup_data), m_copyup_extent_map, 0,
+ {{0, m_image_ctx->layout.object_size}});
+
+ bufferlist thick_bl;
+ destriper.assemble_result(cct, thick_bl, false);
+ cls_client::copyup(op, thick_bl);
+ }
+ ObjectRequest<I>::add_write_hint(*m_image_ctx, op);
+
+ if (!copy_on_read) {
+ // merge all pending write ops into this single RADOS op
+ for (auto req : m_pending_requests) {
+ ldout(cct, 20) << "add_copyup_ops " << req << dendl;
+ req->add_copyup_ops(&write_op);
+ }
+
+ if (write_op.size() > 0) {
+ ++m_pending_copyups;
+ }
+ }
+ m_lock.unlock();
+
+ // issue librados ops at the end to simplify test cases
+ auto object = neorados::Object{data_object_name(m_image_ctx, m_object_no)};
+ if (copyup_op.size() > 0) {
+ // send only the copyup request with a blank snapshot context so that
+ // all snapshots are detected from the parent for this object. If
+ // this is a CoW request, a second request will be created for the
+ // actual modification.
+ ldout(cct, 20) << "copyup with empty snapshot context" << dendl;
+
+ auto copyup_io_context = *io_context;
+ copyup_io_context.write_snap_context({});
+
+ m_image_ctx->rados_api.execute(
+ object, copyup_io_context, std::move(copyup_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_copyup(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ }
+
+ if (write_op.size() > 0) {
+ // compare-and-write doesn't add any write ops (copyup+cmpext+write
+ // can't be executed in the same RADOS op because, unless the object
+ // was already present in the clone, cmpext wouldn't see it)
+ ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ?
+ "copyup + ops" : !deep_copyup ? "copyup" : "ops")
+ << " with current snapshot context" << dendl;
+
+ m_image_ctx->rados_api.execute(
+ object, *io_context, std::move(write_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_copyup(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::handle_copyup(int r) {
+ auto cct = m_image_ctx->cct;
+ unsigned pending_copyups;
+ int copyup_ret_val = r;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_pending_copyups > 0);
+ pending_copyups = --m_pending_copyups;
+ if (m_copyup_ret_val < 0) {
+ copyup_ret_val = m_copyup_ret_val;
+ } else if (r < 0) {
+ m_copyup_ret_val = r;
+ }
+ }
+
+ ldout(cct, 20) << "r=" << r << ", "
+ << "pending=" << pending_copyups << dendl;
+
+ if (pending_copyups == 0) {
+ if (copyup_ret_val < 0 && copyup_ret_val != -ENOENT) {
+ lderr(cct) << "failed to copyup object: " << cpp_strerror(copyup_ret_val)
+ << dendl;
+ complete_requests(false, copyup_ret_val);
+ }
+
+ finish(0);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::finish(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ complete_requests(true, r);
+ delete this;
+}
+
+template <typename I>
+void CopyupRequest<I>::complete_requests(bool override_restart_retval, int r) {
+ auto cct = m_image_ctx->cct;
+ remove_from_list();
+
+ while (!m_pending_requests.empty()) {
+ auto it = m_pending_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "completing request " << req << dendl;
+ req->handle_copyup(r);
+ m_pending_requests.erase(it);
+ }
+
+ if (override_restart_retval) {
+ r = -ERESTART;
+ }
+
+ while (!m_restart_requests.empty()) {
+ auto it = m_restart_requests.begin();
+ auto req = *it;
+ ldout(cct, 20) << "restarting request " << req << dendl;
+ req->handle_copyup(r);
+ m_restart_requests.erase(it);
+ }
+}
+
+template <typename I>
+void CopyupRequest<I>::disable_append_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ m_append_request_permitted = false;
+}
+
+template <typename I>
+void CopyupRequest<I>::remove_from_list() {
+ std::lock_guard copyup_list_locker{m_image_ctx->copyup_list_lock};
+
+ auto it = m_image_ctx->copyup_list.find(m_object_no);
+ if (it != m_image_ctx->copyup_list.end()) {
+ m_image_ctx->copyup_list.erase(it);
+ }
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_copyup_required() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ bool copy_on_read = m_pending_requests.empty();
+ if (copy_on_read) {
+ // always force a copyup if CoR enabled
+ return true;
+ }
+
+ if (!m_copyup_is_zero) {
+ return true;
+ }
+
+ for (auto req : m_pending_requests) {
+ if (!req->is_empty_write_op()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_deep_copy() const {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ return !m_image_ctx->migration_info.empty();
+}
+
+template <typename I>
+bool CopyupRequest<I>::is_update_object_map_required(int r) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+
+ if (r < 0) {
+ return false;
+ }
+
+ if (m_image_ctx->object_map == nullptr) {
+ return false;
+ }
+
+ if (m_image_ctx->migration_info.empty()) {
+ // migration might have completed while IO was in-flight,
+ // assume worst-case and perform an object map update
+ return true;
+ }
+
+ auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP);
+ ceph_assert(it != m_image_ctx->migration_info.snap_map.end());
+ return it->second[0] != CEPH_NOSNAP;
+}
+
+template <typename I>
+void CopyupRequest<I>::compute_deep_copy_snap_ids() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+
+ // don't copy ids for the snaps updated by object deep copy or
+ // that don't overlap
+ std::set<uint64_t> deep_copied;
+ for (auto &it : m_image_ctx->migration_info.snap_map) {
+ if (it.first != CEPH_NOSNAP) {
+ deep_copied.insert(it.second.front());
+ }
+ }
+ ldout(m_image_ctx->cct, 15) << "deep_copied=" << deep_copied << dendl;
+
+ std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(),
+ std::back_inserter(m_snap_ids),
+ [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) {
+ if (deep_copied.count(snap_id)) {
+ m_first_snap_is_clean = true;
+ return false;
+ }
+
+ uint64_t raw_overlap = 0;
+ uint64_t object_overlap = 0;
+ int r = m_image_ctx->get_parent_overlap(snap_id, &raw_overlap);
+ if (r < 0) {
+ ldout(cct, 5) << "failed getting parent overlap for snap_id: "
+ << snap_id << ": " << cpp_strerror(r) << dendl;
+ } else if (raw_overlap > 0) {
+ auto [parent_extents, area] = util::object_to_area_extents(
+ m_image_ctx, m_object_no, {{0, m_image_ctx->layout.object_size}});
+ object_overlap = m_image_ctx->prune_parent_extents(parent_extents, area,
+ raw_overlap, false);
+ }
+ return object_overlap > 0;
+ });
+}
+
+template <typename I>
+void CopyupRequest<I>::convert_copyup_extent_map() {
+ auto cct = m_image_ctx->cct;
+
+ Extents image_extent_map;
+ image_extent_map.swap(m_copyup_extent_map);
+ m_copyup_extent_map.reserve(image_extent_map.size());
+
+ // convert the image-extent extent map to object-extents
+ for (auto [image_offset, image_length] : image_extent_map) {
+ striper::LightweightObjectExtents object_extents;
+ util::area_to_object_extents(m_image_ctx, image_offset, image_length,
+ m_image_area, 0, &object_extents);
+ for (auto& object_extent : object_extents) {
+ m_copyup_extent_map.emplace_back(
+ object_extent.offset, object_extent.length);
+ }
+ }
+
+ ldout(cct, 20) << "image_extents=" << image_extent_map << ", "
+ << "object_extents=" << m_copyup_extent_map << dendl;
+}
+
+template <typename I>
+int CopyupRequest<I>::prepare_copyup_data() {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock));
+ auto cct = m_image_ctx->cct;
+
+ SnapshotSparseBufferlist snapshot_sparse_bufferlist;
+ auto& sparse_bufferlist = snapshot_sparse_bufferlist[0];
+
+ bool copy_on_read = m_pending_requests.empty();
+ bool maybe_deep_copyup = !m_image_ctx->snapc.snaps.empty();
+ if (copy_on_read || maybe_deep_copyup) {
+ // stand-alone copyup that will not be overwritten until HEAD revision
+ ldout(cct, 20) << "processing full copy-up" << dendl;
+
+ uint64_t buffer_offset = 0;
+ for (auto [object_offset, object_length] : m_copyup_extent_map) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_copyup_data, buffer_offset, object_length);
+ buffer_offset += object_length;
+
+ sparse_bufferlist.insert(
+ object_offset, object_length,
+ {SPARSE_EXTENT_STATE_DATA, object_length, std::move(sub_bl)});
+ }
+ } else {
+ // copyup that will concurrently written to the HEAD revision with the
+ // associated write-ops so only process partial extents
+ uint64_t buffer_offset = 0;
+ for (auto [object_offset, object_length] : m_copyup_extent_map) {
+ interval_set<uint64_t> copyup_object_extents;
+ copyup_object_extents.insert(object_offset, object_length);
+
+ interval_set<uint64_t> intersection;
+ intersection.intersection_of(copyup_object_extents,
+ m_write_object_extents);
+
+ // extract only portions of the parent copyup data that have not
+ // been overwritten by write-ops
+ copyup_object_extents.subtract(intersection);
+ for (auto [copyup_offset, copyup_length] : copyup_object_extents) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(
+ m_copyup_data, buffer_offset + (copyup_offset - object_offset),
+ copyup_length);
+ ceph_assert(sub_bl.length() == copyup_length);
+
+ sparse_bufferlist.insert(
+ copyup_offset, copyup_length,
+ {SPARSE_EXTENT_STATE_DATA, copyup_length, std::move(sub_bl)});
+ }
+ buffer_offset += object_length;
+ }
+
+ ldout(cct, 20) << "processing partial copy-up: " << sparse_bufferlist
+ << dendl;
+ }
+
+ // Let dispatch layers have a chance to process the data
+ auto r = m_image_ctx->io_object_dispatcher->prepare_copyup(
+ m_object_no, &snapshot_sparse_bufferlist);
+ if (r < 0) {
+ return r;
+ }
+
+ // Convert sparse extents back to extent map
+ m_copyup_data.clear();
+ m_copyup_extent_map.clear();
+ m_copyup_extent_map.reserve(sparse_bufferlist.ext_count());
+ for (auto& extent : sparse_bufferlist) {
+ auto& sbe = extent.get_val();
+ if (sbe.state == SPARSE_EXTENT_STATE_DATA) {
+ m_copyup_extent_map.emplace_back(extent.get_off(), extent.get_len());
+ m_copyup_data.append(sbe.bl);
+ }
+ }
+
+ return 0;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::CopyupRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h
new file mode 100644
index 000000000..a94139421
--- /dev/null
+++ b/src/librbd/io/CopyupRequest.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/interval_set.h"
+#include "common/ceph_mutex.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/Types.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace ZTracer { struct Trace; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename I> class AbstractObjectWriteRequest;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CopyupRequest {
+public:
+ static CopyupRequest* create(ImageCtxT *ictx, uint64_t objectno,
+ Extents &&image_extents, ImageArea area,
+ const ZTracer::Trace &parent_trace) {
+ return new CopyupRequest(ictx, objectno, std::move(image_extents), area,
+ parent_trace);
+ }
+
+ CopyupRequest(ImageCtxT *ictx, uint64_t objectno,
+ Extents &&image_extents, ImageArea area,
+ const ZTracer::Trace &parent_trace);
+ ~CopyupRequest();
+
+ void append_request(AbstractObjectWriteRequest<ImageCtxT> *req,
+ const Extents& object_extents);
+
+ void send();
+
+private:
+ /**
+ * Copyup requests go through the following state machine to read from the
+ * parent image, update the object map, and copyup the object:
+ *
+ *
+ * @verbatim
+ *
+ * <start>
+ * |
+ * /---------/ \---------\
+ * | |
+ * v v
+ * READ_FROM_PARENT DEEP_COPY
+ * | |
+ * \---------\ /---------/
+ * |
+ * v (skip if not needed)
+ * UPDATE_OBJECT_MAPS
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ *
+ * The OBJECT_MAP state is skipped if the object map isn't enabled or if
+ * an object map update isn't required. The COPYUP state is skipped if
+ * no data was read from the parent *and* there are no additional ops.
+ */
+
+ typedef std::vector<AbstractObjectWriteRequest<ImageCtxT> *> WriteRequests;
+
+ ImageCtxT *m_image_ctx;
+ uint64_t m_object_no;
+ Extents m_image_extents;
+ ImageArea m_image_area;
+ ZTracer::Trace m_trace;
+
+ bool m_flatten = false;
+ bool m_copyup_required = true;
+ bool m_copyup_is_zero = true;
+ bool m_deep_copied = false;
+
+ Extents m_copyup_extent_map;
+ ceph::bufferlist m_copyup_data;
+
+ AsyncOperation m_async_op;
+
+ std::vector<uint64_t> m_snap_ids;
+ bool m_first_snap_is_clean = false;
+
+ ceph::mutex m_lock = ceph::make_mutex("CopyupRequest", false);
+ WriteRequests m_pending_requests;
+ unsigned m_pending_copyups = 0;
+ int m_copyup_ret_val = 0;
+
+ WriteRequests m_restart_requests;
+ bool m_append_request_permitted = true;
+
+ interval_set<uint64_t> m_write_object_extents;
+
+ void read_from_parent();
+ void handle_read_from_parent(int r);
+
+ void deep_copy();
+ void handle_deep_copy(int r);
+
+ void update_object_maps();
+ void handle_update_object_maps(int r);
+
+ void copyup();
+ void handle_copyup(int r);
+
+ void finish(int r);
+ void complete_requests(bool override_restart_retval, int r);
+
+ void disable_append_requests();
+ void remove_from_list();
+
+ bool is_copyup_required();
+ bool is_update_object_map_required(int r);
+ bool is_deep_copy() const;
+
+ void compute_deep_copy_snap_ids();
+ void convert_copyup_extent_map();
+ int prepare_copyup_data();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::CopyupRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H
diff --git a/src/librbd/io/Dispatcher.h b/src/librbd/io/Dispatcher.h
new file mode 100644
index 000000000..cb64e11b2
--- /dev/null
+++ b/src/librbd/io/Dispatcher.h
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_DISPATCHER_H
+#define CEPH_LIBRBD_IO_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+#include "common/AsyncOpTracker.h"
+#include "librbd/Utils.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::Dispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename ImageCtxT, typename DispatchInterfaceT>
+class Dispatcher : public DispatchInterfaceT {
+public:
+ typedef typename DispatchInterfaceT::Dispatch Dispatch;
+ typedef typename DispatchInterfaceT::DispatchLayer DispatchLayer;
+ typedef typename DispatchInterfaceT::DispatchSpec DispatchSpec;
+
+ Dispatcher(ImageCtxT* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ librbd::util::unique_lock_name("librbd::io::Dispatcher::lock",
+ this))) {
+ }
+
+ virtual ~Dispatcher() {
+ ceph_assert(m_dispatches.empty());
+ }
+
+ void shut_down(Context* on_finish) override {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ std::map<DispatchLayer, DispatchMeta> dispatches;
+ {
+ std::unique_lock locker{m_lock};
+ std::swap(dispatches, m_dispatches);
+ }
+
+ for (auto it : dispatches) {
+ shut_down_dispatch(it.second, &on_finish);
+ }
+ on_finish->complete(0);
+ }
+
+ void register_dispatch(Dispatch* dispatch) override {
+ auto cct = m_image_ctx->cct;
+ auto type = dispatch->get_dispatch_layer();
+ ldout(cct, 5) << "dispatch_layer=" << type << dendl;
+
+ std::unique_lock locker{m_lock};
+
+ auto result = m_dispatches.insert(
+ {type, {dispatch, new AsyncOpTracker()}});
+ ceph_assert(result.second);
+ }
+
+ bool exists(DispatchLayer dispatch_layer) override {
+ std::unique_lock locker{m_lock};
+ return m_dispatches.find(dispatch_layer) != m_dispatches.end();
+ }
+
+ void shut_down_dispatch(DispatchLayer dispatch_layer,
+ Context* on_finish) override {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "dispatch_layer=" << dispatch_layer << dendl;
+
+ DispatchMeta dispatch_meta;
+ {
+ std::unique_lock locker{m_lock};
+ auto it = m_dispatches.find(dispatch_layer);
+ if (it == m_dispatches.end()) {
+ on_finish->complete(0);
+ return;
+ }
+
+ dispatch_meta = it->second;
+ m_dispatches.erase(it);
+ }
+
+ shut_down_dispatch(dispatch_meta, &on_finish);
+ on_finish->complete(0);
+ }
+
+ void send(DispatchSpec* dispatch_spec) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "dispatch_spec=" << dispatch_spec << dendl;
+
+ auto dispatch_layer = dispatch_spec->dispatch_layer;
+
+ // apply the IO request to all layers -- this method will be re-invoked
+ // by the dispatch layer if continuing / restarting the IO
+ while (true) {
+ m_lock.lock_shared();
+ dispatch_layer = dispatch_spec->dispatch_layer;
+ auto it = m_dispatches.upper_bound(dispatch_layer);
+ if (it == m_dispatches.end()) {
+ // the request is complete if handled by all layers
+ dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE;
+ m_lock.unlock_shared();
+ break;
+ }
+
+ auto& dispatch_meta = it->second;
+ auto dispatch = dispatch_meta.dispatch;
+ auto async_op_tracker = dispatch_meta.async_op_tracker;
+ dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ async_op_tracker->start_op();
+ m_lock.unlock_shared();
+
+ // advance to next layer in case we skip or continue
+ dispatch_spec->dispatch_layer = dispatch->get_dispatch_layer();
+
+ bool handled = send_dispatch(dispatch, dispatch_spec);
+ async_op_tracker->finish_op();
+
+ // handled ops will resume when the dispatch ctx is invoked
+ if (handled) {
+ return;
+ }
+ }
+
+ // skipped through to the last layer
+ dispatch_spec->dispatcher_ctx.complete(0);
+ }
+
+protected:
+ struct DispatchMeta {
+ Dispatch* dispatch = nullptr;
+ AsyncOpTracker* async_op_tracker = nullptr;
+
+ DispatchMeta() {
+ }
+ DispatchMeta(Dispatch* dispatch, AsyncOpTracker* async_op_tracker)
+ : dispatch(dispatch), async_op_tracker(async_op_tracker) {
+ }
+ };
+
+ ImageCtxT* m_image_ctx;
+
+ ceph::shared_mutex m_lock;
+ std::map<DispatchLayer, DispatchMeta> m_dispatches;
+
+ virtual bool send_dispatch(Dispatch* dispatch,
+ DispatchSpec* dispatch_spec) = 0;
+
+protected:
+ struct C_LayerIterator : public Context {
+ Dispatcher* dispatcher;
+ Context* on_finish;
+ DispatchLayer dispatch_layer;
+
+ C_LayerIterator(Dispatcher* dispatcher,
+ DispatchLayer start_layer,
+ Context* on_finish)
+ : dispatcher(dispatcher), on_finish(on_finish), dispatch_layer(start_layer) {
+ }
+
+ void complete(int r) override {
+ while (true) {
+ dispatcher->m_lock.lock_shared();
+ auto it = dispatcher->m_dispatches.upper_bound(dispatch_layer);
+ if (it == dispatcher->m_dispatches.end()) {
+ dispatcher->m_lock.unlock_shared();
+ Context::complete(r);
+ return;
+ }
+
+ auto& dispatch_meta = it->second;
+ auto dispatch = dispatch_meta.dispatch;
+
+ // prevent recursive locking back into the dispatcher while handling IO
+ dispatch_meta.async_op_tracker->start_op();
+ dispatcher->m_lock.unlock_shared();
+
+ // next loop should start after current layer
+ dispatch_layer = dispatch->get_dispatch_layer();
+
+ auto handled = execute(dispatch, this);
+ dispatch_meta.async_op_tracker->finish_op();
+
+ if (handled) {
+ break;
+ }
+ }
+ }
+
+ void finish(int r) override {
+ on_finish->complete(0);
+ }
+ virtual bool execute(Dispatch* dispatch,
+ Context* on_finish) = 0;
+ };
+
+ struct C_InvalidateCache : public C_LayerIterator {
+ C_InvalidateCache(Dispatcher* dispatcher, DispatchLayer start_layer, Context* on_finish)
+ : C_LayerIterator(dispatcher, start_layer, on_finish) {
+ }
+
+ bool execute(Dispatch* dispatch,
+ Context* on_finish) override {
+ return dispatch->invalidate_cache(on_finish);
+ }
+ };
+
+private:
+ void shut_down_dispatch(DispatchMeta& dispatch_meta,
+ Context** on_finish) {
+ auto dispatch = dispatch_meta.dispatch;
+ auto async_op_tracker = dispatch_meta.async_op_tracker;
+
+ auto ctx = *on_finish;
+ ctx = new LambdaContext(
+ [dispatch, async_op_tracker, ctx](int r) {
+ delete dispatch;
+ delete async_op_tracker;
+
+ ctx->complete(r);
+ });
+ ctx = new LambdaContext([dispatch, ctx](int r) {
+ dispatch->shut_down(ctx);
+ });
+ *on_finish = new LambdaContext([async_op_tracker, ctx](int r) {
+ async_op_tracker->wait_for_ops(ctx);
+ });
+ }
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#undef dout_subsys
+#undef dout_prefix
+#define dout_prefix *_dout
+
+#endif // CEPH_LIBRBD_IO_DISPATCHER_H
diff --git a/src/librbd/io/DispatcherInterface.h b/src/librbd/io/DispatcherInterface.h
new file mode 100644
index 000000000..2bac9ee75
--- /dev/null
+++ b/src/librbd/io/DispatcherInterface.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+template <typename DispatchT>
+struct DispatcherInterface {
+public:
+ typedef DispatchT Dispatch;
+ typedef typename DispatchT::DispatchLayer DispatchLayer;
+ typedef typename DispatchT::DispatchSpec DispatchSpec;
+
+ virtual ~DispatcherInterface() {
+ }
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual void register_dispatch(Dispatch* dispatch) = 0;
+ virtual bool exists(DispatchLayer dispatch_layer) = 0;
+ virtual void shut_down_dispatch(DispatchLayer dispatch_layer,
+ Context* on_finish) = 0;
+
+ virtual void send(DispatchSpec* dispatch_spec) = 0;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/FlushTracker.cc b/src/librbd/io/FlushTracker.cc
new file mode 100644
index 000000000..b6e2ed658
--- /dev/null
+++ b/src/librbd/io/FlushTracker.cc
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/FlushTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::FlushTracker: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+FlushTracker<I>::FlushTracker(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::io::FlushTracker::m_lock", this))) {
+}
+
+template <typename I>
+FlushTracker<I>::~FlushTracker() {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_flush_contexts.empty());
+}
+
+template <typename I>
+void FlushTracker<I>::shut_down() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ std::unique_lock locker{m_lock};
+ Contexts flush_ctxs;
+ for (auto& [flush_tid, ctxs] : m_flush_contexts) {
+ flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end());
+ }
+ m_flush_contexts.clear();
+ locker.unlock();
+
+ for (auto ctx : flush_ctxs) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+uint64_t FlushTracker<I>::start_io(uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ auto [it, inserted] = m_tid_to_flush_tid.insert({tid, ++m_next_flush_tid});
+ auto flush_tid = it->second;
+ m_in_flight_flush_tids.insert(flush_tid);
+ locker.unlock();
+
+ ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl;
+ return flush_tid;
+}
+
+template <typename I>
+void FlushTracker<I>::finish_io(uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ auto tid_to_flush_tid_it = m_tid_to_flush_tid.find(tid);
+ if (tid_to_flush_tid_it == m_tid_to_flush_tid.end()) {
+ return;
+ }
+
+ auto flush_tid = tid_to_flush_tid_it->second;
+ m_tid_to_flush_tid.erase(tid_to_flush_tid_it);
+ m_in_flight_flush_tids.erase(flush_tid);
+
+ ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl;
+ auto oldest_flush_tid = std::numeric_limits<uint64_t>::max();
+ if (!m_in_flight_flush_tids.empty()) {
+ oldest_flush_tid = *m_in_flight_flush_tids.begin();
+ }
+
+ // all flushes tagged before the oldest tid should be completed
+ Contexts flush_ctxs;
+ auto flush_contexts_it = m_flush_contexts.begin();
+ while (flush_contexts_it != m_flush_contexts.end()) {
+ if (flush_contexts_it->first >= oldest_flush_tid) {
+ ldout(cct, 20) << "pending IOs: [" << m_in_flight_flush_tids << "], "
+ << "pending flushes=" << m_flush_contexts << dendl;
+ break;
+ }
+
+ auto& ctxs = flush_contexts_it->second;
+ flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end());
+ flush_contexts_it = m_flush_contexts.erase(flush_contexts_it);
+ }
+ locker.unlock();
+
+ if (!flush_ctxs.empty()) {
+ ldout(cct, 20) << "completing flushes: " << flush_ctxs << dendl;
+ for (auto ctx : flush_ctxs) {
+ ctx->complete(0);
+ }
+ }
+}
+
+template <typename I>
+void FlushTracker<I>::flush(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+
+ std::unique_lock locker{m_lock};
+ if (m_in_flight_flush_tids.empty()) {
+ locker.unlock();
+ on_finish->complete(0);
+ return;
+ }
+
+ auto flush_tid = *m_in_flight_flush_tids.rbegin();
+ m_flush_contexts[flush_tid].push_back(on_finish);
+ ldout(cct, 20) << "flush_tid=" << flush_tid << ", ctx=" << on_finish << ", "
+ << "flush_contexts=" << m_flush_contexts << dendl;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::FlushTracker<librbd::ImageCtx>;
diff --git a/src/librbd/io/FlushTracker.h b/src/librbd/io/FlushTracker.h
new file mode 100644
index 000000000..cc7fcd9ae
--- /dev/null
+++ b/src/librbd/io/FlushTracker.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_FLUSH_TRACKER_H
+#define CEPH_LIBRBD_IO_FLUSH_TRACKER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include <atomic>
+#include <list>
+#include <map>
+#include <set>
+#include <unordered_map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class FlushTracker {
+public:
+ FlushTracker(ImageCtxT* image_ctx);
+ ~FlushTracker();
+
+ void shut_down();
+
+ uint64_t start_io(uint64_t tid);
+ void finish_io(uint64_t tid);
+
+ void flush(Context* on_finish);
+
+private:
+ typedef std::list<Context*> Contexts;
+ typedef std::map<uint64_t, Contexts> FlushContexts;
+ typedef std::set<uint64_t> Tids;
+ typedef std::unordered_map<uint64_t, uint64_t> TidToFlushTid;
+
+ ImageCtxT* m_image_ctx;
+
+ std::atomic<uint32_t> m_next_flush_tid{0};
+
+ mutable ceph::shared_mutex m_lock;
+ TidToFlushTid m_tid_to_flush_tid;
+
+ Tids m_in_flight_flush_tids;
+ FlushContexts m_flush_contexts;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::FlushTracker<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_FLUSH_TRACKER_H
diff --git a/src/librbd/io/ImageDispatch.cc b/src/librbd/io/ImageDispatch.cc
new file mode 100644
index 000000000..12c55cb0c
--- /dev/null
+++ b/src/librbd/io/ImageDispatch.cc
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageDispatch: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+void start_in_flight_io(AioCompletion* aio_comp) {
+ // TODO remove AsyncOperation from AioCompletion
+ if (!aio_comp->async_op.started()) {
+ aio_comp->start_op();
+ }
+}
+
+ImageArea get_area(const std::atomic<uint32_t>* image_dispatch_flags) {
+ return (*image_dispatch_flags & IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ?
+ ImageArea::CRYPTO_HEADER : ImageArea::DATA);
+}
+
+} // anonymous namespace
+
+template <typename I>
+void ImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool ImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_read(m_image_ctx, aio_comp, std::move(image_extents),
+ area, std::move(read_result), io_context, op_flags,
+ read_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_write(m_image_ctx, aio_comp, std::move(image_extents),
+ area, std::move(bl), op_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_discard(m_image_ctx, aio_comp, std::move(image_extents),
+ area, discard_granularity_bytes, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_writesame(m_image_ctx, aio_comp,
+ std::move(image_extents), area, std::move(bl),
+ op_flags, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_compare_and_write(m_image_ctx, aio_comp,
+ std::move(image_extents), area,
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, op_flags,
+ parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageRequest<I>::aio_flush(m_image_ctx, aio_comp, flush_source, parent_trace);
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto area = get_area(image_dispatch_flags);
+ ldout(cct, 20) << "image_extents=" << image_extents
+ << " area=" << area << dendl;
+
+ start_in_flight_io(aio_comp);
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ ImageListSnapsRequest<I> req(*m_image_ctx, aio_comp, std::move(image_extents),
+ area, std::move(snap_ids), list_snaps_flags,
+ snapshot_delta, parent_trace);
+ req.send();
+ return true;
+}
+
+template <typename I>
+bool ImageDispatch<I>::invalidate_cache(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ std::shared_lock owner_lock{m_image_ctx->owner_lock};
+ m_image_ctx->io_object_dispatcher->invalidate_cache(on_finish);
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageDispatch.h b/src/librbd/io/ImageDispatch.h
new file mode 100644
index 000000000..4a89c6054
--- /dev/null
+++ b/src/librbd/io/ImageDispatch.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class ImageDispatch : public ImageDispatchInterface {
+public:
+ ImageDispatch(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) {
+ }
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_CORE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override;
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/ImageDispatchInterface.h b/src/librbd/io/ImageDispatchInterface.h
new file mode 100644
index 000000000..e479f7eef
--- /dev/null
+++ b/src/librbd/io/ImageDispatchInterface.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <atomic>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct AioCompletion;
+struct ImageDispatchSpec;
+
+struct ImageDispatchInterface {
+ typedef ImageDispatchLayer DispatchLayer;
+ typedef ImageDispatchSpec DispatchSpec;
+
+ virtual ~ImageDispatchInterface() {
+ }
+
+ virtual ImageDispatchLayer get_dispatch_layer() const = 0;
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+ virtual bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool invalidate_cache(Context* on_finish) = 0;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H
diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc
new file mode 100644
index 000000000..95d8224ae
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.cc
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+void ImageDispatchSpec::C_Dispatcher::complete(int r) {
+ switch (image_dispatch_spec->dispatch_result) {
+ case DISPATCH_RESULT_RESTART:
+ ceph_assert(image_dispatch_spec->dispatch_layer != 0);
+ image_dispatch_spec->dispatch_layer = static_cast<ImageDispatchLayer>(
+ image_dispatch_spec->dispatch_layer - 1);
+ [[fallthrough]];
+ case DISPATCH_RESULT_CONTINUE:
+ if (r < 0) {
+ // bubble dispatch failure through AioCompletion
+ image_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE;
+ image_dispatch_spec->fail(r);
+ return;
+ }
+
+ image_dispatch_spec->send();
+ break;
+ case DISPATCH_RESULT_COMPLETE:
+ finish(r);
+ break;
+ case DISPATCH_RESULT_INVALID:
+ ceph_abort();
+ break;
+ }
+}
+
+void ImageDispatchSpec::C_Dispatcher::finish(int r) {
+ delete image_dispatch_spec;
+}
+
+void ImageDispatchSpec::send() {
+ image_dispatcher->send(this);
+}
+
+void ImageDispatchSpec::fail(int r) {
+ dispatch_result = DISPATCH_RESULT_COMPLETE;
+ aio_comp->fail(r);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h
new file mode 100644
index 000000000..9323f9879
--- /dev/null
+++ b/src/librbd/io/ImageDispatchSpec.h
@@ -0,0 +1,254 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/zipkin_trace.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ReadResult.h"
+#include <boost/variant/variant.hpp>
+#include <atomic>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+struct ImageDispatcherInterface;
+
+class ImageDispatchSpec {
+private:
+ // helper to avoid extra heap allocation per object IO
+ struct C_Dispatcher : public Context {
+ ImageDispatchSpec* image_dispatch_spec;
+
+ C_Dispatcher(ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ void complete(int r) override;
+ void finish(int r) override;
+ };
+
+public:
+ struct Read {
+ ReadResult read_result;
+ int read_flags;
+
+ Read(ReadResult &&read_result, int read_flags)
+ : read_result(std::move(read_result)), read_flags(read_flags) {
+ }
+ };
+
+ struct Discard {
+ uint32_t discard_granularity_bytes;
+
+ Discard(uint32_t discard_granularity_bytes)
+ : discard_granularity_bytes(discard_granularity_bytes) {
+ }
+ };
+
+ struct Write {
+ bufferlist bl;
+
+ Write(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct WriteSame {
+ bufferlist bl;
+
+ WriteSame(bufferlist&& bl) : bl(std::move(bl)) {
+ }
+ };
+
+ struct CompareAndWrite {
+ bufferlist cmp_bl;
+ bufferlist bl;
+ uint64_t *mismatch_offset;
+
+ CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl,
+ uint64_t *mismatch_offset)
+ : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct Flush {
+ FlushSource flush_source;
+
+ Flush(FlushSource flush_source) : flush_source(flush_source) {
+ }
+ };
+
+ struct ListSnaps {
+ SnapIds snap_ids;
+ int list_snaps_flags;
+ SnapshotDelta* snapshot_delta;
+
+ ListSnaps(SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta)
+ : snap_ids(std::move(snap_ids)), list_snaps_flags(list_snaps_flags),
+ snapshot_delta(snapshot_delta) {
+ }
+ };
+
+ typedef boost::variant<Read,
+ Discard,
+ Write,
+ WriteSame,
+ CompareAndWrite,
+ Flush,
+ ListSnaps> Request;
+
+ C_Dispatcher dispatcher_ctx;
+
+ ImageDispatcherInterface* image_dispatcher;
+ ImageDispatchLayer dispatch_layer;
+ std::atomic<uint32_t> image_dispatch_flags = 0;
+ DispatchResult dispatch_result = DISPATCH_RESULT_INVALID;
+
+ AioCompletion* aio_comp;
+ Extents image_extents;
+ Request request;
+ IOContext io_context;
+ int op_flags;
+ ZTracer::Trace parent_trace;
+ uint64_t tid = 0;
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_read(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ Read{std::move(read_result), read_flags},
+ io_context, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_discard(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ Discard{discard_granularity_bytes},
+ {}, 0, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_write(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ Write{std::move(bl)},
+ {}, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_write_same(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ WriteSame{std::move(bl)},
+ {}, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_compare_and_write(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ CompareAndWrite{std::move(cmp_bl),
+ std::move(bl),
+ mismatch_offset},
+ {}, op_flags, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_flush(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp, {},
+ ImageArea::DATA /* dummy for {} */,
+ Flush{flush_source}, {}, 0, parent_trace);
+ }
+
+ template <typename ImageCtxT = ImageCtx>
+ static ImageDispatchSpec* create_list_snaps(
+ ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer,
+ AioCompletion *aio_comp, Extents &&image_extents, ImageArea area,
+ SnapIds&& snap_ids, int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace) {
+ return new ImageDispatchSpec(image_ctx.io_image_dispatcher,
+ image_dispatch_layer, aio_comp,
+ std::move(image_extents), area,
+ ListSnaps{std::move(snap_ids),
+ list_snaps_flags, snapshot_delta},
+ {}, 0, parent_trace);
+ }
+
+ ~ImageDispatchSpec() {
+ aio_comp->put();
+ }
+
+ void send();
+ void fail(int r);
+
+private:
+ struct SendVisitor;
+ struct IsWriteOpVisitor;
+ struct TokenRequestedVisitor;
+
+ ImageDispatchSpec(ImageDispatcherInterface* image_dispatcher,
+ ImageDispatchLayer image_dispatch_layer,
+ AioCompletion* aio_comp, Extents&& image_extents,
+ ImageArea area, Request&& request, IOContext io_context,
+ int op_flags, const ZTracer::Trace& parent_trace)
+ : dispatcher_ctx(this), image_dispatcher(image_dispatcher),
+ dispatch_layer(image_dispatch_layer), aio_comp(aio_comp),
+ image_extents(std::move(image_extents)), request(std::move(request)),
+ io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) {
+ ceph_assert(aio_comp->image_dispatcher_ctx == nullptr);
+ aio_comp->image_dispatcher_ctx = &dispatcher_ctx;
+ aio_comp->get();
+
+ switch (area) {
+ case ImageArea::DATA:
+ break;
+ case ImageArea::CRYPTO_HEADER:
+ image_dispatch_flags |= IMAGE_DISPATCH_FLAG_CRYPTO_HEADER;
+ break;
+ default:
+ ceph_abort();
+ }
+ }
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ImageDispatcher.cc b/src/librbd/io/ImageDispatcher.cc
new file mode 100644
index 000000000..4aa7929b2
--- /dev/null
+++ b/src/librbd/io/ImageDispatcher.cc
@@ -0,0 +1,324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageDispatcher.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/crypto/CryptoImageDispatch.h"
+#include "librbd/io/ImageDispatch.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/QueueImageDispatch.h"
+#include "librbd/io/QosImageDispatch.h"
+#include "librbd/io/RefreshImageDispatch.h"
+#include "librbd/io/Utils.h"
+#include "librbd/io/WriteBlockImageDispatch.h"
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageDispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ImageDispatcher<I>::SendVisitor : public boost::static_visitor<bool> {
+ ImageDispatchInterface* image_dispatch;
+ ImageDispatchSpec* image_dispatch_spec;
+
+ SendVisitor(ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatch(image_dispatch),
+ image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ bool operator()(ImageDispatchSpec::Read& read) const {
+ return image_dispatch->read(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(read.read_result), image_dispatch_spec->io_context,
+ image_dispatch_spec->op_flags, read.read_flags,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Discard& discard) const {
+ return image_dispatch->discard(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ discard.discard_granularity_bytes, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Write& write) const {
+ return image_dispatch->write(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents), std::move(write.bl),
+ image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::WriteSame& write_same) const {
+ return image_dispatch->write_same(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents), std::move(write_same.bl),
+ image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(
+ ImageDispatchSpec::CompareAndWrite& compare_and_write) const {
+ return image_dispatch->compare_and_write(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl),
+ compare_and_write.mismatch_offset,
+ image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::Flush& flush) const {
+ return image_dispatch->flush(
+ image_dispatch_spec->aio_comp, flush.flush_source,
+ image_dispatch_spec->parent_trace, image_dispatch_spec->tid,
+ &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ImageDispatchSpec::ListSnaps& list_snaps) const {
+ return image_dispatch->list_snaps(
+ image_dispatch_spec->aio_comp,
+ std::move(image_dispatch_spec->image_extents),
+ std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags,
+ list_snaps.snapshot_delta, image_dispatch_spec->parent_trace,
+ image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags,
+ &image_dispatch_spec->dispatch_result,
+ &image_dispatch_spec->aio_comp->image_dispatcher_ctx,
+ &image_dispatch_spec->dispatcher_ctx);
+ }
+};
+
+template <typename I>
+struct ImageDispatcher<I>::PreprocessVisitor
+ : public boost::static_visitor<bool> {
+ ImageDispatcher<I>* image_dispatcher;
+ ImageDispatchSpec* image_dispatch_spec;
+
+ PreprocessVisitor(ImageDispatcher<I>* image_dispatcher,
+ ImageDispatchSpec* image_dispatch_spec)
+ : image_dispatcher(image_dispatcher),
+ image_dispatch_spec(image_dispatch_spec) {
+ }
+
+ bool clip_request() const {
+ auto area = (image_dispatch_spec->image_dispatch_flags &
+ IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ? ImageArea::CRYPTO_HEADER :
+ ImageArea::DATA);
+ int r = util::clip_request(image_dispatcher->m_image_ctx,
+ &image_dispatch_spec->image_extents, area);
+ if (r < 0) {
+ image_dispatch_spec->fail(r);
+ return true;
+ }
+ return false;
+ }
+
+ bool operator()(ImageDispatchSpec::Read& read) const {
+ if ((read.read_flags & READ_FLAG_DISABLE_CLIPPING) != 0) {
+ return false;
+ }
+ return clip_request();
+ }
+
+ bool operator()(ImageDispatchSpec::Flush&) const {
+ return clip_request();
+ }
+
+ bool operator()(ImageDispatchSpec::ListSnaps&) const {
+ return false;
+ }
+
+ template <typename T>
+ bool operator()(T&) const {
+ if (clip_request()) {
+ return true;
+ }
+
+ std::shared_lock image_locker{image_dispatcher->m_image_ctx->image_lock};
+ if (image_dispatcher->m_image_ctx->snap_id != CEPH_NOSNAP ||
+ image_dispatcher->m_image_ctx->read_only) {
+ image_dispatch_spec->fail(-EROFS);
+ return true;
+ }
+ return false;
+ }
+};
+
+template <typename I>
+ImageDispatcher<I>::ImageDispatcher(I* image_ctx)
+ : Dispatcher<I, ImageDispatcherInterface>(image_ctx) {
+ // configure the core image dispatch handler on startup
+ auto image_dispatch = new ImageDispatch(image_ctx);
+ this->register_dispatch(image_dispatch);
+
+ auto queue_image_dispatch = new QueueImageDispatch(image_ctx);
+ this->register_dispatch(queue_image_dispatch);
+
+ m_qos_image_dispatch = new QosImageDispatch<I>(image_ctx);
+ this->register_dispatch(m_qos_image_dispatch);
+
+ auto refresh_image_dispatch = new RefreshImageDispatch(image_ctx);
+ this->register_dispatch(refresh_image_dispatch);
+
+ m_write_block_dispatch = new WriteBlockImageDispatch<I>(image_ctx);
+ this->register_dispatch(m_write_block_dispatch);
+}
+
+template <typename I>
+void ImageDispatcher<I>::invalidate_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ auto ctx = new C_InvalidateCache(
+ this, IMAGE_DISPATCH_LAYER_NONE, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ImageDispatcher<I>::shut_down(Context* on_finish) {
+ // TODO ensure all IOs are executed via a dispatcher
+ // ensure read-ahead / copy-on-read ops are finished since they are
+ // currently outside dispatcher tracking
+ auto async_op = new AsyncOperation();
+
+ on_finish = new LambdaContext([async_op, on_finish](int r) {
+ async_op->finish_op();
+ delete async_op;
+ on_finish->complete(0);
+ });
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ Dispatcher<I, ImageDispatcherInterface>::shut_down(on_finish);
+ });
+ async_op->start_op(*this->m_image_ctx);
+ async_op->flush(on_finish);
+}
+
+template <typename I>
+void ImageDispatcher<I>::apply_qos_schedule_tick_min(uint64_t tick) {
+ m_qos_image_dispatch->apply_qos_schedule_tick_min(tick);
+}
+
+template <typename I>
+void ImageDispatcher<I>::apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) {
+ m_qos_image_dispatch->apply_qos_limit(flag, limit, burst, burst_seconds);
+}
+
+template <typename I>
+void ImageDispatcher<I>::apply_qos_exclude_ops(uint64_t exclude_ops) {
+ m_qos_image_dispatch->apply_qos_exclude_ops(exclude_ops);
+}
+
+template <typename I>
+bool ImageDispatcher<I>::writes_blocked() const {
+ return m_write_block_dispatch->writes_blocked();
+}
+
+template <typename I>
+int ImageDispatcher<I>::block_writes() {
+ return m_write_block_dispatch->block_writes();
+}
+
+template <typename I>
+void ImageDispatcher<I>::block_writes(Context *on_blocked) {
+ m_write_block_dispatch->block_writes(on_blocked);
+}
+
+template <typename I>
+void ImageDispatcher<I>::unblock_writes() {
+ m_write_block_dispatch->unblock_writes();
+}
+
+template <typename I>
+void ImageDispatcher<I>::wait_on_writes_unblocked(Context *on_unblocked) {
+ m_write_block_dispatch->wait_on_writes_unblocked(on_unblocked);
+}
+
+template <typename I>
+void ImageDispatcher<I>::remap_to_physical(Extents& image_extents,
+ ImageArea area) {
+ std::shared_lock locker{this->m_lock};
+ auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO);
+ if (it == this->m_dispatches.end()) {
+ ceph_assert(area == ImageArea::DATA);
+ return;
+ }
+ auto crypto_image_dispatch = static_cast<crypto::CryptoImageDispatch*>(
+ it->second.dispatch);
+ crypto_image_dispatch->remap_to_physical(image_extents, area);
+}
+
+template <typename I>
+ImageArea ImageDispatcher<I>::remap_to_logical(Extents& image_extents) {
+ std::shared_lock locker{this->m_lock};
+ auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO);
+ if (it == this->m_dispatches.end()) {
+ return ImageArea::DATA;
+ }
+ auto crypto_image_dispatch = static_cast<crypto::CryptoImageDispatch*>(
+ it->second.dispatch);
+ return crypto_image_dispatch->remap_to_logical(image_extents);
+}
+
+template <typename I>
+bool ImageDispatcher<I>::send_dispatch(
+ ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec) {
+ if (image_dispatch_spec->tid == 0) {
+ image_dispatch_spec->tid = ++m_next_tid;
+
+ bool finished = preprocess(image_dispatch_spec);
+ if (finished) {
+ return true;
+ }
+ }
+
+ return boost::apply_visitor(
+ SendVisitor{image_dispatch, image_dispatch_spec},
+ image_dispatch_spec->request);
+}
+
+template <typename I>
+bool ImageDispatcher<I>::preprocess(
+ ImageDispatchSpec* image_dispatch_spec) {
+ return boost::apply_visitor(
+ PreprocessVisitor{this, image_dispatch_spec},
+ image_dispatch_spec->request);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageDispatcher<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageDispatcher.h b/src/librbd/io/ImageDispatcher.h
new file mode 100644
index 000000000..5d5fb0535
--- /dev/null
+++ b/src/librbd/io/ImageDispatcher.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Dispatcher.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <atomic>
+#include <map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename> struct QosImageDispatch;
+template <typename> struct WriteBlockImageDispatch;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDispatcher : public Dispatcher<ImageCtxT, ImageDispatcherInterface> {
+public:
+ ImageDispatcher(ImageCtxT* image_ctx);
+
+ void invalidate_cache(Context* on_finish) override;
+
+ void shut_down(Context* on_finish) override;
+
+ void apply_qos_schedule_tick_min(uint64_t tick) override;
+ void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst,
+ uint64_t burst_seconds) override;
+ void apply_qos_exclude_ops(uint64_t exclude_ops) override;
+
+ bool writes_blocked() const override;
+ int block_writes() override;
+ void block_writes(Context *on_blocked) override;
+
+ void unblock_writes() override;
+ void wait_on_writes_unblocked(Context *on_unblocked) override;
+
+ void remap_to_physical(Extents& image_extents, ImageArea area) override;
+ ImageArea remap_to_logical(Extents& image_extents) override;
+
+protected:
+ bool send_dispatch(
+ ImageDispatchInterface* image_dispatch,
+ ImageDispatchSpec* image_dispatch_spec) override;
+
+private:
+ struct SendVisitor;
+ struct PreprocessVisitor;
+
+ using typename Dispatcher<ImageCtxT, ImageDispatcherInterface>::C_InvalidateCache;
+
+ std::atomic<uint64_t> m_next_tid{0};
+
+ QosImageDispatch<ImageCtxT>* m_qos_image_dispatch = nullptr;
+ WriteBlockImageDispatch<ImageCtxT>* m_write_block_dispatch = nullptr;
+
+ bool preprocess(ImageDispatchSpec* image_dispatch_spec);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageDispatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H
diff --git a/src/librbd/io/ImageDispatcherInterface.h b/src/librbd/io/ImageDispatcherInterface.h
new file mode 100644
index 000000000..dcff3d96a
--- /dev/null
+++ b/src/librbd/io/ImageDispatcherInterface.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/ImageDispatchInterface.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct ImageDispatcherInterface
+ : public DispatcherInterface<ImageDispatchInterface> {
+public:
+ virtual void apply_qos_schedule_tick_min(uint64_t tick) = 0;
+ virtual void apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) = 0;
+ virtual void apply_qos_exclude_ops(uint64_t exclude_ops) = 0;
+
+ virtual bool writes_blocked() const = 0;
+ virtual int block_writes() = 0;
+ virtual void block_writes(Context *on_blocked) = 0;
+
+ virtual void unblock_writes() = 0;
+ virtual void wait_on_writes_unblocked(Context *on_unblocked) = 0;
+
+ virtual void invalidate_cache(Context* on_finish) = 0;
+
+ virtual void remap_to_physical(Extents& image_extents, ImageArea area) = 0;
+ virtual ImageArea remap_to_logical(Extents& image_extents) = 0;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc
new file mode 100644
index 000000000..e4c41c229
--- /dev/null
+++ b/src/librbd/io/ImageRequest.cc
@@ -0,0 +1,909 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Types.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/AsyncOperation.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Utils.h"
+#include "librbd/journal/Types.h"
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "osdc/Striper.h"
+#include <algorithm>
+#include <functional>
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+using librbd::util::get_image_ctx;
+
+namespace {
+
+template <typename I>
+struct C_AssembleSnapshotDeltas : public C_AioRequest {
+ I* image_ctx;
+ SnapshotDelta* snapshot_delta;
+
+ ceph::mutex lock = ceph::make_mutex(
+ "librbd::io::C_AssembleSnapshotDeltas::lock", false);
+ std::map<uint64_t, SnapshotDelta> object_snapshot_delta;
+
+ C_AssembleSnapshotDeltas(I* image_ctx, AioCompletion* aio_comp,
+ SnapshotDelta* snapshot_delta)
+ : C_AioRequest(aio_comp),
+ image_ctx(image_ctx), snapshot_delta(snapshot_delta) {
+ }
+
+ SnapshotDelta* get_snapshot_delta(uint64_t object_no) {
+ std::unique_lock locker{lock};
+ return &object_snapshot_delta[object_no];
+ }
+
+ void finish(int r) override {
+ auto cct = image_ctx->cct;
+
+ if (r < 0) {
+ lderr(cct) << "C_AssembleSnapshotDeltas: list snaps failed: "
+ << cpp_strerror(r) << dendl;
+ C_AioRequest::finish(r);
+ return;
+ }
+
+ std::unique_lock locker{lock};
+ *snapshot_delta = {};
+ for (auto& [object_no, object_snapshot_delta] : object_snapshot_delta) {
+ SnapshotDelta image_snapshot_delta;
+ object_to_image_intervals(object_no, object_snapshot_delta,
+ &image_snapshot_delta, snapshot_delta);
+
+ ldout(cct, 20) << "object_no=" << object_no << ", "
+ << "object_snapshot_delta="
+ << object_snapshot_delta << ", "
+ << "image_snapshot_delta=" << image_snapshot_delta
+ << dendl;
+ }
+
+ ldout(cct, 20) << "snapshot_delta=" << *snapshot_delta << dendl;
+ C_AioRequest::finish(0);
+ }
+
+ void object_to_image_intervals(
+ uint64_t object_no, const SnapshotDelta& object_snapshot_delta,
+ SnapshotDelta* image_snapshot_delta,
+ SnapshotDelta* assembled_image_snapshot_delta) {
+ for (auto& [key, object_extents] : object_snapshot_delta) {
+ for (auto& object_extent : object_extents) {
+ auto [image_extents, _] = io::util::object_to_area_extents(
+ image_ctx, object_no,
+ {{object_extent.get_off(), object_extent.get_len()}});
+
+ auto& intervals = (*image_snapshot_delta)[key];
+ auto& assembled_intervals = (*assembled_image_snapshot_delta)[key];
+ for (auto [image_offset, image_length] : image_extents) {
+ SparseExtent sparse_extent{object_extent.get_val().state,
+ image_length};
+ intervals.insert(image_offset, image_length, sparse_extent);
+ assembled_intervals.insert(image_offset, image_length,
+ sparse_extent);
+ }
+ }
+ }
+ }
+};
+
+template <typename I>
+struct C_RBD_Readahead : public Context {
+ I *ictx;
+ uint64_t object_no;
+ io::ReadExtents extents;
+
+ C_RBD_Readahead(I *ictx, uint64_t object_no, uint64_t offset, uint64_t length)
+ : ictx(ictx), object_no(object_no), extents({{offset, length}}) {
+ ictx->readahead.inc_pending();
+ }
+
+ void finish(int r) override {
+ ceph_assert(extents.size() == 1);
+ auto& extent = extents.front();
+ ldout(ictx->cct, 20) << "C_RBD_Readahead on "
+ << data_object_name(ictx, object_no) << ": "
+ << extent.offset << "~" << extent.length << dendl;
+ ictx->readahead.dec_pending();
+ }
+};
+
+template <typename I>
+void readahead(I *ictx, const Extents& image_extents, IOContext io_context) {
+ uint64_t total_bytes = 0;
+ for (auto& image_extent : image_extents) {
+ total_bytes += image_extent.second;
+ }
+
+ ictx->image_lock.lock_shared();
+ auto total_bytes_read = ictx->total_bytes_read.fetch_add(total_bytes);
+ bool abort = (
+ ictx->readahead_disable_after_bytes != 0 &&
+ total_bytes_read > ictx->readahead_disable_after_bytes);
+ if (abort) {
+ ictx->image_lock.unlock_shared();
+ return;
+ }
+
+ uint64_t data_size = ictx->get_area_size(ImageArea::DATA);
+ ictx->image_lock.unlock_shared();
+
+ auto readahead_extent = ictx->readahead.update(image_extents, data_size);
+ uint64_t readahead_offset = readahead_extent.first;
+ uint64_t readahead_length = readahead_extent.second;
+
+ if (readahead_length > 0) {
+ ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~"
+ << readahead_length << dendl;
+ LightweightObjectExtents readahead_object_extents;
+ io::util::area_to_object_extents(ictx, readahead_offset, readahead_length,
+ ImageArea::DATA, 0,
+ &readahead_object_extents);
+ for (auto& object_extent : readahead_object_extents) {
+ ldout(ictx->cct, 20) << "(readahead) "
+ << data_object_name(ictx,
+ object_extent.object_no) << " "
+ << object_extent.offset << "~"
+ << object_extent.length << dendl;
+
+ auto req_comp = new C_RBD_Readahead<I>(ictx, object_extent.object_no,
+ object_extent.offset,
+ object_extent.length);
+ auto req = io::ObjectDispatchSpec::create_read(
+ ictx, io::OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ &req_comp->extents, io_context, 0, 0, {}, nullptr, req_comp);
+ req->send();
+ }
+
+ ictx->perfcounter->inc(l_librbd_readahead);
+ ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length);
+ }
+}
+
+template <typename I>
+struct C_UpdateTimestamp : public Context {
+public:
+ I& m_image_ctx;
+ bool m_modify; // if modify set to 'true', modify timestamp is updated,
+ // access timestamp otherwise
+ AsyncOperation m_async_op;
+
+ C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) {
+ m_async_op.start_op(*get_image_ctx(&m_image_ctx));
+ }
+ ~C_UpdateTimestamp() override {
+ m_async_op.finish_op();
+ }
+
+ void send() {
+ librados::ObjectWriteOperation op;
+ if (m_modify) {
+ cls_client::set_modify_timestamp(&op);
+ } else {
+ cls_client::set_access_timestamp(&op);
+ }
+
+ auto comp = librbd::util::create_rados_callback(this);
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op);
+ ceph_assert(r == 0);
+ comp->release();
+ }
+
+ void finish(int r) override {
+ // ignore errors updating timestamp
+ }
+};
+
+bool should_update_timestamp(const utime_t& now, const utime_t& timestamp,
+ uint64_t interval) {
+ return (interval &&
+ (static_cast<uint64_t>(now.sec()) >= interval + timestamp));
+}
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \
+ << " " << __func__ << ": "
+
+template <typename I>
+void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ ReadResult &&read_result, IOContext io_context,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageReadRequest<I> req(*ictx, c, std::move(image_extents), area,
+ std::move(read_result), io_context, op_flags,
+ read_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), area,
+ std::move(bl), op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace) {
+ ImageDiscardRequest<I> req(*ictx, c, std::move(image_extents), area,
+ discard_granularity_bytes, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace) {
+ ImageFlushRequest<I> req(*ictx, c, flush_source, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageWriteSameRequest<I> req(*ictx, c, std::move(image_extents), area,
+ std::move(bl), op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c,
+ Extents &&image_extents,
+ ImageArea area,
+ bufferlist &&cmp_bl,
+ bufferlist &&bl,
+ uint64_t *mismatch_offset,
+ int op_flags,
+ const ZTracer::Trace &parent_trace) {
+ ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents), area,
+ std::move(cmp_bl), std::move(bl),
+ mismatch_offset, op_flags, parent_trace);
+ req.send();
+}
+
+template <typename I>
+void ImageRequest<I>::send() {
+ I &image_ctx = this->m_image_ctx;
+ ceph_assert(m_aio_comp->is_initialized(get_aio_type()));
+ ceph_assert(m_aio_comp->is_started());
+
+ CephContext *cct = image_ctx.cct;
+ AioCompletion *aio_comp = this->m_aio_comp;
+ ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", "
+ << "completion=" << aio_comp << dendl;
+
+ update_timestamp();
+ send_request();
+}
+
+template <typename I>
+void ImageRequest<I>::update_timestamp() {
+ bool modify = (get_aio_type() != AIO_TYPE_READ);
+ uint64_t update_interval;
+ if (modify) {
+ update_interval = m_image_ctx.mtime_update_interval;
+ } else {
+ update_interval = m_image_ctx.atime_update_interval;
+ }
+
+ if (update_interval == 0) {
+ return;
+ }
+
+ utime_t (I::*get_timestamp_fn)() const;
+ void (I::*set_timestamp_fn)(utime_t);
+ if (modify) {
+ get_timestamp_fn = &I::get_modify_timestamp;
+ set_timestamp_fn = &I::set_modify_timestamp;
+ } else {
+ get_timestamp_fn = &I::get_access_timestamp;
+ set_timestamp_fn = &I::set_access_timestamp;
+ }
+
+ utime_t ts = ceph_clock_now();
+ {
+ std::shared_lock timestamp_locker{m_image_ctx.timestamp_lock};
+ if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx),
+ update_interval)) {
+ return;
+ }
+ }
+
+ {
+ std::unique_lock timestamp_locker{m_image_ctx.timestamp_lock};
+ bool update = should_update_timestamp(
+ ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval);
+ if (!update) {
+ return;
+ }
+
+ std::invoke(set_timestamp_fn, m_image_ctx, ts);
+ }
+
+ // TODO we fire and forget this outside the IO path to prevent
+ // potential race conditions with librbd client IO callbacks
+ // between different threads (e.g. librados and object cacher)
+ ldout(m_image_ctx.cct, 10) << get_request_type() << dendl;
+ auto req = new C_UpdateTimestamp<I>(m_image_ctx, modify);
+ req->send();
+}
+
+template <typename I>
+ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area,
+ ReadResult &&read_result,
+ IOContext io_context, int op_flags,
+ int read_flags,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), area,
+ "read", parent_trace),
+ m_io_context(io_context), m_op_flags(op_flags), m_read_flags(read_flags) {
+ aio_comp->read_result = std::move(read_result);
+}
+
+template <typename I>
+void ImageReadRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ auto &image_extents = this->m_image_extents;
+ if (this->m_image_area == ImageArea::DATA &&
+ image_ctx.cache && image_ctx.readahead_max_bytes > 0 &&
+ !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
+ readahead(get_image_ctx(&image_ctx), image_extents, m_io_context);
+ }
+
+ // map image extents to object extents
+ LightweightObjectExtents object_extents;
+ uint64_t buffer_ofs = 0;
+ for (auto &extent : image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ util::area_to_object_extents(&image_ctx, extent.first, extent.second,
+ this->m_image_area, buffer_ofs,
+ &object_extents);
+ buffer_ofs += extent.second;
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->read_result.set_image_extents(image_extents);
+
+ // issue the requests
+ aio_comp->set_request_count(object_extents.size());
+ for (auto &oe : object_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " "
+ << oe.offset << "~" << oe.length << " from "
+ << oe.buffer_extents << dendl;
+
+ auto req_comp = new io::ReadResult::C_ObjectReadRequest(
+ aio_comp, {{oe.offset, oe.length, std::move(oe.buffer_extents)}});
+ auto req = ObjectDispatchSpec::create_read(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.object_no,
+ &req_comp->extents, m_io_context, m_op_flags, m_read_flags,
+ this->m_trace, nullptr, req_comp);
+ req->send();
+ }
+
+ image_ctx.perfcounter->inc(l_librbd_rd);
+ image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool journaling = false;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ {
+ // prevent image size from changing between computing clip and recording
+ // pending async operation
+ std::shared_lock image_locker{image_ctx.image_lock};
+ journaling = (image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ uint64_t clip_len = 0;
+ LightweightObjectExtents object_extents;
+ for (auto &extent : this->m_image_extents) {
+ if (extent.second == 0) {
+ continue;
+ }
+
+ // map to object extents
+ io::util::area_to_object_extents(&image_ctx, extent.first, extent.second,
+ this->m_image_area, clip_len,
+ &object_extents);
+ clip_len += extent.second;
+ }
+
+ int ret = prune_object_extents(&object_extents);
+ if (ret < 0) {
+ aio_comp->fail(ret);
+ return;
+ }
+
+ // reflect changes in object_extents back to m_image_extents
+ if (ret == 1) {
+ this->m_image_extents.clear();
+ for (auto& object_extent : object_extents) {
+ auto [image_extents, _] = io::util::object_to_area_extents(
+ &image_ctx, object_extent.object_no,
+ {{object_extent.offset, object_extent.length}});
+ this->m_image_extents.insert(this->m_image_extents.end(),
+ image_extents.begin(), image_extents.end());
+ }
+ }
+
+ aio_comp->set_request_count(object_extents.size());
+ if (!object_extents.empty()) {
+ uint64_t journal_tid = 0;
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ ceph_assert(image_ctx.journal != NULL);
+ journal_tid = append_journal_event(m_synchronous);
+ }
+
+ // it's very important that IOContext is captured here instead of
+ // e.g. at the API layer so that an up-to-date snap context is used
+ // when owning the exclusive lock
+ send_object_requests(object_extents, image_ctx.get_data_io_context(),
+ journal_tid);
+ }
+
+ update_stats(clip_len);
+}
+
+template <typename I>
+void AbstractImageWriteRequest<I>::send_object_requests(
+ const LightweightObjectExtents &object_extents, IOContext io_context,
+ uint64_t journal_tid) {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ bool single_extent = (object_extents.size() == 1);
+ for (auto& oe : object_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " "
+ << oe.offset << "~" << oe.length << " from "
+ << oe.buffer_extents << dendl;
+ C_AioRequest *req_comp = new C_AioRequest(aio_comp);
+ auto request = create_object_request(oe, io_context, journal_tid,
+ single_extent, req_comp);
+ request->send();
+ }
+}
+
+template <typename I>
+void ImageWriteRequest<I>::assemble_extent(
+ const LightweightObjectExtent &object_extent, bufferlist *bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+ }
+}
+
+template <typename I>
+uint64_t ImageWriteRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ uint64_t buffer_offset = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, buffer_offset, extent.second);
+ buffer_offset += extent.second;
+
+ tid = image_ctx.journal->append_write_event(extent.first, extent.second,
+ sub_bl, synchronous);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ if (single_extent && object_extent.buffer_extents.size() == 1 &&
+ m_bl.length() == object_extent.length) {
+ // optimization for single object/buffer extent writes
+ bl = std::move(m_bl);
+ } else {
+ assemble_extent(object_extent, &bl);
+ }
+
+ auto req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(bl), io_context, m_op_flags, 0,
+ std::nullopt, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_wr);
+ image_ctx.perfcounter->inc(l_librbd_wr_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageDiscardRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(
+ journal::AioDiscardEvent(extent.first,
+ extent.second,
+ this->m_discard_granularity_bytes));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageDiscardRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+ auto req = ObjectDispatchSpec::create_discard(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, object_extent.length, io_context,
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace,
+ on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageDiscardRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_discard);
+ image_ctx.perfcounter->inc(l_librbd_discard_bytes, length);
+}
+
+template <typename I>
+int ImageDiscardRequest<I>::prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ if (m_discard_granularity_bytes == 0) {
+ return 0;
+ }
+
+ // Align the range to discard_granularity_bytes boundary and skip
+ // and discards that are too small to free up any space.
+ //
+ // discard_granularity_bytes >= object_size && tail truncation
+ // is a special case for filestore
+ bool prune_required = false;
+ bool length_modified = false;
+ auto object_size = this->m_image_ctx.layout.object_size;
+ auto discard_granularity_bytes = std::min(m_discard_granularity_bytes,
+ object_size);
+ auto xform_lambda =
+ [discard_granularity_bytes, object_size, &prune_required, &length_modified]
+ (LightweightObjectExtent& object_extent) {
+ auto& offset = object_extent.offset;
+ auto& length = object_extent.length;
+ auto next_offset = offset + length;
+
+ if ((discard_granularity_bytes < object_size) ||
+ (next_offset < object_size)) {
+ offset = p2roundup<uint64_t>(offset, discard_granularity_bytes);
+ next_offset = p2align<uint64_t>(next_offset, discard_granularity_bytes);
+ if (offset >= next_offset) {
+ prune_required = true;
+ length = 0;
+ } else {
+ auto new_length = next_offset - offset;
+ if (length != new_length) {
+ length_modified = true;
+ length = new_length;
+ }
+ }
+ }
+ };
+ std::for_each(object_extents->begin(), object_extents->end(),
+ xform_lambda);
+
+ if (prune_required) {
+ // one or more object extents were skipped
+ auto remove_lambda =
+ [](const LightweightObjectExtent& object_extent) {
+ return (object_extent.length == 0);
+ };
+ object_extents->erase(
+ std::remove_if(object_extents->begin(), object_extents->end(),
+ remove_lambda),
+ object_extents->end());
+ }
+
+ // object extents were modified, image extents needs updating
+ if (length_modified || prune_required) {
+ return 1;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ImageFlushRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+
+ bool journaling = false;
+ {
+ std::shared_lock image_locker{image_ctx.image_lock};
+ journaling = (m_flush_source == FLUSH_SOURCE_USER &&
+ image_ctx.journal != nullptr &&
+ image_ctx.journal->is_journal_appending());
+ }
+
+ AioCompletion *aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+
+ Context *ctx = new C_AioRequest(aio_comp);
+
+ // ensure no locks are held when flush is complete
+ ctx = librbd::util::create_async_context_callback(image_ctx, ctx);
+
+ uint64_t journal_tid = 0;
+ if (journaling) {
+ // in-flight ops are flushed prior to closing the journal
+ ceph_assert(image_ctx.journal != NULL);
+ journal_tid = image_ctx.journal->append_io_event(
+ journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0);
+ image_ctx.journal->user_flushed();
+ }
+
+ auto object_dispatch_spec = ObjectDispatchSpec::create_flush(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, journal_tid,
+ this->m_trace, ctx);
+ ctx = new LambdaContext([object_dispatch_spec](int r) {
+ object_dispatch_spec->send();
+ });
+
+ // ensure all in-flight IOs are settled if non-user flush request
+ if (m_flush_source == FLUSH_SOURCE_WRITEBACK) {
+ ctx->complete(0);
+ } else {
+ aio_comp->async_op.flush(ctx);
+ }
+
+ // might be flushing during image shutdown
+ if (image_ctx.perfcounter != nullptr) {
+ image_ctx.perfcounter->inc(l_librbd_flush);
+ }
+}
+
+template <typename I>
+uint64_t ImageWriteSameRequest<I>::append_journal_event(bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(!this->m_image_extents.empty());
+ for (auto &extent : this->m_image_extents) {
+ journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
+ extent.second,
+ m_data_bl));
+ tid = image_ctx.journal->append_io_event(std::move(event_entry),
+ extent.first, extent.second,
+ synchronous, 0);
+ }
+
+ return tid;
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageWriteSameRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ ObjectDispatchSpec *req;
+
+ if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) {
+ auto buffer_extents{object_extent.buffer_extents};
+
+ req = ObjectDispatchSpec::create_write_same(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, object_extent.length, std::move(buffer_extents),
+ std::move(bl), io_context, m_op_flags, journal_tid,
+ this->m_trace, on_finish);
+ return req;
+ }
+ req = ObjectDispatchSpec::create_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(bl), io_context, m_op_flags, 0,
+ std::nullopt, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageWriteSameRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_ws);
+ image_ctx.perfcounter->inc(l_librbd_ws_bytes, length);
+}
+
+template <typename I>
+uint64_t ImageCompareAndWriteRequest<I>::append_journal_event(
+ bool synchronous) {
+ I &image_ctx = this->m_image_ctx;
+
+ uint64_t tid = 0;
+ ceph_assert(this->m_image_extents.size() == 1);
+ auto &extent = this->m_image_extents.front();
+ tid = image_ctx.journal->append_compare_and_write_event(extent.first,
+ extent.second,
+ m_cmp_bl,
+ m_bl,
+ synchronous);
+
+ return tid;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::assemble_extent(
+ const LightweightObjectExtent &object_extent, bufferlist *bl,
+ bufferlist *cmp_bl) {
+ for (auto q = object_extent.buffer_extents.begin();
+ q != object_extent.buffer_extents.end(); ++q) {
+ bufferlist sub_bl;
+ sub_bl.substr_of(m_bl, q->first, q->second);
+ bl->claim_append(sub_bl);
+
+ bufferlist sub_cmp_bl;
+ sub_cmp_bl.substr_of(m_cmp_bl, q->first, q->second);
+ cmp_bl->claim_append(sub_cmp_bl);
+ }
+}
+
+template <typename I>
+ObjectDispatchSpec *ImageCompareAndWriteRequest<I>::create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) {
+ I &image_ctx = this->m_image_ctx;
+
+ bufferlist bl;
+ bufferlist cmp_bl;
+ assemble_extent(object_extent, &bl, &cmp_bl);
+ auto req = ObjectDispatchSpec::create_compare_and_write(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no,
+ object_extent.offset, std::move(cmp_bl), std::move(bl), io_context,
+ m_mismatch_offset, m_op_flags, journal_tid, this->m_trace, on_finish);
+ return req;
+}
+
+template <typename I>
+void ImageCompareAndWriteRequest<I>::update_stats(size_t length) {
+ I &image_ctx = this->m_image_ctx;
+ image_ctx.perfcounter->inc(l_librbd_cmp);
+ image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length);
+}
+
+template <typename I>
+int ImageCompareAndWriteRequest<I>::prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ if (object_extents->size() > 1)
+ return -EINVAL;
+
+ I &image_ctx = this->m_image_ctx;
+ uint64_t su = image_ctx.layout.stripe_unit;
+ auto& object_extent = object_extents->front();
+ if (su == 0 || (object_extent.offset % su + object_extent.length > su))
+ return -EINVAL;
+
+ return 0;
+}
+
+template <typename I>
+ImageListSnapsRequest<I>::ImageListSnapsRequest(
+ I& image_ctx, AioCompletion* aio_comp, Extents&& image_extents,
+ ImageArea area, SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta, const ZTracer::Trace& parent_trace)
+ : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), area,
+ "list-snaps", parent_trace),
+ m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags),
+ m_snapshot_delta(snapshot_delta) {
+}
+
+template <typename I>
+void ImageListSnapsRequest<I>::send_request() {
+ I &image_ctx = this->m_image_ctx;
+ CephContext *cct = image_ctx.cct;
+
+ // map image extents to object extents
+ auto &image_extents = this->m_image_extents;
+ std::map<uint64_t, Extents> object_number_extents;
+ for (auto& image_extent : image_extents) {
+ if (image_extent.second == 0) {
+ continue;
+ }
+
+ striper::LightweightObjectExtents object_extents;
+ io::util::area_to_object_extents(&image_ctx, image_extent.first,
+ image_extent.second, this->m_image_area, 0,
+ &object_extents);
+ for (auto& object_extent : object_extents) {
+ object_number_extents[object_extent.object_no].emplace_back(
+ object_extent.offset, object_extent.length);
+ }
+ }
+
+ // reassemble the deltas back into image-extents when complete
+ auto aio_comp = this->m_aio_comp;
+ aio_comp->set_request_count(1);
+ auto assemble_ctx = new C_AssembleSnapshotDeltas<I>(
+ &image_ctx, aio_comp, m_snapshot_delta);
+ auto sub_aio_comp = AioCompletion::create_and_start<
+ Context, &Context::complete>(assemble_ctx, get_image_ctx(&image_ctx),
+ AIO_TYPE_GENERIC);
+
+ // issue the requests
+ sub_aio_comp->set_request_count(object_number_extents.size());
+ for (auto& oe : object_number_extents) {
+ ldout(cct, 20) << data_object_name(&image_ctx, oe.first) << " "
+ << oe.second << dendl;
+ auto ctx = new C_AioRequest(sub_aio_comp);
+ auto req = ObjectDispatchSpec::create_list_snaps(
+ &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.first, std::move(oe.second),
+ SnapIds{m_snap_ids}, m_list_snaps_flags, this->m_trace,
+ assemble_ctx->get_snapshot_delta(oe.first), ctx);
+ req->send();
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ImageRequest<librbd::ImageCtx>;
+template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h
new file mode 100644
index 000000000..2668c1acb
--- /dev/null
+++ b/src/librbd/io/ImageRequest.h
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "common/zipkin_trace.h"
+#include "osd/osd_types.h"
+#include "librbd/Utils.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace librbd {
+class ImageCtx;
+
+namespace io {
+
+class AioCompletion;
+class ObjectDispatchSpec;
+class ReadResult;
+
+template <typename ImageCtxT = ImageCtx>
+class ImageRequest {
+public:
+ virtual ~ImageRequest() {
+ m_trace.event("finish");
+ }
+
+ static void aio_read(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ ReadResult &&read_result, IOContext io_context,
+ int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_discard(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace);
+ static void aio_flush(ImageCtxT *ictx, AioCompletion *c,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace);
+ static void aio_writesame(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace);
+ static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&cmp_bl, bufferlist &&bl,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace);
+
+ void send();
+
+ inline const ZTracer::Trace &get_trace() const {
+ return m_trace;
+ }
+
+protected:
+ typedef std::list<ObjectDispatchSpec*> ObjectRequests;
+
+ ImageCtxT &m_image_ctx;
+ AioCompletion *m_aio_comp;
+ Extents m_image_extents;
+ ImageArea m_image_area;
+ ZTracer::Trace m_trace;
+
+ ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area, const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : m_image_ctx(image_ctx), m_aio_comp(aio_comp),
+ m_image_extents(std::move(image_extents)), m_image_area(area),
+ m_trace(librbd::util::create_trace(image_ctx, trace_name, parent_trace)) {
+ m_trace.event("start");
+ }
+
+ virtual void update_timestamp();
+ virtual void send_request() = 0;
+
+ virtual aio_type_t get_aio_type() const = 0;
+ virtual const char *get_request_type() const = 0;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageReadRequest : public ImageRequest<ImageCtxT> {
+public:
+ ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace);
+
+protected:
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_READ;
+ }
+ const char *get_request_type() const override {
+ return "aio_read";
+ }
+
+private:
+ IOContext m_io_context;
+ int m_op_flags;
+ int m_read_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractImageWriteRequest : public ImageRequest<ImageCtxT> {
+public:
+ inline void flag_synchronous() {
+ m_synchronous = true;
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area,
+ const char *trace_name,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents),
+ area, trace_name, parent_trace),
+ m_synchronous(false) {
+ }
+
+ void send_request() override;
+
+ virtual int prune_object_extents(
+ LightweightObjectExtents* object_extents) const {
+ return 0;
+ }
+
+ void send_object_requests(const LightweightObjectExtents &object_extents,
+ IOContext io_context, uint64_t journal_tid);
+ virtual ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) = 0;
+
+ virtual uint64_t append_journal_event(bool synchronous) = 0;
+ virtual void update_stats(size_t length) = 0;
+
+private:
+ bool m_synchronous;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), area,
+ "write", parent_trace),
+ m_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_write";
+ }
+
+ void assemble_extent(const LightweightObjectExtent &object_extent,
+ bufferlist *bl);
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+private:
+ bufferlist m_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents, ImageArea area,
+ uint32_t discard_granularity_bytes,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), area,
+ "discard", parent_trace),
+ m_discard_granularity_bytes(discard_granularity_bytes) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_DISCARD;
+ }
+ const char *get_request_type() const override {
+ return "aio_discard";
+ }
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ int prune_object_extents(
+ LightweightObjectExtents* object_extents) const override;
+
+private:
+ uint32_t m_discard_granularity_bytes;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageFlushRequest : public ImageRequest<ImageCtxT> {
+public:
+ ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ FlushSource flush_source,
+ const ZTracer::Trace &parent_trace)
+ : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {},
+ ImageArea::DATA /* dummy for {} */,
+ "flush", parent_trace),
+ m_flush_source(flush_source) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ void update_timestamp() override {
+ }
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_FLUSH;
+ }
+ const char *get_request_type() const override {
+ return "aio_flush";
+ }
+
+private:
+ FlushSource m_flush_source;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents&& image_extents, ImageArea area,
+ bufferlist &&bl, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), area,
+ "writesame", parent_trace),
+ m_data_bl(std::move(bl)), m_op_flags(op_flags) {
+ }
+
+protected:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_WRITESAME;
+ }
+ const char *get_request_type() const override {
+ return "aio_writesame";
+ }
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+private:
+ bufferlist m_data_bl;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> {
+public:
+ using typename ImageRequest<ImageCtxT>::ObjectRequests;
+
+ ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+ Extents &&image_extents, ImageArea area,
+ bufferlist &&cmp_bl, bufferlist &&bl,
+ uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace)
+ : AbstractImageWriteRequest<ImageCtxT>(
+ image_ctx, aio_comp, std::move(image_extents), area,
+ "compare_and_write", parent_trace),
+ m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+protected:
+ void assemble_extent(const LightweightObjectExtent &object_extent,
+ bufferlist *bl, bufferlist *cmp_bl);
+
+ ObjectDispatchSpec *create_object_request(
+ const LightweightObjectExtent &object_extent, IOContext io_context,
+ uint64_t journal_tid, bool single_extent, Context *on_finish) override;
+
+ uint64_t append_journal_event(bool synchronous) override;
+ void update_stats(size_t length) override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_COMPARE_AND_WRITE;
+ }
+ const char *get_request_type() const override {
+ return "aio_compare_and_write";
+ }
+
+ int prune_object_extents(
+ LightweightObjectExtents* object_extents) const override;
+
+private:
+ bufferlist m_cmp_bl;
+ bufferlist m_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ImageListSnapsRequest : public ImageRequest<ImageCtxT> {
+public:
+ ImageListSnapsRequest(
+ ImageCtxT& image_ctx, AioCompletion* aio_comp,
+ Extents&& image_extents, ImageArea area, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace& parent_trace);
+
+protected:
+ void update_timestamp() override {}
+ void send_request() override;
+
+ aio_type_t get_aio_type() const override {
+ return AIO_TYPE_GENERIC;
+ }
+ const char *get_request_type() const override {
+ return "list-snaps";
+ }
+
+private:
+ SnapIds m_snap_ids;
+ int m_list_snaps_flags;
+ SnapshotDelta* m_snapshot_delta;
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ImageRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H
diff --git a/src/librbd/io/IoOperations.cc b/src/librbd/io/IoOperations.cc
new file mode 100644
index 000000000..7db7e7a80
--- /dev/null
+++ b/src/librbd/io/IoOperations.cc
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "librbd/io/Types.h"
+#include "librbd/io/IoOperations.h"
+
+#include <map>
+#include <vector>
+
+namespace librbd {
+namespace io {
+
+#define RBD_IO_OPERATION_NAME_READ "read"
+#define RBD_IO_OPERATION_NAME_WRITE "write"
+#define RBD_IO_OPERATION_NAME_DISCARD "discard"
+#define RBD_IO_OPERATION_NAME_WRITE_SAME "write_same"
+#define RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE "compare_and_write"
+
+static const std::map<std::string, uint64_t> RBD_IO_OPERATION_MAP = {
+ {RBD_IO_OPERATION_NAME_READ, RBD_IO_OPERATION_READ},
+ {RBD_IO_OPERATION_NAME_WRITE, RBD_IO_OPERATION_WRITE},
+ {RBD_IO_OPERATION_NAME_DISCARD, RBD_IO_OPERATION_DISCARD},
+ {RBD_IO_OPERATION_NAME_WRITE_SAME, RBD_IO_OPERATION_WRITE_SAME},
+ {RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE, RBD_IO_OPERATION_COMPARE_AND_WRITE},
+};
+static_assert((RBD_IO_OPERATION_COMPARE_AND_WRITE << 1) > RBD_IO_OPERATIONS_ALL,
+ "new RBD io operation added");
+
+std::string rbd_io_operations_to_string(uint64_t operations,
+ std::ostream *err)
+{
+ std::string r;
+ for (auto& i : RBD_IO_OPERATION_MAP) {
+ if (operations & i.second) {
+ if (!r.empty()) {
+ r += ",";
+ }
+ r += i.first;
+ operations &= ~i.second;
+ }
+ }
+ if (err && operations) {
+ *err << "ignoring unknown io operation mask 0x"
+ << std::hex << operations << std::dec;
+ }
+ return r;
+}
+
+uint64_t rbd_io_operations_from_string(const std::string& orig_value,
+ std::ostream *err)
+{
+ uint64_t operations = 0;
+ std::string value = orig_value;
+ boost::trim(value);
+
+ // empty string means default operations
+ if (!value.size()) {
+ return RBD_IO_OPERATIONS_DEFAULT;
+ }
+
+ try {
+ // numeric?
+ operations = boost::lexical_cast<uint64_t>(value);
+
+ // drop unrecognized bits
+ uint64_t unsupported_operations = (operations & ~RBD_IO_OPERATIONS_ALL);
+ if (unsupported_operations != 0ull) {
+ operations &= RBD_IO_OPERATIONS_ALL;
+ if (err) {
+ *err << "ignoring unknown operation mask 0x"
+ << std::hex << unsupported_operations << std::dec;
+ }
+ }
+ } catch (boost::bad_lexical_cast&) {
+ // operation name list?
+ bool errors = false;
+ std::vector<std::string> operation_names;
+ boost::split(operation_names, value, boost::is_any_of(","));
+ for (auto operation_name: operation_names) {
+ boost::trim(operation_name);
+ auto operation_it = RBD_IO_OPERATION_MAP.find(operation_name);
+ if (operation_it != RBD_IO_OPERATION_MAP.end()) {
+ operations += operation_it->second;
+ } else if (err) {
+ if (errors) {
+ *err << ", ";
+ } else {
+ errors = true;
+ }
+ *err << "ignoring unknown operation " << operation_name;
+ }
+ }
+ }
+ return operations;
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/IoOperations.h b/src/librbd/io/IoOperations.h
new file mode 100644
index 000000000..93d3ef4fe
--- /dev/null
+++ b/src/librbd/io/IoOperations.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <ostream>
+
+namespace librbd {
+namespace io {
+
+ std::string rbd_io_operations_to_string(uint64_t ops,
+ std::ostream *err);
+ uint64_t rbd_io_operations_from_string(const std::string& value,
+ std::ostream *err);
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc
new file mode 100644
index 000000000..a31cc74ea
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatch.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/ObjectRequest.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+
+template <typename I>
+ObjectDispatch<I>::ObjectDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx) {
+}
+
+template <typename I>
+void ObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_image_ctx->asio_engine->post(on_finish, 0);
+}
+
+template <typename I>
+bool ObjectDispatch<I>::read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectReadRequest<I>(m_image_ctx, object_no, extents,
+ io_context, op_flags, read_flags,
+ parent_trace, version, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectDiscardRequest<I>(m_image_ctx, object_no, object_off,
+ object_len, io_context, discard_flags,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteRequest<I>(m_image_ctx, object_no, object_off,
+ std::move(data), io_context, op_flags,
+ write_flags, assert_version,
+ parent_trace, on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectWriteSameRequest<I>(m_image_ctx, object_no,
+ object_off, object_len,
+ std::move(data), io_context,
+ op_flags, parent_trace,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << write_data.length() << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = new ObjectCompareAndWriteRequest<I>(m_image_ctx, object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data),
+ io_context, mismatch_offset,
+ op_flags, parent_trace,
+ on_dispatched);
+ req->send();
+ return true;
+}
+
+template <typename I>
+bool ObjectDispatch<I>::list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << "extents=" << extents << ", "
+ << "snap_ids=" << snap_ids << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ auto req = ObjectListSnapsRequest<I>::create(
+ m_image_ctx, object_no, std::move(extents), std::move(snap_ids),
+ list_snap_flags, parent_trace, snapshot_delta, on_dispatched);
+ req->send();
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h
new file mode 100644
index 000000000..dd1f7261d
--- /dev/null
+++ b/src/librbd/io/ObjectDispatch.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/io/Types.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ObjectDispatch : public ObjectDispatchInterface {
+public:
+ ObjectDispatch(ImageCtxT* image_ctx);
+
+ ObjectDispatchLayer get_dispatch_layer() const override {
+ return OBJECT_DISPATCH_LAYER_CORE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override {
+ return false;
+ }
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H
diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h
new file mode 100644
index 000000000..2e9dd1300
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchInterface.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+struct RWLock;
+
+namespace librbd {
+namespace io {
+
+struct AioCompletion;
+struct ObjectDispatchInterface;
+struct ObjectDispatchSpec;
+
+struct ObjectDispatchInterface {
+ typedef ObjectDispatchInterface Dispatch;
+ typedef ObjectDispatchLayer DispatchLayer;
+ typedef ObjectDispatchSpec DispatchSpec;
+
+ virtual ~ObjectDispatchInterface() {
+ }
+
+ virtual ObjectDispatchLayer get_dispatch_layer() const = 0;
+
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context**on_finish, Context* on_dispatched) = 0;
+
+ virtual bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context**on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) = 0;
+
+ virtual bool list_snaps(
+ uint64_t object_no, Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) = 0;
+
+ virtual bool invalidate_cache(Context* on_finish) = 0;
+ virtual bool reset_existence_cache(Context* on_finish) = 0;
+
+ virtual void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) = 0;
+
+ virtual int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H
diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc
new file mode 100644
index 000000000..3efff9774
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.cc
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "include/Context.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include <boost/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+void ObjectDispatchSpec::C_Dispatcher::complete(int r) {
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ switch (object_dispatch_spec->dispatch_result) {
+ case DISPATCH_RESULT_CONTINUE:
+ object_dispatch_spec->send();
+ break;
+ case DISPATCH_RESULT_COMPLETE:
+ finish(r);
+ break;
+ case DISPATCH_RESULT_INVALID:
+ case DISPATCH_RESULT_RESTART:
+ ceph_abort();
+ break;
+ }
+}
+
+void ObjectDispatchSpec::C_Dispatcher::finish(int r) {
+ on_finish->complete(r);
+ delete object_dispatch_spec;
+}
+
+void ObjectDispatchSpec::send() {
+ object_dispatcher->send(this);
+}
+
+void ObjectDispatchSpec::fail(int r) {
+ ceph_assert(r < 0);
+ dispatcher_ctx.complete(r);
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h
new file mode 100644
index 000000000..a0d4b49a4
--- /dev/null
+++ b/src/librbd/io/ObjectDispatchSpec.h
@@ -0,0 +1,295 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <boost/variant/variant.hpp>
+
+namespace librbd {
+namespace io {
+
+struct ObjectDispatcherInterface;
+
+struct ObjectDispatchSpec {
+private:
+ // helper to avoid extra heap allocation per object IO
+ struct C_Dispatcher : public Context {
+ ObjectDispatchSpec* object_dispatch_spec;
+ Context* on_finish;
+
+ C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish)
+ : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) {
+ }
+
+ void complete(int r) override;
+ void finish(int r) override;
+ };
+
+public:
+ struct RequestBase {
+ uint64_t object_no;
+
+ RequestBase(uint64_t object_no)
+ : object_no(object_no) {
+ }
+ };
+
+ struct ReadRequest : public RequestBase {
+ ReadExtents* extents;
+ int read_flags;
+ uint64_t* version;
+
+ ReadRequest(uint64_t object_no, ReadExtents* extents, int read_flags,
+ uint64_t* version)
+ : RequestBase(object_no), extents(extents), read_flags(read_flags),
+ version(version) {
+ }
+ };
+
+ struct WriteRequestBase : public RequestBase {
+ uint64_t object_off;
+ uint64_t journal_tid;
+
+ WriteRequestBase(uint64_t object_no, uint64_t object_off,
+ uint64_t journal_tid)
+ : RequestBase(object_no), object_off(object_off),
+ journal_tid(journal_tid) {
+ }
+ };
+
+ struct DiscardRequest : public WriteRequestBase {
+ uint64_t object_len;
+ int discard_flags;
+
+ DiscardRequest(uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ int discard_flags, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ object_len(object_len), discard_flags(discard_flags) {
+ }
+ };
+
+ struct WriteRequest : public WriteRequestBase {
+ ceph::bufferlist data;
+ int write_flags;
+ std::optional<uint64_t> assert_version;
+
+ WriteRequest(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, int write_flags,
+ std::optional<uint64_t> assert_version, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ data(std::move(data)), write_flags(write_flags),
+ assert_version(assert_version) {
+ }
+ };
+
+ struct WriteSameRequest : public WriteRequestBase {
+ uint64_t object_len;
+ LightweightBufferExtents buffer_extents;
+ ceph::bufferlist data;
+
+ WriteSameRequest(uint64_t object_no, uint64_t object_off,
+ uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents,
+ ceph::bufferlist&& data, uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ object_len(object_len), buffer_extents(std::move(buffer_extents)),
+ data(std::move(data)) {
+ }
+ };
+
+ struct CompareAndWriteRequest : public WriteRequestBase {
+ ceph::bufferlist cmp_data;
+ ceph::bufferlist data;
+ uint64_t* mismatch_offset;
+
+ CompareAndWriteRequest(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& data,
+ uint64_t* mismatch_offset,
+ uint64_t journal_tid)
+ : WriteRequestBase(object_no, object_off, journal_tid),
+ cmp_data(std::move(cmp_data)), data(std::move(data)),
+ mismatch_offset(mismatch_offset) {
+ }
+ };
+
+ struct FlushRequest {
+ FlushSource flush_source;
+ uint64_t journal_tid;
+
+ FlushRequest(FlushSource flush_source, uint64_t journal_tid)
+ : flush_source(flush_source), journal_tid(journal_tid) {
+ }
+ };
+
+ struct ListSnapsRequest : public RequestBase {
+ Extents extents;
+ SnapIds snap_ids;
+ int list_snaps_flags;
+ SnapshotDelta* snapshot_delta;
+
+ ListSnapsRequest(uint64_t object_no, Extents&& extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ SnapshotDelta* snapshot_delta)
+ : RequestBase(object_no), extents(std::move(extents)),
+ snap_ids(std::move(snap_ids)),list_snaps_flags(list_snaps_flags),
+ snapshot_delta(snapshot_delta) {
+ }
+ };
+
+ typedef boost::variant<ReadRequest,
+ DiscardRequest,
+ WriteRequest,
+ WriteSameRequest,
+ CompareAndWriteRequest,
+ FlushRequest,
+ ListSnapsRequest> Request;
+
+ C_Dispatcher dispatcher_ctx;
+
+ ObjectDispatcherInterface* object_dispatcher;
+ ObjectDispatchLayer dispatch_layer;
+ int object_dispatch_flags = 0;
+ DispatchResult dispatch_result = DISPATCH_RESULT_INVALID;
+
+ Request request;
+ IOContext io_context;
+ int op_flags;
+ ZTracer::Trace parent_trace;
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_read(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, Context* on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ ReadRequest{object_no, extents,
+ read_flags, version},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_discard(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ DiscardRequest{object_no, object_off,
+ object_len, discard_flags,
+ journal_tid},
+ io_context, 0, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteRequest{object_no, object_off,
+ std::move(data), write_flags,
+ assert_version, journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_write_same(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ WriteSameRequest{object_no, object_off,
+ object_len,
+ std::move(buffer_extents),
+ std::move(data),
+ journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_compare_and_write(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context,
+ uint64_t *mismatch_offset, int op_flags, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ CompareAndWriteRequest{object_no,
+ object_off,
+ std::move(cmp_data),
+ std::move(write_data),
+ mismatch_offset,
+ journal_tid},
+ io_context, op_flags, parent_trace,
+ on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_flush(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ FlushSource flush_source, uint64_t journal_tid,
+ const ZTracer::Trace &parent_trace, Context *on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ FlushRequest{flush_source, journal_tid},
+ {}, 0, parent_trace, on_finish);
+ }
+
+ template <typename ImageCtxT>
+ static ObjectDispatchSpec* create_list_snaps(
+ ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer,
+ uint64_t object_no, Extents&& extents, SnapIds&& snap_ids,
+ int list_snaps_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, Context* on_finish) {
+ return new ObjectDispatchSpec(image_ctx->io_object_dispatcher,
+ object_dispatch_layer,
+ ListSnapsRequest{object_no,
+ std::move(extents),
+ std::move(snap_ids),
+ list_snaps_flags,
+ snapshot_delta},
+ {}, 0, parent_trace, on_finish);
+ }
+
+ void send();
+ void fail(int r);
+
+private:
+ template <typename> friend class ObjectDispatcher;
+
+ ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher,
+ ObjectDispatchLayer object_dispatch_layer,
+ Request&& request, IOContext io_context, int op_flags,
+ const ZTracer::Trace& parent_trace, Context* on_finish)
+ : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher),
+ dispatch_layer(object_dispatch_layer), request(std::move(request)),
+ io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) {
+ }
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H
diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc
new file mode 100644
index 000000000..b66c6bb18
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectDispatcher.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/ObjectDispatch.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include <boost/variant.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct ObjectDispatcher<I>::C_ResetExistenceCache : public C_LayerIterator {
+ C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish)
+ : C_LayerIterator(object_dispatcher, OBJECT_DISPATCH_LAYER_NONE, on_finish) {
+ }
+
+ bool execute(ObjectDispatchInterface* object_dispatch,
+ Context* on_finish) override {
+ return object_dispatch->reset_existence_cache(on_finish);
+ }
+};
+
+template <typename I>
+struct ObjectDispatcher<I>::SendVisitor : public boost::static_visitor<bool> {
+ ObjectDispatchInterface* object_dispatch;
+ ObjectDispatchSpec* object_dispatch_spec;
+
+ SendVisitor(ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec)
+ : object_dispatch(object_dispatch),
+ object_dispatch_spec(object_dispatch_spec) {
+ }
+
+ bool operator()(ObjectDispatchSpec::ReadRequest& read) const {
+ return object_dispatch->read(
+ read.object_no, read.extents, object_dispatch_spec->io_context,
+ object_dispatch_spec->op_flags, read.read_flags,
+ object_dispatch_spec->parent_trace, read.version,
+ &object_dispatch_spec->object_dispatch_flags,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const {
+ return object_dispatch->discard(
+ discard.object_no, discard.object_off, discard.object_len,
+ object_dispatch_spec->io_context, discard.discard_flags,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteRequest& write) const {
+ return object_dispatch->write(
+ write.object_no, write.object_off, std::move(write.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ write.write_flags, write.assert_version,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const {
+ return object_dispatch->write_same(
+ write_same.object_no, write_same.object_off, write_same.object_len,
+ std::move(write_same.buffer_extents), std::move(write_same.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ object_dispatch_spec->parent_trace,
+ &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(
+ ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const {
+ return object_dispatch->compare_and_write(
+ compare_and_write.object_no, compare_and_write.object_off,
+ std::move(compare_and_write.cmp_data), std::move(compare_and_write.data),
+ object_dispatch_spec->io_context, object_dispatch_spec->op_flags,
+ object_dispatch_spec->parent_trace, compare_and_write.mismatch_offset,
+ &object_dispatch_spec->object_dispatch_flags,
+ &compare_and_write.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::FlushRequest& flush) const {
+ return object_dispatch->flush(
+ flush.flush_source, object_dispatch_spec->parent_trace,
+ &flush.journal_tid,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+
+ bool operator()(ObjectDispatchSpec::ListSnapsRequest& list_snaps) const {
+ return object_dispatch->list_snaps(
+ list_snaps.object_no, std::move(list_snaps.extents),
+ std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags,
+ object_dispatch_spec->parent_trace, list_snaps.snapshot_delta,
+ &object_dispatch_spec->object_dispatch_flags,
+ &object_dispatch_spec->dispatch_result,
+ &object_dispatch_spec->dispatcher_ctx.on_finish,
+ &object_dispatch_spec->dispatcher_ctx);
+ }
+};
+
+template <typename I>
+ObjectDispatcher<I>::ObjectDispatcher(I* image_ctx)
+ : Dispatcher<I, ObjectDispatcherInterface>(image_ctx) {
+ // configure the core object dispatch handler on startup
+ auto object_dispatch = new ObjectDispatch(image_ctx);
+ this->register_dispatch(object_dispatch);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::invalidate_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*image_ctx, on_finish);
+ auto ctx = new C_InvalidateCache(
+ this, OBJECT_DISPATCH_LAYER_NONE, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::reset_existence_cache(Context* on_finish) {
+ auto image_ctx = this->m_image_ctx;
+ auto cct = image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ on_finish = util::create_async_context_callback(*image_ctx, on_finish);
+ auto ctx = new C_ResetExistenceCache(this, on_finish);
+ ctx->complete(0);
+}
+
+template <typename I>
+void ObjectDispatcher<I>::extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) {
+ auto cct = this->m_image_ctx->cct;
+ ldout(cct, 20) << object_no << " " << object_off << "~" << object_len
+ << dendl;
+
+ std::shared_lock locker{this->m_lock};
+ for (auto it : this->m_dispatches) {
+ auto& object_dispatch_meta = it.second;
+ auto object_dispatch = object_dispatch_meta.dispatch;
+ object_dispatch->extent_overwritten(object_no, object_off, object_len,
+ journal_tid, new_journal_tid);
+ }
+}
+
+template <typename I>
+int ObjectDispatcher<I>::prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) {
+ auto cct = this->m_image_ctx->cct;
+ ldout(cct, 20) << "object_no=" << object_no << dendl;
+
+ std::shared_lock locker{this->m_lock};
+ for (auto it : this->m_dispatches) {
+ auto& object_dispatch_meta = it.second;
+ auto object_dispatch = object_dispatch_meta.dispatch;
+ auto r = object_dispatch->prepare_copyup(
+ object_no, snapshot_sparse_bufferlist);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+bool ObjectDispatcher<I>::send_dispatch(
+ ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec) {
+ return boost::apply_visitor(
+ SendVisitor{object_dispatch, object_dispatch_spec},
+ object_dispatch_spec->request);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h
new file mode 100644
index 000000000..1e5e78d8b
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcher.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "librbd/io/Dispatcher.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcherInterface.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDispatcher
+ : public Dispatcher<ImageCtxT, ObjectDispatcherInterface> {
+public:
+ ObjectDispatcher(ImageCtxT* image_ctx);
+
+ void invalidate_cache(Context* on_finish) override;
+ void reset_existence_cache(Context* on_finish) override;
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override;
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override;
+
+ using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_LayerIterator;
+
+ using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_InvalidateCache;
+
+protected:
+ bool send_dispatch(ObjectDispatchInterface* object_dispatch,
+ ObjectDispatchSpec* object_dispatch_spec) override;
+
+private:
+ struct C_ResetExistenceCache;
+ struct SendVisitor;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectDispatcher<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H
diff --git a/src/librbd/io/ObjectDispatcherInterface.h b/src/librbd/io/ObjectDispatcherInterface.h
new file mode 100644
index 000000000..0f3d33330
--- /dev/null
+++ b/src/librbd/io/ObjectDispatcherInterface.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
+#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
+
+#include "include/int_types.h"
+#include "librbd/io/DispatcherInterface.h"
+#include "librbd/io/ObjectDispatchInterface.h"
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+struct ObjectDispatcherInterface
+ : public DispatcherInterface<ObjectDispatchInterface> {
+public:
+ virtual void invalidate_cache(Context* on_finish) = 0;
+ virtual void reset_existence_cache(Context* on_finish) = 0;
+
+ virtual void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) = 0;
+
+ virtual int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H
diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc
new file mode 100644
index 000000000..6d246cdf3
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.cc
@@ -0,0 +1,1073 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ObjectRequest.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/err.h"
+#include "include/neorados/RADOS.hpp"
+#include "osd/osd_types.h"
+#include "librados/snap_set_diff.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/CopyupRequest.h"
+#include "librbd/io/ImageRequest.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/optional.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
+ << " " << __func__ << ": " \
+ << data_object_name(this->m_ictx, \
+ this->m_object_no) << " "
+
+namespace librbd {
+namespace io {
+
+using librbd::util::data_object_name;
+using librbd::util::create_context_callback;
+using librbd::util::create_trace;
+
+namespace {
+
+template <typename I>
+inline bool is_copy_on_read(I *ictx, const IOContext& io_context) {
+ std::shared_lock image_locker{ictx->image_lock};
+ return (ictx->clone_copy_on_read && !ictx->read_only &&
+ io_context->read_snap().value_or(CEPH_NOSNAP) == CEPH_NOSNAP &&
+ (ictx->exclusive_lock == nullptr ||
+ ictx->exclusive_lock->is_lock_owner()));
+}
+
+template <typename S, typename D>
+void convert_snap_set(const S& src_snap_set,
+ D* dst_snap_set) {
+ dst_snap_set->seq = src_snap_set.seq;
+ dst_snap_set->clones.reserve(src_snap_set.clones.size());
+ for (auto& src_clone : src_snap_set.clones) {
+ dst_snap_set->clones.emplace_back();
+ auto& dst_clone = dst_snap_set->clones.back();
+ dst_clone.cloneid = src_clone.cloneid;
+ dst_clone.snaps = src_clone.snaps;
+ dst_clone.overlap = src_clone.overlap;
+ dst_clone.size = src_clone.size;
+ }
+}
+
+} // anonymous namespace
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write(
+ I *ictx, uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectWriteRequest<I>(ictx, object_no, object_off,
+ std::move(data), io_context, op_flags,
+ write_flags, assert_version,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_discard(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectDiscardRequest<I>(ictx, object_no, object_off,
+ object_len, io_context, discard_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_write_same(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectWriteSameRequest<I>(ictx, object_no, object_off,
+ object_len, std::move(data), io_context,
+ op_flags, parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>*
+ObjectRequest<I>::create_compare_and_write(
+ I *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion) {
+ return new ObjectCompareAndWriteRequest<I>(ictx, object_no, object_off,
+ std::move(cmp_data),
+ std::move(write_data), io_context,
+ mismatch_offset, op_flags,
+ parent_trace, completion);
+}
+
+template <typename I>
+ObjectRequest<I>::ObjectRequest(
+ I *ictx, uint64_t objectno, IOContext io_context,
+ const char *trace_name, const ZTracer::Trace &trace, Context *completion)
+ : m_ictx(ictx), m_object_no(objectno), m_io_context(io_context),
+ m_completion(completion),
+ m_trace(create_trace(*ictx, "", trace)) {
+ ceph_assert(m_ictx->data_ctx.is_valid());
+ if (m_trace.valid()) {
+ m_trace.copy_name(trace_name + std::string(" ") +
+ data_object_name(ictx, objectno));
+ m_trace.event("start");
+ }
+}
+
+template <typename I>
+void ObjectRequest<I>::add_write_hint(I& image_ctx, neorados::WriteOp* wr) {
+ auto alloc_hint_flags = static_cast<neorados::alloc_hint::alloc_hint_t>(
+ image_ctx.alloc_hint_flags);
+ if (image_ctx.enable_alloc_hint) {
+ wr->set_alloc_hint(image_ctx.get_object_size(),
+ image_ctx.get_object_size(),
+ alloc_hint_flags);
+ } else if (image_ctx.alloc_hint_flags != 0U) {
+ wr->set_alloc_hint(0, 0, alloc_hint_flags);
+ }
+}
+
+template <typename I>
+bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents,
+ ImageArea *area,
+ bool read_request) {
+ ceph_assert(ceph_mutex_is_locked(m_ictx->image_lock));
+
+ m_has_parent = false;
+ parent_extents->clear();
+ *area = ImageArea::DATA;
+
+ uint64_t raw_overlap;
+ int r = m_ictx->get_parent_overlap(
+ m_io_context->read_snap().value_or(CEPH_NOSNAP), &raw_overlap);
+ if (r < 0) {
+ // NOTE: it's possible for a snapshot to be deleted while we are
+ // still reading from it
+ lderr(m_ictx->cct) << "failed to retrieve parent overlap: "
+ << cpp_strerror(r) << dendl;
+ return false;
+ }
+ bool migration_write = !read_request && !m_ictx->migration_info.empty();
+ if (migration_write) {
+ raw_overlap = m_ictx->migration_info.overlap;
+ }
+ if (raw_overlap == 0) {
+ return false;
+ }
+
+ std::tie(*parent_extents, *area) = io::util::object_to_area_extents(
+ m_ictx, m_object_no, {{0, m_ictx->layout.object_size}});
+ uint64_t object_overlap = m_ictx->prune_parent_extents(
+ *parent_extents, *area, raw_overlap, migration_write);
+ if (object_overlap > 0) {
+ m_has_parent = true;
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void ObjectRequest<I>::async_finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_ictx->asio_engine->post([this, r]() { finish(r); });
+}
+
+template <typename I>
+void ObjectRequest<I>::finish(int r) {
+ ldout(m_ictx->cct, 20) << "r=" << r << dendl;
+ m_completion->complete(r);
+ delete this;
+}
+
+/** read **/
+
+template <typename I>
+ObjectReadRequest<I>::ObjectReadRequest(
+ I *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion)
+ : ObjectRequest<I>(ictx, objectno, io_context, "read", parent_trace,
+ completion),
+ m_extents(extents), m_op_flags(op_flags),m_read_flags(read_flags),
+ m_version(version) {
+}
+
+template <typename I>
+void ObjectReadRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ read_object();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_object() {
+ I *image_ctx = this->m_ictx;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+ auto read_snap_id = this->m_io_context->read_snap().value_or(CEPH_NOSNAP);
+ if (read_snap_id == image_ctx->snap_id &&
+ image_ctx->object_map != nullptr &&
+ !image_ctx->object_map->object_may_exist(this->m_object_no)) {
+ image_ctx->asio_engine->post([this]() { read_parent(); });
+ return;
+ }
+ image_locker.unlock();
+
+ ldout(image_ctx->cct, 20) << "snap_id=" << read_snap_id << dendl;
+
+ neorados::ReadOp read_op;
+ for (auto& extent: *this->m_extents) {
+ if (extent.length >= image_ctx->sparse_read_threshold_bytes) {
+ read_op.sparse_read(extent.offset, extent.length, &extent.bl,
+ &extent.extent_map);
+ } else {
+ read_op.read(extent.offset, extent.length, &extent.bl);
+ }
+ }
+ util::apply_op_flags(
+ m_op_flags, image_ctx->get_read_flags(read_snap_id), &read_op);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(read_op), nullptr,
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_read_object(r); }), m_version,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (m_version != nullptr) {
+ ldout(image_ctx->cct, 20) << "version=" << *m_version << dendl;
+ }
+
+ if (r == -ENOENT) {
+ read_parent();
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read from object: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::read_parent() {
+ if ((m_read_flags & READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) {
+ this->finish(-ENOENT);
+ return;
+ }
+
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ auto ctx = create_context_callback<
+ ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>(this);
+
+ io::util::read_parent<I>(
+ image_ctx, this->m_object_no, this->m_extents,
+ this->m_io_context->read_snap().value_or(CEPH_NOSNAP), this->m_trace,
+ ctx);
+}
+
+template <typename I>
+void ObjectReadRequest<I>::handle_read_parent(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to read parent extents: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ copyup();
+}
+
+template <typename I>
+void ObjectReadRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ if (!is_copy_on_read(image_ctx, this->m_io_context)) {
+ this->finish(0);
+ return;
+ }
+
+ image_ctx->owner_lock.lock_shared();
+ image_ctx->image_lock.lock_shared();
+ Extents parent_extents;
+ ImageArea area;
+ if (!this->compute_parent_extents(&parent_extents, &area, true) ||
+ (image_ctx->exclusive_lock != nullptr &&
+ !image_ctx->exclusive_lock->is_lock_owner())) {
+ image_ctx->image_lock.unlock_shared();
+ image_ctx->owner_lock.unlock_shared();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ image_ctx->copyup_list_lock.lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_object_no, std::move(parent_extents), area,
+ this->m_trace);
+
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+ image_ctx->copyup_list_lock.unlock();
+ image_ctx->image_lock.unlock_shared();
+ new_req->send();
+ } else {
+ image_ctx->copyup_list_lock.unlock();
+ image_ctx->image_lock.unlock_shared();
+ }
+
+ image_ctx->owner_lock.unlock_shared();
+ this->finish(0);
+}
+
+/** write **/
+
+template <typename I>
+AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest(
+ I *ictx, uint64_t object_no, uint64_t object_off, uint64_t len,
+ IOContext io_context, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : ObjectRequest<I>(ictx, object_no, io_context, trace_name, parent_trace,
+ completion),
+ m_object_off(object_off), m_object_len(len)
+{
+ if (this->m_object_off == 0 &&
+ this->m_object_len == ictx->get_object_size()) {
+ m_full_object = true;
+ }
+
+ compute_parent_info();
+
+ ictx->image_lock.lock_shared();
+ if (!ictx->migration_info.empty()) {
+ m_guarding_migration_write = true;
+ }
+ ictx->image_lock.unlock_shared();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::compute_parent_info() {
+ I *image_ctx = this->m_ictx;
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ this->compute_parent_extents(&m_parent_extents, &m_image_area, false);
+
+ if (!this->has_parent() ||
+ (m_full_object &&
+ !this->m_io_context->write_snap_context() &&
+ !is_post_copyup_write_required())) {
+ m_copyup_enabled = false;
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::add_write_hint(
+ neorados::WriteOp *wr) {
+ I *image_ctx = this->m_ictx;
+ std::shared_lock image_locker{image_ctx->image_lock};
+ if (image_ctx->object_map == nullptr || !this->m_object_may_exist ||
+ image_ctx->alloc_hint_flags != 0U) {
+ ObjectRequest<I>::add_write_hint(*image_ctx, wr);
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << this->get_op_type() << " "
+ << this->m_object_off << "~" << this->m_object_len
+ << dendl;
+ {
+ std::shared_lock image_lock{image_ctx->image_lock};
+ if (image_ctx->object_map == nullptr) {
+ m_object_may_exist = true;
+ } else {
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ m_object_may_exist = image_ctx->object_map->object_may_exist(
+ this->m_object_no);
+ }
+ }
+
+ if (!m_object_may_exist && is_no_op_for_nonexistent_object()) {
+ ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object"
+ << dendl;
+ this->async_finish(0);
+ return;
+ }
+
+ pre_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::pre_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->image_lock.lock_shared();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) {
+ image_ctx->image_lock.unlock_shared();
+ write_object();
+ return;
+ }
+
+ if (!m_object_may_exist && m_copyup_enabled) {
+ // optimization: copyup required
+ image_ctx->image_lock.unlock_shared();
+ copyup();
+ return;
+ }
+
+ uint8_t new_state = this->get_pre_write_object_map_state();
+ ldout(image_ctx->cct, 20) << this->m_object_off << "~" << this->m_object_len
+ << dendl;
+
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false,
+ this)) {
+ image_ctx->image_lock.unlock_shared();
+ return;
+ }
+
+ image_ctx->image_lock.unlock_shared();
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ write_object();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::write_object() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ neorados::WriteOp write_op;
+ if (m_copyup_enabled) {
+ if (m_guarding_migration_write) {
+ auto snap_seq = (this->m_io_context->write_snap_context() ?
+ this->m_io_context->write_snap_context()->first : 0);
+ ldout(image_ctx->cct, 20) << "guarding write: snap_seq=" << snap_seq
+ << dendl;
+
+ cls_client::assert_snapc_seq(
+ &write_op, snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ);
+ } else {
+ ldout(image_ctx->cct, 20) << "guarding write" << dendl;
+ write_op.assert_exists();
+ }
+ }
+
+ add_write_hint(&write_op);
+ add_write_ops(&write_op);
+ ceph_assert(write_op.size() != 0);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(write_op),
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_write_object(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_write_object(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ r = filter_write_result(r);
+ if (r == -ENOENT) {
+ if (m_copyup_enabled) {
+ copyup();
+ return;
+ }
+ } else if (r == -ERANGE && m_guarding_migration_write) {
+ image_ctx->image_lock.lock_shared();
+ m_guarding_migration_write = !image_ctx->migration_info.empty();
+ image_ctx->image_lock.unlock_shared();
+
+ if (m_guarding_migration_write) {
+ copyup();
+ } else {
+ ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl;
+ compute_parent_info();
+ write_object();
+ }
+ return;
+ } else if (r == -EILSEQ) {
+ ldout(image_ctx->cct, 10) << "failed to write object" << dendl;
+ this->finish(r);
+ return;
+ } else if (r < 0) {
+ lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::copyup() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ ceph_assert(!m_copyup_in_progress);
+ m_copyup_in_progress = true;
+
+ image_ctx->copyup_list_lock.lock();
+ auto it = image_ctx->copyup_list.find(this->m_object_no);
+ if (it == image_ctx->copyup_list.end()) {
+ auto new_req = CopyupRequest<I>::create(
+ image_ctx, this->m_object_no, std::move(this->m_parent_extents),
+ m_image_area, this->m_trace);
+ this->m_parent_extents.clear();
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this, std::move(get_copyup_overwrite_extents()));
+ image_ctx->copyup_list[this->m_object_no] = new_req;
+
+ image_ctx->copyup_list_lock.unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this, std::move(get_copyup_overwrite_extents()));
+ image_ctx->copyup_list_lock.unlock();
+ }
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_copyup(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+
+ ceph_assert(m_copyup_in_progress);
+ m_copyup_in_progress = false;
+
+ if (r < 0 && r != -ERESTART) {
+ lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ if (r == -ERESTART || is_post_copyup_write_required()) {
+ write_object();
+ return;
+ }
+
+ post_write_object_map_update();
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::post_write_object_map_update() {
+ I *image_ctx = this->m_ictx;
+
+ image_ctx->image_lock.lock_shared();
+ if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() ||
+ !is_non_existent_post_write_object_map_state()) {
+ image_ctx->image_lock.unlock_shared();
+ this->finish(0);
+ return;
+ }
+
+ ldout(image_ctx->cct, 20) << dendl;
+
+ // should have been flushed prior to releasing lock
+ ceph_assert(image_ctx->exclusive_lock->is_lock_owner());
+ if (image_ctx->object_map->template aio_update<
+ AbstractObjectWriteRequest<I>,
+ &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>(
+ CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING,
+ this->m_trace, false, this)) {
+ image_ctx->image_lock.unlock_shared();
+ return;
+ }
+
+ image_ctx->image_lock.unlock_shared();
+ this->finish(0);
+}
+
+template <typename I>
+void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << "r=" << r << dendl;
+ if (r < 0) {
+ lderr(image_ctx->cct) << "failed to update object map: "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectWriteRequest<I>::add_write_hint(neorados::WriteOp* wr) {
+ if ((m_write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
+ wr->create(true);
+ } else if (m_assert_version.has_value()) {
+ wr->assert_version(m_assert_version.value());
+ }
+ AbstractObjectWriteRequest<I>::add_write_hint(wr);
+}
+
+template <typename I>
+void ObjectWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ if (this->m_full_object) {
+ wr->write_full(bufferlist{m_write_data});
+ } else {
+ wr->write(this->m_object_off, bufferlist{m_write_data});
+ }
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+void ObjectDiscardRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ wr->remove();
+ break;
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ wr->create(false);
+ // fall through
+ case DISCARD_ACTION_TRUNCATE:
+ wr->truncate(this->m_object_off);
+ break;
+ case DISCARD_ACTION_ZERO:
+ wr->zero(this->m_object_off, this->m_object_len);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+}
+
+template <typename I>
+void ObjectWriteSameRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ wr->writesame(this->m_object_off, this->m_object_len,
+ bufferlist{m_write_data});
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+void ObjectCompareAndWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) {
+ wr->cmpext(this->m_object_off, bufferlist{m_cmp_bl}, nullptr);
+
+ if (this->m_full_object) {
+ wr->write_full(bufferlist{m_write_bl});
+ } else {
+ wr->write(this->m_object_off, bufferlist{m_write_bl});
+ }
+ util::apply_op_flags(m_op_flags, 0U, wr);
+}
+
+template <typename I>
+int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const {
+ if (r <= -MAX_ERRNO) {
+ I *image_ctx = this->m_ictx;
+
+ // object extent compare mismatch
+ uint64_t offset = -MAX_ERRNO - r;
+ auto [image_extents, _] = io::util::object_to_area_extents(
+ image_ctx, this->m_object_no, {{offset, this->m_object_len}});
+ ceph_assert(image_extents.size() == 1);
+
+ if (m_mismatch_offset) {
+ *m_mismatch_offset = image_extents[0].first;
+ }
+ r = -EILSEQ;
+ }
+ return r;
+}
+
+template <typename I>
+ObjectListSnapsRequest<I>::ObjectListSnapsRequest(
+ I *ictx, uint64_t objectno, Extents&& object_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, Context *completion)
+ : ObjectRequest<I>(
+ ictx, objectno, ictx->duplicate_data_io_context(), "snap_list",
+ parent_trace, completion),
+ m_object_extents(std::move(object_extents)),
+ m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags),
+ m_snapshot_delta(snapshot_delta) {
+ this->m_io_context->read_snap(CEPH_SNAPDIR);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::send() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ if (m_snap_ids.size() < 2) {
+ lderr(image_ctx->cct) << "invalid snap ids: " << m_snap_ids << dendl;
+ this->async_finish(-EINVAL);
+ return;
+ }
+
+ list_snaps();
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::list_snaps() {
+ I *image_ctx = this->m_ictx;
+ ldout(image_ctx->cct, 20) << dendl;
+
+ neorados::ReadOp read_op;
+ read_op.list_snaps(&m_snap_set, &m_ec);
+
+ image_ctx->rados_api.execute(
+ {data_object_name(this->m_ictx, this->m_object_no)},
+ *this->m_io_context, std::move(read_op), nullptr,
+ librbd::asio::util::get_callback_adapter(
+ [this](int r) { handle_list_snaps(r); }), nullptr,
+ (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::handle_list_snaps(int r) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ if (r >= 0) {
+ r = -m_ec.value();
+ }
+
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ m_snapshot_delta->clear();
+ auto& snapshot_delta = *m_snapshot_delta;
+
+ ceph_assert(!m_snap_ids.empty());
+ librados::snap_t start_snap_id = 0;
+ librados::snap_t first_snap_id = *m_snap_ids.begin();
+ librados::snap_t last_snap_id = *m_snap_ids.rbegin();
+
+ if (r == -ENOENT) {
+ // the object does not exist -- mark the missing extents
+ zero_extent(first_snap_id, true);
+ list_from_parent();
+ return;
+ } else if (r < 0) {
+ lderr(cct) << "failed to retrieve object snapshot list: " << cpp_strerror(r)
+ << dendl;
+ this->finish(r);
+ return;
+ }
+
+ // helper function requires the librados legacy data structure
+ librados::snap_set_t snap_set;
+ convert_snap_set(m_snap_set, &snap_set);
+
+ bool initial_extents_written = false;
+
+ interval_set<uint64_t> object_interval;
+ for (auto& object_extent : m_object_extents) {
+ object_interval.insert(object_extent.first, object_extent.second);
+ }
+ ldout(cct, 20) << "object_interval=" << object_interval << dendl;
+
+ // loop through all expected snapshots and build interval sets for
+ // data and zeroed ranges for each snapshot
+ uint64_t prev_end_size = 0;
+ interval_set<uint64_t> initial_written_extents;
+ for (auto end_snap_id : m_snap_ids) {
+ if (start_snap_id == end_snap_id) {
+ continue;
+ } else if (end_snap_id > last_snap_id) {
+ break;
+ }
+
+ interval_set<uint64_t> diff;
+ uint64_t end_size;
+ bool exists;
+ librados::snap_t clone_end_snap_id;
+ bool read_whole_object;
+ calc_snap_set_diff(cct, snap_set, start_snap_id,
+ end_snap_id, &diff, &end_size, &exists,
+ &clone_end_snap_id, &read_whole_object);
+
+ if (read_whole_object ||
+ (!diff.empty() &&
+ ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) {
+ ldout(cct, 1) << "need to read full object" << dendl;
+ diff.clear();
+ diff.insert(0, image_ctx->layout.object_size);
+ end_size = image_ctx->layout.object_size;
+ clone_end_snap_id = end_snap_id;
+ } else if (!exists) {
+ end_size = 0;
+ }
+
+ if (exists) {
+ // reads should be issued against the newest (existing) snapshot within
+ // the associated snapshot object clone. writes should be issued
+ // against the oldest snapshot in the snap_map.
+ ceph_assert(clone_end_snap_id >= end_snap_id);
+ if (clone_end_snap_id > last_snap_id) {
+ // do not read past the copy point snapshot
+ clone_end_snap_id = last_snap_id;
+ }
+ }
+
+ // clip diff to current object extent
+ interval_set<uint64_t> diff_interval;
+ diff_interval.intersection_of(object_interval, diff);
+
+ // clip diff to size of object (in case it was truncated)
+ interval_set<uint64_t> zero_interval;
+ if (end_size < prev_end_size) {
+ zero_interval.insert(end_size, prev_end_size - end_size);
+ zero_interval.intersection_of(object_interval);
+
+ interval_set<uint64_t> trunc_interval;
+ trunc_interval.intersection_of(zero_interval, diff_interval);
+ if (!trunc_interval.empty()) {
+ diff_interval.subtract(trunc_interval);
+ ldout(cct, 20) << "clearing truncate diff: " << trunc_interval << dendl;
+ }
+ }
+
+ ldout(cct, 20) << "start_snap_id=" << start_snap_id << ", "
+ << "end_snap_id=" << end_snap_id << ", "
+ << "clone_end_snap_id=" << clone_end_snap_id << ", "
+ << "diff=" << diff << ", "
+ << "diff_interval=" << diff_interval<< ", "
+ << "zero_interval=" << zero_interval<< ", "
+ << "end_size=" << end_size << ", "
+ << "prev_end_size=" << prev_end_size << ", "
+ << "exists=" << exists << ", "
+ << "whole_object=" << read_whole_object << dendl;
+
+ // check if object exists prior to start of incremental snap delta so that
+ // we don't DNE the object if no additional deltas exist
+ if (exists && start_snap_id == 0 &&
+ (!diff_interval.empty() || !zero_interval.empty())) {
+ ldout(cct, 20) << "object exists at snap id " << end_snap_id << dendl;
+ initial_extents_written = true;
+ }
+
+ prev_end_size = end_size;
+ start_snap_id = end_snap_id;
+
+ if (end_snap_id <= first_snap_id) {
+ // don't include deltas from the starting snapshots, but we iterate over
+ // it to track its existence and size
+ ldout(cct, 20) << "skipping prior snapshot " << dendl;
+ continue;
+ }
+
+ if (exists) {
+ for (auto& interval : diff_interval) {
+ snapshot_delta[{end_snap_id, clone_end_snap_id}].insert(
+ interval.first, interval.second,
+ SparseExtent(SPARSE_EXTENT_STATE_DATA, interval.second));
+ }
+ } else {
+ zero_interval.union_of(diff_interval);
+ }
+
+ if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
+ for (auto& interval : zero_interval) {
+ snapshot_delta[{end_snap_id, end_snap_id}].insert(
+ interval.first, interval.second,
+ SparseExtent(SPARSE_EXTENT_STATE_ZEROED, interval.second));
+ }
+ }
+ }
+
+ bool snapshot_delta_empty = snapshot_delta.empty();
+ if (!initial_extents_written) {
+ zero_extent(first_snap_id, first_snap_id > 0);
+ }
+ ldout(cct, 20) << "snapshot_delta=" << snapshot_delta << dendl;
+
+ if (snapshot_delta_empty) {
+ list_from_parent();
+ return;
+ }
+
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::list_from_parent() {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ ceph_assert(!m_snap_ids.empty());
+ librados::snap_t snap_id_start = *m_snap_ids.begin();
+ librados::snap_t snap_id_end = *m_snap_ids.rbegin();
+
+ std::unique_lock image_locker{image_ctx->image_lock};
+ if ((snap_id_start > 0) || (image_ctx->parent == nullptr) ||
+ ((m_list_snaps_flags & LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT) != 0)) {
+ image_locker.unlock();
+
+ this->finish(0);
+ return;
+ }
+
+ Extents parent_extents;
+ uint64_t raw_overlap = 0;
+ uint64_t object_overlap = 0;
+ image_ctx->get_parent_overlap(snap_id_end, &raw_overlap);
+ if (raw_overlap > 0) {
+ // calculate reverse mapping onto the parent image
+ std::tie(parent_extents, m_image_area) = io::util::object_to_area_extents(
+ image_ctx, this->m_object_no, m_object_extents);
+ object_overlap = image_ctx->prune_parent_extents(
+ parent_extents, m_image_area, raw_overlap, false);
+ }
+ if (object_overlap == 0) {
+ image_locker.unlock();
+
+ this->finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ObjectListSnapsRequest<I>,
+ &ObjectListSnapsRequest<I>::handle_list_from_parent>(this);
+ auto aio_comp = AioCompletion::create_and_start(
+ ctx, librbd::util::get_image_ctx(image_ctx->parent), AIO_TYPE_GENERIC);
+ ldout(cct, 20) << "completion=" << aio_comp
+ << " parent_extents=" << parent_extents
+ << " area=" << m_image_area << dendl;
+
+ auto list_snaps_flags = (
+ m_list_snaps_flags | LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS);
+
+ ImageListSnapsRequest<I> req(
+ *image_ctx->parent, aio_comp, std::move(parent_extents), m_image_area,
+ {0, image_ctx->parent->snap_id}, list_snaps_flags, &m_parent_snapshot_delta,
+ this->m_trace);
+ req.send();
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::handle_list_from_parent(int r) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ ldout(cct, 20) << "r=" << r << ", "
+ << "parent_snapshot_delta=" << m_parent_snapshot_delta
+ << dendl;
+
+ // ignore special-case of fully empty dataset (we ignore zeroes)
+ if (m_parent_snapshot_delta.empty()) {
+ this->finish(0);
+ return;
+ }
+
+ // the write/read snapshot id key is not useful for parent images so
+ // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key
+ *m_snapshot_delta = {};
+ auto& intervals = (*m_snapshot_delta)[INITIAL_WRITE_READ_SNAP_IDS];
+ for (auto& [key, image_extents] : m_parent_snapshot_delta) {
+ for (auto image_extent : image_extents) {
+ auto state = image_extent.get_val().state;
+
+ // map image-extents back to this object
+ striper::LightweightObjectExtents object_extents;
+ io::util::area_to_object_extents(image_ctx, image_extent.get_off(),
+ image_extent.get_len(), m_image_area, 0,
+ &object_extents);
+ for (auto& object_extent : object_extents) {
+ ceph_assert(object_extent.object_no == this->m_object_no);
+ intervals.insert(
+ object_extent.offset, object_extent.length,
+ {state, object_extent.length});
+ }
+ }
+ }
+
+ ldout(cct, 20) << "snapshot_delta=" << *m_snapshot_delta << dendl;
+ this->finish(0);
+}
+
+template <typename I>
+void ObjectListSnapsRequest<I>::zero_extent(uint64_t snap_id, bool dne) {
+ I *image_ctx = this->m_ictx;
+ auto cct = image_ctx->cct;
+
+ // the object does not exist or is (partially) under whiteout -- mark the
+ // missing extents which would be any portion of the object that does not
+ // have data in the initial snapshot set
+ if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) {
+ interval_set<uint64_t> interval;
+ for (auto [object_offset, object_length] : m_object_extents) {
+ interval.insert(object_offset, object_length);
+ }
+
+ for (auto [offset, length] : interval) {
+ ldout(cct, 20) << "snapshot " << snap_id << ": "
+ << (dne ? "DNE" : "zeroed") << " extent "
+ << offset << "~" << length << dendl;
+ (*m_snapshot_delta)[{snap_id, snap_id}].insert(
+ offset, length,
+ SparseExtent(
+ (dne ? SPARSE_EXTENT_STATE_DNE : SPARSE_EXTENT_STATE_ZEROED),
+ length));
+ }
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
+template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h
new file mode 100644
index 000000000..caf644023
--- /dev/null
+++ b/src/librbd/io/ObjectRequest.h
@@ -0,0 +1,505 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/neorados/RADOS.hpp"
+#include "include/rados/librados.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/ObjectMap.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+class Context;
+class ObjectExtent;
+
+namespace neorados { struct WriteOp; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class CopyupRequest;
+
+/**
+ * This class represents an I/O operation to a single RBD data object.
+ * Its subclasses encapsulate logic for dealing with special cases
+ * for I/O due to layering.
+ */
+template <typename ImageCtxT = ImageCtx>
+class ObjectRequest {
+public:
+ static ObjectRequest* create_write(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ int write_flags, std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_discard(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_write_same(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, Context *completion);
+ static ObjectRequest* create_compare_and_write(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion);
+
+ ObjectRequest(ImageCtxT *ictx, uint64_t objectno, IOContext io_context,
+ const char *trace_name, const ZTracer::Trace &parent_trace,
+ Context *completion);
+ virtual ~ObjectRequest() {
+ m_trace.event("finish");
+ }
+
+ static void add_write_hint(ImageCtxT& image_ctx,
+ neorados::WriteOp *wr);
+
+ virtual void send() = 0;
+
+ bool has_parent() const {
+ return m_has_parent;
+ }
+
+ virtual const char *get_op_type() const = 0;
+
+protected:
+ bool compute_parent_extents(Extents *parent_extents, ImageArea *area,
+ bool read_request);
+
+ ImageCtxT *m_ictx;
+ uint64_t m_object_no;
+ IOContext m_io_context;
+ Context *m_completion;
+ ZTracer::Trace m_trace;
+
+ void async_finish(int r);
+ void finish(int r);
+
+private:
+ bool m_has_parent = false;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectReadRequest : public ObjectRequest<ImageCtxT> {
+public:
+ static ObjectReadRequest* create(
+ ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion) {
+ return new ObjectReadRequest(ictx, objectno, extents, io_context, op_flags,
+ read_flags, parent_trace, version, completion);
+ }
+
+ ObjectReadRequest(
+ ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* version,
+ Context *completion);
+
+ void send() override;
+
+ const char *get_op_type() const override {
+ return "read";
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |
+ * v
+ * READ_OBJECT
+ * |
+ * v (skip if not needed)
+ * READ_PARENT
+ * |
+ * v (skip if not needed)
+ * COPYUP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ReadExtents* m_extents;
+ int m_op_flags;
+ int m_read_flags;
+ uint64_t* m_version;
+
+ void read_object();
+ void handle_read_object(int r);
+
+ void read_parent();
+ void handle_read_parent(int r);
+
+ void copyup();
+};
+
+template <typename ImageCtxT = ImageCtx>
+class AbstractObjectWriteRequest : public ObjectRequest<ImageCtxT> {
+public:
+ AbstractObjectWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, uint64_t len,
+ IOContext io_context, const char *trace_name,
+ const ZTracer::Trace &parent_trace, Context *completion);
+
+ virtual bool is_empty_write_op() const {
+ return false;
+ }
+
+ virtual uint8_t get_pre_write_object_map_state() const {
+ return OBJECT_EXISTS;
+ }
+
+ virtual void add_copyup_ops(neorados::WriteOp *wr) {
+ add_write_ops(wr);
+ }
+
+ void handle_copyup(int r);
+
+ void send() override;
+
+protected:
+ uint64_t m_object_off;
+ uint64_t m_object_len;
+ bool m_full_object = false;
+ bool m_copyup_enabled = true;
+
+ virtual bool is_no_op_for_nonexistent_object() const {
+ return false;
+ }
+ virtual bool is_object_map_update_enabled() const {
+ return true;
+ }
+ virtual bool is_post_copyup_write_required() const {
+ return false;
+ }
+ virtual bool is_non_existent_post_write_object_map_state() const {
+ return false;
+ }
+
+ virtual void add_write_hint(neorados::WriteOp *wr);
+ virtual void add_write_ops(neorados::WriteOp *wr) = 0;
+
+ virtual int filter_write_result(int r) const {
+ return r;
+ }
+
+ virtual Extents get_copyup_overwrite_extents() const {
+ return {{m_object_off, m_object_len}};
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (no-op write request)
+ * DETECT_NO_OP . . . . . . . . . . . . . . . . . . .
+ * | .
+ * v (skip if not required/disabled) .
+ * PRE_UPDATE_OBJECT_MAP .
+ * | . .
+ * | . (child dne) .
+ * | . . . . . . . . . .
+ * | . .
+ * | (post-copyup write) . .
+ * | . . . . . . . . . . . . . .
+ * | . . . .
+ * v v . v .
+ * WRITE . . . . . . . . > COPYUP (if required) .
+ * | | .
+ * |/----------------------/ .
+ * | .
+ * v (skip if not required/disabled) .
+ * POST_UPDATE_OBJECT_MAP .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ Extents m_parent_extents;
+ ImageArea m_image_area = ImageArea::DATA;
+ bool m_object_may_exist = false;
+ bool m_copyup_in_progress = false;
+ bool m_guarding_migration_write = false;
+
+ void compute_parent_info();
+
+ void pre_write_object_map_update();
+ void handle_pre_write_object_map_update(int r);
+
+ void write_object();
+ void handle_write_object(int r);
+
+ void copyup();
+
+ void post_write_object_map_update();
+ void handle_post_write_object_map_update(int r);
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context, int op_flags,
+ int write_flags, std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ data.length(), io_context, "write",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags),
+ m_write_flags(write_flags), m_assert_version(assert_version) {
+ }
+
+ bool is_empty_write_op() const override {
+ return (m_write_data.length() == 0);
+ }
+
+ const char *get_op_type() const override {
+ return "write";
+ }
+
+protected:
+ void add_write_ops(neorados::WriteOp *wr) override;
+ void add_write_hint(neorados::WriteOp *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+ int m_write_flags;
+ std::optional<uint64_t> m_assert_version;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectDiscardRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectDiscardRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ object_len, io_context, "discard",
+ parent_trace, completion),
+ m_discard_flags(discard_flags) {
+ if (this->m_full_object) {
+ if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 &&
+ this->has_parent()) {
+ if (!this->m_copyup_enabled) {
+ // need to hide the parent object instead of child object
+ m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ }
+ } else {
+ m_discard_action = DISCARD_ACTION_REMOVE;
+ }
+ } else if (object_off + object_len == ictx->layout.object_size) {
+ m_discard_action = DISCARD_ACTION_TRUNCATE;
+ } else {
+ m_discard_action = DISCARD_ACTION_ZERO;
+ }
+ }
+
+ const char* get_op_type() const override {
+ switch (m_discard_action) {
+ case DISCARD_ACTION_REMOVE:
+ return "remove";
+ case DISCARD_ACTION_REMOVE_TRUNCATE:
+ return "remove (create+truncate)";
+ case DISCARD_ACTION_TRUNCATE:
+ return "truncate";
+ case DISCARD_ACTION_ZERO:
+ return "zero";
+ }
+ ceph_abort();
+ return nullptr;
+ }
+
+ uint8_t get_pre_write_object_map_state() const override {
+ if (m_discard_action == DISCARD_ACTION_REMOVE) {
+ return OBJECT_PENDING;
+ }
+ return OBJECT_EXISTS;
+ }
+
+protected:
+ bool is_no_op_for_nonexistent_object() const override {
+ return (!this->has_parent());
+ }
+ bool is_object_map_update_enabled() const override {
+ return (
+ (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0);
+ }
+ bool is_non_existent_post_write_object_map_state() const override {
+ return (m_discard_action == DISCARD_ACTION_REMOVE);
+ }
+
+ void add_write_hint(neorados::WriteOp *wr) override {
+ // no hint for discard
+ }
+
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+private:
+ enum DiscardAction {
+ DISCARD_ACTION_REMOVE,
+ DISCARD_ACTION_REMOVE_TRUNCATE,
+ DISCARD_ACTION_TRUNCATE,
+ DISCARD_ACTION_ZERO
+ };
+
+ DiscardAction m_discard_action;
+ int m_discard_flags;
+
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectWriteSameRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectWriteSameRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ uint64_t object_len, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ object_len, io_context, "writesame",
+ parent_trace, completion),
+ m_write_data(std::move(data)), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "writesame";
+ }
+
+protected:
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+private:
+ ceph::bufferlist m_write_data;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> {
+public:
+ ObjectCompareAndWriteRequest(
+ ImageCtxT *ictx, uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& cmp_bl, ceph::bufferlist&& write_bl,
+ IOContext io_context, uint64_t *mismatch_offset, int op_flags,
+ const ZTracer::Trace &parent_trace, Context *completion)
+ : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off,
+ cmp_bl.length(), io_context,
+ "compare_and_write", parent_trace,
+ completion),
+ m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)),
+ m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) {
+ }
+
+ const char *get_op_type() const override {
+ return "compare_and_write";
+ }
+
+ void add_copyup_ops(neorados::WriteOp *wr) override {
+ // no-op on copyup
+ }
+
+protected:
+ virtual bool is_post_copyup_write_required() const {
+ return true;
+ }
+
+ void add_write_ops(neorados::WriteOp *wr) override;
+
+ int filter_write_result(int r) const override;
+
+ Extents get_copyup_overwrite_extents() const override {
+ return {};
+ }
+
+private:
+ ceph::bufferlist m_cmp_bl;
+ ceph::bufferlist m_write_bl;
+ uint64_t *m_mismatch_offset;
+ int m_op_flags;
+};
+
+template <typename ImageCtxT = ImageCtx>
+class ObjectListSnapsRequest : public ObjectRequest<ImageCtxT> {
+public:
+ static ObjectListSnapsRequest* create(
+ ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta,
+ Context *completion) {
+ return new ObjectListSnapsRequest(ictx, objectno,
+ std::move(object_extents),
+ std::move(snap_ids), list_snaps_flags,
+ parent_trace, snapshot_delta, completion);
+ }
+
+ ObjectListSnapsRequest(
+ ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents,
+ SnapIds&& snap_ids, int list_snaps_flags,
+ const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta,
+ Context *completion);
+
+ void send() override;
+
+ const char *get_op_type() const override {
+ return "snap_list";
+ }
+
+private:
+ Extents m_object_extents;
+ SnapIds m_snap_ids;
+ int m_list_snaps_flags;
+ SnapshotDelta* m_snapshot_delta;
+
+ neorados::SnapSet m_snap_set;
+ boost::system::error_code m_ec;
+
+ ImageArea m_image_area = ImageArea::DATA;
+ SnapshotDelta m_parent_snapshot_delta;
+
+ void list_snaps();
+ void handle_list_snaps(int r);
+
+ void list_from_parent();
+ void handle_list_from_parent(int r);
+
+ void zero_extent(uint64_t snap_id, bool dne);
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::ObjectRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectReadRequest<librbd::ImageCtx>;
+extern template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>;
+extern template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H
diff --git a/src/librbd/io/QosImageDispatch.cc b/src/librbd/io/QosImageDispatch.cc
new file mode 100644
index 000000000..ea1d5dbb5
--- /dev/null
+++ b/src/librbd/io/QosImageDispatch.cc
@@ -0,0 +1,328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/QosImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/io/FlushTracker.h"
+#include <utility>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::QosImageDispatch: " << this << " " \
+ << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+namespace {
+
+uint64_t get_extent_length(const Extents& extents) {
+ uint64_t length = 0;
+ for (auto& extent : extents) {
+ length += extent.second;
+ }
+ return length;
+}
+
+uint64_t calculate_tokens(bool read_op, uint64_t extent_length, uint64_t flag) {
+ if (read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK) != 0)) {
+ return 0;
+ } else if (!read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_READ_MASK) != 0)) {
+ return 0;
+ }
+
+ return (((flag & IMAGE_DISPATCH_FLAG_QOS_BPS_MASK) != 0) ? extent_length : 1);
+}
+
+static const std::pair<uint64_t, const char*> throttle_flags[] = {
+ {IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" },
+ {IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" }
+};
+
+} // anonymous namespace
+
+template <typename I>
+QosImageDispatch<I>::QosImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+
+ SafeTimer *timer;
+ ceph::mutex *timer_lock;
+ ImageCtx::get_timer_instance(cct, &timer, &timer_lock);
+ for (auto [flag, name] : throttle_flags) {
+ m_throttles.emplace_back(
+ flag,
+ new TokenBucketThrottle(cct, name, 0, 0, timer, timer_lock));
+ }
+}
+
+template <typename I>
+QosImageDispatch<I>::~QosImageDispatch() {
+ for (auto t : m_throttles) {
+ delete t.second;
+ }
+}
+
+template <typename I>
+void QosImageDispatch<I>::shut_down(Context* on_finish) {
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+void QosImageDispatch<I>::apply_qos_schedule_tick_min(uint64_t tick) {
+ for (auto pair : m_throttles) {
+ pair.second->set_schedule_tick_min(tick);
+ }
+}
+
+template <typename I>
+void QosImageDispatch<I>::apply_qos_limit(uint64_t flag, uint64_t limit,
+ uint64_t burst, uint64_t burst_seconds) {
+ auto cct = m_image_ctx->cct;
+ TokenBucketThrottle *throttle = nullptr;
+ for (auto pair : m_throttles) {
+ if (flag == pair.first) {
+ throttle = pair.second;
+ break;
+ }
+ }
+ ceph_assert(throttle != nullptr);
+
+ int r = throttle->set_limit(limit, burst, burst_seconds);
+ if (r < 0) {
+ lderr(cct) << throttle->get_name() << ": invalid qos parameter: "
+ << "burst(" << burst << ") is less than "
+ << "limit(" << limit << ")" << dendl;
+ // if apply failed, we should at least make sure the limit works.
+ throttle->set_limit(limit, 0, 1);
+ }
+
+ if (limit) {
+ m_qos_enabled_flag |= flag;
+ } else {
+ m_qos_enabled_flag &= ~flag;
+ }
+}
+
+template <typename I>
+void QosImageDispatch<I>::apply_qos_exclude_ops(uint64_t exclude_ops) {
+ m_qos_exclude_ops = exclude_ops;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (m_qos_exclude_ops & RBD_IO_OPERATION_READ) {
+ return false;
+ }
+
+ if (needs_throttle(true, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE) {
+ return false;
+ }
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (m_qos_exclude_ops & RBD_IO_OPERATION_DISCARD) {
+ return false;
+ }
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE_SAME) {
+ return false;
+ }
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (m_qos_exclude_ops & RBD_IO_OPERATION_COMPARE_AND_WRITE) {
+ return false;
+ }
+
+ if (needs_throttle(false, image_extents, tid, image_dispatch_flags,
+ dispatch_result, on_finish, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool QosImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void QosImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ m_flush_tracker->finish_io(tid);
+}
+
+template <typename I>
+bool QosImageDispatch<I>::set_throttle_flag(
+ std::atomic<uint32_t>* image_dispatch_flags, uint32_t flag) {
+ uint32_t expected = image_dispatch_flags->load();
+ uint32_t desired;
+ do {
+ desired = expected | flag;
+ } while (!image_dispatch_flags->compare_exchange_weak(expected, desired));
+
+ return ((desired & IMAGE_DISPATCH_FLAG_QOS_MASK) ==
+ IMAGE_DISPATCH_FLAG_QOS_MASK);
+}
+
+template <typename I>
+bool QosImageDispatch<I>::needs_throttle(
+ bool read_op, const Extents& image_extents, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ auto extent_length = get_extent_length(image_extents);
+ bool all_qos_flags_set = false;
+
+ if (!read_op) {
+ m_flush_tracker->start_io(tid);
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ }
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+
+ auto qos_enabled_flag = m_qos_enabled_flag;
+ for (auto [flag, throttle] : m_throttles) {
+ if ((qos_enabled_flag & flag) == 0) {
+ all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag);
+ continue;
+ }
+
+ auto tokens = calculate_tokens(read_op, extent_length, flag);
+ if (tokens > 0 &&
+ throttle->get(tokens, this, &QosImageDispatch<I>::handle_throttle_ready,
+ Tag{image_dispatch_flags, on_dispatched}, flag)) {
+ ldout(cct, 15) << "on_dispatched=" << on_dispatched << ", "
+ << "flag=" << flag << dendl;
+ all_qos_flags_set = false;
+ } else {
+ all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag);
+ }
+ }
+ return !all_qos_flags_set;
+}
+
+template <typename I>
+void QosImageDispatch<I>::handle_throttle_ready(Tag&& tag, uint64_t flag) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 15) << "on_dispatched=" << tag.on_dispatched << ", "
+ << "flag=" << flag << dendl;
+
+ if (set_throttle_flag(tag.image_dispatch_flags, flag)) {
+ // timer_lock is held -- so dispatch from outside the timer thread
+ m_image_ctx->asio_engine->post(tag.on_dispatched, 0);
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::QosImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/QosImageDispatch.h b/src/librbd/io/QosImageDispatch.h
new file mode 100644
index 000000000..f5e08940a
--- /dev/null
+++ b/src/librbd/io/QosImageDispatch.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
+
+#include <list>
+#include <memory>
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class FlushTracker;
+
+template <typename ImageCtxT>
+class QosImageDispatch : public ImageDispatchInterface {
+public:
+ struct Tag {
+ std::atomic<uint32_t>* image_dispatch_flags;
+ Context* on_dispatched;
+
+ Tag(std::atomic<uint32_t>* image_dispatch_flags, Context* on_dispatched)
+ : image_dispatch_flags(image_dispatch_flags),
+ on_dispatched(on_dispatched) {
+ }
+ };
+
+ QosImageDispatch(ImageCtxT* image_ctx);
+ ~QosImageDispatch() override;
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_QOS;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ void apply_qos_schedule_tick_min(uint64_t tick);
+ void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst,
+ uint64_t burst_seconds);
+ void apply_qos_exclude_ops(uint64_t exclude_ops);
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ std::list<std::pair<uint64_t, TokenBucketThrottle*> > m_throttles;
+ uint64_t m_qos_enabled_flag = 0;
+ uint64_t m_qos_exclude_ops = 0;
+
+ std::unique_ptr<FlushTracker<ImageCtxT>> m_flush_tracker;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool set_throttle_flag(std::atomic<uint32_t>* image_dispatch_flags,
+ uint32_t flag);
+ bool needs_throttle(bool read_op, const Extents& image_extents, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched);
+ void handle_throttle_ready(Tag&& tag, uint64_t flag);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::QosImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/QueueImageDispatch.cc b/src/librbd/io/QueueImageDispatch.cc
new file mode 100644
index 000000000..ea5ed63b4
--- /dev/null
+++ b/src/librbd/io/QueueImageDispatch.cc
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/QueueImageDispatch.h"
+#include "common/dout.h"
+#include "common/Cond.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/FlushTracker.h"
+#include "librbd/io/ImageDispatchSpec.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::QueueImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+QueueImageDispatch<I>::QueueImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+QueueImageDispatch<I>::~QueueImageDispatch() {
+ delete m_flush_tracker;
+}
+
+template <typename I>
+void QueueImageDispatch<I>::shut_down(Context* on_finish) {
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(true, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return enqueue(false, tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void QueueImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ m_flush_tracker->finish_io(tid);
+}
+
+template <typename I>
+bool QueueImageDispatch<I>::enqueue(
+ bool read_op, uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ if (!m_image_ctx->non_blocking_aio) {
+ return false;
+ }
+
+ if (!read_op) {
+ m_flush_tracker->start_io(tid);
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ }
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_image_ctx->asio_engine->post(on_dispatched, 0);
+ return true;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::QueueImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/QueueImageDispatch.h b/src/librbd/io/QueueImageDispatch.h
new file mode 100644
index 000000000..9a41927ba
--- /dev/null
+++ b/src/librbd/io/QueueImageDispatch.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <list>
+#include <set>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> class FlushTracker;
+
+template <typename ImageCtxT>
+class QueueImageDispatch : public ImageDispatchInterface {
+public:
+ QueueImageDispatch(ImageCtxT* image_ctx);
+ ~QueueImageDispatch();
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_QUEUE;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ FlushTracker<ImageCtxT>* m_flush_tracker;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool enqueue(bool read_op, uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::QueueImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc
new file mode 100644
index 000000000..c4053fee6
--- /dev/null
+++ b/src/librbd/io/ReadResult.cc
@@ -0,0 +1,262 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/ReadResult.h"
+#include "include/buffer.h"
+#include "common/dout.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/Utils.h"
+#include <boost/variant/apply_visitor.hpp>
+#include <boost/variant/static_visitor.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+struct ReadResult::SetImageExtentsVisitor : public boost::static_visitor<void> {
+ Extents image_extents;
+
+ explicit SetImageExtentsVisitor(const Extents& image_extents)
+ : image_extents(image_extents) {
+ }
+
+ void operator()(Linear &linear) const {
+ uint64_t length = util::get_extents_length(image_extents);
+
+ ceph_assert(length <= linear.buf_len);
+ linear.buf_len = length;
+ }
+
+ void operator()(SparseBufferlist &sbl) const {
+ sbl.image_extents = image_extents;
+ }
+
+ template <typename T>
+ void operator()(T &t) const {
+ }
+};
+
+struct ReadResult::AssembleResultVisitor : public boost::static_visitor<void> {
+ CephContext *cct;
+ Striper::StripedReadResult &destriper;
+
+ AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper)
+ : cct(cct), destriper(destriper) {
+ }
+
+ void operator()(Empty &empty) const {
+ ldout(cct, 20) << "dropping read result" << dendl;
+ }
+
+ void operator()(Linear &linear) const {
+ ldout(cct, 20) << "copying resulting bytes to "
+ << reinterpret_cast<void*>(linear.buf) << dendl;
+ destriper.assemble_result(cct, linear.buf, linear.buf_len);
+ }
+
+ void operator()(Vector &vector) const {
+ bufferlist bl;
+ destriper.assemble_result(cct, bl, true);
+
+ ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec "
+ << reinterpret_cast<const void*>(vector.iov) << dendl;
+
+ bufferlist::iterator it = bl.begin();
+ size_t length = bl.length();
+ size_t offset = 0;
+ int idx = 0;
+ for (; offset < length && idx < vector.iov_count; idx++) {
+ size_t len = std::min(vector.iov[idx].iov_len, length - offset);
+ it.copy(len, static_cast<char *>(vector.iov[idx].iov_base));
+ offset += len;
+ }
+ ceph_assert(offset == bl.length());
+ }
+
+ void operator()(Bufferlist &bufferlist) const {
+ bufferlist.bl->clear();
+ destriper.assemble_result(cct, *bufferlist.bl, true);
+
+ ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " "
+ << "bytes to bl " << reinterpret_cast<void*>(bufferlist.bl)
+ << dendl;
+ }
+
+ void operator()(SparseBufferlist &sparse_bufferlist) const {
+ sparse_bufferlist.bl->clear();
+
+ ExtentMap buffer_extent_map;
+ auto buffer_extents_length = destriper.assemble_result(
+ cct, &buffer_extent_map, sparse_bufferlist.bl);
+
+ ldout(cct, 20) << "image_extents="
+ << sparse_bufferlist.image_extents << ", "
+ << "buffer_extent_map=" << buffer_extent_map << dendl;
+
+ sparse_bufferlist.extent_map->clear();
+ sparse_bufferlist.extent_map->reserve(buffer_extent_map.size());
+
+ // The extent-map is logically addressed by buffer-extents not image- or
+ // object-extents. Translate this address mapping to image-extent
+ // logical addressing since it's tied to an image-extent read
+ uint64_t buffer_offset = 0;
+ auto bem_it = buffer_extent_map.begin();
+ for (auto [image_offset, image_length] : sparse_bufferlist.image_extents) {
+ while (bem_it != buffer_extent_map.end()) {
+ auto [buffer_extent_offset, buffer_extent_length] = *bem_it;
+
+ if (buffer_offset + image_length <= buffer_extent_offset) {
+ // skip any image extent that is not included in the results
+ break;
+ }
+
+ // current buffer-extent should be within the current image-extent
+ ceph_assert(buffer_offset <= buffer_extent_offset &&
+ buffer_offset + image_length >=
+ buffer_extent_offset + buffer_extent_length);
+ auto image_extent_offset =
+ image_offset + (buffer_extent_offset - buffer_offset);
+ ldout(cct, 20) << "mapping buffer extent " << buffer_extent_offset
+ << "~" << buffer_extent_length << " to image extent "
+ << image_extent_offset << "~" << buffer_extent_length
+ << dendl;
+ sparse_bufferlist.extent_map->emplace_back(
+ image_extent_offset, buffer_extent_length);
+ ++bem_it;
+ }
+
+ buffer_offset += image_length;
+ }
+ ceph_assert(buffer_offset == buffer_extents_length);
+ ceph_assert(bem_it == buffer_extent_map.end());
+
+ ldout(cct, 20) << "moved resulting " << *sparse_bufferlist.extent_map
+ << " extents of total " << sparse_bufferlist.bl->length()
+ << " bytes to bl "
+ << reinterpret_cast<void*>(sparse_bufferlist.bl) << dendl;
+ }
+};
+
+ReadResult::C_ImageReadRequest::C_ImageReadRequest(
+ AioCompletion *aio_completion, uint64_t buffer_offset,
+ const Extents image_extents)
+ : aio_completion(aio_completion), buffer_offset(buffer_offset),
+ image_extents(image_extents) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ImageReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ImageReadRequest: r=" << r
+ << dendl;
+ if (r >= 0 || (ignore_enoent && r == -ENOENT)) {
+ striper::LightweightBufferExtents buffer_extents;
+ size_t length = 0;
+ for (auto &image_extent : image_extents) {
+ buffer_extents.emplace_back(buffer_offset + length, image_extent.second);
+ length += image_extent.second;
+ }
+ ceph_assert(r == -ENOENT || length == bl.length());
+
+ aio_completion->lock.lock();
+ aio_completion->read_result.m_destriper.add_partial_result(
+ cct, std::move(bl), buffer_extents);
+ aio_completion->lock.unlock();
+ r = length;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::C_ObjectReadRequest::C_ObjectReadRequest(
+ AioCompletion *aio_completion, ReadExtents&& extents)
+ : aio_completion(aio_completion), extents(std::move(extents)) {
+ aio_completion->add_request();
+}
+
+void ReadResult::C_ObjectReadRequest::finish(int r) {
+ CephContext *cct = aio_completion->ictx->cct;
+ ldout(cct, 10) << "C_ObjectReadRequest: r=" << r
+ << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r >= 0) {
+ uint64_t object_len = 0;
+ aio_completion->lock.lock();
+ for (auto& extent: extents) {
+ ldout(cct, 10) << " got " << extent.extent_map
+ << " for " << extent.buffer_extents
+ << " bl " << extent.bl.length() << dendl;
+
+ aio_completion->read_result.m_destriper.add_partial_sparse_result(
+ cct, std::move(extent.bl), extent.extent_map, extent.offset,
+ extent.buffer_extents);
+
+ object_len += extent.length;
+ }
+ aio_completion->lock.unlock();
+ r = object_len;
+ }
+
+ aio_completion->complete_request(r);
+}
+
+ReadResult::C_ObjectReadMergedExtents::C_ObjectReadMergedExtents(
+ CephContext* cct, ReadExtents* extents, Context* on_finish)
+ : cct(cct), extents(extents), on_finish(on_finish) {
+}
+
+void ReadResult::C_ObjectReadMergedExtents::finish(int r) {
+ if (r >= 0) {
+ for (auto& extent: *extents) {
+ if (bl.length() < extent.length) {
+ lderr(cct) << "Merged extents length is less than expected" << dendl;
+ r = -EIO;
+ break;
+ }
+ bl.splice(0, extent.length, &extent.bl);
+ }
+ if (bl.length() != 0) {
+ lderr(cct) << "Merged extents length is greater than expected" << dendl;
+ r = -EIO;
+ }
+ }
+ on_finish->complete(r);
+}
+
+ReadResult::ReadResult() : m_buffer(Empty()) {
+}
+
+ReadResult::ReadResult(char *buf, size_t buf_len)
+ : m_buffer(Linear(buf, buf_len)) {
+}
+
+ReadResult::ReadResult(const struct iovec *iov, int iov_count)
+ : m_buffer(Vector(iov, iov_count)) {
+}
+
+ReadResult::ReadResult(ceph::bufferlist *bl)
+ : m_buffer(Bufferlist(bl)) {
+}
+
+ReadResult::ReadResult(Extents* extent_map, ceph::bufferlist* bl)
+ : m_buffer(SparseBufferlist(extent_map, bl)) {
+}
+
+void ReadResult::set_image_extents(const Extents& image_extents) {
+ boost::apply_visitor(SetImageExtentsVisitor(image_extents), m_buffer);
+}
+
+void ReadResult::assemble_result(CephContext *cct) {
+ boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer);
+}
+
+} // namespace io
+} // namespace librbd
+
diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h
new file mode 100644
index 000000000..12a1e78cc
--- /dev/null
+++ b/src/librbd/io/ReadResult.h
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_READ_RESULT_H
+#define CEPH_LIBRBD_IO_READ_RESULT_H
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "librbd/io/Types.h"
+#include "osdc/Striper.h"
+#include <sys/uio.h>
+#include <boost/variant/variant.hpp>
+
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+template <typename> struct ObjectReadRequest;
+
+class ReadResult {
+public:
+ struct C_ImageReadRequest : public Context {
+ AioCompletion *aio_completion;
+ uint64_t buffer_offset = 0;
+ Extents image_extents;
+ bufferlist bl;
+ bool ignore_enoent = false;
+
+ C_ImageReadRequest(AioCompletion *aio_completion,
+ uint64_t buffer_offset,
+ const Extents image_extents);
+
+ void finish(int r) override;
+ };
+
+ struct C_ObjectReadRequest : public Context {
+ AioCompletion *aio_completion;
+ ReadExtents extents;
+
+ C_ObjectReadRequest(AioCompletion *aio_completion, ReadExtents&& extents);
+
+ void finish(int r) override;
+ };
+
+ struct C_ObjectReadMergedExtents : public Context {
+ CephContext* cct;
+ ReadExtents* extents;
+ Context *on_finish;
+ bufferlist bl;
+
+ C_ObjectReadMergedExtents(CephContext* cct, ReadExtents* extents,
+ Context* on_finish);
+
+ void finish(int r) override;
+ };
+
+ ReadResult();
+ ReadResult(char *buf, size_t buf_len);
+ ReadResult(const struct iovec *iov, int iov_count);
+ ReadResult(ceph::bufferlist *bl);
+ ReadResult(Extents* extent_map, ceph::bufferlist* bl);
+
+ void set_image_extents(const Extents& image_extents);
+
+ void assemble_result(CephContext *cct);
+
+private:
+ struct Empty {
+ };
+
+ struct Linear {
+ char *buf;
+ size_t buf_len;
+
+ Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) {
+ }
+ };
+
+ struct Vector {
+ const struct iovec *iov;
+ int iov_count;
+
+ Vector(const struct iovec *iov, int iov_count)
+ : iov(iov), iov_count(iov_count) {
+ }
+ };
+
+ struct Bufferlist {
+ ceph::bufferlist *bl;
+
+ Bufferlist(ceph::bufferlist *bl) : bl(bl) {
+ }
+ };
+
+ struct SparseBufferlist {
+ Extents *extent_map;
+ ceph::bufferlist *bl;
+
+ Extents image_extents;
+
+ SparseBufferlist(Extents* extent_map, ceph::bufferlist* bl)
+ : extent_map(extent_map), bl(bl) {
+ }
+ };
+
+ typedef boost::variant<Empty,
+ Linear,
+ Vector,
+ Bufferlist,
+ SparseBufferlist> Buffer;
+ struct SetImageExtentsVisitor;
+ struct AssembleResultVisitor;
+
+ Buffer m_buffer;
+ Striper::StripedReadResult m_destriper;
+
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_READ_RESULT_H
+
diff --git a/src/librbd/io/RefreshImageDispatch.cc b/src/librbd/io/RefreshImageDispatch.cc
new file mode 100644
index 000000000..3141faf25
--- /dev/null
+++ b/src/librbd/io/RefreshImageDispatch.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/RefreshImageDispatch.h"
+#include "common/dout.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include <map>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::RefreshImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+RefreshImageDispatch<I>::RefreshImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+void RefreshImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::read(
+ AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result,
+ IOContext io_context, int op_flags, int read_flags,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents
+ << dendl;
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ // The refresh state machine can initiate a flush and it can
+ // enable the exclusive-lock which will also attmept to flush.
+ if (flush_source == FLUSH_SOURCE_REFRESH ||
+ flush_source == FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH ||
+ flush_source == FLUSH_SOURCE_SHUTDOWN) {
+ return false;
+ }
+
+ if (needs_refresh(dispatch_result, on_dispatched)) {
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+bool RefreshImageDispatch<I>::needs_refresh(
+ DispatchResult* dispatch_result, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+
+ if (m_image_ctx->state->is_refresh_required()) {
+ ldout(cct, 15) << "on_dispatched=" << on_dispatched << dendl;
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_image_ctx->state->refresh(on_dispatched);
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/RefreshImageDispatch.h b/src/librbd/io/RefreshImageDispatch.h
new file mode 100644
index 000000000..668dec419
--- /dev/null
+++ b/src/librbd/io/RefreshImageDispatch.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class RefreshImageDispatch : public ImageDispatchInterface {
+public:
+ RefreshImageDispatch(ImageCtxT* image_ctx);
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_REFRESH;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+private:
+ ImageCtxT* m_image_ctx;
+
+ bool needs_refresh(DispatchResult* dispatch_result, Context* on_dispatched);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H
diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.cc b/src/librbd/io/SimpleSchedulerObjectDispatch.cc
new file mode 100644
index 000000000..cd2ffb197
--- /dev/null
+++ b/src/librbd/io/SimpleSchedulerObjectDispatch.cc
@@ -0,0 +1,565 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/SimpleSchedulerObjectDispatch.h"
+#include "include/neorados/RADOS.hpp"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/io/FlushTracker.h"
+#include "librbd/io/ObjectDispatchSpec.h"
+#include "librbd/io/ObjectDispatcher.h"
+#include "librbd/io/Utils.h"
+
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/rolling_count.hpp>
+#include <boost/accumulators/statistics/rolling_sum.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::SimpleSchedulerObjectDispatch: " \
+ << this << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+using namespace boost::accumulators;
+using ceph::operator<<;
+using librbd::util::data_object_name;
+
+static const int LATENCY_STATS_WINDOW_SIZE = 10;
+
+class LatencyStats {
+private:
+ accumulator_set<uint64_t, stats<tag::rolling_count, tag::rolling_sum>> m_acc;
+
+public:
+ LatencyStats()
+ : m_acc(tag::rolling_window::window_size = LATENCY_STATS_WINDOW_SIZE) {
+ }
+
+ bool is_ready() const {
+ return rolling_count(m_acc) == LATENCY_STATS_WINDOW_SIZE;
+ }
+
+ void add(uint64_t latency) {
+ m_acc(latency);
+ }
+
+ uint64_t avg() const {
+ auto count = rolling_count(m_acc);
+
+ if (count > 0) {
+ return rolling_sum(m_acc);
+ }
+ return 0;
+ }
+};
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_delay_request(
+ uint64_t object_off, ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, int object_dispatch_flags, Context* on_dispatched) {
+ if (!m_delayed_requests.empty()) {
+ if (!m_io_context || *m_io_context != *io_context ||
+ op_flags != m_op_flags || data.length() == 0 ||
+ intersects(object_off, data.length())) {
+ return false;
+ }
+ } else {
+ m_io_context = io_context;
+ m_op_flags = op_flags;
+ }
+
+ if (data.length() == 0) {
+ // a zero length write is usually a special case,
+ // and we don't want it to be merged with others
+ ceph_assert(m_delayed_requests.empty());
+ m_delayed_request_extents.insert(0, UINT64_MAX);
+ } else {
+ m_delayed_request_extents.insert(object_off, data.length());
+ }
+ m_object_dispatch_flags |= object_dispatch_flags;
+
+ if (!m_delayed_requests.empty()) {
+ // try to merge front to an existing request
+ auto iter = m_delayed_requests.find(object_off + data.length());
+ if (iter != m_delayed_requests.end()) {
+ auto new_iter = m_delayed_requests.insert({object_off, {}}).first;
+ new_iter->second.data = std::move(data);
+ new_iter->second.data.append(std::move(iter->second.data));
+ new_iter->second.requests = std::move(iter->second.requests);
+ new_iter->second.requests.push_back(on_dispatched);
+ m_delayed_requests.erase(iter);
+
+ if (new_iter != m_delayed_requests.begin()) {
+ auto prev = new_iter;
+ try_merge_delayed_requests(--prev, new_iter);
+ }
+ return true;
+ }
+
+ // try to merge back to an existing request
+ iter = m_delayed_requests.lower_bound(object_off);
+ if (iter != m_delayed_requests.begin() &&
+ (iter == m_delayed_requests.end() || iter->first > object_off)) {
+ iter--;
+ }
+ if (iter != m_delayed_requests.end() &&
+ iter->first + iter->second.data.length() == object_off) {
+ iter->second.data.append(std::move(data));
+ iter->second.requests.push_back(on_dispatched);
+
+ auto next = iter;
+ if (++next != m_delayed_requests.end()) {
+ try_merge_delayed_requests(iter, next);
+ }
+ return true;
+ }
+ }
+
+ // create a new request
+ auto iter = m_delayed_requests.insert({object_off, {}}).first;
+ iter->second.data = std::move(data);
+ iter->second.requests.push_back(on_dispatched);
+ return true;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_merge_delayed_requests(
+ typename std::map<uint64_t, MergedRequests>::iterator &iter1,
+ typename std::map<uint64_t, MergedRequests>::iterator &iter2) {
+ if (iter1->first + iter1->second.data.length() != iter2->first) {
+ return;
+ }
+
+ iter1->second.data.append(std::move(iter2->second.data));
+ iter1->second.requests.insert(iter1->second.requests.end(),
+ iter2->second.requests.begin(),
+ iter2->second.requests.end());
+ m_delayed_requests.erase(iter2);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::ObjectRequests::dispatch_delayed_requests(
+ I *image_ctx, LatencyStats *latency_stats, ceph::mutex *latency_stats_lock) {
+ for (auto &it : m_delayed_requests) {
+ auto offset = it.first;
+ auto &merged_requests = it.second;
+
+ auto ctx = new LambdaContext(
+ [requests=std::move(merged_requests.requests), latency_stats,
+ latency_stats_lock, start_time=ceph_clock_now()](int r) {
+ if (latency_stats) {
+ std::lock_guard locker{*latency_stats_lock};
+ auto latency = ceph_clock_now() - start_time;
+ latency_stats->add(latency.to_nsec());
+ }
+ for (auto on_dispatched : requests) {
+ on_dispatched->complete(r);
+ }
+ });
+
+ auto req = ObjectDispatchSpec::create_write(
+ image_ctx, OBJECT_DISPATCH_LAYER_SCHEDULER,
+ m_object_no, offset, std::move(merged_requests.data), m_io_context,
+ m_op_flags, 0, std::nullopt, 0, {}, ctx);
+
+ req->object_dispatch_flags = m_object_dispatch_flags;
+ req->send();
+ }
+
+ m_dispatch_time = {};
+}
+
+template <typename I>
+SimpleSchedulerObjectDispatch<I>::SimpleSchedulerObjectDispatch(
+ I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_flush_tracker(new FlushTracker<I>(image_ctx)),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "librbd::io::SimpleSchedulerObjectDispatch::lock", this))),
+ m_max_delay(image_ctx->config.template get_val<uint64_t>(
+ "rbd_io_scheduler_simple_max_delay")) {
+ CephContext *cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+
+ I::get_timer_instance(cct, &m_timer, &m_timer_lock);
+
+ if (m_max_delay == 0) {
+ m_latency_stats = std::make_unique<LatencyStats>();
+ }
+}
+
+template <typename I>
+SimpleSchedulerObjectDispatch<I>::~SimpleSchedulerObjectDispatch() {
+ delete m_flush_tracker;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::init() {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ // add ourself to the IO object dispatcher chain
+ m_image_ctx->io_object_dispatcher->register_dispatch(this);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::shut_down(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << dendl;
+
+ m_flush_tracker->shut_down();
+ on_finish->complete(0);
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " << extents
+ << dendl;
+
+ std::lock_guard locker{m_lock};
+ for (auto& extent : *extents) {
+ if (intersects(object_no, extent.offset, extent.length)) {
+ dispatch_delayed_requests(object_no);
+ break;
+ }
+ }
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << data.length() << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ // don't try to batch assert version writes
+ if (assert_version.has_value() ||
+ (write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) {
+ dispatch_delayed_requests(object_no);
+ return false;
+ }
+
+ if (try_delay_write(object_no, object_off, std::move(data), io_context,
+ op_flags, *object_dispatch_flags, on_dispatched)) {
+
+ auto dispatch_seq = ++m_dispatch_seq;
+ m_flush_tracker->start_io(dispatch_seq);
+ *on_finish = new LambdaContext(
+ [this, dispatch_seq, ctx=*on_finish](int r) {
+ ctx->complete(r);
+ m_flush_tracker->finish_io(dispatch_seq);
+ });
+
+ *dispatch_result = DISPATCH_RESULT_COMPLETE;
+ return true;
+ }
+
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, ceph_clock_now(), on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << object_len << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " "
+ << object_off << "~" << cmp_data.length() << dendl;
+
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ register_in_flight_request(object_no, {}, on_finish);
+
+ return false;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ dispatch_all_delayed_requests();
+ }
+
+ *dispatch_result = DISPATCH_RESULT_CONTINUE;
+ m_flush_tracker->flush(on_dispatched);
+
+ return true;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::intersects(
+ uint64_t object_no, uint64_t object_off, uint64_t len) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ auto it = m_requests.find(object_no);
+ bool intersects = (it != m_requests.end()) &&
+ it->second->intersects(object_off, len);
+
+ ldout(cct, 20) << intersects << dendl;
+
+ return intersects;
+}
+
+template <typename I>
+bool SimpleSchedulerObjectDispatch<I>::try_delay_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int object_dispatch_flags,
+ Context* on_dispatched) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ if (m_latency_stats && !m_latency_stats->is_ready()) {
+ ldout(cct, 20) << "latency stats not collected yet" << dendl;
+ return false;
+ }
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end()) {
+ ldout(cct, 20) << "no pending requests" << dendl;
+ return false;
+ }
+
+ auto &object_requests = it->second;
+ bool delayed = object_requests->try_delay_request(
+ object_off, std::move(data), io_context, op_flags, object_dispatch_flags,
+ on_dispatched);
+
+ ldout(cct, 20) << "delayed: " << delayed << dendl;
+
+ // schedule dispatch on the first request added
+ if (delayed && !object_requests->is_scheduled_dispatch()) {
+ auto dispatch_time = ceph::real_clock::now();
+ if (m_latency_stats) {
+ dispatch_time += std::chrono::nanoseconds(m_latency_stats->avg() / 2);
+ } else {
+ dispatch_time += std::chrono::milliseconds(m_max_delay);
+ }
+ object_requests->set_scheduled_dispatch(dispatch_time);
+ m_dispatch_queue.push_back(object_requests);
+ if (m_dispatch_queue.front() == object_requests) {
+ schedule_dispatch_delayed_requests();
+ }
+ }
+
+ return delayed;
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_all_delayed_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << dendl;
+
+ while (!m_requests.empty()) {
+ auto it = m_requests.begin();
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+ }
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::register_in_flight_request(
+ uint64_t object_no, const utime_t &start_time, Context **on_finish) {
+ auto res = m_requests.insert(
+ {object_no, std::make_shared<ObjectRequests>(object_no)});
+ ceph_assert(res.second);
+ auto it = res.first;
+
+ auto dispatch_seq = ++m_dispatch_seq;
+ m_flush_tracker->start_io(dispatch_seq);
+
+ it->second->set_dispatch_seq(dispatch_seq);
+ *on_finish = new LambdaContext(
+ [this, object_no, dispatch_seq, start_time, ctx=*on_finish](int r) {
+ ctx->complete(r);
+
+ std::unique_lock locker{m_lock};
+ if (m_latency_stats && start_time != utime_t()) {
+ auto latency = ceph_clock_now() - start_time;
+ m_latency_stats->add(latency.to_nsec());
+ }
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end() ||
+ it->second->get_dispatch_seq() != dispatch_seq) {
+ ldout(m_image_ctx->cct, 20) << "already dispatched" << dendl;
+ } else {
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+ }
+ locker.unlock();
+
+ m_flush_tracker->finish_io(dispatch_seq);
+ });
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests(
+ uint64_t object_no) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ auto it = m_requests.find(object_no);
+ if (it == m_requests.end()) {
+ ldout(cct, 20) << "object_no=" << object_no << ": not found" << dendl;
+ return;
+ }
+
+ dispatch_delayed_requests(it->second);
+ m_requests.erase(it);
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests(
+ ObjectRequestsRef object_requests) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ ldout(cct, 20) << "object_no=" << object_requests->get_object_no() << ", "
+ << object_requests->delayed_requests_size() << " requests, "
+ << "dispatch_time=" << object_requests->get_dispatch_time()
+ << dendl;
+
+ if (!object_requests->is_scheduled_dispatch()) {
+ return;
+ }
+
+ object_requests->dispatch_delayed_requests(m_image_ctx, m_latency_stats.get(),
+ &m_lock);
+
+ ceph_assert(!m_dispatch_queue.empty());
+ if (m_dispatch_queue.front() == object_requests) {
+ m_dispatch_queue.pop_front();
+ schedule_dispatch_delayed_requests();
+ }
+}
+
+template <typename I>
+void SimpleSchedulerObjectDispatch<I>::schedule_dispatch_delayed_requests() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto cct = m_image_ctx->cct;
+
+ std::lock_guard timer_locker{*m_timer_lock};
+
+ if (m_timer_task != nullptr) {
+ ldout(cct, 20) << "canceling task " << m_timer_task << dendl;
+
+ bool canceled = m_timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+ }
+
+ if (m_dispatch_queue.empty()) {
+ ldout(cct, 20) << "nothing to schedule" << dendl;
+ return;
+ }
+
+ auto object_requests = m_dispatch_queue.front().get();
+
+ while (!object_requests->is_scheduled_dispatch()) {
+ ldout(cct, 20) << "garbage collecting " << object_requests << dendl;
+ m_dispatch_queue.pop_front();
+
+ if (m_dispatch_queue.empty()) {
+ ldout(cct, 20) << "nothing to schedule" << dendl;
+ return;
+ }
+ object_requests = m_dispatch_queue.front().get();
+ }
+
+ m_timer_task = new LambdaContext(
+ [this, object_no=object_requests->get_object_no()](int r) {
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "running timer task " << m_timer_task << dendl;
+
+ m_timer_task = nullptr;
+ m_image_ctx->asio_engine->post(
+ [this, object_no]() {
+ std::lock_guard locker{m_lock};
+ dispatch_delayed_requests(object_no);
+ });
+ });
+
+ ldout(cct, 20) << "scheduling task " << m_timer_task << " at "
+ << object_requests->get_dispatch_time() << dendl;
+
+ m_timer->add_event_at(object_requests->get_dispatch_time(), m_timer_task);
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.h b/src/librbd/io/SimpleSchedulerObjectDispatch.h
new file mode 100644
index 000000000..ca8a57f3a
--- /dev/null
+++ b/src/librbd/io/SimpleSchedulerObjectDispatch.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
+#define CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
+
+#include "common/ceph_mutex.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+
+#include "librbd/io/ObjectDispatchInterface.h"
+#include "librbd/io/TypeTraits.h"
+
+#include <list>
+#include <map>
+#include <memory>
+
+namespace librbd {
+
+class ImageCtx;
+
+namespace io {
+
+template <typename> class FlushTracker;
+class LatencyStats;
+
+/**
+ * Simple scheduler plugin for object dispatcher layer.
+ */
+template <typename ImageCtxT = ImageCtx>
+class SimpleSchedulerObjectDispatch : public ObjectDispatchInterface {
+private:
+ // mock unit testing support
+ typedef ::librbd::io::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::SafeTimer SafeTimer;
+public:
+ static SimpleSchedulerObjectDispatch* create(ImageCtxT* image_ctx) {
+ return new SimpleSchedulerObjectDispatch(image_ctx);
+ }
+
+ SimpleSchedulerObjectDispatch(ImageCtxT* image_ctx);
+ ~SimpleSchedulerObjectDispatch() override;
+
+ ObjectDispatchLayer get_dispatch_layer() const override {
+ return OBJECT_DISPATCH_LAYER_SCHEDULER;
+ }
+
+ void init();
+ void shut_down(Context* on_finish) override;
+
+ bool read(
+ uint64_t object_no, ReadExtents* extents, IOContext io_context,
+ int op_flags, int read_flags, const ZTracer::Trace &parent_trace,
+ uint64_t* version, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool discard(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ IOContext io_context, int discard_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags, int write_flags,
+ std::optional<uint64_t> assert_version,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool write_same(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, int* object_dispatch_flags,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool compare_and_write(
+ uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data,
+ ceph::bufferlist&& write_data, IOContext io_context, int op_flags,
+ const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset,
+ int* object_dispatch_flags, uint64_t* journal_tid,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool flush(
+ FlushSource flush_source, const ZTracer::Trace &parent_trace,
+ uint64_t* journal_tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched) override;
+
+ bool list_snaps(
+ uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids,
+ int list_snap_flags, const ZTracer::Trace &parent_trace,
+ SnapshotDelta* snapshot_delta, int* object_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+ bool reset_existence_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void extent_overwritten(
+ uint64_t object_no, uint64_t object_off, uint64_t object_len,
+ uint64_t journal_tid, uint64_t new_journal_tid) override {
+ }
+
+ int prepare_copyup(
+ uint64_t object_no,
+ SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override {
+ return 0;
+ }
+
+private:
+ struct MergedRequests {
+ ceph::bufferlist data;
+ std::list<Context *> requests;
+ };
+
+ class ObjectRequests {
+ public:
+ using clock_t = ceph::real_clock;
+
+ ObjectRequests(uint64_t object_no) : m_object_no(object_no) {
+ }
+
+ uint64_t get_object_no() const {
+ return m_object_no;
+ }
+
+ void set_dispatch_seq(uint64_t dispatch_seq) {
+ m_dispatch_seq = dispatch_seq;
+ }
+
+ uint64_t get_dispatch_seq() const {
+ return m_dispatch_seq;
+ }
+
+ clock_t::time_point get_dispatch_time() const {
+ return m_dispatch_time;
+ }
+
+ void set_scheduled_dispatch(const clock_t::time_point &dispatch_time) {
+ m_dispatch_time = dispatch_time;
+ }
+
+ bool is_scheduled_dispatch() const {
+ return !clock_t::is_zero(m_dispatch_time);
+ }
+
+ size_t delayed_requests_size() const {
+ return m_delayed_requests.size();
+ }
+
+ bool intersects(uint64_t object_off, uint64_t len) const {
+ return m_delayed_request_extents.intersects(object_off, len);
+ }
+
+ bool try_delay_request(uint64_t object_off, ceph::bufferlist&& data,
+ IOContext io_context, int op_flags,
+ int object_dispatch_flags, Context* on_dispatched);
+
+ void dispatch_delayed_requests(ImageCtxT *image_ctx,
+ LatencyStats *latency_stats,
+ ceph::mutex *latency_stats_lock);
+
+ private:
+ uint64_t m_object_no;
+ uint64_t m_dispatch_seq = 0;
+ clock_t::time_point m_dispatch_time;
+ IOContext m_io_context;
+ int m_op_flags = 0;
+ int m_object_dispatch_flags = 0;
+ std::map<uint64_t, MergedRequests> m_delayed_requests;
+ interval_set<uint64_t> m_delayed_request_extents;
+
+ void try_merge_delayed_requests(
+ typename std::map<uint64_t, MergedRequests>::iterator &iter,
+ typename std::map<uint64_t, MergedRequests>::iterator &iter2);
+ };
+
+ typedef std::shared_ptr<ObjectRequests> ObjectRequestsRef;
+ typedef std::map<uint64_t, ObjectRequestsRef> Requests;
+
+ ImageCtxT *m_image_ctx;
+
+ FlushTracker<ImageCtxT>* m_flush_tracker;
+
+ ceph::mutex m_lock;
+ SafeTimer *m_timer;
+ ceph::mutex *m_timer_lock;
+ uint64_t m_max_delay;
+ uint64_t m_dispatch_seq = 0;
+
+ Requests m_requests;
+ std::list<ObjectRequestsRef> m_dispatch_queue;
+ Context *m_timer_task = nullptr;
+ std::unique_ptr<LatencyStats> m_latency_stats;
+
+ bool try_delay_write(uint64_t object_no, uint64_t object_off,
+ ceph::bufferlist&& data, IOContext io_context,
+ int op_flags, int object_dispatch_flags,
+ Context* on_dispatched);
+ bool intersects(uint64_t object_no, uint64_t object_off, uint64_t len) const;
+
+ void dispatch_all_delayed_requests();
+ void dispatch_delayed_requests(uint64_t object_no);
+ void dispatch_delayed_requests(ObjectRequestsRef object_requests);
+ void register_in_flight_request(uint64_t object_no, const utime_t &start_time,
+ Context** on_finish);
+
+ void schedule_dispatch_delayed_requests();
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_CACHE_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H
diff --git a/src/librbd/io/TypeTraits.h b/src/librbd/io/TypeTraits.h
new file mode 100644
index 000000000..2f3a6b7ef
--- /dev/null
+++ b/src/librbd/io/TypeTraits.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_TYPE_TRAITS_H
+#define CEPH_LIBRBD_IO_TYPE_TRAITS_H
+
+#include "common/Timer.h"
+
+namespace librbd {
+namespace io {
+
+template <typename IoCtxT>
+struct TypeTraits {
+ typedef ::SafeTimer SafeTimer;
+};
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_TYPE_TRAITS_H
diff --git a/src/librbd/io/Types.cc b/src/librbd/io/Types.cc
new file mode 100644
index 000000000..19fcc6b89
--- /dev/null
+++ b/src/librbd/io/Types.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/Types.h"
+#include <iostream>
+
+namespace librbd {
+namespace io {
+
+const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS{0, 0};
+
+std::ostream& operator<<(std::ostream& os, SparseExtentState state) {
+ switch (state) {
+ case SPARSE_EXTENT_STATE_DNE:
+ os << "dne";
+ break;
+ case SPARSE_EXTENT_STATE_ZEROED:
+ os << "zeroed";
+ break;
+ case SPARSE_EXTENT_STATE_DATA:
+ os << "data";
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const SparseExtent& se) {
+ os << "["
+ << "state=" << se.state << ", "
+ << "length=" << se.length << "]";
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, ImageArea area) {
+ switch (area) {
+ case ImageArea::DATA:
+ return os << "data";
+ case ImageArea::CRYPTO_HEADER:
+ return os << "crypto_header";
+ default:
+ ceph_abort();
+ }
+}
+
+} // namespace io
+} // namespace librbd
diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h
new file mode 100644
index 000000000..7c70986c5
--- /dev/null
+++ b/src/librbd/io/Types.h
@@ -0,0 +1,328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_TYPES_H
+#define CEPH_LIBRBD_IO_TYPES_H
+
+#include "include/int_types.h"
+#include "include/rados/rados_types.hpp"
+#include "common/interval_map.h"
+#include "osdc/StriperTypes.h"
+#include <iosfwd>
+#include <map>
+#include <vector>
+
+struct Context;
+
+namespace librbd {
+namespace io {
+
+typedef enum {
+ AIO_TYPE_NONE = 0,
+ AIO_TYPE_GENERIC,
+ AIO_TYPE_OPEN,
+ AIO_TYPE_CLOSE,
+ AIO_TYPE_READ,
+ AIO_TYPE_WRITE,
+ AIO_TYPE_DISCARD,
+ AIO_TYPE_FLUSH,
+ AIO_TYPE_WRITESAME,
+ AIO_TYPE_COMPARE_AND_WRITE,
+} aio_type_t;
+
+enum FlushSource {
+ FLUSH_SOURCE_USER,
+ FLUSH_SOURCE_INTERNAL,
+ FLUSH_SOURCE_SHUTDOWN,
+ FLUSH_SOURCE_EXCLUSIVE_LOCK,
+ FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH,
+ FLUSH_SOURCE_REFRESH,
+ FLUSH_SOURCE_WRITEBACK,
+ FLUSH_SOURCE_WRITE_BLOCK,
+};
+
+enum Direction {
+ DIRECTION_READ,
+ DIRECTION_WRITE,
+ DIRECTION_BOTH
+};
+
+enum DispatchResult {
+ DISPATCH_RESULT_INVALID,
+ DISPATCH_RESULT_RESTART,
+ DISPATCH_RESULT_CONTINUE,
+ DISPATCH_RESULT_COMPLETE
+};
+
+enum ImageDispatchLayer {
+ IMAGE_DISPATCH_LAYER_NONE = 0,
+ IMAGE_DISPATCH_LAYER_API_START = IMAGE_DISPATCH_LAYER_NONE,
+ IMAGE_DISPATCH_LAYER_QUEUE,
+ IMAGE_DISPATCH_LAYER_QOS,
+ IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK,
+ IMAGE_DISPATCH_LAYER_REFRESH,
+ IMAGE_DISPATCH_LAYER_INTERNAL_START = IMAGE_DISPATCH_LAYER_REFRESH,
+ IMAGE_DISPATCH_LAYER_MIGRATION,
+ IMAGE_DISPATCH_LAYER_JOURNAL,
+ IMAGE_DISPATCH_LAYER_WRITE_BLOCK,
+ IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE,
+ IMAGE_DISPATCH_LAYER_CRYPTO,
+ IMAGE_DISPATCH_LAYER_CORE,
+ IMAGE_DISPATCH_LAYER_LAST
+};
+
+enum {
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE = 1 << 0,
+ IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE = 1 << 1,
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE = 1 << 2,
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE = 1 << 3,
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE = 1 << 4,
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE = 1 << 5,
+ IMAGE_DISPATCH_FLAG_QOS_BPS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_READ_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE |
+ IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE),
+ IMAGE_DISPATCH_FLAG_QOS_MASK = (
+ IMAGE_DISPATCH_FLAG_QOS_BPS_MASK |
+ IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK),
+
+ // TODO: pass area through ImageDispatchInterface and remove
+ // this flag
+ IMAGE_DISPATCH_FLAG_CRYPTO_HEADER = 1 << 6
+};
+
+enum {
+ RBD_IO_OPERATIONS_DEFAULT = 0,
+ RBD_IO_OPERATION_READ = 1 << 0,
+ RBD_IO_OPERATION_WRITE = 1 << 1,
+ RBD_IO_OPERATION_DISCARD = 1 << 2,
+ RBD_IO_OPERATION_WRITE_SAME = 1 << 3,
+ RBD_IO_OPERATION_COMPARE_AND_WRITE = 1 << 4,
+ RBD_IO_OPERATIONS_ALL = (
+ RBD_IO_OPERATION_READ |
+ RBD_IO_OPERATION_WRITE |
+ RBD_IO_OPERATION_DISCARD |
+ RBD_IO_OPERATION_WRITE_SAME |
+ RBD_IO_OPERATION_COMPARE_AND_WRITE)
+};
+
+enum ObjectDispatchLayer {
+ OBJECT_DISPATCH_LAYER_NONE = 0,
+ OBJECT_DISPATCH_LAYER_CACHE,
+ OBJECT_DISPATCH_LAYER_CRYPTO,
+ OBJECT_DISPATCH_LAYER_JOURNAL,
+ OBJECT_DISPATCH_LAYER_PARENT_CACHE,
+ OBJECT_DISPATCH_LAYER_SCHEDULER,
+ OBJECT_DISPATCH_LAYER_CORE,
+ OBJECT_DISPATCH_LAYER_LAST
+};
+
+enum {
+ READ_FLAG_DISABLE_READ_FROM_PARENT = 1UL << 0,
+ READ_FLAG_DISABLE_CLIPPING = 1UL << 1,
+};
+
+enum {
+ OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE = 1UL << 0
+};
+
+enum {
+ OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0,
+ OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1
+};
+
+enum {
+ OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0,
+ OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1
+};
+
+enum {
+ LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT = 1UL << 0,
+ LIST_SNAPS_FLAG_WHOLE_OBJECT = 1UL << 1,
+ LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS = 1UL << 2,
+};
+
+enum SparseExtentState {
+ SPARSE_EXTENT_STATE_DNE, /* does not exist */
+ SPARSE_EXTENT_STATE_ZEROED,
+ SPARSE_EXTENT_STATE_DATA
+};
+
+std::ostream& operator<<(std::ostream& os, SparseExtentState state);
+
+struct SparseExtent {
+ SparseExtentState state;
+ uint64_t length;
+
+ SparseExtent(SparseExtentState state, uint64_t length)
+ : state(state), length(length) {
+ }
+
+ operator SparseExtentState() const {
+ return state;
+ }
+
+ bool operator==(const SparseExtent& rhs) const {
+ return state == rhs.state && length == rhs.length;
+ }
+};
+
+std::ostream& operator<<(std::ostream& os, const SparseExtent& state);
+
+struct SparseExtentSplitMerge {
+ SparseExtent split(uint64_t offset, uint64_t length, SparseExtent &se) const {
+ return SparseExtent(se.state, se.length);
+ }
+
+ bool can_merge(const SparseExtent& left, const SparseExtent& right) const {
+ return left.state == right.state;
+ }
+
+ SparseExtent merge(SparseExtent&& left, SparseExtent&& right) const {
+ SparseExtent se(left);
+ se.length += right.length;
+ return se;
+ }
+
+ uint64_t length(const SparseExtent& se) const {
+ return se.length;
+ }
+};
+
+typedef interval_map<uint64_t,
+ SparseExtent,
+ SparseExtentSplitMerge> SparseExtents;
+
+typedef std::vector<uint64_t> SnapIds;
+
+typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds;
+extern const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS;
+
+typedef std::map<WriteReadSnapIds, SparseExtents> SnapshotDelta;
+
+struct SparseBufferlistExtent : public SparseExtent {
+ ceph::bufferlist bl;
+
+ SparseBufferlistExtent(SparseExtentState state, uint64_t length)
+ : SparseExtent(state, length) {
+ ceph_assert(state != SPARSE_EXTENT_STATE_DATA);
+ }
+ SparseBufferlistExtent(SparseExtentState state, uint64_t length,
+ ceph::bufferlist&& bl_)
+ : SparseExtent(state, length), bl(std::move(bl_)) {
+ ceph_assert(state != SPARSE_EXTENT_STATE_DATA || length == bl.length());
+ }
+
+ bool operator==(const SparseBufferlistExtent& rhs) const {
+ return (state == rhs.state &&
+ length == rhs.length &&
+ bl.contents_equal(rhs.bl));
+ }
+};
+
+struct SparseBufferlistExtentSplitMerge {
+ SparseBufferlistExtent split(uint64_t offset, uint64_t length,
+ SparseBufferlistExtent& sbe) const {
+ ceph::bufferlist bl;
+ if (sbe.state == SPARSE_EXTENT_STATE_DATA) {
+ bl.substr_of(bl, offset, length);
+ }
+ return SparseBufferlistExtent(sbe.state, length, std::move(bl));
+ }
+
+ bool can_merge(const SparseBufferlistExtent& left,
+ const SparseBufferlistExtent& right) const {
+ return left.state == right.state;
+ }
+
+ SparseBufferlistExtent merge(SparseBufferlistExtent&& left,
+ SparseBufferlistExtent&& right) const {
+ if (left.state == SPARSE_EXTENT_STATE_DATA) {
+ ceph::bufferlist bl{std::move(left.bl)};
+ bl.claim_append(std::move(right.bl));
+ return SparseBufferlistExtent(SPARSE_EXTENT_STATE_DATA,
+ bl.length(), std::move(bl));
+ } else {
+ return SparseBufferlistExtent(left.state, left.length + right.length, {});
+ }
+ }
+
+ uint64_t length(const SparseBufferlistExtent& sbe) const {
+ return sbe.length;
+ }
+};
+
+typedef interval_map<uint64_t,
+ SparseBufferlistExtent,
+ SparseBufferlistExtentSplitMerge> SparseBufferlist;
+typedef std::map<uint64_t, SparseBufferlist> SnapshotSparseBufferlist;
+
+using striper::LightweightBufferExtents;
+using striper::LightweightObjectExtent;
+using striper::LightweightObjectExtents;
+
+typedef std::pair<uint64_t,uint64_t> Extent;
+typedef std::vector<Extent> Extents;
+
+enum class ImageArea {
+ DATA,
+ CRYPTO_HEADER
+};
+
+std::ostream& operator<<(std::ostream& os, ImageArea area);
+
+struct ReadExtent {
+ const uint64_t offset;
+ const uint64_t length;
+ const LightweightBufferExtents buffer_extents;
+ ceph::bufferlist bl;
+ Extents extent_map;
+
+ ReadExtent(uint64_t offset,
+ uint64_t length) : offset(offset), length(length) {};
+ ReadExtent(uint64_t offset,
+ uint64_t length,
+ const LightweightBufferExtents&& buffer_extents)
+ : offset(offset),
+ length(length),
+ buffer_extents(buffer_extents) {}
+ ReadExtent(uint64_t offset,
+ uint64_t length,
+ const LightweightBufferExtents&& buffer_extents,
+ ceph::bufferlist&& bl,
+ Extents&& extent_map) : offset(offset),
+ length(length),
+ buffer_extents(buffer_extents),
+ bl(bl),
+ extent_map(extent_map) {};
+
+ friend inline std::ostream& operator<<(
+ std::ostream& os,
+ const ReadExtent &extent) {
+ os << "offset=" << extent.offset << ", "
+ << "length=" << extent.length << ", "
+ << "buffer_extents=" << extent.buffer_extents << ", "
+ << "bl.length=" << extent.bl.length() << ", "
+ << "extent_map=" << extent.extent_map;
+ return os;
+ }
+};
+
+typedef std::vector<ReadExtent> ReadExtents;
+
+typedef std::map<uint64_t, uint64_t> ExtentMap;
+
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_TYPES_H
diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc
new file mode 100644
index 000000000..63d587206
--- /dev/null
+++ b/src/librbd/io/Utils.cc
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/Utils.h"
+#include "common/dout.h"
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/neorados/RADOS.hpp"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+#include "librbd/io/ObjectRequest.h"
+#include "librbd/io/ImageDispatcherInterface.h"
+#include "osd/osd_types.h"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::util: " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+namespace util {
+
+void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op) {
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)
+ op->set_fadvise_random();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL)
+ op->set_fadvise_sequential();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_WILLNEED)
+ op->set_fadvise_willneed();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED)
+ op->set_fadvise_dontneed();
+ if (op_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE)
+ op->set_fadvise_nocache();
+
+ if (flags & librados::OPERATION_BALANCE_READS)
+ op->balance_reads();
+ if (flags & librados::OPERATION_LOCALIZE_READS)
+ op->localize_reads();
+}
+
+bool assemble_write_same_extent(
+ const LightweightObjectExtent &object_extent, const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data, bool force_write) {
+ size_t data_len = data.length();
+
+ if (!force_write) {
+ bool may_writesame = true;
+ for (auto& q : object_extent.buffer_extents) {
+ if (!(q.first % data_len == 0 && q.second % data_len == 0)) {
+ may_writesame = false;
+ break;
+ }
+ }
+
+ if (may_writesame) {
+ ws_data->append(data);
+ return true;
+ }
+ }
+
+ for (auto& q : object_extent.buffer_extents) {
+ bufferlist sub_bl;
+ uint64_t sub_off = q.first % data_len;
+ uint64_t sub_len = data_len - sub_off;
+ uint64_t extent_left = q.second;
+ while (extent_left >= sub_len) {
+ sub_bl.substr_of(data, sub_off, sub_len);
+ ws_data->claim_append(sub_bl);
+ extent_left -= sub_len;
+ if (sub_off) {
+ sub_off = 0;
+ sub_len = data_len;
+ }
+ }
+ if (extent_left) {
+ sub_bl.substr_of(data, sub_off, extent_left);
+ ws_data->claim_append(sub_bl);
+ }
+ }
+ return false;
+}
+
+template <typename I>
+void read_parent(I *image_ctx, uint64_t object_no, ReadExtents* read_extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace,
+ Context* on_finish) {
+
+ auto cct = image_ctx->cct;
+
+ std::shared_lock image_locker{image_ctx->image_lock};
+
+ Extents parent_extents;
+ ImageArea area;
+ uint64_t raw_overlap = 0;
+ uint64_t object_overlap = 0;
+ image_ctx->get_parent_overlap(snap_id, &raw_overlap);
+ if (raw_overlap > 0) {
+ // calculate reverse mapping onto the parent image
+ Extents extents;
+ for (const auto& extent : *read_extents) {
+ extents.emplace_back(extent.offset, extent.length);
+ }
+ std::tie(parent_extents, area) = object_to_area_extents(image_ctx,
+ object_no, extents);
+ object_overlap = image_ctx->prune_parent_extents(parent_extents, area,
+ raw_overlap, false);
+ }
+ if (object_overlap == 0) {
+ image_locker.unlock();
+
+ on_finish->complete(-ENOENT);
+ return;
+ }
+
+ ldout(cct, 20) << dendl;
+
+ ceph::bufferlist* parent_read_bl;
+ if (read_extents->size() > 1) {
+ auto parent_comp = new ReadResult::C_ObjectReadMergedExtents(
+ cct, read_extents, on_finish);
+ parent_read_bl = &parent_comp->bl;
+ on_finish = parent_comp;
+ } else {
+ parent_read_bl = &read_extents->front().bl;
+ }
+
+ auto comp = AioCompletion::create_and_start(on_finish, image_ctx->parent,
+ AIO_TYPE_READ);
+ ldout(cct, 20) << "completion=" << comp
+ << " parent_extents=" << parent_extents
+ << " area=" << area << dendl;
+ auto req = io::ImageDispatchSpec::create_read(
+ *image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp,
+ std::move(parent_extents), area, ReadResult{parent_read_bl},
+ image_ctx->parent->get_data_io_context(), 0, 0, trace);
+ req->send();
+}
+
+template <typename I>
+int clip_request(I* image_ctx, Extents* image_extents, ImageArea area) {
+ std::shared_lock image_locker{image_ctx->image_lock};
+ for (auto &image_extent : *image_extents) {
+ auto clip_len = image_extent.second;
+ int r = clip_io(librbd::util::get_image_ctx(image_ctx),
+ image_extent.first, &clip_len, area);
+ if (r < 0) {
+ return r;
+ }
+
+ image_extent.second = clip_len;
+ }
+ return 0;
+}
+
+void unsparsify(CephContext* cct, ceph::bufferlist* bl,
+ const Extents& extent_map, uint64_t bl_off,
+ uint64_t out_bl_len) {
+ Striper::StripedReadResult destriper;
+ bufferlist out_bl;
+
+ destriper.add_partial_sparse_result(cct, std::move(*bl), extent_map, bl_off,
+ {{0, out_bl_len}});
+ destriper.assemble_result(cct, out_bl, true);
+ *bl = out_bl;
+}
+
+template <typename I>
+bool trigger_copyup(I* image_ctx, uint64_t object_no, IOContext io_context,
+ Context* on_finish) {
+ bufferlist bl;
+ auto req = new ObjectWriteRequest<I>(
+ image_ctx, object_no, 0, std::move(bl), io_context, 0, 0,
+ std::nullopt, {}, on_finish);
+ if (!req->has_parent()) {
+ delete req;
+ return false;
+ }
+
+ req->send();
+ return true;
+}
+
+template <typename I>
+void area_to_object_extents(I* image_ctx, uint64_t offset, uint64_t length,
+ ImageArea area, uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents) {
+ Extents extents = {{offset, length}};
+ image_ctx->io_image_dispatcher->remap_to_physical(extents, area);
+ for (auto [off, len] : extents) {
+ Striper::file_to_extents(image_ctx->cct, &image_ctx->layout, off, len, 0,
+ buffer_offset, object_extents);
+ }
+}
+
+template <typename I>
+std::pair<Extents, ImageArea> object_to_area_extents(
+ I* image_ctx, uint64_t object_no, const Extents& object_extents) {
+ Extents extents;
+ for (auto [off, len] : object_extents) {
+ Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no, off,
+ len, extents);
+ }
+ auto area = image_ctx->io_image_dispatcher->remap_to_logical(extents);
+ return {std::move(extents), area};
+}
+
+template <typename I>
+uint64_t area_to_raw_offset(const I& image_ctx, uint64_t offset,
+ ImageArea area) {
+ Extents extents = {{offset, 0}};
+ image_ctx.io_image_dispatcher->remap_to_physical(extents, area);
+ return extents[0].first;
+}
+
+template <typename I>
+std::pair<uint64_t, ImageArea> raw_to_area_offset(const I& image_ctx,
+ uint64_t offset) {
+ Extents extents = {{offset, 0}};
+ auto area = image_ctx.io_image_dispatcher->remap_to_logical(extents);
+ return {extents[0].first, area};
+}
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
+template void librbd::io::util::read_parent(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, ReadExtents* extents,
+ librados::snap_t snap_id, const ZTracer::Trace &trace, Context* on_finish);
+template int librbd::io::util::clip_request(
+ librbd::ImageCtx* image_ctx, Extents* image_extents, ImageArea area);
+template bool librbd::io::util::trigger_copyup(
+ librbd::ImageCtx *image_ctx, uint64_t object_no, IOContext io_context,
+ Context* on_finish);
+template void librbd::io::util::area_to_object_extents(
+ librbd::ImageCtx* image_ctx, uint64_t offset, uint64_t length,
+ ImageArea area, uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents);
+template auto librbd::io::util::object_to_area_extents(
+ librbd::ImageCtx* image_ctx, uint64_t object_no, const Extents& extents)
+ -> std::pair<Extents, ImageArea>;
+template uint64_t librbd::io::util::area_to_raw_offset(
+ const librbd::ImageCtx& image_ctx, uint64_t offset, ImageArea area);
+template auto librbd::io::util::raw_to_area_offset(
+ const librbd::ImageCtx& image_ctx, uint64_t offset)
+ -> std::pair<uint64_t, ImageArea>;
diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h
new file mode 100644
index 000000000..efb79b6a6
--- /dev/null
+++ b/src/librbd/io/Utils.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_UTILS_H
+#define CEPH_LIBRBD_IO_UTILS_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/rados/rados_types.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Types.h"
+#include "librbd/io/Types.h"
+#include <map>
+
+class ObjectExtent;
+
+namespace neorados { struct Op; }
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+namespace util {
+
+void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op);
+
+bool assemble_write_same_extent(const LightweightObjectExtent &object_extent,
+ const ceph::bufferlist& data,
+ ceph::bufferlist *ws_data,
+ bool force_write);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void read_parent(ImageCtxT *image_ctx, uint64_t object_no,
+ ReadExtents* read_extents, librados::snap_t snap_id,
+ const ZTracer::Trace &trace, Context* on_finish);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+int clip_request(ImageCtxT* image_ctx, Extents* image_extents, ImageArea area);
+
+inline uint64_t get_extents_length(const Extents &extents) {
+ uint64_t total_bytes = 0;
+ for (auto [_, extent_length] : extents) {
+ total_bytes += extent_length;
+ }
+ return total_bytes;
+}
+
+void unsparsify(CephContext* cct, ceph::bufferlist* bl,
+ const Extents& extent_map, uint64_t bl_off,
+ uint64_t out_bl_len);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+bool trigger_copyup(ImageCtxT *image_ctx, uint64_t object_no,
+ IOContext io_context, Context* on_finish);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+void area_to_object_extents(ImageCtxT* image_ctx, uint64_t offset,
+ uint64_t length, ImageArea area,
+ uint64_t buffer_offset,
+ striper::LightweightObjectExtents* object_extents);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+std::pair<Extents, ImageArea> object_to_area_extents(
+ ImageCtxT* image_ctx, uint64_t object_no, const Extents& object_extents);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+uint64_t area_to_raw_offset(const ImageCtxT& image_ctx, uint64_t offset,
+ ImageArea area);
+
+template <typename ImageCtxT = librbd::ImageCtx>
+std::pair<uint64_t, ImageArea> raw_to_area_offset(const ImageCtxT& image_ctx,
+ uint64_t offset);
+
+inline ObjectDispatchLayer get_previous_layer(ObjectDispatchLayer layer) {
+ return (ObjectDispatchLayer)(((int)layer) - 1);
+}
+
+} // namespace util
+} // namespace io
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_IO_UTILS_H
diff --git a/src/librbd/io/WriteBlockImageDispatch.cc b/src/librbd/io/WriteBlockImageDispatch.cc
new file mode 100644
index 000000000..57d181d20
--- /dev/null
+++ b/src/librbd/io/WriteBlockImageDispatch.cc
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/io/WriteBlockImageDispatch.h"
+#include "common/dout.h"
+#include "common/Cond.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/io/AioCompletion.h"
+#include "librbd/io/ImageDispatchSpec.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::io::WriteBlockImageDispatch: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace io {
+
+template <typename I>
+struct WriteBlockImageDispatch<I>::C_BlockedWrites : public Context {
+ WriteBlockImageDispatch *dispatch;
+ explicit C_BlockedWrites(WriteBlockImageDispatch *dispatch)
+ : dispatch(dispatch) {
+ }
+
+ void finish(int r) override {
+ dispatch->handle_blocked_writes(r);
+ }
+};
+
+template <typename I>
+WriteBlockImageDispatch<I>::WriteBlockImageDispatch(I* image_ctx)
+ : m_image_ctx(image_ctx),
+ m_lock(ceph::make_shared_mutex(
+ util::unique_lock_name("librbd::io::WriteBlockImageDispatch::m_lock",
+ this))) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 5) << "ictx=" << image_ctx << dendl;
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::shut_down(Context* on_finish) {
+ on_finish->complete(0);
+}
+
+template <typename I>
+int WriteBlockImageDispatch<I>::block_writes() {
+ C_SaferCond cond_ctx;
+ block_writes(&cond_ctx);
+ return cond_ctx.wait();
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::block_writes(Context *on_blocked) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ auto cct = m_image_ctx->cct;
+
+ // ensure owner lock is not held after block_writes completes
+ on_blocked = util::create_async_context_callback(
+ *m_image_ctx, on_blocked);
+
+ {
+ std::unique_lock locker{m_lock};
+ ++m_write_blockers;
+ ldout(cct, 5) << m_image_ctx << ", "
+ << "num=" << m_write_blockers << dendl;
+ if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) {
+ ldout(cct, 5) << "waiting for in-flight writes to complete: "
+ << "in_flight_writes=" << m_in_flight_writes << dendl;
+ m_write_blocker_contexts.push_back(on_blocked);
+ return;
+ }
+ }
+
+ flush_io(on_blocked);
+};
+
+template <typename I>
+void WriteBlockImageDispatch<I>::unblock_writes() {
+ auto cct = m_image_ctx->cct;
+
+ Contexts waiter_contexts;
+ Contexts dispatch_contexts;
+ {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_write_blockers > 0);
+ --m_write_blockers;
+
+ ldout(cct, 5) << m_image_ctx << ", "
+ << "num=" << m_write_blockers << dendl;
+ if (m_write_blockers == 0) {
+ std::swap(waiter_contexts, m_unblocked_write_waiter_contexts);
+ std::swap(dispatch_contexts, m_on_dispatches);
+ }
+ }
+
+ for (auto ctx : waiter_contexts) {
+ ctx->complete(0);
+ }
+
+ for (auto ctx : dispatch_contexts) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::wait_on_writes_unblocked(
+ Context *on_unblocked) {
+ ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock));
+ auto cct = m_image_ctx->cct;
+
+ {
+ std::unique_lock locker{m_lock};
+ ldout(cct, 20) << m_image_ctx << ", "
+ << "write_blockers=" << m_write_blockers << dendl;
+ if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) {
+ m_unblocked_write_waiter_contexts.push_back(on_unblocked);
+ return;
+ }
+ }
+
+ on_unblocked->complete(0);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "tid=" << tid << dendl;
+
+ if (flush_source != FLUSH_SOURCE_USER) {
+ return false;
+ }
+
+ return process_io(tid, dispatch_result, on_finish, on_dispatched);
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::handle_finished(int r, uint64_t tid) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_in_flight_writes > 0);
+ --m_in_flight_writes;
+
+ bool writes_blocked = false;
+ if (m_write_blockers > 0 && m_in_flight_writes == 0) {
+ ldout(cct, 10) << "flushing all in-flight IO for blocked writes" << dendl;
+ writes_blocked = true;
+ }
+ locker.unlock();
+
+ if (writes_blocked) {
+ flush_io(new C_BlockedWrites(this));
+ }
+}
+
+template <typename I>
+bool WriteBlockImageDispatch<I>::process_io(
+ uint64_t tid, DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) {
+ std::unique_lock locker{m_lock};
+ if (m_write_blockers > 0 || !m_on_dispatches.empty()) {
+ *dispatch_result = DISPATCH_RESULT_RESTART;
+ m_on_dispatches.push_back(on_dispatched);
+ return true;
+ }
+
+ ++m_in_flight_writes;
+ *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) {
+ handle_finished(r, tid);
+ on_finish->complete(r);
+ });
+ return false;
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::flush_io(Context* on_finish) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ // ensure that all in-flight IO is flushed
+ auto aio_comp = AioCompletion::create_and_start(
+ on_finish, util::get_image_ctx(m_image_ctx), librbd::io::AIO_TYPE_FLUSH);
+ auto req = ImageDispatchSpec::create_flush(
+ *m_image_ctx, IMAGE_DISPATCH_LAYER_WRITE_BLOCK, aio_comp,
+ FLUSH_SOURCE_WRITE_BLOCK, {});
+ req->send();
+}
+
+template <typename I>
+void WriteBlockImageDispatch<I>::handle_blocked_writes(int r) {
+ auto cct = m_image_ctx->cct;
+ ldout(cct, 10) << dendl;
+
+ Contexts write_blocker_contexts;
+ {
+ std::unique_lock locker{m_lock};
+ std::swap(write_blocker_contexts, m_write_blocker_contexts);
+ }
+
+ for (auto ctx : write_blocker_contexts) {
+ ctx->complete(0);
+ }
+}
+
+} // namespace io
+} // namespace librbd
+
+template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>;
diff --git a/src/librbd/io/WriteBlockImageDispatch.h b/src/librbd/io/WriteBlockImageDispatch.h
new file mode 100644
index 000000000..b1d0ddb0e
--- /dev/null
+++ b/src/librbd/io/WriteBlockImageDispatch.h
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H
+#define CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H
+
+#include "librbd/io/ImageDispatchInterface.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/ceph_mutex.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+#include "librbd/io/ReadResult.h"
+#include "librbd/io/Types.h"
+#include <list>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace io {
+
+struct AioCompletion;
+
+template <typename ImageCtxT>
+class WriteBlockImageDispatch : public ImageDispatchInterface {
+public:
+ WriteBlockImageDispatch(ImageCtxT* image_ctx);
+
+ ImageDispatchLayer get_dispatch_layer() const override {
+ return IMAGE_DISPATCH_LAYER_WRITE_BLOCK;
+ }
+
+ void shut_down(Context* on_finish) override;
+
+ int block_writes();
+ void block_writes(Context *on_blocked);
+ void unblock_writes();
+
+ inline bool writes_blocked() const {
+ std::shared_lock locker{m_lock};
+ return (m_write_blockers > 0);
+ }
+
+ void wait_on_writes_unblocked(Context *on_unblocked);
+
+ bool read(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ ReadResult &&read_result, IOContext io_context, int op_flags,
+ int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+ bool write(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool discard(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool write_same(
+ AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool compare_and_write(
+ AioCompletion* aio_comp, Extents &&image_extents,
+ bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset,
+ int op_flags, const ZTracer::Trace &parent_trace,
+ uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+ bool flush(
+ AioCompletion* aio_comp, FlushSource flush_source,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override;
+
+ bool list_snaps(
+ AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids,
+ int list_snaps_flags, SnapshotDelta* snapshot_delta,
+ const ZTracer::Trace &parent_trace, uint64_t tid,
+ std::atomic<uint32_t>* image_dispatch_flags,
+ DispatchResult* dispatch_result, Context** on_finish,
+ Context* on_dispatched) override {
+ return false;
+ }
+
+private:
+ struct C_BlockedWrites;
+
+ typedef std::list<Context*> Contexts;
+
+ ImageCtxT* m_image_ctx;
+
+ mutable ceph::shared_mutex m_lock;
+ Contexts m_on_dispatches;
+
+ uint32_t m_write_blockers = 0;
+ Contexts m_write_blocker_contexts;
+ Contexts m_unblocked_write_waiter_contexts;
+ uint64_t m_in_flight_writes = 0;
+
+ void handle_finished(int r, uint64_t tid);
+
+ bool process_io(uint64_t tid, DispatchResult* dispatch_result,
+ Context** on_finish, Context* on_dispatched);
+ void flush_io(Context* on_finish);
+
+ bool invalidate_cache(Context* on_finish) override {
+ return false;
+ }
+
+ void handle_blocked_writes(int r);
+
+};
+
+} // namespace io
+} // namespace librbd
+
+extern template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H