diff options
Diffstat (limited to '')
49 files changed, 10485 insertions, 0 deletions
diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc new file mode 100644 index 000000000..c04b80770 --- /dev/null +++ b/src/librbd/io/AioCompletion.cc @@ -0,0 +1,294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AioCompletion.h" +#include <errno.h> + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" + +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" +#include <boost/asio/dispatch.hpp> +#include <boost/asio/post.hpp> + +#ifdef WITH_LTTNG +#include "tracing/librbd.h" +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +int AioCompletion::wait_for_complete() { + tracepoint(librbd, aio_wait_for_complete_enter, this); + { + std::unique_lock<std::mutex> locker(lock); + while (state != AIO_STATE_COMPLETE) { + cond.wait(locker); + } + } + tracepoint(librbd, aio_wait_for_complete_exit, 0); + return 0; +} + +void AioCompletion::finalize() { + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + // finalize any pending error results since we won't be + // atomically incrementing rval anymore + int err_r = error_rval; + if (err_r < 0) { + rval = err_r; + } + + ssize_t r = rval; + ldout(cct, 20) << "r=" << r << dendl; + if (r >= 0 && aio_type == AIO_TYPE_READ) { + read_result.assemble_result(cct); + } +} + +void AioCompletion::complete() { + ceph_assert(ictx != nullptr); + + ssize_t r = rval; + if ((aio_type == AIO_TYPE_CLOSE) || (aio_type == AIO_TYPE_OPEN && r < 0)) { + ictx = nullptr; + external_callback = false; + } else { + CephContext *cct = ictx->cct; + + tracepoint(librbd, aio_complete_enter, this, r); + if (ictx->perfcounter != nullptr) { + ceph::timespan elapsed = coarse_mono_clock::now() - start_time; + switch (aio_type) { + case AIO_TYPE_GENERIC: + case AIO_TYPE_OPEN: + break; + case AIO_TYPE_READ: + ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break; + case AIO_TYPE_WRITE: + ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break; + case AIO_TYPE_DISCARD: + ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break; + case AIO_TYPE_FLUSH: + ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break; + case AIO_TYPE_WRITESAME: + ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break; + case AIO_TYPE_COMPARE_AND_WRITE: + ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break; + default: + lderr(cct) << "completed invalid aio_type: " << aio_type << dendl; + break; + } + } + } + + state = AIO_STATE_CALLBACK; + if (complete_cb) { + if (external_callback) { + complete_external_callback(); + } else { + complete_cb(rbd_comp, complete_arg); + complete_event_socket(); + notify_callbacks_complete(); + } + } else { + complete_event_socket(); + notify_callbacks_complete(); + } + + tracepoint(librbd, aio_complete_exit); +} + +void AioCompletion::init_time(ImageCtx *i, aio_type_t t) { + if (ictx == nullptr) { + ictx = i; + aio_type = t; + start_time = coarse_mono_clock::now(); + } +} + +void AioCompletion::start_op() { + ceph_assert(ictx != nullptr); + + if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) { + // no need to track async open/close operations + return; + } + + ceph_assert(!async_op.started()); + async_op.start_op(*ictx); +} + +void AioCompletion::queue_complete() { + uint32_t zero = 0; + pending_count.compare_exchange_strong(zero, 1); + ceph_assert(zero == 0); + + add_request(); + + // ensure completion fires in clean lock context + boost::asio::post(ictx->asio_engine->get_api_strand(), [this]() { + complete_request(0); + }); +} + +void AioCompletion::block(CephContext* cct) { + ldout(cct, 20) << dendl; + ceph_assert(!was_armed); + + get(); + ++pending_count; +} + +void AioCompletion::unblock(CephContext* cct) { + ldout(cct, 20) << dendl; + ceph_assert(was_armed); + + uint32_t previous_pending_count = pending_count--; + ceph_assert(previous_pending_count > 0); + + if (previous_pending_count == 1) { + queue_complete(); + } + put(); +} + +void AioCompletion::fail(int r) +{ + ceph_assert(ictx != nullptr); + ceph_assert(r < 0); + + bool queue_required = true; + if (aio_type == AIO_TYPE_CLOSE || aio_type == AIO_TYPE_OPEN) { + // executing from a safe context and the ImageCtx has been destructed + queue_required = false; + } else { + CephContext *cct = ictx->cct; + lderr(cct) << cpp_strerror(r) << dendl; + } + + ceph_assert(!was_armed); + was_armed = true; + + rval = r; + + uint32_t previous_pending_count = pending_count.load(); + if (previous_pending_count == 0) { + if (queue_required) { + queue_complete(); + } else { + complete(); + } + } +} + +void AioCompletion::set_request_count(uint32_t count) { + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + ceph_assert(!was_armed); + was_armed = true; + + ldout(cct, 20) << "pending=" << count << dendl; + uint32_t previous_pending_count = pending_count.fetch_add(count); + if (previous_pending_count == 0 && count == 0) { + queue_complete(); + } +} + +void AioCompletion::complete_request(ssize_t r) +{ + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + if (r > 0) { + rval += r; + } else if (r < 0 && r != -EEXIST) { + // might race w/ another thread setting an error code but + // first one wins + int zero = 0; + error_rval.compare_exchange_strong(zero, r); + } + + uint32_t previous_pending_count = pending_count--; + ceph_assert(previous_pending_count > 0); + auto pending_count = previous_pending_count - 1; + + ldout(cct, 20) << "cb=" << complete_cb << ", " + << "pending=" << pending_count << dendl; + if (pending_count == 0) { + finalize(); + complete(); + } + put(); +} + +bool AioCompletion::is_complete() { + tracepoint(librbd, aio_is_complete_enter, this); + bool done = (this->state != AIO_STATE_PENDING); + tracepoint(librbd, aio_is_complete_exit, done); + return done; +} + +ssize_t AioCompletion::get_return_value() { + tracepoint(librbd, aio_get_return_value_enter, this); + ssize_t r = rval; + tracepoint(librbd, aio_get_return_value_exit, r); + return r; +} + +void AioCompletion::complete_external_callback() { + get(); + + // ensure librbd external users never experience concurrent callbacks + // from multiple librbd-internal threads. + boost::asio::dispatch(ictx->asio_engine->get_api_strand(), [this]() { + complete_cb(rbd_comp, complete_arg); + complete_event_socket(); + notify_callbacks_complete(); + put(); + }); +} + +void AioCompletion::complete_event_socket() { + if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) { + ictx->event_socket_completions.push(this); + ictx->event_socket.notify(); + } +} + +void AioCompletion::notify_callbacks_complete() { + state = AIO_STATE_COMPLETE; + + { + std::unique_lock<std::mutex> locker(lock); + cond.notify_all(); + } + + if (image_dispatcher_ctx != nullptr) { + image_dispatcher_ctx->complete(rval); + } + + // note: possible for image to be closed after op marked finished + if (async_op.started()) { + async_op.finish_op(); + } +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h new file mode 100644 index 000000000..4ae93fe36 --- /dev/null +++ b/src/librbd/io/AioCompletion.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H +#define CEPH_LIBRBD_IO_AIO_COMPLETION_H + +#include "common/ceph_time.h" +#include "include/common_fwd.h" +#include "include/Context.h" +#include "include/utime.h" +#include "include/rbd/librbd.hpp" + +#include "librbd/ImageCtx.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +#include <atomic> +#include <condition_variable> +#include <mutex> + +struct Context; + +namespace librbd { +namespace io { + +/** + * AioCompletion is the overall completion for a single + * rbd I/O request. It may be composed of many AioObjectRequests, + * which each go to a single object. + * + * The retrying of individual requests is handled at a lower level, + * so all AioCompletion cares about is the count of outstanding + * requests. The number of expected individual requests should be + * set initially using set_request_count() prior to issuing the + * requests. This ensures that the completion will not be completed + * within the caller's thread of execution (instead via a librados + * context or via a thread pool context for cache read hits). + */ +struct AioCompletion { + typedef enum { + AIO_STATE_PENDING = 0, + AIO_STATE_CALLBACK, + AIO_STATE_COMPLETE, + } aio_state_t; + + mutable std::mutex lock; + std::condition_variable cond; + + callback_t complete_cb = nullptr; + void *complete_arg = nullptr; + rbd_completion_t rbd_comp = nullptr; + + /// note: only using atomic for built-in memory barrier + std::atomic<aio_state_t> state{AIO_STATE_PENDING}; + + std::atomic<ssize_t> rval{0}; + std::atomic<int> error_rval{0}; + std::atomic<uint32_t> ref{1}; + std::atomic<uint32_t> pending_count{0}; ///< number of requests/blocks + std::atomic<bool> released{false}; + + ImageCtx *ictx = nullptr; + coarse_mono_time start_time; + aio_type_t aio_type = AIO_TYPE_NONE; + + ReadResult read_result; + + AsyncOperation async_op; + + bool event_notify = false; + bool was_armed = false; + bool external_callback = false; + + Context* image_dispatcher_ctx = nullptr; + + template <typename T, void (T::*MF)(int)> + static void callback_adapter(completion_t cb, void *arg) { + AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb); + T *t = reinterpret_cast<T *>(arg); + (t->*MF)(comp->get_return_value()); + comp->release(); + } + + static AioCompletion *create(void *cb_arg, callback_t cb_complete, + rbd_completion_t rbd_comp) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(cb_arg, cb_complete); + comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp); + return comp; + } + + template <typename T, void (T::*MF)(int) = &T::complete> + static AioCompletion *create(T *obj) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(obj, &callback_adapter<T, MF>); + comp->rbd_comp = comp; + return comp; + } + + template <typename T, void (T::*MF)(int) = &T::complete> + static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx, + aio_type_t type) { + AioCompletion *comp = create<T, MF>(obj); + comp->init_time(image_ctx, type); + comp->start_op(); + return comp; + } + + AioCompletion() { + } + + ~AioCompletion() { + } + + int wait_for_complete(); + + void finalize(); + + inline bool is_initialized(aio_type_t type) const { + std::unique_lock<std::mutex> locker(lock); + return ((ictx != nullptr) && (aio_type == type)); + } + inline bool is_started() const { + std::unique_lock<std::mutex> locker(lock); + return async_op.started(); + } + + void block(CephContext* cct); + void unblock(CephContext* cct); + + void init_time(ImageCtx *i, aio_type_t t); + void start_op(); + void fail(int r); + + void complete(); + + void set_complete_cb(void *cb_arg, callback_t cb) { + complete_cb = cb; + complete_arg = cb_arg; + } + + void set_request_count(uint32_t num); + void add_request() { + ceph_assert(pending_count > 0); + get(); + } + void complete_request(ssize_t r); + + bool is_complete(); + + ssize_t get_return_value(); + + void get() { + ceph_assert(ref > 0); + ++ref; + } + void release() { + bool previous_released = released.exchange(true); + ceph_assert(!previous_released); + put(); + } + void put() { + uint32_t previous_ref = ref--; + ceph_assert(previous_ref > 0); + + if (previous_ref == 1) { + delete this; + } + } + + void set_event_notify(bool s) { + event_notify = s; + } + + void *get_arg() { + return complete_arg; + } + +private: + void queue_complete(); + void complete_external_callback(); + void complete_event_socket(); + void notify_callbacks_complete(); +}; + +class C_AioRequest : public Context { +public: + C_AioRequest(AioCompletion *completion) : m_completion(completion) { + m_completion->add_request(); + } + ~C_AioRequest() override {} + void finish(int r) override { + m_completion->complete_request(r); + } +protected: + AioCompletion *m_completion; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc new file mode 100644 index 000000000..18db2410e --- /dev/null +++ b/src/librbd/io/AsyncOperation.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AsyncOperation.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AsyncOperation: " + +namespace librbd { +namespace io { + +namespace { + +struct C_CompleteFlushes : public Context { + ImageCtx *image_ctx; + std::list<Context *> flush_contexts; + + explicit C_CompleteFlushes(ImageCtx *image_ctx, + std::list<Context *> &&flush_contexts) + : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) { + } + void finish(int r) override { + std::shared_lock owner_locker{image_ctx->owner_lock}; + while (!flush_contexts.empty()) { + Context *flush_ctx = flush_contexts.front(); + flush_contexts.pop_front(); + + ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl; + flush_ctx->complete(0); + } + } +}; + +} // anonymous namespace + +void AsyncOperation::start_op(ImageCtx &image_ctx) { + ceph_assert(m_image_ctx == NULL); + m_image_ctx = &image_ctx; + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + std::lock_guard l{m_image_ctx->async_ops_lock}; + m_image_ctx->async_ops.push_front(&m_xlist_item); +} + +void AsyncOperation::finish_op() { + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + + { + std::lock_guard l{m_image_ctx->async_ops_lock}; + xlist<AsyncOperation *>::iterator iter(&m_xlist_item); + ++iter; + ceph_assert(m_xlist_item.remove_myself()); + + // linked list stored newest -> oldest ops + if (!iter.end() && !m_flush_contexts.empty()) { + ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: " + << *iter << dendl; + (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(), + m_flush_contexts.begin(), + m_flush_contexts.end()); + return; + } + } + + if (!m_flush_contexts.empty()) { + C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx, + std::move(m_flush_contexts)); + m_image_ctx->asio_engine->post(ctx, 0); + } +} + +void AsyncOperation::flush(Context* on_finish) { + { + std::lock_guard locker{m_image_ctx->async_ops_lock}; + xlist<AsyncOperation *>::iterator iter(&m_xlist_item); + ++iter; + + // linked list stored newest -> oldest ops + if (!iter.end()) { + (*iter)->m_flush_contexts.push_back(on_finish); + return; + } + } + + m_image_ctx->asio_engine->post(on_finish, 0); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h new file mode 100644 index 000000000..b0a37c4b8 --- /dev/null +++ b/src/librbd/io/AsyncOperation.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_IO_ASYNC_OPERATION_H +#define LIBRBD_IO_ASYNC_OPERATION_H + +#include "include/ceph_assert.h" +#include "include/xlist.h" +#include <list> + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace io { + +class AsyncOperation { +public: + + AsyncOperation() + : m_image_ctx(NULL), m_xlist_item(this) + { + } + + ~AsyncOperation() + { + ceph_assert(!m_xlist_item.is_on_list()); + } + + inline bool started() const { + return m_xlist_item.is_on_list(); + } + + void start_op(ImageCtx &image_ctx); + void finish_op(); + + void flush(Context *on_finish); + +private: + + ImageCtx *m_image_ctx; + xlist<AsyncOperation *>::item m_xlist_item; + std::list<Context *> m_flush_contexts; + +}; + +} // namespace io +} // namespace librbd + +#endif // LIBRBD_IO_ASYNC_OPERATION_H diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc new file mode 100644 index 000000000..228f95980 --- /dev/null +++ b/src/librbd/io/CopyupRequest.cc @@ -0,0 +1,773 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/CopyupRequest.h" +#include "include/neorados/RADOS.hpp" +#include "common/ceph_context.h" +#include "common/ceph_mutex.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/asio/Utils.h" +#include "librbd/deep_copy/ObjectCopyRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Utils.h" + +#include <boost/lambda/bind.hpp> +#include <boost/lambda/construct.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \ + << " " << __func__ << ": " \ + << data_object_name(m_image_ctx, m_object_no) << " " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; + +namespace { + +template <typename I> +class C_UpdateObjectMap : public C_AsyncObjectThrottle<I> { +public: + C_UpdateObjectMap(AsyncObjectThrottle<I> &throttle, I *image_ctx, + uint64_t object_no, uint8_t head_object_map_state, + const std::vector<uint64_t> *snap_ids, + bool first_snap_is_clean, const ZTracer::Trace &trace, + size_t snap_id_idx) + : C_AsyncObjectThrottle<I>(throttle, *image_ctx), m_object_no(object_no), + m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids), + m_first_snap_is_clean(first_snap_is_clean), m_trace(trace), + m_snap_id_idx(snap_id_idx) + { + } + + int send() override { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + if (image_ctx.exclusive_lock == nullptr) { + return 1; + } + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map == nullptr) { + return 1; + } + + uint64_t snap_id = m_snap_ids[m_snap_id_idx]; + if (snap_id == CEPH_NOSNAP) { + return update_head(); + } else { + return update_snapshot(snap_id); + } + } + + int update_head() { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + bool sent = image_ctx.object_map->template aio_update<Context>( + CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false, + this); + return (sent ? 0 : 1); + } + + int update_snapshot(uint64_t snap_id) { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + uint8_t state = OBJECT_EXISTS; + if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.image_lock) && + (m_snap_id_idx > 0 || m_first_snap_is_clean)) { + // first snapshot should be exists+dirty since it contains + // the copyup data -- later snapshots inherit the data. + state = OBJECT_EXISTS_CLEAN; + } + + bool sent = image_ctx.object_map->template aio_update<Context>( + snap_id, m_object_no, state, {}, m_trace, true, this); + ceph_assert(sent); + return 0; + } + +private: + uint64_t m_object_no; + uint8_t m_head_object_map_state; + const std::vector<uint64_t> &m_snap_ids; + bool m_first_snap_is_clean; + const ZTracer::Trace &m_trace; + size_t m_snap_id_idx; +}; + +} // anonymous namespace + +template <typename I> +CopyupRequest<I>::CopyupRequest(I *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace) + : m_image_ctx(ictx), m_object_no(objectno), + m_image_extents(std::move(image_extents)), m_image_area(area), + m_trace(librbd::util::create_trace(*m_image_ctx, "copy-up", parent_trace)) +{ + ceph_assert(m_image_ctx->data_ctx.is_valid()); + m_async_op.start_op(*librbd::util::get_image_ctx(m_image_ctx)); +} + +template <typename I> +CopyupRequest<I>::~CopyupRequest() { + ceph_assert(m_pending_requests.empty()); + m_async_op.finish_op(); +} + +template <typename I> +void CopyupRequest<I>::append_request(AbstractObjectWriteRequest<I> *req, + const Extents& object_extents) { + std::lock_guard locker{m_lock}; + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_request=" << req << ", " + << "append=" << m_append_request_permitted << dendl; + if (m_append_request_permitted) { + m_pending_requests.push_back(req); + + for (auto [offset, length] : object_extents) { + if (length > 0) { + m_write_object_extents.union_insert(offset, length); + } + } + } else { + m_restart_requests.push_back(req); + } +} + +template <typename I> +void CopyupRequest<I>::send() { + read_from_parent(); +} + +template <typename I> +void CopyupRequest<I>::read_from_parent() { + auto cct = m_image_ctx->cct; + std::shared_lock image_locker{m_image_ctx->image_lock}; + + if (m_image_ctx->parent == nullptr) { + ldout(cct, 5) << "parent detached" << dendl; + + m_image_ctx->asio_engine->post( + [this]() { handle_read_from_parent(-ENOENT); }); + return; + } else if (is_deep_copy()) { + deep_copy(); + return; + } + + auto comp = AioCompletion::create_and_start< + CopyupRequest<I>, + &CopyupRequest<I>::handle_read_from_parent>( + this, librbd::util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ); + + ldout(cct, 20) << "completion=" << comp + << " image_extents=" << m_image_extents + << " area=" << m_image_area << dendl; + auto req = io::ImageDispatchSpec::create_read( + *m_image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp, + std::move(m_image_extents), m_image_area, + ReadResult{&m_copyup_extent_map, &m_copyup_data}, + m_image_ctx->parent->get_data_io_context(), 0, 0, m_trace); + req->send(); +} + +template <typename I> +void CopyupRequest<I>::handle_read_from_parent(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + m_lock.lock(); + disable_append_requests(); + m_lock.unlock(); + + lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + convert_copyup_extent_map(); + + m_image_ctx->image_lock.lock_shared(); + m_lock.lock(); + disable_append_requests(); + + r = prepare_copyup_data(); + if (r < 0) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + lderr(m_image_ctx->cct) << "failed to prepare copyup data: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_copyup_is_zero = m_copyup_data.is_zero(); + m_copyup_required = is_copyup_required(); + if (!m_copyup_required) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 20) << "no-op, skipping" << dendl; + finish(0); + return; + } + + // copyup() will affect snapshots only if parent data is not all + // zeros. + if (!m_copyup_is_zero) { + m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(), + m_image_ctx->snaps.rend()); + } + + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + update_object_maps(); +} + +template <typename I> +void CopyupRequest<I>::deep_copy() { + auto cct = m_image_ctx->cct; + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + ceph_assert(m_image_ctx->parent != nullptr); + + m_lock.lock(); + m_deep_copied = true; + m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten; + m_lock.unlock(); + + ldout(cct, 20) << "flatten=" << m_flatten << dendl; + + uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION; + if (m_flatten) { + flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN; + } + + auto ctx = librbd::util::create_context_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_deep_copy>(this); + auto req = deep_copy::ObjectCopyRequest<I>::create( + m_image_ctx->parent, m_image_ctx, 0, 0, + m_image_ctx->migration_info.snap_map, m_object_no, flags, nullptr, ctx); + + req->send(); +} + +template <typename I> +void CopyupRequest<I>::handle_deep_copy(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_image_ctx->image_lock.lock_shared(); + m_lock.lock(); + m_copyup_required = is_copyup_required(); + if (r == -ENOENT && !m_flatten && m_copyup_required) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 10) << "restart deep-copy with flatten" << dendl; + send(); + return; + } + + disable_append_requests(); + + if (r < 0 && r != -ENOENT) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (!m_copyup_required && !is_update_object_map_required(r)) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + if (r == -ENOENT) { + r = 0; + } + + ldout(cct, 20) << "skipping" << dendl; + finish(r); + return; + } + + // For deep-copy, copyup() will never affect snapshots. However, + // this state machine is responsible for updating object maps for + // snapshots that have been created on destination image after + // migration started. + if (r != -ENOENT) { + compute_deep_copy_snap_ids(); + } + + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + update_object_maps(); +} + +template <typename I> +void CopyupRequest<I>::update_object_maps() { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + std::shared_lock image_locker{m_image_ctx->image_lock}; + if (m_image_ctx->object_map == nullptr) { + image_locker.unlock(); + owner_locker.unlock(); + + copyup(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + bool copy_on_read = m_pending_requests.empty(); + uint8_t head_object_map_state = OBJECT_EXISTS; + if (copy_on_read && !m_snap_ids.empty() && + m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF, + m_image_ctx->image_lock)) { + // HEAD is non-dirty since data is tied to first snapshot + head_object_map_state = OBJECT_EXISTS_CLEAN; + } + + auto r_it = m_pending_requests.rbegin(); + if (r_it != m_pending_requests.rend()) { + // last write-op determines the final object map state + head_object_map_state = (*r_it)->get_pre_write_object_map_state(); + } + + if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) { + // (maybe) need to update the HEAD object map state + m_snap_ids.push_back(CEPH_NOSNAP); + } + image_locker.unlock(); + + ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner()); + typename AsyncObjectThrottle<I>::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr<C_UpdateObjectMap<I>>(), + boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state, + &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2)); + auto ctx = librbd::util::create_context_callback< + CopyupRequest<I>, &CopyupRequest<I>::handle_update_object_maps>(this); + auto throttle = new AsyncObjectThrottle<I>( + nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size()); + throttle->start_ops( + m_image_ctx->config.template get_val<uint64_t>("rbd_concurrent_management_ops")); +} + +template <typename I> +void CopyupRequest<I>::handle_update_object_maps(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + + finish(r); + return; + } + + copyup(); +} + +template <typename I> +void CopyupRequest<I>::copyup() { + auto cct = m_image_ctx->cct; + m_image_ctx->image_lock.lock_shared(); + auto snapc = m_image_ctx->snapc; + auto io_context = m_image_ctx->get_data_io_context(); + m_image_ctx->image_lock.unlock_shared(); + + m_lock.lock(); + if (!m_copyup_required) { + m_lock.unlock(); + + ldout(cct, 20) << "skipping copyup" << dendl; + finish(0); + return; + } + + ldout(cct, 20) << dendl; + + bool copy_on_read = m_pending_requests.empty() && !m_deep_copied; + bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero; + if (m_copyup_is_zero) { + m_copyup_data.clear(); + m_copyup_extent_map.clear(); + } + + neorados::WriteOp copyup_op; + neorados::WriteOp write_op; + neorados::WriteOp* op; + if (copy_on_read || deep_copyup) { + // copyup-op will use its own request issued to the initial object revision + op = ©up_op; + ++m_pending_copyups; + } else { + // copyup-op can be combined with the write-ops (if any) + op = &write_op; + } + + if (m_image_ctx->enable_sparse_copyup) { + cls_client::sparse_copyup(op, m_copyup_extent_map, m_copyup_data); + } else { + // convert the sparse read back into a standard (thick) read + Striper::StripedReadResult destriper; + destriper.add_partial_sparse_result( + cct, std::move(m_copyup_data), m_copyup_extent_map, 0, + {{0, m_image_ctx->layout.object_size}}); + + bufferlist thick_bl; + destriper.assemble_result(cct, thick_bl, false); + cls_client::copyup(op, thick_bl); + } + ObjectRequest<I>::add_write_hint(*m_image_ctx, op); + + if (!copy_on_read) { + // merge all pending write ops into this single RADOS op + for (auto req : m_pending_requests) { + ldout(cct, 20) << "add_copyup_ops " << req << dendl; + req->add_copyup_ops(&write_op); + } + + if (write_op.size() > 0) { + ++m_pending_copyups; + } + } + m_lock.unlock(); + + // issue librados ops at the end to simplify test cases + auto object = neorados::Object{data_object_name(m_image_ctx, m_object_no)}; + if (copyup_op.size() > 0) { + // send only the copyup request with a blank snapshot context so that + // all snapshots are detected from the parent for this object. If + // this is a CoW request, a second request will be created for the + // actual modification. + ldout(cct, 20) << "copyup with empty snapshot context" << dendl; + + auto copyup_io_context = *io_context; + copyup_io_context.write_snap_context({}); + + m_image_ctx->rados_api.execute( + object, copyup_io_context, std::move(copyup_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_copyup(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + } + + if (write_op.size() > 0) { + // compare-and-write doesn't add any write ops (copyup+cmpext+write + // can't be executed in the same RADOS op because, unless the object + // was already present in the clone, cmpext wouldn't see it) + ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ? + "copyup + ops" : !deep_copyup ? "copyup" : "ops") + << " with current snapshot context" << dendl; + + m_image_ctx->rados_api.execute( + object, *io_context, std::move(write_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_copyup(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + } +} + +template <typename I> +void CopyupRequest<I>::handle_copyup(int r) { + auto cct = m_image_ctx->cct; + unsigned pending_copyups; + int copyup_ret_val = r; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_pending_copyups > 0); + pending_copyups = --m_pending_copyups; + if (m_copyup_ret_val < 0) { + copyup_ret_val = m_copyup_ret_val; + } else if (r < 0) { + m_copyup_ret_val = r; + } + } + + ldout(cct, 20) << "r=" << r << ", " + << "pending=" << pending_copyups << dendl; + + if (pending_copyups == 0) { + if (copyup_ret_val < 0 && copyup_ret_val != -ENOENT) { + lderr(cct) << "failed to copyup object: " << cpp_strerror(copyup_ret_val) + << dendl; + complete_requests(false, copyup_ret_val); + } + + finish(0); + } +} + +template <typename I> +void CopyupRequest<I>::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + complete_requests(true, r); + delete this; +} + +template <typename I> +void CopyupRequest<I>::complete_requests(bool override_restart_retval, int r) { + auto cct = m_image_ctx->cct; + remove_from_list(); + + while (!m_pending_requests.empty()) { + auto it = m_pending_requests.begin(); + auto req = *it; + ldout(cct, 20) << "completing request " << req << dendl; + req->handle_copyup(r); + m_pending_requests.erase(it); + } + + if (override_restart_retval) { + r = -ERESTART; + } + + while (!m_restart_requests.empty()) { + auto it = m_restart_requests.begin(); + auto req = *it; + ldout(cct, 20) << "restarting request " << req << dendl; + req->handle_copyup(r); + m_restart_requests.erase(it); + } +} + +template <typename I> +void CopyupRequest<I>::disable_append_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + m_append_request_permitted = false; +} + +template <typename I> +void CopyupRequest<I>::remove_from_list() { + std::lock_guard copyup_list_locker{m_image_ctx->copyup_list_lock}; + + auto it = m_image_ctx->copyup_list.find(m_object_no); + if (it != m_image_ctx->copyup_list.end()) { + m_image_ctx->copyup_list.erase(it); + } +} + +template <typename I> +bool CopyupRequest<I>::is_copyup_required() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + bool copy_on_read = m_pending_requests.empty(); + if (copy_on_read) { + // always force a copyup if CoR enabled + return true; + } + + if (!m_copyup_is_zero) { + return true; + } + + for (auto req : m_pending_requests) { + if (!req->is_empty_write_op()) { + return true; + } + } + return false; +} + +template <typename I> +bool CopyupRequest<I>::is_deep_copy() const { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + return !m_image_ctx->migration_info.empty(); +} + +template <typename I> +bool CopyupRequest<I>::is_update_object_map_required(int r) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + if (r < 0) { + return false; + } + + if (m_image_ctx->object_map == nullptr) { + return false; + } + + if (m_image_ctx->migration_info.empty()) { + // migration might have completed while IO was in-flight, + // assume worst-case and perform an object map update + return true; + } + + auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP); + ceph_assert(it != m_image_ctx->migration_info.snap_map.end()); + return it->second[0] != CEPH_NOSNAP; +} + +template <typename I> +void CopyupRequest<I>::compute_deep_copy_snap_ids() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + // don't copy ids for the snaps updated by object deep copy or + // that don't overlap + std::set<uint64_t> deep_copied; + for (auto &it : m_image_ctx->migration_info.snap_map) { + if (it.first != CEPH_NOSNAP) { + deep_copied.insert(it.second.front()); + } + } + ldout(m_image_ctx->cct, 15) << "deep_copied=" << deep_copied << dendl; + + std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(), + std::back_inserter(m_snap_ids), + [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) { + if (deep_copied.count(snap_id)) { + m_first_snap_is_clean = true; + return false; + } + + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + int r = m_image_ctx->get_parent_overlap(snap_id, &raw_overlap); + if (r < 0) { + ldout(cct, 5) << "failed getting parent overlap for snap_id: " + << snap_id << ": " << cpp_strerror(r) << dendl; + } else if (raw_overlap > 0) { + auto [parent_extents, area] = util::object_to_area_extents( + m_image_ctx, m_object_no, {{0, m_image_ctx->layout.object_size}}); + object_overlap = m_image_ctx->prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + return object_overlap > 0; + }); +} + +template <typename I> +void CopyupRequest<I>::convert_copyup_extent_map() { + auto cct = m_image_ctx->cct; + + Extents image_extent_map; + image_extent_map.swap(m_copyup_extent_map); + m_copyup_extent_map.reserve(image_extent_map.size()); + + // convert the image-extent extent map to object-extents + for (auto [image_offset, image_length] : image_extent_map) { + striper::LightweightObjectExtents object_extents; + util::area_to_object_extents(m_image_ctx, image_offset, image_length, + m_image_area, 0, &object_extents); + for (auto& object_extent : object_extents) { + m_copyup_extent_map.emplace_back( + object_extent.offset, object_extent.length); + } + } + + ldout(cct, 20) << "image_extents=" << image_extent_map << ", " + << "object_extents=" << m_copyup_extent_map << dendl; +} + +template <typename I> +int CopyupRequest<I>::prepare_copyup_data() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + auto cct = m_image_ctx->cct; + + SnapshotSparseBufferlist snapshot_sparse_bufferlist; + auto& sparse_bufferlist = snapshot_sparse_bufferlist[0]; + + bool copy_on_read = m_pending_requests.empty(); + bool maybe_deep_copyup = !m_image_ctx->snapc.snaps.empty(); + if (copy_on_read || maybe_deep_copyup) { + // stand-alone copyup that will not be overwritten until HEAD revision + ldout(cct, 20) << "processing full copy-up" << dendl; + + uint64_t buffer_offset = 0; + for (auto [object_offset, object_length] : m_copyup_extent_map) { + bufferlist sub_bl; + sub_bl.substr_of(m_copyup_data, buffer_offset, object_length); + buffer_offset += object_length; + + sparse_bufferlist.insert( + object_offset, object_length, + {SPARSE_EXTENT_STATE_DATA, object_length, std::move(sub_bl)}); + } + } else { + // copyup that will concurrently written to the HEAD revision with the + // associated write-ops so only process partial extents + uint64_t buffer_offset = 0; + for (auto [object_offset, object_length] : m_copyup_extent_map) { + interval_set<uint64_t> copyup_object_extents; + copyup_object_extents.insert(object_offset, object_length); + + interval_set<uint64_t> intersection; + intersection.intersection_of(copyup_object_extents, + m_write_object_extents); + + // extract only portions of the parent copyup data that have not + // been overwritten by write-ops + copyup_object_extents.subtract(intersection); + for (auto [copyup_offset, copyup_length] : copyup_object_extents) { + bufferlist sub_bl; + sub_bl.substr_of( + m_copyup_data, buffer_offset + (copyup_offset - object_offset), + copyup_length); + ceph_assert(sub_bl.length() == copyup_length); + + sparse_bufferlist.insert( + copyup_offset, copyup_length, + {SPARSE_EXTENT_STATE_DATA, copyup_length, std::move(sub_bl)}); + } + buffer_offset += object_length; + } + + ldout(cct, 20) << "processing partial copy-up: " << sparse_bufferlist + << dendl; + } + + // Let dispatch layers have a chance to process the data + auto r = m_image_ctx->io_object_dispatcher->prepare_copyup( + m_object_no, &snapshot_sparse_bufferlist); + if (r < 0) { + return r; + } + + // Convert sparse extents back to extent map + m_copyup_data.clear(); + m_copyup_extent_map.clear(); + m_copyup_extent_map.reserve(sparse_bufferlist.ext_count()); + for (auto& extent : sparse_bufferlist) { + auto& sbe = extent.get_val(); + if (sbe.state == SPARSE_EXTENT_STATE_DATA) { + m_copyup_extent_map.emplace_back(extent.get_off(), extent.get_len()); + m_copyup_data.append(sbe.bl); + } + } + + return 0; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::CopyupRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h new file mode 100644 index 000000000..a94139421 --- /dev/null +++ b/src/librbd/io/CopyupRequest.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H +#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/interval_set.h" +#include "common/ceph_mutex.h" +#include "common/zipkin_trace.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/Types.h" + +#include <map> +#include <string> +#include <vector> + +namespace ZTracer { struct Trace; } + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template <typename I> class AbstractObjectWriteRequest; + +template <typename ImageCtxT = librbd::ImageCtx> +class CopyupRequest { +public: + static CopyupRequest* create(ImageCtxT *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace) { + return new CopyupRequest(ictx, objectno, std::move(image_extents), area, + parent_trace); + } + + CopyupRequest(ImageCtxT *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace); + ~CopyupRequest(); + + void append_request(AbstractObjectWriteRequest<ImageCtxT> *req, + const Extents& object_extents); + + void send(); + +private: + /** + * Copyup requests go through the following state machine to read from the + * parent image, update the object map, and copyup the object: + * + * + * @verbatim + * + * <start> + * | + * /---------/ \---------\ + * | | + * v v + * READ_FROM_PARENT DEEP_COPY + * | | + * \---------\ /---------/ + * | + * v (skip if not needed) + * UPDATE_OBJECT_MAPS + * | + * v (skip if not needed) + * COPYUP + * | + * v + * <finish> + * + * @endverbatim + * + * The OBJECT_MAP state is skipped if the object map isn't enabled or if + * an object map update isn't required. The COPYUP state is skipped if + * no data was read from the parent *and* there are no additional ops. + */ + + typedef std::vector<AbstractObjectWriteRequest<ImageCtxT> *> WriteRequests; + + ImageCtxT *m_image_ctx; + uint64_t m_object_no; + Extents m_image_extents; + ImageArea m_image_area; + ZTracer::Trace m_trace; + + bool m_flatten = false; + bool m_copyup_required = true; + bool m_copyup_is_zero = true; + bool m_deep_copied = false; + + Extents m_copyup_extent_map; + ceph::bufferlist m_copyup_data; + + AsyncOperation m_async_op; + + std::vector<uint64_t> m_snap_ids; + bool m_first_snap_is_clean = false; + + ceph::mutex m_lock = ceph::make_mutex("CopyupRequest", false); + WriteRequests m_pending_requests; + unsigned m_pending_copyups = 0; + int m_copyup_ret_val = 0; + + WriteRequests m_restart_requests; + bool m_append_request_permitted = true; + + interval_set<uint64_t> m_write_object_extents; + + void read_from_parent(); + void handle_read_from_parent(int r); + + void deep_copy(); + void handle_deep_copy(int r); + + void update_object_maps(); + void handle_update_object_maps(int r); + + void copyup(); + void handle_copyup(int r); + + void finish(int r); + void complete_requests(bool override_restart_retval, int r); + + void disable_append_requests(); + void remove_from_list(); + + bool is_copyup_required(); + bool is_update_object_map_required(int r); + bool is_deep_copy() const; + + void compute_deep_copy_snap_ids(); + void convert_copyup_extent_map(); + int prepare_copyup_data(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::CopyupRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H diff --git a/src/librbd/io/Dispatcher.h b/src/librbd/io/Dispatcher.h new file mode 100644 index 000000000..cb64e11b2 --- /dev/null +++ b/src/librbd/io/Dispatcher.h @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_DISPATCHER_H +#define CEPH_LIBRBD_IO_DISPATCHER_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "common/ceph_mutex.h" +#include "common/dout.h" +#include "common/AsyncOpTracker.h" +#include "librbd/Utils.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/Types.h" +#include <map> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::Dispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename ImageCtxT, typename DispatchInterfaceT> +class Dispatcher : public DispatchInterfaceT { +public: + typedef typename DispatchInterfaceT::Dispatch Dispatch; + typedef typename DispatchInterfaceT::DispatchLayer DispatchLayer; + typedef typename DispatchInterfaceT::DispatchSpec DispatchSpec; + + Dispatcher(ImageCtxT* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + librbd::util::unique_lock_name("librbd::io::Dispatcher::lock", + this))) { + } + + virtual ~Dispatcher() { + ceph_assert(m_dispatches.empty()); + } + + void shut_down(Context* on_finish) override { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + std::map<DispatchLayer, DispatchMeta> dispatches; + { + std::unique_lock locker{m_lock}; + std::swap(dispatches, m_dispatches); + } + + for (auto it : dispatches) { + shut_down_dispatch(it.second, &on_finish); + } + on_finish->complete(0); + } + + void register_dispatch(Dispatch* dispatch) override { + auto cct = m_image_ctx->cct; + auto type = dispatch->get_dispatch_layer(); + ldout(cct, 5) << "dispatch_layer=" << type << dendl; + + std::unique_lock locker{m_lock}; + + auto result = m_dispatches.insert( + {type, {dispatch, new AsyncOpTracker()}}); + ceph_assert(result.second); + } + + bool exists(DispatchLayer dispatch_layer) override { + std::unique_lock locker{m_lock}; + return m_dispatches.find(dispatch_layer) != m_dispatches.end(); + } + + void shut_down_dispatch(DispatchLayer dispatch_layer, + Context* on_finish) override { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "dispatch_layer=" << dispatch_layer << dendl; + + DispatchMeta dispatch_meta; + { + std::unique_lock locker{m_lock}; + auto it = m_dispatches.find(dispatch_layer); + if (it == m_dispatches.end()) { + on_finish->complete(0); + return; + } + + dispatch_meta = it->second; + m_dispatches.erase(it); + } + + shut_down_dispatch(dispatch_meta, &on_finish); + on_finish->complete(0); + } + + void send(DispatchSpec* dispatch_spec) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "dispatch_spec=" << dispatch_spec << dendl; + + auto dispatch_layer = dispatch_spec->dispatch_layer; + + // apply the IO request to all layers -- this method will be re-invoked + // by the dispatch layer if continuing / restarting the IO + while (true) { + m_lock.lock_shared(); + dispatch_layer = dispatch_spec->dispatch_layer; + auto it = m_dispatches.upper_bound(dispatch_layer); + if (it == m_dispatches.end()) { + // the request is complete if handled by all layers + dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE; + m_lock.unlock_shared(); + break; + } + + auto& dispatch_meta = it->second; + auto dispatch = dispatch_meta.dispatch; + auto async_op_tracker = dispatch_meta.async_op_tracker; + dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID; + + // prevent recursive locking back into the dispatcher while handling IO + async_op_tracker->start_op(); + m_lock.unlock_shared(); + + // advance to next layer in case we skip or continue + dispatch_spec->dispatch_layer = dispatch->get_dispatch_layer(); + + bool handled = send_dispatch(dispatch, dispatch_spec); + async_op_tracker->finish_op(); + + // handled ops will resume when the dispatch ctx is invoked + if (handled) { + return; + } + } + + // skipped through to the last layer + dispatch_spec->dispatcher_ctx.complete(0); + } + +protected: + struct DispatchMeta { + Dispatch* dispatch = nullptr; + AsyncOpTracker* async_op_tracker = nullptr; + + DispatchMeta() { + } + DispatchMeta(Dispatch* dispatch, AsyncOpTracker* async_op_tracker) + : dispatch(dispatch), async_op_tracker(async_op_tracker) { + } + }; + + ImageCtxT* m_image_ctx; + + ceph::shared_mutex m_lock; + std::map<DispatchLayer, DispatchMeta> m_dispatches; + + virtual bool send_dispatch(Dispatch* dispatch, + DispatchSpec* dispatch_spec) = 0; + +protected: + struct C_LayerIterator : public Context { + Dispatcher* dispatcher; + Context* on_finish; + DispatchLayer dispatch_layer; + + C_LayerIterator(Dispatcher* dispatcher, + DispatchLayer start_layer, + Context* on_finish) + : dispatcher(dispatcher), on_finish(on_finish), dispatch_layer(start_layer) { + } + + void complete(int r) override { + while (true) { + dispatcher->m_lock.lock_shared(); + auto it = dispatcher->m_dispatches.upper_bound(dispatch_layer); + if (it == dispatcher->m_dispatches.end()) { + dispatcher->m_lock.unlock_shared(); + Context::complete(r); + return; + } + + auto& dispatch_meta = it->second; + auto dispatch = dispatch_meta.dispatch; + + // prevent recursive locking back into the dispatcher while handling IO + dispatch_meta.async_op_tracker->start_op(); + dispatcher->m_lock.unlock_shared(); + + // next loop should start after current layer + dispatch_layer = dispatch->get_dispatch_layer(); + + auto handled = execute(dispatch, this); + dispatch_meta.async_op_tracker->finish_op(); + + if (handled) { + break; + } + } + } + + void finish(int r) override { + on_finish->complete(0); + } + virtual bool execute(Dispatch* dispatch, + Context* on_finish) = 0; + }; + + struct C_InvalidateCache : public C_LayerIterator { + C_InvalidateCache(Dispatcher* dispatcher, DispatchLayer start_layer, Context* on_finish) + : C_LayerIterator(dispatcher, start_layer, on_finish) { + } + + bool execute(Dispatch* dispatch, + Context* on_finish) override { + return dispatch->invalidate_cache(on_finish); + } + }; + +private: + void shut_down_dispatch(DispatchMeta& dispatch_meta, + Context** on_finish) { + auto dispatch = dispatch_meta.dispatch; + auto async_op_tracker = dispatch_meta.async_op_tracker; + + auto ctx = *on_finish; + ctx = new LambdaContext( + [dispatch, async_op_tracker, ctx](int r) { + delete dispatch; + delete async_op_tracker; + + ctx->complete(r); + }); + ctx = new LambdaContext([dispatch, ctx](int r) { + dispatch->shut_down(ctx); + }); + *on_finish = new LambdaContext([async_op_tracker, ctx](int r) { + async_op_tracker->wait_for_ops(ctx); + }); + } + +}; + +} // namespace io +} // namespace librbd + +#undef dout_subsys +#undef dout_prefix +#define dout_prefix *_dout + +#endif // CEPH_LIBRBD_IO_DISPATCHER_H diff --git a/src/librbd/io/DispatcherInterface.h b/src/librbd/io/DispatcherInterface.h new file mode 100644 index 000000000..2bac9ee75 --- /dev/null +++ b/src/librbd/io/DispatcherInterface.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" + +struct Context; + +namespace librbd { +namespace io { + +template <typename DispatchT> +struct DispatcherInterface { +public: + typedef DispatchT Dispatch; + typedef typename DispatchT::DispatchLayer DispatchLayer; + typedef typename DispatchT::DispatchSpec DispatchSpec; + + virtual ~DispatcherInterface() { + } + + virtual void shut_down(Context* on_finish) = 0; + + virtual void register_dispatch(Dispatch* dispatch) = 0; + virtual bool exists(DispatchLayer dispatch_layer) = 0; + virtual void shut_down_dispatch(DispatchLayer dispatch_layer, + Context* on_finish) = 0; + + virtual void send(DispatchSpec* dispatch_spec) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/FlushTracker.cc b/src/librbd/io/FlushTracker.cc new file mode 100644 index 000000000..b6e2ed658 --- /dev/null +++ b/src/librbd/io/FlushTracker.cc @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/FlushTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::FlushTracker: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +FlushTracker<I>::FlushTracker(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::io::FlushTracker::m_lock", this))) { +} + +template <typename I> +FlushTracker<I>::~FlushTracker() { + std::unique_lock locker{m_lock}; + ceph_assert(m_flush_contexts.empty()); +} + +template <typename I> +void FlushTracker<I>::shut_down() { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + std::unique_lock locker{m_lock}; + Contexts flush_ctxs; + for (auto& [flush_tid, ctxs] : m_flush_contexts) { + flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end()); + } + m_flush_contexts.clear(); + locker.unlock(); + + for (auto ctx : flush_ctxs) { + ctx->complete(0); + } +} + +template <typename I> +uint64_t FlushTracker<I>::start_io(uint64_t tid) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + auto [it, inserted] = m_tid_to_flush_tid.insert({tid, ++m_next_flush_tid}); + auto flush_tid = it->second; + m_in_flight_flush_tids.insert(flush_tid); + locker.unlock(); + + ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl; + return flush_tid; +} + +template <typename I> +void FlushTracker<I>::finish_io(uint64_t tid) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + auto tid_to_flush_tid_it = m_tid_to_flush_tid.find(tid); + if (tid_to_flush_tid_it == m_tid_to_flush_tid.end()) { + return; + } + + auto flush_tid = tid_to_flush_tid_it->second; + m_tid_to_flush_tid.erase(tid_to_flush_tid_it); + m_in_flight_flush_tids.erase(flush_tid); + + ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl; + auto oldest_flush_tid = std::numeric_limits<uint64_t>::max(); + if (!m_in_flight_flush_tids.empty()) { + oldest_flush_tid = *m_in_flight_flush_tids.begin(); + } + + // all flushes tagged before the oldest tid should be completed + Contexts flush_ctxs; + auto flush_contexts_it = m_flush_contexts.begin(); + while (flush_contexts_it != m_flush_contexts.end()) { + if (flush_contexts_it->first >= oldest_flush_tid) { + ldout(cct, 20) << "pending IOs: [" << m_in_flight_flush_tids << "], " + << "pending flushes=" << m_flush_contexts << dendl; + break; + } + + auto& ctxs = flush_contexts_it->second; + flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end()); + flush_contexts_it = m_flush_contexts.erase(flush_contexts_it); + } + locker.unlock(); + + if (!flush_ctxs.empty()) { + ldout(cct, 20) << "completing flushes: " << flush_ctxs << dendl; + for (auto ctx : flush_ctxs) { + ctx->complete(0); + } + } +} + +template <typename I> +void FlushTracker<I>::flush(Context* on_finish) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + if (m_in_flight_flush_tids.empty()) { + locker.unlock(); + on_finish->complete(0); + return; + } + + auto flush_tid = *m_in_flight_flush_tids.rbegin(); + m_flush_contexts[flush_tid].push_back(on_finish); + ldout(cct, 20) << "flush_tid=" << flush_tid << ", ctx=" << on_finish << ", " + << "flush_contexts=" << m_flush_contexts << dendl; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::FlushTracker<librbd::ImageCtx>; diff --git a/src/librbd/io/FlushTracker.h b/src/librbd/io/FlushTracker.h new file mode 100644 index 000000000..cc7fcd9ae --- /dev/null +++ b/src/librbd/io/FlushTracker.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_FLUSH_TRACKER_H +#define CEPH_LIBRBD_IO_FLUSH_TRACKER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include <atomic> +#include <list> +#include <map> +#include <set> +#include <unordered_map> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT> +class FlushTracker { +public: + FlushTracker(ImageCtxT* image_ctx); + ~FlushTracker(); + + void shut_down(); + + uint64_t start_io(uint64_t tid); + void finish_io(uint64_t tid); + + void flush(Context* on_finish); + +private: + typedef std::list<Context*> Contexts; + typedef std::map<uint64_t, Contexts> FlushContexts; + typedef std::set<uint64_t> Tids; + typedef std::unordered_map<uint64_t, uint64_t> TidToFlushTid; + + ImageCtxT* m_image_ctx; + + std::atomic<uint32_t> m_next_flush_tid{0}; + + mutable ceph::shared_mutex m_lock; + TidToFlushTid m_tid_to_flush_tid; + + Tids m_in_flight_flush_tids; + FlushContexts m_flush_contexts; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::FlushTracker<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_FLUSH_TRACKER_H diff --git a/src/librbd/io/ImageDispatch.cc b/src/librbd/io/ImageDispatch.cc new file mode 100644 index 000000000..12c55cb0c --- /dev/null +++ b/src/librbd/io/ImageDispatch.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatch.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageDispatch: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +void start_in_flight_io(AioCompletion* aio_comp) { + // TODO remove AsyncOperation from AioCompletion + if (!aio_comp->async_op.started()) { + aio_comp->start_op(); + } +} + +ImageArea get_area(const std::atomic<uint32_t>* image_dispatch_flags) { + return (*image_dispatch_flags & IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ? + ImageArea::CRYPTO_HEADER : ImageArea::DATA); +} + +} // anonymous namespace + +template <typename I> +void ImageDispatch<I>::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template <typename I> +bool ImageDispatch<I>::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_read(m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(read_result), io_context, op_flags, + read_flags, parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_write(m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(bl), op_flags, parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_discard(m_image_ctx, aio_comp, std::move(image_extents), + area, discard_granularity_bytes, parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_writesame(m_image_ctx, aio_comp, + std::move(image_extents), area, std::move(bl), + op_flags, parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_compare_and_write(m_image_ctx, aio_comp, + std::move(image_extents), area, + std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, + parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest<I>::aio_flush(m_image_ctx, aio_comp, flush_source, parent_trace); + return true; +} + +template <typename I> +bool ImageDispatch<I>::list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageListSnapsRequest<I> req(*m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(snap_ids), list_snaps_flags, + snapshot_delta, parent_trace); + req.send(); + return true; +} + +template <typename I> +bool ImageDispatch<I>::invalidate_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + m_image_ctx->io_object_dispatcher->invalidate_cache(on_finish); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/ImageDispatch.h b/src/librbd/io/ImageDispatch.h new file mode 100644 index 000000000..4a89c6054 --- /dev/null +++ b/src/librbd/io/ImageDispatch.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT> +class ImageDispatch : public ImageDispatchInterface { +public: + ImageDispatch(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) { + } + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_CORE; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override; + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_H diff --git a/src/librbd/io/ImageDispatchInterface.h b/src/librbd/io/ImageDispatchInterface.h new file mode 100644 index 000000000..e479f7eef --- /dev/null +++ b/src/librbd/io/ImageDispatchInterface.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include <atomic> + +struct Context; + +namespace librbd { +namespace io { + +struct AioCompletion; +struct ImageDispatchSpec; + +struct ImageDispatchInterface { + typedef ImageDispatchLayer DispatchLayer; + typedef ImageDispatchSpec DispatchSpec; + + virtual ~ImageDispatchInterface() { + } + + virtual ImageDispatchLayer get_dispatch_layer() const = 0; + + virtual void shut_down(Context* on_finish) = 0; + + virtual bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool invalidate_cache(Context* on_finish) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc new file mode 100644 index 000000000..95d8224ae --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.cc @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include <boost/variant.hpp> + +namespace librbd { +namespace io { + +void ImageDispatchSpec::C_Dispatcher::complete(int r) { + switch (image_dispatch_spec->dispatch_result) { + case DISPATCH_RESULT_RESTART: + ceph_assert(image_dispatch_spec->dispatch_layer != 0); + image_dispatch_spec->dispatch_layer = static_cast<ImageDispatchLayer>( + image_dispatch_spec->dispatch_layer - 1); + [[fallthrough]]; + case DISPATCH_RESULT_CONTINUE: + if (r < 0) { + // bubble dispatch failure through AioCompletion + image_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE; + image_dispatch_spec->fail(r); + return; + } + + image_dispatch_spec->send(); + break; + case DISPATCH_RESULT_COMPLETE: + finish(r); + break; + case DISPATCH_RESULT_INVALID: + ceph_abort(); + break; + } +} + +void ImageDispatchSpec::C_Dispatcher::finish(int r) { + delete image_dispatch_spec; +} + +void ImageDispatchSpec::send() { + image_dispatcher->send(this); +} + +void ImageDispatchSpec::fail(int r) { + dispatch_result = DISPATCH_RESULT_COMPLETE; + aio_comp->fail(r); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h new file mode 100644 index 000000000..9323f9879 --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.h @@ -0,0 +1,254 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "common/zipkin_trace.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Types.h" +#include "librbd/io/ReadResult.h" +#include <boost/variant/variant.hpp> +#include <atomic> + +namespace librbd { + +class ImageCtx; + +namespace io { + +struct ImageDispatcherInterface; + +class ImageDispatchSpec { +private: + // helper to avoid extra heap allocation per object IO + struct C_Dispatcher : public Context { + ImageDispatchSpec* image_dispatch_spec; + + C_Dispatcher(ImageDispatchSpec* image_dispatch_spec) + : image_dispatch_spec(image_dispatch_spec) { + } + + void complete(int r) override; + void finish(int r) override; + }; + +public: + struct Read { + ReadResult read_result; + int read_flags; + + Read(ReadResult &&read_result, int read_flags) + : read_result(std::move(read_result)), read_flags(read_flags) { + } + }; + + struct Discard { + uint32_t discard_granularity_bytes; + + Discard(uint32_t discard_granularity_bytes) + : discard_granularity_bytes(discard_granularity_bytes) { + } + }; + + struct Write { + bufferlist bl; + + Write(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct WriteSame { + bufferlist bl; + + WriteSame(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct CompareAndWrite { + bufferlist cmp_bl; + bufferlist bl; + uint64_t *mismatch_offset; + + CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl, + uint64_t *mismatch_offset) + : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)), + mismatch_offset(mismatch_offset) { + } + }; + + struct Flush { + FlushSource flush_source; + + Flush(FlushSource flush_source) : flush_source(flush_source) { + } + }; + + struct ListSnaps { + SnapIds snap_ids; + int list_snaps_flags; + SnapshotDelta* snapshot_delta; + + ListSnaps(SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta) + : snap_ids(std::move(snap_ids)), list_snaps_flags(list_snaps_flags), + snapshot_delta(snapshot_delta) { + } + }; + + typedef boost::variant<Read, + Discard, + Write, + WriteSame, + CompareAndWrite, + Flush, + ListSnaps> Request; + + C_Dispatcher dispatcher_ctx; + + ImageDispatcherInterface* image_dispatcher; + ImageDispatchLayer dispatch_layer; + std::atomic<uint32_t> image_dispatch_flags = 0; + DispatchResult dispatch_result = DISPATCH_RESULT_INVALID; + + AioCompletion* aio_comp; + Extents image_extents; + Request request; + IOContext io_context; + int op_flags; + ZTracer::Trace parent_trace; + uint64_t tid = 0; + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_read( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Read{std::move(read_result), read_flags}, + io_context, op_flags, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_discard( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Discard{discard_granularity_bytes}, + {}, 0, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_write( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Write{std::move(bl)}, + {}, op_flags, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_write_same( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + WriteSame{std::move(bl)}, + {}, op_flags, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_compare_and_write( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + CompareAndWrite{std::move(cmp_bl), + std::move(bl), + mismatch_offset}, + {}, op_flags, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_flush( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, {}, + ImageArea::DATA /* dummy for {} */, + Flush{flush_source}, {}, 0, parent_trace); + } + + template <typename ImageCtxT = ImageCtx> + static ImageDispatchSpec* create_list_snaps( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + SnapIds&& snap_ids, int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + ListSnaps{std::move(snap_ids), + list_snaps_flags, snapshot_delta}, + {}, 0, parent_trace); + } + + ~ImageDispatchSpec() { + aio_comp->put(); + } + + void send(); + void fail(int r); + +private: + struct SendVisitor; + struct IsWriteOpVisitor; + struct TokenRequestedVisitor; + + ImageDispatchSpec(ImageDispatcherInterface* image_dispatcher, + ImageDispatchLayer image_dispatch_layer, + AioCompletion* aio_comp, Extents&& image_extents, + ImageArea area, Request&& request, IOContext io_context, + int op_flags, const ZTracer::Trace& parent_trace) + : dispatcher_ctx(this), image_dispatcher(image_dispatcher), + dispatch_layer(image_dispatch_layer), aio_comp(aio_comp), + image_extents(std::move(image_extents)), request(std::move(request)), + io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) { + ceph_assert(aio_comp->image_dispatcher_ctx == nullptr); + aio_comp->image_dispatcher_ctx = &dispatcher_ctx; + aio_comp->get(); + + switch (area) { + case ImageArea::DATA: + break; + case ImageArea::CRYPTO_HEADER: + image_dispatch_flags |= IMAGE_DISPATCH_FLAG_CRYPTO_HEADER; + break; + default: + ceph_abort(); + } + } +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H diff --git a/src/librbd/io/ImageDispatcher.cc b/src/librbd/io/ImageDispatcher.cc new file mode 100644 index 000000000..4aa7929b2 --- /dev/null +++ b/src/librbd/io/ImageDispatcher.cc @@ -0,0 +1,324 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatcher.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/CryptoImageDispatch.h" +#include "librbd/io/ImageDispatch.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/QueueImageDispatch.h" +#include "librbd/io/QosImageDispatch.h" +#include "librbd/io/RefreshImageDispatch.h" +#include "librbd/io/Utils.h" +#include "librbd/io/WriteBlockImageDispatch.h" +#include <boost/variant.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageDispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +struct ImageDispatcher<I>::SendVisitor : public boost::static_visitor<bool> { + ImageDispatchInterface* image_dispatch; + ImageDispatchSpec* image_dispatch_spec; + + SendVisitor(ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) + : image_dispatch(image_dispatch), + image_dispatch_spec(image_dispatch_spec) { + } + + bool operator()(ImageDispatchSpec::Read& read) const { + return image_dispatch->read( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(read.read_result), image_dispatch_spec->io_context, + image_dispatch_spec->op_flags, read.read_flags, + image_dispatch_spec->parent_trace, image_dispatch_spec->tid, + &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Discard& discard) const { + return image_dispatch->discard( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + discard.discard_granularity_bytes, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Write& write) const { + return image_dispatch->write( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), std::move(write.bl), + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::WriteSame& write_same) const { + return image_dispatch->write_same( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), std::move(write_same.bl), + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()( + ImageDispatchSpec::CompareAndWrite& compare_and_write) const { + return image_dispatch->compare_and_write( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl), + compare_and_write.mismatch_offset, + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Flush& flush) const { + return image_dispatch->flush( + image_dispatch_spec->aio_comp, flush.flush_source, + image_dispatch_spec->parent_trace, image_dispatch_spec->tid, + &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::ListSnaps& list_snaps) const { + return image_dispatch->list_snaps( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags, + list_snaps.snapshot_delta, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } +}; + +template <typename I> +struct ImageDispatcher<I>::PreprocessVisitor + : public boost::static_visitor<bool> { + ImageDispatcher<I>* image_dispatcher; + ImageDispatchSpec* image_dispatch_spec; + + PreprocessVisitor(ImageDispatcher<I>* image_dispatcher, + ImageDispatchSpec* image_dispatch_spec) + : image_dispatcher(image_dispatcher), + image_dispatch_spec(image_dispatch_spec) { + } + + bool clip_request() const { + auto area = (image_dispatch_spec->image_dispatch_flags & + IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ? ImageArea::CRYPTO_HEADER : + ImageArea::DATA); + int r = util::clip_request(image_dispatcher->m_image_ctx, + &image_dispatch_spec->image_extents, area); + if (r < 0) { + image_dispatch_spec->fail(r); + return true; + } + return false; + } + + bool operator()(ImageDispatchSpec::Read& read) const { + if ((read.read_flags & READ_FLAG_DISABLE_CLIPPING) != 0) { + return false; + } + return clip_request(); + } + + bool operator()(ImageDispatchSpec::Flush&) const { + return clip_request(); + } + + bool operator()(ImageDispatchSpec::ListSnaps&) const { + return false; + } + + template <typename T> + bool operator()(T&) const { + if (clip_request()) { + return true; + } + + std::shared_lock image_locker{image_dispatcher->m_image_ctx->image_lock}; + if (image_dispatcher->m_image_ctx->snap_id != CEPH_NOSNAP || + image_dispatcher->m_image_ctx->read_only) { + image_dispatch_spec->fail(-EROFS); + return true; + } + return false; + } +}; + +template <typename I> +ImageDispatcher<I>::ImageDispatcher(I* image_ctx) + : Dispatcher<I, ImageDispatcherInterface>(image_ctx) { + // configure the core image dispatch handler on startup + auto image_dispatch = new ImageDispatch(image_ctx); + this->register_dispatch(image_dispatch); + + auto queue_image_dispatch = new QueueImageDispatch(image_ctx); + this->register_dispatch(queue_image_dispatch); + + m_qos_image_dispatch = new QosImageDispatch<I>(image_ctx); + this->register_dispatch(m_qos_image_dispatch); + + auto refresh_image_dispatch = new RefreshImageDispatch(image_ctx); + this->register_dispatch(refresh_image_dispatch); + + m_write_block_dispatch = new WriteBlockImageDispatch<I>(image_ctx); + this->register_dispatch(m_write_block_dispatch); +} + +template <typename I> +void ImageDispatcher<I>::invalidate_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + auto ctx = new C_InvalidateCache( + this, IMAGE_DISPATCH_LAYER_NONE, on_finish); + ctx->complete(0); +} + +template <typename I> +void ImageDispatcher<I>::shut_down(Context* on_finish) { + // TODO ensure all IOs are executed via a dispatcher + // ensure read-ahead / copy-on-read ops are finished since they are + // currently outside dispatcher tracking + auto async_op = new AsyncOperation(); + + on_finish = new LambdaContext([async_op, on_finish](int r) { + async_op->finish_op(); + delete async_op; + on_finish->complete(0); + }); + on_finish = new LambdaContext([this, on_finish](int r) { + Dispatcher<I, ImageDispatcherInterface>::shut_down(on_finish); + }); + async_op->start_op(*this->m_image_ctx); + async_op->flush(on_finish); +} + +template <typename I> +void ImageDispatcher<I>::apply_qos_schedule_tick_min(uint64_t tick) { + m_qos_image_dispatch->apply_qos_schedule_tick_min(tick); +} + +template <typename I> +void ImageDispatcher<I>::apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) { + m_qos_image_dispatch->apply_qos_limit(flag, limit, burst, burst_seconds); +} + +template <typename I> +void ImageDispatcher<I>::apply_qos_exclude_ops(uint64_t exclude_ops) { + m_qos_image_dispatch->apply_qos_exclude_ops(exclude_ops); +} + +template <typename I> +bool ImageDispatcher<I>::writes_blocked() const { + return m_write_block_dispatch->writes_blocked(); +} + +template <typename I> +int ImageDispatcher<I>::block_writes() { + return m_write_block_dispatch->block_writes(); +} + +template <typename I> +void ImageDispatcher<I>::block_writes(Context *on_blocked) { + m_write_block_dispatch->block_writes(on_blocked); +} + +template <typename I> +void ImageDispatcher<I>::unblock_writes() { + m_write_block_dispatch->unblock_writes(); +} + +template <typename I> +void ImageDispatcher<I>::wait_on_writes_unblocked(Context *on_unblocked) { + m_write_block_dispatch->wait_on_writes_unblocked(on_unblocked); +} + +template <typename I> +void ImageDispatcher<I>::remap_to_physical(Extents& image_extents, + ImageArea area) { + std::shared_lock locker{this->m_lock}; + auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO); + if (it == this->m_dispatches.end()) { + ceph_assert(area == ImageArea::DATA); + return; + } + auto crypto_image_dispatch = static_cast<crypto::CryptoImageDispatch*>( + it->second.dispatch); + crypto_image_dispatch->remap_to_physical(image_extents, area); +} + +template <typename I> +ImageArea ImageDispatcher<I>::remap_to_logical(Extents& image_extents) { + std::shared_lock locker{this->m_lock}; + auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO); + if (it == this->m_dispatches.end()) { + return ImageArea::DATA; + } + auto crypto_image_dispatch = static_cast<crypto::CryptoImageDispatch*>( + it->second.dispatch); + return crypto_image_dispatch->remap_to_logical(image_extents); +} + +template <typename I> +bool ImageDispatcher<I>::send_dispatch( + ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) { + if (image_dispatch_spec->tid == 0) { + image_dispatch_spec->tid = ++m_next_tid; + + bool finished = preprocess(image_dispatch_spec); + if (finished) { + return true; + } + } + + return boost::apply_visitor( + SendVisitor{image_dispatch, image_dispatch_spec}, + image_dispatch_spec->request); +} + +template <typename I> +bool ImageDispatcher<I>::preprocess( + ImageDispatchSpec* image_dispatch_spec) { + return boost::apply_visitor( + PreprocessVisitor{this, image_dispatch_spec}, + image_dispatch_spec->request); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageDispatcher<librbd::ImageCtx>; diff --git a/src/librbd/io/ImageDispatcher.h b/src/librbd/io/ImageDispatcher.h new file mode 100644 index 000000000..5d5fb0535 --- /dev/null +++ b/src/librbd/io/ImageDispatcher.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Dispatcher.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/Types.h" +#include <atomic> +#include <map> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template <typename> struct QosImageDispatch; +template <typename> struct WriteBlockImageDispatch; + +template <typename ImageCtxT = ImageCtx> +class ImageDispatcher : public Dispatcher<ImageCtxT, ImageDispatcherInterface> { +public: + ImageDispatcher(ImageCtxT* image_ctx); + + void invalidate_cache(Context* on_finish) override; + + void shut_down(Context* on_finish) override; + + void apply_qos_schedule_tick_min(uint64_t tick) override; + void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst, + uint64_t burst_seconds) override; + void apply_qos_exclude_ops(uint64_t exclude_ops) override; + + bool writes_blocked() const override; + int block_writes() override; + void block_writes(Context *on_blocked) override; + + void unblock_writes() override; + void wait_on_writes_unblocked(Context *on_unblocked) override; + + void remap_to_physical(Extents& image_extents, ImageArea area) override; + ImageArea remap_to_logical(Extents& image_extents) override; + +protected: + bool send_dispatch( + ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) override; + +private: + struct SendVisitor; + struct PreprocessVisitor; + + using typename Dispatcher<ImageCtxT, ImageDispatcherInterface>::C_InvalidateCache; + + std::atomic<uint64_t> m_next_tid{0}; + + QosImageDispatch<ImageCtxT>* m_qos_image_dispatch = nullptr; + WriteBlockImageDispatch<ImageCtxT>* m_write_block_dispatch = nullptr; + + bool preprocess(ImageDispatchSpec* image_dispatch_spec); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageDispatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H diff --git a/src/librbd/io/ImageDispatcherInterface.h b/src/librbd/io/ImageDispatcherInterface.h new file mode 100644 index 000000000..dcff3d96a --- /dev/null +++ b/src/librbd/io/ImageDispatcherInterface.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { +namespace io { + +struct ImageDispatcherInterface + : public DispatcherInterface<ImageDispatchInterface> { +public: + virtual void apply_qos_schedule_tick_min(uint64_t tick) = 0; + virtual void apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) = 0; + virtual void apply_qos_exclude_ops(uint64_t exclude_ops) = 0; + + virtual bool writes_blocked() const = 0; + virtual int block_writes() = 0; + virtual void block_writes(Context *on_blocked) = 0; + + virtual void unblock_writes() = 0; + virtual void wait_on_writes_unblocked(Context *on_unblocked) = 0; + + virtual void invalidate_cache(Context* on_finish) = 0; + + virtual void remap_to_physical(Extents& image_extents, ImageArea area) = 0; + virtual ImageArea remap_to_logical(Extents& image_extents) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc new file mode 100644 index 000000000..e4c41c229 --- /dev/null +++ b/src/librbd/io/ImageRequest.cc @@ -0,0 +1,909 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Utils.h" +#include "librbd/journal/Types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "osdc/Striper.h" +#include <algorithm> +#include <functional> +#include <map> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequest: " << __func__ << ": " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; +using librbd::util::get_image_ctx; + +namespace { + +template <typename I> +struct C_AssembleSnapshotDeltas : public C_AioRequest { + I* image_ctx; + SnapshotDelta* snapshot_delta; + + ceph::mutex lock = ceph::make_mutex( + "librbd::io::C_AssembleSnapshotDeltas::lock", false); + std::map<uint64_t, SnapshotDelta> object_snapshot_delta; + + C_AssembleSnapshotDeltas(I* image_ctx, AioCompletion* aio_comp, + SnapshotDelta* snapshot_delta) + : C_AioRequest(aio_comp), + image_ctx(image_ctx), snapshot_delta(snapshot_delta) { + } + + SnapshotDelta* get_snapshot_delta(uint64_t object_no) { + std::unique_lock locker{lock}; + return &object_snapshot_delta[object_no]; + } + + void finish(int r) override { + auto cct = image_ctx->cct; + + if (r < 0) { + lderr(cct) << "C_AssembleSnapshotDeltas: list snaps failed: " + << cpp_strerror(r) << dendl; + C_AioRequest::finish(r); + return; + } + + std::unique_lock locker{lock}; + *snapshot_delta = {}; + for (auto& [object_no, object_snapshot_delta] : object_snapshot_delta) { + SnapshotDelta image_snapshot_delta; + object_to_image_intervals(object_no, object_snapshot_delta, + &image_snapshot_delta, snapshot_delta); + + ldout(cct, 20) << "object_no=" << object_no << ", " + << "object_snapshot_delta=" + << object_snapshot_delta << ", " + << "image_snapshot_delta=" << image_snapshot_delta + << dendl; + } + + ldout(cct, 20) << "snapshot_delta=" << *snapshot_delta << dendl; + C_AioRequest::finish(0); + } + + void object_to_image_intervals( + uint64_t object_no, const SnapshotDelta& object_snapshot_delta, + SnapshotDelta* image_snapshot_delta, + SnapshotDelta* assembled_image_snapshot_delta) { + for (auto& [key, object_extents] : object_snapshot_delta) { + for (auto& object_extent : object_extents) { + auto [image_extents, _] = io::util::object_to_area_extents( + image_ctx, object_no, + {{object_extent.get_off(), object_extent.get_len()}}); + + auto& intervals = (*image_snapshot_delta)[key]; + auto& assembled_intervals = (*assembled_image_snapshot_delta)[key]; + for (auto [image_offset, image_length] : image_extents) { + SparseExtent sparse_extent{object_extent.get_val().state, + image_length}; + intervals.insert(image_offset, image_length, sparse_extent); + assembled_intervals.insert(image_offset, image_length, + sparse_extent); + } + } + } + } +}; + +template <typename I> +struct C_RBD_Readahead : public Context { + I *ictx; + uint64_t object_no; + io::ReadExtents extents; + + C_RBD_Readahead(I *ictx, uint64_t object_no, uint64_t offset, uint64_t length) + : ictx(ictx), object_no(object_no), extents({{offset, length}}) { + ictx->readahead.inc_pending(); + } + + void finish(int r) override { + ceph_assert(extents.size() == 1); + auto& extent = extents.front(); + ldout(ictx->cct, 20) << "C_RBD_Readahead on " + << data_object_name(ictx, object_no) << ": " + << extent.offset << "~" << extent.length << dendl; + ictx->readahead.dec_pending(); + } +}; + +template <typename I> +void readahead(I *ictx, const Extents& image_extents, IOContext io_context) { + uint64_t total_bytes = 0; + for (auto& image_extent : image_extents) { + total_bytes += image_extent.second; + } + + ictx->image_lock.lock_shared(); + auto total_bytes_read = ictx->total_bytes_read.fetch_add(total_bytes); + bool abort = ( + ictx->readahead_disable_after_bytes != 0 && + total_bytes_read > ictx->readahead_disable_after_bytes); + if (abort) { + ictx->image_lock.unlock_shared(); + return; + } + + uint64_t data_size = ictx->get_area_size(ImageArea::DATA); + ictx->image_lock.unlock_shared(); + + auto readahead_extent = ictx->readahead.update(image_extents, data_size); + uint64_t readahead_offset = readahead_extent.first; + uint64_t readahead_length = readahead_extent.second; + + if (readahead_length > 0) { + ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~" + << readahead_length << dendl; + LightweightObjectExtents readahead_object_extents; + io::util::area_to_object_extents(ictx, readahead_offset, readahead_length, + ImageArea::DATA, 0, + &readahead_object_extents); + for (auto& object_extent : readahead_object_extents) { + ldout(ictx->cct, 20) << "(readahead) " + << data_object_name(ictx, + object_extent.object_no) << " " + << object_extent.offset << "~" + << object_extent.length << dendl; + + auto req_comp = new C_RBD_Readahead<I>(ictx, object_extent.object_no, + object_extent.offset, + object_extent.length); + auto req = io::ObjectDispatchSpec::create_read( + ictx, io::OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + &req_comp->extents, io_context, 0, 0, {}, nullptr, req_comp); + req->send(); + } + + ictx->perfcounter->inc(l_librbd_readahead); + ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length); + } +} + +template <typename I> +struct C_UpdateTimestamp : public Context { +public: + I& m_image_ctx; + bool m_modify; // if modify set to 'true', modify timestamp is updated, + // access timestamp otherwise + AsyncOperation m_async_op; + + C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) { + m_async_op.start_op(*get_image_ctx(&m_image_ctx)); + } + ~C_UpdateTimestamp() override { + m_async_op.finish_op(); + } + + void send() { + librados::ObjectWriteOperation op; + if (m_modify) { + cls_client::set_modify_timestamp(&op); + } else { + cls_client::set_access_timestamp(&op); + } + + auto comp = librbd::util::create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void finish(int r) override { + // ignore errors updating timestamp + } +}; + +bool should_update_timestamp(const utime_t& now, const utime_t& timestamp, + uint64_t interval) { + return (interval && + (static_cast<uint64_t>(now.sec()) >= interval + timestamp)); +} + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \ + << " " << __func__ << ": " + +template <typename I> +void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) { + ImageReadRequest<I> req(*ictx, c, std::move(image_extents), area, + std::move(read_result), io_context, op_flags, + read_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), area, + std::move(bl), op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) { + ImageDiscardRequest<I> req(*ictx, c, std::move(image_extents), area, + discard_granularity_bytes, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) { + ImageFlushRequest<I> req(*ictx, c, flush_source, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteSameRequest<I> req(*ictx, c, std::move(image_extents), area, + std::move(bl), op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::aio_compare_and_write(I *ictx, AioCompletion *c, + Extents &&image_extents, + ImageArea area, + bufferlist &&cmp_bl, + bufferlist &&bl, + uint64_t *mismatch_offset, + int op_flags, + const ZTracer::Trace &parent_trace) { + ImageCompareAndWriteRequest<I> req(*ictx, c, std::move(image_extents), area, + std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, parent_trace); + req.send(); +} + +template <typename I> +void ImageRequest<I>::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(m_aio_comp->is_initialized(get_aio_type())); + ceph_assert(m_aio_comp->is_started()); + + CephContext *cct = image_ctx.cct; + AioCompletion *aio_comp = this->m_aio_comp; + ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << dendl; + + update_timestamp(); + send_request(); +} + +template <typename I> +void ImageRequest<I>::update_timestamp() { + bool modify = (get_aio_type() != AIO_TYPE_READ); + uint64_t update_interval; + if (modify) { + update_interval = m_image_ctx.mtime_update_interval; + } else { + update_interval = m_image_ctx.atime_update_interval; + } + + if (update_interval == 0) { + return; + } + + utime_t (I::*get_timestamp_fn)() const; + void (I::*set_timestamp_fn)(utime_t); + if (modify) { + get_timestamp_fn = &I::get_modify_timestamp; + set_timestamp_fn = &I::set_modify_timestamp; + } else { + get_timestamp_fn = &I::get_access_timestamp; + set_timestamp_fn = &I::set_access_timestamp; + } + + utime_t ts = ceph_clock_now(); + { + std::shared_lock timestamp_locker{m_image_ctx.timestamp_lock}; + if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx), + update_interval)) { + return; + } + } + + { + std::unique_lock timestamp_locker{m_image_ctx.timestamp_lock}; + bool update = should_update_timestamp( + ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval); + if (!update) { + return; + } + + std::invoke(set_timestamp_fn, m_image_ctx, ts); + } + + // TODO we fire and forget this outside the IO path to prevent + // potential race conditions with librbd client IO callbacks + // between different threads (e.g. librados and object cacher) + ldout(m_image_ctx.cct, 10) << get_request_type() << dendl; + auto req = new C_UpdateTimestamp<I>(m_image_ctx, modify); + req->send(); +} + +template <typename I> +ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, + IOContext io_context, int op_flags, + int read_flags, + const ZTracer::Trace &parent_trace) + : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), area, + "read", parent_trace), + m_io_context(io_context), m_op_flags(op_flags), m_read_flags(read_flags) { + aio_comp->read_result = std::move(read_result); +} + +template <typename I> +void ImageReadRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + auto &image_extents = this->m_image_extents; + if (this->m_image_area == ImageArea::DATA && + image_ctx.cache && image_ctx.readahead_max_bytes > 0 && + !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) { + readahead(get_image_ctx(&image_ctx), image_extents, m_io_context); + } + + // map image extents to object extents + LightweightObjectExtents object_extents; + uint64_t buffer_ofs = 0; + for (auto &extent : image_extents) { + if (extent.second == 0) { + continue; + } + + util::area_to_object_extents(&image_ctx, extent.first, extent.second, + this->m_image_area, buffer_ofs, + &object_extents); + buffer_ofs += extent.second; + } + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->read_result.set_image_extents(image_extents); + + // issue the requests + aio_comp->set_request_count(object_extents.size()); + for (auto &oe : object_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " " + << oe.offset << "~" << oe.length << " from " + << oe.buffer_extents << dendl; + + auto req_comp = new io::ReadResult::C_ObjectReadRequest( + aio_comp, {{oe.offset, oe.length, std::move(oe.buffer_extents)}}); + auto req = ObjectDispatchSpec::create_read( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.object_no, + &req_comp->extents, m_io_context, m_op_flags, m_read_flags, + this->m_trace, nullptr, req_comp); + req->send(); + } + + image_ctx.perfcounter->inc(l_librbd_rd); + image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs); +} + +template <typename I> +void AbstractImageWriteRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + + bool journaling = false; + + AioCompletion *aio_comp = this->m_aio_comp; + { + // prevent image size from changing between computing clip and recording + // pending async operation + std::shared_lock image_locker{image_ctx.image_lock}; + journaling = (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + uint64_t clip_len = 0; + LightweightObjectExtents object_extents; + for (auto &extent : this->m_image_extents) { + if (extent.second == 0) { + continue; + } + + // map to object extents + io::util::area_to_object_extents(&image_ctx, extent.first, extent.second, + this->m_image_area, clip_len, + &object_extents); + clip_len += extent.second; + } + + int ret = prune_object_extents(&object_extents); + if (ret < 0) { + aio_comp->fail(ret); + return; + } + + // reflect changes in object_extents back to m_image_extents + if (ret == 1) { + this->m_image_extents.clear(); + for (auto& object_extent : object_extents) { + auto [image_extents, _] = io::util::object_to_area_extents( + &image_ctx, object_extent.object_no, + {{object_extent.offset, object_extent.length}}); + this->m_image_extents.insert(this->m_image_extents.end(), + image_extents.begin(), image_extents.end()); + } + } + + aio_comp->set_request_count(object_extents.size()); + if (!object_extents.empty()) { + uint64_t journal_tid = 0; + if (journaling) { + // in-flight ops are flushed prior to closing the journal + ceph_assert(image_ctx.journal != NULL); + journal_tid = append_journal_event(m_synchronous); + } + + // it's very important that IOContext is captured here instead of + // e.g. at the API layer so that an up-to-date snap context is used + // when owning the exclusive lock + send_object_requests(object_extents, image_ctx.get_data_io_context(), + journal_tid); + } + + update_stats(clip_len); +} + +template <typename I> +void AbstractImageWriteRequest<I>::send_object_requests( + const LightweightObjectExtents &object_extents, IOContext io_context, + uint64_t journal_tid) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + AioCompletion *aio_comp = this->m_aio_comp; + bool single_extent = (object_extents.size() == 1); + for (auto& oe : object_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " " + << oe.offset << "~" << oe.length << " from " + << oe.buffer_extents << dendl; + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + auto request = create_object_request(oe, io_context, journal_tid, + single_extent, req_comp); + request->send(); + } +} + +template <typename I> +void ImageWriteRequest<I>::assemble_extent( + const LightweightObjectExtent &object_extent, bufferlist *bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + } +} + +template <typename I> +uint64_t ImageWriteRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + uint64_t buffer_offset = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, buffer_offset, extent.second); + buffer_offset += extent.second; + + tid = image_ctx.journal->append_write_event(extent.first, extent.second, + sub_bl, synchronous); + } + + return tid; +} + +template <typename I> +ObjectDispatchSpec *ImageWriteRequest<I>::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + if (single_extent && object_extent.buffer_extents.size() == 1 && + m_bl.length() == object_extent.length) { + // optimization for single object/buffer extent writes + bl = std::move(m_bl); + } else { + assemble_extent(object_extent, &bl); + } + + auto req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(bl), io_context, m_op_flags, 0, + std::nullopt, journal_tid, this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageWriteRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_wr); + image_ctx.perfcounter->inc(l_librbd_wr_bytes, length); +} + +template <typename I> +uint64_t ImageDiscardRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry( + journal::AioDiscardEvent(extent.first, + extent.second, + this->m_discard_granularity_bytes)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template <typename I> +ObjectDispatchSpec *ImageDiscardRequest<I>::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + auto req = ObjectDispatchSpec::create_discard( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, object_extent.length, io_context, + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace, + on_finish); + return req; +} + +template <typename I> +void ImageDiscardRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_discard); + image_ctx.perfcounter->inc(l_librbd_discard_bytes, length); +} + +template <typename I> +int ImageDiscardRequest<I>::prune_object_extents( + LightweightObjectExtents* object_extents) const { + if (m_discard_granularity_bytes == 0) { + return 0; + } + + // Align the range to discard_granularity_bytes boundary and skip + // and discards that are too small to free up any space. + // + // discard_granularity_bytes >= object_size && tail truncation + // is a special case for filestore + bool prune_required = false; + bool length_modified = false; + auto object_size = this->m_image_ctx.layout.object_size; + auto discard_granularity_bytes = std::min(m_discard_granularity_bytes, + object_size); + auto xform_lambda = + [discard_granularity_bytes, object_size, &prune_required, &length_modified] + (LightweightObjectExtent& object_extent) { + auto& offset = object_extent.offset; + auto& length = object_extent.length; + auto next_offset = offset + length; + + if ((discard_granularity_bytes < object_size) || + (next_offset < object_size)) { + offset = p2roundup<uint64_t>(offset, discard_granularity_bytes); + next_offset = p2align<uint64_t>(next_offset, discard_granularity_bytes); + if (offset >= next_offset) { + prune_required = true; + length = 0; + } else { + auto new_length = next_offset - offset; + if (length != new_length) { + length_modified = true; + length = new_length; + } + } + } + }; + std::for_each(object_extents->begin(), object_extents->end(), + xform_lambda); + + if (prune_required) { + // one or more object extents were skipped + auto remove_lambda = + [](const LightweightObjectExtent& object_extent) { + return (object_extent.length == 0); + }; + object_extents->erase( + std::remove_if(object_extents->begin(), object_extents->end(), + remove_lambda), + object_extents->end()); + } + + // object extents were modified, image extents needs updating + if (length_modified || prune_required) { + return 1; + } + + return 0; +} + +template <typename I> +void ImageFlushRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + + bool journaling = false; + { + std::shared_lock image_locker{image_ctx.image_lock}; + journaling = (m_flush_source == FLUSH_SOURCE_USER && + image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + + Context *ctx = new C_AioRequest(aio_comp); + + // ensure no locks are held when flush is complete + ctx = librbd::util::create_async_context_callback(image_ctx, ctx); + + uint64_t journal_tid = 0; + if (journaling) { + // in-flight ops are flushed prior to closing the journal + ceph_assert(image_ctx.journal != NULL); + journal_tid = image_ctx.journal->append_io_event( + journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0); + image_ctx.journal->user_flushed(); + } + + auto object_dispatch_spec = ObjectDispatchSpec::create_flush( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, journal_tid, + this->m_trace, ctx); + ctx = new LambdaContext([object_dispatch_spec](int r) { + object_dispatch_spec->send(); + }); + + // ensure all in-flight IOs are settled if non-user flush request + if (m_flush_source == FLUSH_SOURCE_WRITEBACK) { + ctx->complete(0); + } else { + aio_comp->async_op.flush(ctx); + } + + // might be flushing during image shutdown + if (image_ctx.perfcounter != nullptr) { + image_ctx.perfcounter->inc(l_librbd_flush); + } +} + +template <typename I> +uint64_t ImageWriteSameRequest<I>::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first, + extent.second, + m_data_bl)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template <typename I> +ObjectDispatchSpec *ImageWriteSameRequest<I>::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + ObjectDispatchSpec *req; + + if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) { + auto buffer_extents{object_extent.buffer_extents}; + + req = ObjectDispatchSpec::create_write_same( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, object_extent.length, std::move(buffer_extents), + std::move(bl), io_context, m_op_flags, journal_tid, + this->m_trace, on_finish); + return req; + } + req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(bl), io_context, m_op_flags, 0, + std::nullopt, journal_tid, this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageWriteSameRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_ws); + image_ctx.perfcounter->inc(l_librbd_ws_bytes, length); +} + +template <typename I> +uint64_t ImageCompareAndWriteRequest<I>::append_journal_event( + bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(this->m_image_extents.size() == 1); + auto &extent = this->m_image_extents.front(); + tid = image_ctx.journal->append_compare_and_write_event(extent.first, + extent.second, + m_cmp_bl, + m_bl, + synchronous); + + return tid; +} + +template <typename I> +void ImageCompareAndWriteRequest<I>::assemble_extent( + const LightweightObjectExtent &object_extent, bufferlist *bl, + bufferlist *cmp_bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + + bufferlist sub_cmp_bl; + sub_cmp_bl.substr_of(m_cmp_bl, q->first, q->second); + cmp_bl->claim_append(sub_cmp_bl); + } +} + +template <typename I> +ObjectDispatchSpec *ImageCompareAndWriteRequest<I>::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + bufferlist cmp_bl; + assemble_extent(object_extent, &bl, &cmp_bl); + auto req = ObjectDispatchSpec::create_compare_and_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(cmp_bl), std::move(bl), io_context, + m_mismatch_offset, m_op_flags, journal_tid, this->m_trace, on_finish); + return req; +} + +template <typename I> +void ImageCompareAndWriteRequest<I>::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_cmp); + image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length); +} + +template <typename I> +int ImageCompareAndWriteRequest<I>::prune_object_extents( + LightweightObjectExtents* object_extents) const { + if (object_extents->size() > 1) + return -EINVAL; + + I &image_ctx = this->m_image_ctx; + uint64_t su = image_ctx.layout.stripe_unit; + auto& object_extent = object_extents->front(); + if (su == 0 || (object_extent.offset % su + object_extent.length > su)) + return -EINVAL; + + return 0; +} + +template <typename I> +ImageListSnapsRequest<I>::ImageListSnapsRequest( + I& image_ctx, AioCompletion* aio_comp, Extents&& image_extents, + ImageArea area, SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta, const ZTracer::Trace& parent_trace) + : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), area, + "list-snaps", parent_trace), + m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags), + m_snapshot_delta(snapshot_delta) { +} + +template <typename I> +void ImageListSnapsRequest<I>::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + // map image extents to object extents + auto &image_extents = this->m_image_extents; + std::map<uint64_t, Extents> object_number_extents; + for (auto& image_extent : image_extents) { + if (image_extent.second == 0) { + continue; + } + + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(&image_ctx, image_extent.first, + image_extent.second, this->m_image_area, 0, + &object_extents); + for (auto& object_extent : object_extents) { + object_number_extents[object_extent.object_no].emplace_back( + object_extent.offset, object_extent.length); + } + } + + // reassemble the deltas back into image-extents when complete + auto aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + auto assemble_ctx = new C_AssembleSnapshotDeltas<I>( + &image_ctx, aio_comp, m_snapshot_delta); + auto sub_aio_comp = AioCompletion::create_and_start< + Context, &Context::complete>(assemble_ctx, get_image_ctx(&image_ctx), + AIO_TYPE_GENERIC); + + // issue the requests + sub_aio_comp->set_request_count(object_number_extents.size()); + for (auto& oe : object_number_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.first) << " " + << oe.second << dendl; + auto ctx = new C_AioRequest(sub_aio_comp); + auto req = ObjectDispatchSpec::create_list_snaps( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.first, std::move(oe.second), + SnapIds{m_snap_ids}, m_list_snaps_flags, this->m_trace, + assemble_ctx->get_snapshot_delta(oe.first), ctx); + req->send(); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageRequest<librbd::ImageCtx>; +template class librbd::io::ImageReadRequest<librbd::ImageCtx>; +template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>; +template class librbd::io::ImageWriteRequest<librbd::ImageCtx>; +template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>; +template class librbd::io::ImageFlushRequest<librbd::ImageCtx>; +template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>; +template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>; +template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h new file mode 100644 index 000000000..2668c1acb --- /dev/null +++ b/src/librbd/io/ImageRequest.h @@ -0,0 +1,377 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H +#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "common/zipkin_trace.h" +#include "osd/osd_types.h" +#include "librbd/Utils.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include <list> +#include <utility> +#include <vector> + +namespace librbd { +class ImageCtx; + +namespace io { + +class AioCompletion; +class ObjectDispatchSpec; +class ReadResult; + +template <typename ImageCtxT = ImageCtx> +class ImageRequest { +public: + virtual ~ImageRequest() { + m_trace.event("finish"); + } + + static void aio_read(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace); + static void aio_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace); + static void aio_discard(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace); + static void aio_flush(ImageCtxT *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace); + static void aio_writesame(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace); + static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace); + + void send(); + + inline const ZTracer::Trace &get_trace() const { + return m_trace; + } + +protected: + typedef std::list<ObjectDispatchSpec*> ObjectRequests; + + ImageCtxT &m_image_ctx; + AioCompletion *m_aio_comp; + Extents m_image_extents; + ImageArea m_image_area; + ZTracer::Trace m_trace; + + ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, const char *trace_name, + const ZTracer::Trace &parent_trace) + : m_image_ctx(image_ctx), m_aio_comp(aio_comp), + m_image_extents(std::move(image_extents)), m_image_area(area), + m_trace(librbd::util::create_trace(image_ctx, trace_name, parent_trace)) { + m_trace.event("start"); + } + + virtual void update_timestamp(); + virtual void send_request() = 0; + + virtual aio_type_t get_aio_type() const = 0; + virtual const char *get_request_type() const = 0; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageReadRequest : public ImageRequest<ImageCtxT> { +public: + ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace); + +protected: + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_READ; + } + const char *get_request_type() const override { + return "aio_read"; + } + +private: + IOContext m_io_context; + int m_op_flags; + int m_read_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class AbstractImageWriteRequest : public ImageRequest<ImageCtxT> { +public: + inline void flag_synchronous() { + m_synchronous = true; + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + const char *trace_name, + const ZTracer::Trace &parent_trace) + : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents), + area, trace_name, parent_trace), + m_synchronous(false) { + } + + void send_request() override; + + virtual int prune_object_extents( + LightweightObjectExtents* object_extents) const { + return 0; + } + + void send_object_requests(const LightweightObjectExtents &object_extents, + IOContext io_context, uint64_t journal_tid); + virtual ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) = 0; + + virtual uint64_t append_journal_event(bool synchronous) = 0; + virtual void update_stats(size_t length) = 0; + +private: + bool m_synchronous; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageWriteRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), area, + "write", parent_trace), + m_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITE; + } + const char *get_request_type() const override { + return "aio_write"; + } + + void assemble_extent(const LightweightObjectExtent &object_extent, + bufferlist *bl); + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + +private: + bufferlist m_bl; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), area, + "discard", parent_trace), + m_discard_granularity_bytes(discard_granularity_bytes) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_DISCARD; + } + const char *get_request_type() const override { + return "aio_discard"; + } + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + int prune_object_extents( + LightweightObjectExtents* object_extents) const override; + +private: + uint32_t m_discard_granularity_bytes; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageFlushRequest : public ImageRequest<ImageCtxT> { +public: + ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) + : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}, + ImageArea::DATA /* dummy for {} */, + "flush", parent_trace), + m_flush_source(flush_source) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + void update_timestamp() override { + } + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_FLUSH; + } + const char *get_request_type() const override { + return "aio_flush"; + } + +private: + FlushSource m_flush_source; + +}; + +template <typename ImageCtxT = ImageCtx> +class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), area, + "writesame", parent_trace), + m_data_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITESAME; + } + const char *get_request_type() const override { + return "aio_writesame"; + } + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; +private: + bufferlist m_data_bl; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageCompareAndWriteRequest : public AbstractImageWriteRequest<ImageCtxT> { +public: + using typename ImageRequest<ImageCtxT>::ObjectRequests; + + ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest<ImageCtxT>( + image_ctx, aio_comp, std::move(image_extents), area, + "compare_and_write", parent_trace), + m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + +protected: + void assemble_extent(const LightweightObjectExtent &object_extent, + bufferlist *bl, bufferlist *cmp_bl); + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_COMPARE_AND_WRITE; + } + const char *get_request_type() const override { + return "aio_compare_and_write"; + } + + int prune_object_extents( + LightweightObjectExtents* object_extents) const override; + +private: + bufferlist m_cmp_bl; + bufferlist m_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ImageListSnapsRequest : public ImageRequest<ImageCtxT> { +public: + ImageListSnapsRequest( + ImageCtxT& image_ctx, AioCompletion* aio_comp, + Extents&& image_extents, ImageArea area, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace& parent_trace); + +protected: + void update_timestamp() override {} + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_GENERIC; + } + const char *get_request_type() const override { + return "list-snaps"; + } + +private: + SnapIds m_snap_ids; + int m_list_snaps_flags; + SnapshotDelta* m_snapshot_delta; +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageReadRequest<librbd::ImageCtx>; +extern template class librbd::io::AbstractImageWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageDiscardRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageFlushRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageWriteSameRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageCompareAndWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ImageListSnapsRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H diff --git a/src/librbd/io/IoOperations.cc b/src/librbd/io/IoOperations.cc new file mode 100644 index 000000000..7db7e7a80 --- /dev/null +++ b/src/librbd/io/IoOperations.cc @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/lexical_cast.hpp> +#include <boost/algorithm/string.hpp> + +#include "librbd/io/Types.h" +#include "librbd/io/IoOperations.h" + +#include <map> +#include <vector> + +namespace librbd { +namespace io { + +#define RBD_IO_OPERATION_NAME_READ "read" +#define RBD_IO_OPERATION_NAME_WRITE "write" +#define RBD_IO_OPERATION_NAME_DISCARD "discard" +#define RBD_IO_OPERATION_NAME_WRITE_SAME "write_same" +#define RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE "compare_and_write" + +static const std::map<std::string, uint64_t> RBD_IO_OPERATION_MAP = { + {RBD_IO_OPERATION_NAME_READ, RBD_IO_OPERATION_READ}, + {RBD_IO_OPERATION_NAME_WRITE, RBD_IO_OPERATION_WRITE}, + {RBD_IO_OPERATION_NAME_DISCARD, RBD_IO_OPERATION_DISCARD}, + {RBD_IO_OPERATION_NAME_WRITE_SAME, RBD_IO_OPERATION_WRITE_SAME}, + {RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE, RBD_IO_OPERATION_COMPARE_AND_WRITE}, +}; +static_assert((RBD_IO_OPERATION_COMPARE_AND_WRITE << 1) > RBD_IO_OPERATIONS_ALL, + "new RBD io operation added"); + +std::string rbd_io_operations_to_string(uint64_t operations, + std::ostream *err) +{ + std::string r; + for (auto& i : RBD_IO_OPERATION_MAP) { + if (operations & i.second) { + if (!r.empty()) { + r += ","; + } + r += i.first; + operations &= ~i.second; + } + } + if (err && operations) { + *err << "ignoring unknown io operation mask 0x" + << std::hex << operations << std::dec; + } + return r; +} + +uint64_t rbd_io_operations_from_string(const std::string& orig_value, + std::ostream *err) +{ + uint64_t operations = 0; + std::string value = orig_value; + boost::trim(value); + + // empty string means default operations + if (!value.size()) { + return RBD_IO_OPERATIONS_DEFAULT; + } + + try { + // numeric? + operations = boost::lexical_cast<uint64_t>(value); + + // drop unrecognized bits + uint64_t unsupported_operations = (operations & ~RBD_IO_OPERATIONS_ALL); + if (unsupported_operations != 0ull) { + operations &= RBD_IO_OPERATIONS_ALL; + if (err) { + *err << "ignoring unknown operation mask 0x" + << std::hex << unsupported_operations << std::dec; + } + } + } catch (boost::bad_lexical_cast&) { + // operation name list? + bool errors = false; + std::vector<std::string> operation_names; + boost::split(operation_names, value, boost::is_any_of(",")); + for (auto operation_name: operation_names) { + boost::trim(operation_name); + auto operation_it = RBD_IO_OPERATION_MAP.find(operation_name); + if (operation_it != RBD_IO_OPERATION_MAP.end()) { + operations += operation_it->second; + } else if (err) { + if (errors) { + *err << ", "; + } else { + errors = true; + } + *err << "ignoring unknown operation " << operation_name; + } + } + } + return operations; +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/IoOperations.h b/src/librbd/io/IoOperations.h new file mode 100644 index 000000000..93d3ef4fe --- /dev/null +++ b/src/librbd/io/IoOperations.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <ostream> + +namespace librbd { +namespace io { + + std::string rbd_io_operations_to_string(uint64_t ops, + std::ostream *err); + uint64_t rbd_io_operations_from_string(const std::string& value, + std::ostream *err); + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc new file mode 100644 index 000000000..a31cc74ea --- /dev/null +++ b/src/librbd/io/ObjectDispatch.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatch.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/ObjectRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; + +template <typename I> +ObjectDispatch<I>::ObjectDispatch(I* image_ctx) + : m_image_ctx(image_ctx) { +} + +template <typename I> +void ObjectDispatch<I>::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_image_ctx->asio_engine->post(on_finish, 0); +} + +template <typename I> +bool ObjectDispatch<I>::read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectReadRequest<I>(m_image_ctx, object_no, extents, + io_context, op_flags, read_flags, + parent_trace, version, on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectDiscardRequest<I>(m_image_ctx, object_no, object_off, + object_len, io_context, discard_flags, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteRequest<I>(m_image_ctx, object_no, object_off, + std::move(data), io_context, op_flags, + write_flags, assert_version, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteSameRequest<I>(m_image_ctx, object_no, + object_off, object_len, + std::move(data), io_context, + op_flags, parent_trace, + on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << write_data.length() << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectCompareAndWriteRequest<I>(m_image_ctx, object_no, + object_off, + std::move(cmp_data), + std::move(write_data), + io_context, mismatch_offset, + op_flags, parent_trace, + on_dispatched); + req->send(); + return true; +} + +template <typename I> +bool ObjectDispatch<I>::list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << "extents=" << extents << ", " + << "snap_ids=" << snap_ids << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = ObjectListSnapsRequest<I>::create( + m_image_ctx, object_no, std::move(extents), std::move(snap_ids), + list_snap_flags, parent_trace, snapshot_delta, on_dispatched); + req->send(); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h new file mode 100644 index 000000000..dd1f7261d --- /dev/null +++ b/src/librbd/io/ObjectDispatch.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT = librbd::ImageCtx> +class ObjectDispatch : public ObjectDispatchInterface { +public: + ObjectDispatch(ImageCtxT* image_ctx); + + ObjectDispatchLayer get_dispatch_layer() const override { + return OBJECT_DISPATCH_LAYER_CORE; + } + + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override { + return false; + } + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h new file mode 100644 index 000000000..2e9dd1300 --- /dev/null +++ b/src/librbd/io/ObjectDispatchInterface.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" + +struct Context; +struct RWLock; + +namespace librbd { +namespace io { + +struct AioCompletion; +struct ObjectDispatchInterface; +struct ObjectDispatchSpec; + +struct ObjectDispatchInterface { + typedef ObjectDispatchInterface Dispatch; + typedef ObjectDispatchLayer DispatchLayer; + typedef ObjectDispatchSpec DispatchSpec; + + virtual ~ObjectDispatchInterface() { + } + + virtual ObjectDispatchLayer get_dispatch_layer() const = 0; + + virtual void shut_down(Context* on_finish) = 0; + + virtual bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context**on_finish, + Context* on_dispatched) = 0; + + virtual bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) = 0; + + virtual bool list_snaps( + uint64_t object_no, Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool invalidate_cache(Context* on_finish) = 0; + virtual bool reset_existence_cache(Context* on_finish) = 0; + + virtual void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) = 0; + + virtual int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc new file mode 100644 index 000000000..3efff9774 --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.cc @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatchSpec.h" +#include "include/Context.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include <boost/variant.hpp> + +namespace librbd { +namespace io { + +void ObjectDispatchSpec::C_Dispatcher::complete(int r) { + if (r < 0) { + finish(r); + return; + } + + switch (object_dispatch_spec->dispatch_result) { + case DISPATCH_RESULT_CONTINUE: + object_dispatch_spec->send(); + break; + case DISPATCH_RESULT_COMPLETE: + finish(r); + break; + case DISPATCH_RESULT_INVALID: + case DISPATCH_RESULT_RESTART: + ceph_abort(); + break; + } +} + +void ObjectDispatchSpec::C_Dispatcher::finish(int r) { + on_finish->complete(r); + delete object_dispatch_spec; +} + +void ObjectDispatchSpec::send() { + object_dispatcher->send(this); +} + +void ObjectDispatchSpec::fail(int r) { + ceph_assert(r < 0); + dispatcher_ctx.complete(r); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h new file mode 100644 index 000000000..a0d4b49a4 --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.h @@ -0,0 +1,295 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include <boost/variant/variant.hpp> + +namespace librbd { +namespace io { + +struct ObjectDispatcherInterface; + +struct ObjectDispatchSpec { +private: + // helper to avoid extra heap allocation per object IO + struct C_Dispatcher : public Context { + ObjectDispatchSpec* object_dispatch_spec; + Context* on_finish; + + C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish) + : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) { + } + + void complete(int r) override; + void finish(int r) override; + }; + +public: + struct RequestBase { + uint64_t object_no; + + RequestBase(uint64_t object_no) + : object_no(object_no) { + } + }; + + struct ReadRequest : public RequestBase { + ReadExtents* extents; + int read_flags; + uint64_t* version; + + ReadRequest(uint64_t object_no, ReadExtents* extents, int read_flags, + uint64_t* version) + : RequestBase(object_no), extents(extents), read_flags(read_flags), + version(version) { + } + }; + + struct WriteRequestBase : public RequestBase { + uint64_t object_off; + uint64_t journal_tid; + + WriteRequestBase(uint64_t object_no, uint64_t object_off, + uint64_t journal_tid) + : RequestBase(object_no), object_off(object_off), + journal_tid(journal_tid) { + } + }; + + struct DiscardRequest : public WriteRequestBase { + uint64_t object_len; + int discard_flags; + + DiscardRequest(uint64_t object_no, uint64_t object_off, uint64_t object_len, + int discard_flags, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + object_len(object_len), discard_flags(discard_flags) { + } + }; + + struct WriteRequest : public WriteRequestBase { + ceph::bufferlist data; + int write_flags; + std::optional<uint64_t> assert_version; + + WriteRequest(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, int write_flags, + std::optional<uint64_t> assert_version, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + data(std::move(data)), write_flags(write_flags), + assert_version(assert_version) { + } + }; + + struct WriteSameRequest : public WriteRequestBase { + uint64_t object_len; + LightweightBufferExtents buffer_extents; + ceph::bufferlist data; + + WriteSameRequest(uint64_t object_no, uint64_t object_off, + uint64_t object_len, + LightweightBufferExtents&& buffer_extents, + ceph::bufferlist&& data, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + object_len(object_len), buffer_extents(std::move(buffer_extents)), + data(std::move(data)) { + } + }; + + struct CompareAndWriteRequest : public WriteRequestBase { + ceph::bufferlist cmp_data; + ceph::bufferlist data; + uint64_t* mismatch_offset; + + CompareAndWriteRequest(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& data, + uint64_t* mismatch_offset, + uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + cmp_data(std::move(cmp_data)), data(std::move(data)), + mismatch_offset(mismatch_offset) { + } + }; + + struct FlushRequest { + FlushSource flush_source; + uint64_t journal_tid; + + FlushRequest(FlushSource flush_source, uint64_t journal_tid) + : flush_source(flush_source), journal_tid(journal_tid) { + } + }; + + struct ListSnapsRequest : public RequestBase { + Extents extents; + SnapIds snap_ids; + int list_snaps_flags; + SnapshotDelta* snapshot_delta; + + ListSnapsRequest(uint64_t object_no, Extents&& extents, + SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta) + : RequestBase(object_no), extents(std::move(extents)), + snap_ids(std::move(snap_ids)),list_snaps_flags(list_snaps_flags), + snapshot_delta(snapshot_delta) { + } + }; + + typedef boost::variant<ReadRequest, + DiscardRequest, + WriteRequest, + WriteSameRequest, + CompareAndWriteRequest, + FlushRequest, + ListSnapsRequest> Request; + + C_Dispatcher dispatcher_ctx; + + ObjectDispatcherInterface* object_dispatcher; + ObjectDispatchLayer dispatch_layer; + int object_dispatch_flags = 0; + DispatchResult dispatch_result = DISPATCH_RESULT_INVALID; + + Request request; + IOContext io_context; + int op_flags; + ZTracer::Trace parent_trace; + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_read( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, Context* on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + ReadRequest{object_no, extents, + read_flags, version}, + io_context, op_flags, parent_trace, + on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_discard( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + DiscardRequest{object_no, object_off, + object_len, discard_flags, + journal_tid}, + io_context, 0, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteRequest{object_no, object_off, + std::move(data), write_flags, + assert_version, journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_write_same( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteSameRequest{object_no, object_off, + object_len, + std::move(buffer_extents), + std::move(data), + journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_compare_and_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, + uint64_t *mismatch_offset, int op_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + CompareAndWriteRequest{object_no, + object_off, + std::move(cmp_data), + std::move(write_data), + mismatch_offset, + journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_flush( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + FlushSource flush_source, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + FlushRequest{flush_source, journal_tid}, + {}, 0, parent_trace, on_finish); + } + + template <typename ImageCtxT> + static ObjectDispatchSpec* create_list_snaps( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, Extents&& extents, SnapIds&& snap_ids, + int list_snaps_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, Context* on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + ListSnapsRequest{object_no, + std::move(extents), + std::move(snap_ids), + list_snaps_flags, + snapshot_delta}, + {}, 0, parent_trace, on_finish); + } + + void send(); + void fail(int r); + +private: + template <typename> friend class ObjectDispatcher; + + ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher, + ObjectDispatchLayer object_dispatch_layer, + Request&& request, IOContext io_context, int op_flags, + const ZTracer::Trace& parent_trace, Context* on_finish) + : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher), + dispatch_layer(object_dispatch_layer), request(std::move(request)), + io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) { + } + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc new file mode 100644 index 000000000..b66c6bb18 --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatcher.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatch.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include <boost/variant.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +struct ObjectDispatcher<I>::C_ResetExistenceCache : public C_LayerIterator { + C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish) + : C_LayerIterator(object_dispatcher, OBJECT_DISPATCH_LAYER_NONE, on_finish) { + } + + bool execute(ObjectDispatchInterface* object_dispatch, + Context* on_finish) override { + return object_dispatch->reset_existence_cache(on_finish); + } +}; + +template <typename I> +struct ObjectDispatcher<I>::SendVisitor : public boost::static_visitor<bool> { + ObjectDispatchInterface* object_dispatch; + ObjectDispatchSpec* object_dispatch_spec; + + SendVisitor(ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) + : object_dispatch(object_dispatch), + object_dispatch_spec(object_dispatch_spec) { + } + + bool operator()(ObjectDispatchSpec::ReadRequest& read) const { + return object_dispatch->read( + read.object_no, read.extents, object_dispatch_spec->io_context, + object_dispatch_spec->op_flags, read.read_flags, + object_dispatch_spec->parent_trace, read.version, + &object_dispatch_spec->object_dispatch_flags, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const { + return object_dispatch->discard( + discard.object_no, discard.object_off, discard.object_len, + object_dispatch_spec->io_context, discard.discard_flags, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteRequest& write) const { + return object_dispatch->write( + write.object_no, write.object_off, std::move(write.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + write.write_flags, write.assert_version, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const { + return object_dispatch->write_same( + write_same.object_no, write_same.object_off, write_same.object_len, + std::move(write_same.buffer_extents), std::move(write_same.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()( + ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const { + return object_dispatch->compare_and_write( + compare_and_write.object_no, compare_and_write.object_off, + std::move(compare_and_write.cmp_data), std::move(compare_and_write.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + object_dispatch_spec->parent_trace, compare_and_write.mismatch_offset, + &object_dispatch_spec->object_dispatch_flags, + &compare_and_write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::FlushRequest& flush) const { + return object_dispatch->flush( + flush.flush_source, object_dispatch_spec->parent_trace, + &flush.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::ListSnapsRequest& list_snaps) const { + return object_dispatch->list_snaps( + list_snaps.object_no, std::move(list_snaps.extents), + std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags, + object_dispatch_spec->parent_trace, list_snaps.snapshot_delta, + &object_dispatch_spec->object_dispatch_flags, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } +}; + +template <typename I> +ObjectDispatcher<I>::ObjectDispatcher(I* image_ctx) + : Dispatcher<I, ObjectDispatcherInterface>(image_ctx) { + // configure the core object dispatch handler on startup + auto object_dispatch = new ObjectDispatch(image_ctx); + this->register_dispatch(object_dispatch); +} + +template <typename I> +void ObjectDispatcher<I>::invalidate_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*image_ctx, on_finish); + auto ctx = new C_InvalidateCache( + this, OBJECT_DISPATCH_LAYER_NONE, on_finish); + ctx->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::reset_existence_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*image_ctx, on_finish); + auto ctx = new C_ResetExistenceCache(this, on_finish); + ctx->complete(0); +} + +template <typename I> +void ObjectDispatcher<I>::extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + auto cct = this->m_image_ctx->cct; + ldout(cct, 20) << object_no << " " << object_off << "~" << object_len + << dendl; + + std::shared_lock locker{this->m_lock}; + for (auto it : this->m_dispatches) { + auto& object_dispatch_meta = it.second; + auto object_dispatch = object_dispatch_meta.dispatch; + object_dispatch->extent_overwritten(object_no, object_off, object_len, + journal_tid, new_journal_tid); + } +} + +template <typename I> +int ObjectDispatcher<I>::prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) { + auto cct = this->m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << dendl; + + std::shared_lock locker{this->m_lock}; + for (auto it : this->m_dispatches) { + auto& object_dispatch_meta = it.second; + auto object_dispatch = object_dispatch_meta.dispatch; + auto r = object_dispatch->prepare_copyup( + object_no, snapshot_sparse_bufferlist); + if (r < 0) { + return r; + } + } + + return 0; +} + +template <typename I> +bool ObjectDispatcher<I>::send_dispatch( + ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) { + return boost::apply_visitor( + SendVisitor{object_dispatch, object_dispatch_spec}, + object_dispatch_spec->request); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatcher<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h new file mode 100644 index 000000000..1e5e78d8b --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Dispatcher.h" +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Types.h" +#include <map> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template <typename ImageCtxT = ImageCtx> +class ObjectDispatcher + : public Dispatcher<ImageCtxT, ObjectDispatcherInterface> { +public: + ObjectDispatcher(ImageCtxT* image_ctx); + + void invalidate_cache(Context* on_finish) override; + void reset_existence_cache(Context* on_finish) override; + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override; + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override; + + using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_LayerIterator; + + using typename Dispatcher<ImageCtxT, ObjectDispatcherInterface>::C_InvalidateCache; + +protected: + bool send_dispatch(ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) override; + +private: + struct C_ResetExistenceCache; + struct SendVisitor; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatcher<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H diff --git a/src/librbd/io/ObjectDispatcherInterface.h b/src/librbd/io/ObjectDispatcherInterface.h new file mode 100644 index 000000000..0f3d33330 --- /dev/null +++ b/src/librbd/io/ObjectDispatcherInterface.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { +namespace io { + +struct ObjectDispatcherInterface + : public DispatcherInterface<ObjectDispatchInterface> { +public: + virtual void invalidate_cache(Context* on_finish) = 0; + virtual void reset_existence_cache(Context* on_finish) = 0; + + virtual void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) = 0; + + virtual int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc new file mode 100644 index 000000000..6d246cdf3 --- /dev/null +++ b/src/librbd/io/ObjectRequest.cc @@ -0,0 +1,1073 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectRequest.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/ceph_mutex.h" +#include "include/Context.h" +#include "include/err.h" +#include "include/neorados/RADOS.hpp" +#include "osd/osd_types.h" +#include "librados/snap_set_diff.h" +#include "librbd/AsioEngine.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/CopyupRequest.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/Utils.h" + +#include <boost/optional.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \ + << " " << __func__ << ": " \ + << data_object_name(this->m_ictx, \ + this->m_object_no) << " " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; +using librbd::util::create_context_callback; +using librbd::util::create_trace; + +namespace { + +template <typename I> +inline bool is_copy_on_read(I *ictx, const IOContext& io_context) { + std::shared_lock image_locker{ictx->image_lock}; + return (ictx->clone_copy_on_read && !ictx->read_only && + io_context->read_snap().value_or(CEPH_NOSNAP) == CEPH_NOSNAP && + (ictx->exclusive_lock == nullptr || + ictx->exclusive_lock->is_lock_owner())); +} + +template <typename S, typename D> +void convert_snap_set(const S& src_snap_set, + D* dst_snap_set) { + dst_snap_set->seq = src_snap_set.seq; + dst_snap_set->clones.reserve(src_snap_set.clones.size()); + for (auto& src_clone : src_snap_set.clones) { + dst_snap_set->clones.emplace_back(); + auto& dst_clone = dst_snap_set->clones.back(); + dst_clone.cloneid = src_clone.cloneid; + dst_clone.snaps = src_clone.snaps; + dst_clone.overlap = src_clone.overlap; + dst_clone.size = src_clone.size; + } +} + +} // anonymous namespace + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_write( + I *ictx, uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectWriteRequest<I>(ictx, object_no, object_off, + std::move(data), io_context, op_flags, + write_flags, assert_version, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_discard( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectDiscardRequest<I>(ictx, object_no, object_off, + object_len, io_context, discard_flags, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_write_same( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectWriteSameRequest<I>(ictx, object_no, object_off, + object_len, std::move(data), io_context, + op_flags, parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>* +ObjectRequest<I>::create_compare_and_write( + I *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectCompareAndWriteRequest<I>(ictx, object_no, object_off, + std::move(cmp_data), + std::move(write_data), io_context, + mismatch_offset, op_flags, + parent_trace, completion); +} + +template <typename I> +ObjectRequest<I>::ObjectRequest( + I *ictx, uint64_t objectno, IOContext io_context, + const char *trace_name, const ZTracer::Trace &trace, Context *completion) + : m_ictx(ictx), m_object_no(objectno), m_io_context(io_context), + m_completion(completion), + m_trace(create_trace(*ictx, "", trace)) { + ceph_assert(m_ictx->data_ctx.is_valid()); + if (m_trace.valid()) { + m_trace.copy_name(trace_name + std::string(" ") + + data_object_name(ictx, objectno)); + m_trace.event("start"); + } +} + +template <typename I> +void ObjectRequest<I>::add_write_hint(I& image_ctx, neorados::WriteOp* wr) { + auto alloc_hint_flags = static_cast<neorados::alloc_hint::alloc_hint_t>( + image_ctx.alloc_hint_flags); + if (image_ctx.enable_alloc_hint) { + wr->set_alloc_hint(image_ctx.get_object_size(), + image_ctx.get_object_size(), + alloc_hint_flags); + } else if (image_ctx.alloc_hint_flags != 0U) { + wr->set_alloc_hint(0, 0, alloc_hint_flags); + } +} + +template <typename I> +bool ObjectRequest<I>::compute_parent_extents(Extents *parent_extents, + ImageArea *area, + bool read_request) { + ceph_assert(ceph_mutex_is_locked(m_ictx->image_lock)); + + m_has_parent = false; + parent_extents->clear(); + *area = ImageArea::DATA; + + uint64_t raw_overlap; + int r = m_ictx->get_parent_overlap( + m_io_context->read_snap().value_or(CEPH_NOSNAP), &raw_overlap); + if (r < 0) { + // NOTE: it's possible for a snapshot to be deleted while we are + // still reading from it + lderr(m_ictx->cct) << "failed to retrieve parent overlap: " + << cpp_strerror(r) << dendl; + return false; + } + bool migration_write = !read_request && !m_ictx->migration_info.empty(); + if (migration_write) { + raw_overlap = m_ictx->migration_info.overlap; + } + if (raw_overlap == 0) { + return false; + } + + std::tie(*parent_extents, *area) = io::util::object_to_area_extents( + m_ictx, m_object_no, {{0, m_ictx->layout.object_size}}); + uint64_t object_overlap = m_ictx->prune_parent_extents( + *parent_extents, *area, raw_overlap, migration_write); + if (object_overlap > 0) { + m_has_parent = true; + return true; + } + return false; +} + +template <typename I> +void ObjectRequest<I>::async_finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_ictx->asio_engine->post([this, r]() { finish(r); }); +} + +template <typename I> +void ObjectRequest<I>::finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_completion->complete(r); + delete this; +} + +/** read **/ + +template <typename I> +ObjectReadRequest<I>::ObjectReadRequest( + I *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion) + : ObjectRequest<I>(ictx, objectno, io_context, "read", parent_trace, + completion), + m_extents(extents), m_op_flags(op_flags),m_read_flags(read_flags), + m_version(version) { +} + +template <typename I> +void ObjectReadRequest<I>::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + read_object(); +} + +template <typename I> +void ObjectReadRequest<I>::read_object() { + I *image_ctx = this->m_ictx; + + std::shared_lock image_locker{image_ctx->image_lock}; + auto read_snap_id = this->m_io_context->read_snap().value_or(CEPH_NOSNAP); + if (read_snap_id == image_ctx->snap_id && + image_ctx->object_map != nullptr && + !image_ctx->object_map->object_may_exist(this->m_object_no)) { + image_ctx->asio_engine->post([this]() { read_parent(); }); + return; + } + image_locker.unlock(); + + ldout(image_ctx->cct, 20) << "snap_id=" << read_snap_id << dendl; + + neorados::ReadOp read_op; + for (auto& extent: *this->m_extents) { + if (extent.length >= image_ctx->sparse_read_threshold_bytes) { + read_op.sparse_read(extent.offset, extent.length, &extent.bl, + &extent.extent_map); + } else { + read_op.read(extent.offset, extent.length, &extent.bl); + } + } + util::apply_op_flags( + m_op_flags, image_ctx->get_read_flags(read_snap_id), &read_op); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(read_op), nullptr, + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_read_object(r); }), m_version, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template <typename I> +void ObjectReadRequest<I>::handle_read_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (m_version != nullptr) { + ldout(image_ctx->cct, 20) << "version=" << *m_version << dendl; + } + + if (r == -ENOENT) { + read_parent(); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read from object: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template <typename I> +void ObjectReadRequest<I>::read_parent() { + if ((m_read_flags & READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) { + this->finish(-ENOENT); + return; + } + + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + auto ctx = create_context_callback< + ObjectReadRequest<I>, &ObjectReadRequest<I>::handle_read_parent>(this); + + io::util::read_parent<I>( + image_ctx, this->m_object_no, this->m_extents, + this->m_io_context->read_snap().value_or(CEPH_NOSNAP), this->m_trace, + ctx); +} + +template <typename I> +void ObjectReadRequest<I>::handle_read_parent(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read parent extents: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + copyup(); +} + +template <typename I> +void ObjectReadRequest<I>::copyup() { + I *image_ctx = this->m_ictx; + if (!is_copy_on_read(image_ctx, this->m_io_context)) { + this->finish(0); + return; + } + + image_ctx->owner_lock.lock_shared(); + image_ctx->image_lock.lock_shared(); + Extents parent_extents; + ImageArea area; + if (!this->compute_parent_extents(&parent_extents, &area, true) || + (image_ctx->exclusive_lock != nullptr && + !image_ctx->exclusive_lock->is_lock_owner())) { + image_ctx->image_lock.unlock_shared(); + image_ctx->owner_lock.unlock_shared(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + image_ctx->copyup_list_lock.lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + // create and kick off a CopyupRequest + auto new_req = CopyupRequest<I>::create( + image_ctx, this->m_object_no, std::move(parent_extents), area, + this->m_trace); + + image_ctx->copyup_list[this->m_object_no] = new_req; + image_ctx->copyup_list_lock.unlock(); + image_ctx->image_lock.unlock_shared(); + new_req->send(); + } else { + image_ctx->copyup_list_lock.unlock(); + image_ctx->image_lock.unlock_shared(); + } + + image_ctx->owner_lock.unlock_shared(); + this->finish(0); +} + +/** write **/ + +template <typename I> +AbstractObjectWriteRequest<I>::AbstractObjectWriteRequest( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t len, + IOContext io_context, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion) + : ObjectRequest<I>(ictx, object_no, io_context, trace_name, parent_trace, + completion), + m_object_off(object_off), m_object_len(len) +{ + if (this->m_object_off == 0 && + this->m_object_len == ictx->get_object_size()) { + m_full_object = true; + } + + compute_parent_info(); + + ictx->image_lock.lock_shared(); + if (!ictx->migration_info.empty()) { + m_guarding_migration_write = true; + } + ictx->image_lock.unlock_shared(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::compute_parent_info() { + I *image_ctx = this->m_ictx; + std::shared_lock image_locker{image_ctx->image_lock}; + + this->compute_parent_extents(&m_parent_extents, &m_image_area, false); + + if (!this->has_parent() || + (m_full_object && + !this->m_io_context->write_snap_context() && + !is_post_copyup_write_required())) { + m_copyup_enabled = false; + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::add_write_hint( + neorados::WriteOp *wr) { + I *image_ctx = this->m_ictx; + std::shared_lock image_locker{image_ctx->image_lock}; + if (image_ctx->object_map == nullptr || !this->m_object_may_exist || + image_ctx->alloc_hint_flags != 0U) { + ObjectRequest<I>::add_write_hint(*image_ctx, wr); + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << this->get_op_type() << " " + << this->m_object_off << "~" << this->m_object_len + << dendl; + { + std::shared_lock image_lock{image_ctx->image_lock}; + if (image_ctx->object_map == nullptr) { + m_object_may_exist = true; + } else { + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + m_object_may_exist = image_ctx->object_map->object_may_exist( + this->m_object_no); + } + } + + if (!m_object_may_exist && is_no_op_for_nonexistent_object()) { + ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object" + << dendl; + this->async_finish(0); + return; + } + + pre_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::pre_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->image_lock.lock_shared(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) { + image_ctx->image_lock.unlock_shared(); + write_object(); + return; + } + + if (!m_object_may_exist && m_copyup_enabled) { + // optimization: copyup required + image_ctx->image_lock.unlock_shared(); + copyup(); + return; + } + + uint8_t new_state = this->get_pre_write_object_map_state(); + ldout(image_ctx->cct, 20) << this->m_object_off << "~" << this->m_object_len + << dendl; + + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest<I>, + &AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false, + this)) { + image_ctx->image_lock.unlock_shared(); + return; + } + + image_ctx->image_lock.unlock_shared(); + write_object(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_pre_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + write_object(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::write_object() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + neorados::WriteOp write_op; + if (m_copyup_enabled) { + if (m_guarding_migration_write) { + auto snap_seq = (this->m_io_context->write_snap_context() ? + this->m_io_context->write_snap_context()->first : 0); + ldout(image_ctx->cct, 20) << "guarding write: snap_seq=" << snap_seq + << dendl; + + cls_client::assert_snapc_seq( + &write_op, snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ); + } else { + ldout(image_ctx->cct, 20) << "guarding write" << dendl; + write_op.assert_exists(); + } + } + + add_write_hint(&write_op); + add_write_ops(&write_op); + ceph_assert(write_op.size() != 0); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(write_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_write_object(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_write_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + r = filter_write_result(r); + if (r == -ENOENT) { + if (m_copyup_enabled) { + copyup(); + return; + } + } else if (r == -ERANGE && m_guarding_migration_write) { + image_ctx->image_lock.lock_shared(); + m_guarding_migration_write = !image_ctx->migration_info.empty(); + image_ctx->image_lock.unlock_shared(); + + if (m_guarding_migration_write) { + copyup(); + } else { + ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl; + compute_parent_info(); + write_object(); + } + return; + } else if (r == -EILSEQ) { + ldout(image_ctx->cct, 10) << "failed to write object" << dendl; + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + post_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::copyup() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + ceph_assert(!m_copyup_in_progress); + m_copyup_in_progress = true; + + image_ctx->copyup_list_lock.lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + auto new_req = CopyupRequest<I>::create( + image_ctx, this->m_object_no, std::move(this->m_parent_extents), + m_image_area, this->m_trace); + this->m_parent_extents.clear(); + + // make sure to wait on this CopyupRequest + new_req->append_request(this, std::move(get_copyup_overwrite_extents())); + image_ctx->copyup_list[this->m_object_no] = new_req; + + image_ctx->copyup_list_lock.unlock(); + new_req->send(); + } else { + it->second->append_request(this, std::move(get_copyup_overwrite_extents())); + image_ctx->copyup_list_lock.unlock(); + } +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_copyup(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + ceph_assert(m_copyup_in_progress); + m_copyup_in_progress = false; + + if (r < 0 && r != -ERESTART) { + lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + if (r == -ERESTART || is_post_copyup_write_required()) { + write_object(); + return; + } + + post_write_object_map_update(); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::post_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->image_lock.lock_shared(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() || + !is_non_existent_post_write_object_map_state()) { + image_ctx->image_lock.unlock_shared(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest<I>, + &AbstractObjectWriteRequest<I>::handle_post_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING, + this->m_trace, false, this)) { + image_ctx->image_lock.unlock_shared(); + return; + } + + image_ctx->image_lock.unlock_shared(); + this->finish(0); +} + +template <typename I> +void AbstractObjectWriteRequest<I>::handle_post_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template <typename I> +void ObjectWriteRequest<I>::add_write_hint(neorados::WriteOp* wr) { + if ((m_write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) { + wr->create(true); + } else if (m_assert_version.has_value()) { + wr->assert_version(m_assert_version.value()); + } + AbstractObjectWriteRequest<I>::add_write_hint(wr); +} + +template <typename I> +void ObjectWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) { + if (this->m_full_object) { + wr->write_full(bufferlist{m_write_data}); + } else { + wr->write(this->m_object_off, bufferlist{m_write_data}); + } + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template <typename I> +void ObjectDiscardRequest<I>::add_write_ops(neorados::WriteOp* wr) { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + wr->remove(); + break; + case DISCARD_ACTION_REMOVE_TRUNCATE: + wr->create(false); + // fall through + case DISCARD_ACTION_TRUNCATE: + wr->truncate(this->m_object_off); + break; + case DISCARD_ACTION_ZERO: + wr->zero(this->m_object_off, this->m_object_len); + break; + default: + ceph_abort(); + break; + } +} + +template <typename I> +void ObjectWriteSameRequest<I>::add_write_ops(neorados::WriteOp* wr) { + wr->writesame(this->m_object_off, this->m_object_len, + bufferlist{m_write_data}); + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template <typename I> +void ObjectCompareAndWriteRequest<I>::add_write_ops(neorados::WriteOp* wr) { + wr->cmpext(this->m_object_off, bufferlist{m_cmp_bl}, nullptr); + + if (this->m_full_object) { + wr->write_full(bufferlist{m_write_bl}); + } else { + wr->write(this->m_object_off, bufferlist{m_write_bl}); + } + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template <typename I> +int ObjectCompareAndWriteRequest<I>::filter_write_result(int r) const { + if (r <= -MAX_ERRNO) { + I *image_ctx = this->m_ictx; + + // object extent compare mismatch + uint64_t offset = -MAX_ERRNO - r; + auto [image_extents, _] = io::util::object_to_area_extents( + image_ctx, this->m_object_no, {{offset, this->m_object_len}}); + ceph_assert(image_extents.size() == 1); + + if (m_mismatch_offset) { + *m_mismatch_offset = image_extents[0].first; + } + r = -EILSEQ; + } + return r; +} + +template <typename I> +ObjectListSnapsRequest<I>::ObjectListSnapsRequest( + I *ictx, uint64_t objectno, Extents&& object_extents, SnapIds&& snap_ids, + int list_snaps_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, Context *completion) + : ObjectRequest<I>( + ictx, objectno, ictx->duplicate_data_io_context(), "snap_list", + parent_trace, completion), + m_object_extents(std::move(object_extents)), + m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags), + m_snapshot_delta(snapshot_delta) { + this->m_io_context->read_snap(CEPH_SNAPDIR); +} + +template <typename I> +void ObjectListSnapsRequest<I>::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + if (m_snap_ids.size() < 2) { + lderr(image_ctx->cct) << "invalid snap ids: " << m_snap_ids << dendl; + this->async_finish(-EINVAL); + return; + } + + list_snaps(); +} + +template <typename I> +void ObjectListSnapsRequest<I>::list_snaps() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + neorados::ReadOp read_op; + read_op.list_snaps(&m_snap_set, &m_ec); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(read_op), nullptr, + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_list_snaps(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template <typename I> +void ObjectListSnapsRequest<I>::handle_list_snaps(int r) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + if (r >= 0) { + r = -m_ec.value(); + } + + ldout(cct, 20) << "r=" << r << dendl; + + m_snapshot_delta->clear(); + auto& snapshot_delta = *m_snapshot_delta; + + ceph_assert(!m_snap_ids.empty()); + librados::snap_t start_snap_id = 0; + librados::snap_t first_snap_id = *m_snap_ids.begin(); + librados::snap_t last_snap_id = *m_snap_ids.rbegin(); + + if (r == -ENOENT) { + // the object does not exist -- mark the missing extents + zero_extent(first_snap_id, true); + list_from_parent(); + return; + } else if (r < 0) { + lderr(cct) << "failed to retrieve object snapshot list: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + // helper function requires the librados legacy data structure + librados::snap_set_t snap_set; + convert_snap_set(m_snap_set, &snap_set); + + bool initial_extents_written = false; + + interval_set<uint64_t> object_interval; + for (auto& object_extent : m_object_extents) { + object_interval.insert(object_extent.first, object_extent.second); + } + ldout(cct, 20) << "object_interval=" << object_interval << dendl; + + // loop through all expected snapshots and build interval sets for + // data and zeroed ranges for each snapshot + uint64_t prev_end_size = 0; + interval_set<uint64_t> initial_written_extents; + for (auto end_snap_id : m_snap_ids) { + if (start_snap_id == end_snap_id) { + continue; + } else if (end_snap_id > last_snap_id) { + break; + } + + interval_set<uint64_t> diff; + uint64_t end_size; + bool exists; + librados::snap_t clone_end_snap_id; + bool read_whole_object; + calc_snap_set_diff(cct, snap_set, start_snap_id, + end_snap_id, &diff, &end_size, &exists, + &clone_end_snap_id, &read_whole_object); + + if (read_whole_object || + (!diff.empty() && + ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) { + ldout(cct, 1) << "need to read full object" << dendl; + diff.clear(); + diff.insert(0, image_ctx->layout.object_size); + end_size = image_ctx->layout.object_size; + clone_end_snap_id = end_snap_id; + } else if (!exists) { + end_size = 0; + } + + if (exists) { + // reads should be issued against the newest (existing) snapshot within + // the associated snapshot object clone. writes should be issued + // against the oldest snapshot in the snap_map. + ceph_assert(clone_end_snap_id >= end_snap_id); + if (clone_end_snap_id > last_snap_id) { + // do not read past the copy point snapshot + clone_end_snap_id = last_snap_id; + } + } + + // clip diff to current object extent + interval_set<uint64_t> diff_interval; + diff_interval.intersection_of(object_interval, diff); + + // clip diff to size of object (in case it was truncated) + interval_set<uint64_t> zero_interval; + if (end_size < prev_end_size) { + zero_interval.insert(end_size, prev_end_size - end_size); + zero_interval.intersection_of(object_interval); + + interval_set<uint64_t> trunc_interval; + trunc_interval.intersection_of(zero_interval, diff_interval); + if (!trunc_interval.empty()) { + diff_interval.subtract(trunc_interval); + ldout(cct, 20) << "clearing truncate diff: " << trunc_interval << dendl; + } + } + + ldout(cct, 20) << "start_snap_id=" << start_snap_id << ", " + << "end_snap_id=" << end_snap_id << ", " + << "clone_end_snap_id=" << clone_end_snap_id << ", " + << "diff=" << diff << ", " + << "diff_interval=" << diff_interval<< ", " + << "zero_interval=" << zero_interval<< ", " + << "end_size=" << end_size << ", " + << "prev_end_size=" << prev_end_size << ", " + << "exists=" << exists << ", " + << "whole_object=" << read_whole_object << dendl; + + // check if object exists prior to start of incremental snap delta so that + // we don't DNE the object if no additional deltas exist + if (exists && start_snap_id == 0 && + (!diff_interval.empty() || !zero_interval.empty())) { + ldout(cct, 20) << "object exists at snap id " << end_snap_id << dendl; + initial_extents_written = true; + } + + prev_end_size = end_size; + start_snap_id = end_snap_id; + + if (end_snap_id <= first_snap_id) { + // don't include deltas from the starting snapshots, but we iterate over + // it to track its existence and size + ldout(cct, 20) << "skipping prior snapshot " << dendl; + continue; + } + + if (exists) { + for (auto& interval : diff_interval) { + snapshot_delta[{end_snap_id, clone_end_snap_id}].insert( + interval.first, interval.second, + SparseExtent(SPARSE_EXTENT_STATE_DATA, interval.second)); + } + } else { + zero_interval.union_of(diff_interval); + } + + if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) { + for (auto& interval : zero_interval) { + snapshot_delta[{end_snap_id, end_snap_id}].insert( + interval.first, interval.second, + SparseExtent(SPARSE_EXTENT_STATE_ZEROED, interval.second)); + } + } + } + + bool snapshot_delta_empty = snapshot_delta.empty(); + if (!initial_extents_written) { + zero_extent(first_snap_id, first_snap_id > 0); + } + ldout(cct, 20) << "snapshot_delta=" << snapshot_delta << dendl; + + if (snapshot_delta_empty) { + list_from_parent(); + return; + } + + this->finish(0); +} + +template <typename I> +void ObjectListSnapsRequest<I>::list_from_parent() { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + ceph_assert(!m_snap_ids.empty()); + librados::snap_t snap_id_start = *m_snap_ids.begin(); + librados::snap_t snap_id_end = *m_snap_ids.rbegin(); + + std::unique_lock image_locker{image_ctx->image_lock}; + if ((snap_id_start > 0) || (image_ctx->parent == nullptr) || + ((m_list_snaps_flags & LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT) != 0)) { + image_locker.unlock(); + + this->finish(0); + return; + } + + Extents parent_extents; + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + image_ctx->get_parent_overlap(snap_id_end, &raw_overlap); + if (raw_overlap > 0) { + // calculate reverse mapping onto the parent image + std::tie(parent_extents, m_image_area) = io::util::object_to_area_extents( + image_ctx, this->m_object_no, m_object_extents); + object_overlap = image_ctx->prune_parent_extents( + parent_extents, m_image_area, raw_overlap, false); + } + if (object_overlap == 0) { + image_locker.unlock(); + + this->finish(0); + return; + } + + auto ctx = create_context_callback< + ObjectListSnapsRequest<I>, + &ObjectListSnapsRequest<I>::handle_list_from_parent>(this); + auto aio_comp = AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(image_ctx->parent), AIO_TYPE_GENERIC); + ldout(cct, 20) << "completion=" << aio_comp + << " parent_extents=" << parent_extents + << " area=" << m_image_area << dendl; + + auto list_snaps_flags = ( + m_list_snaps_flags | LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS); + + ImageListSnapsRequest<I> req( + *image_ctx->parent, aio_comp, std::move(parent_extents), m_image_area, + {0, image_ctx->parent->snap_id}, list_snaps_flags, &m_parent_snapshot_delta, + this->m_trace); + req.send(); +} + +template <typename I> +void ObjectListSnapsRequest<I>::handle_list_from_parent(int r) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + ldout(cct, 20) << "r=" << r << ", " + << "parent_snapshot_delta=" << m_parent_snapshot_delta + << dendl; + + // ignore special-case of fully empty dataset (we ignore zeroes) + if (m_parent_snapshot_delta.empty()) { + this->finish(0); + return; + } + + // the write/read snapshot id key is not useful for parent images so + // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key + *m_snapshot_delta = {}; + auto& intervals = (*m_snapshot_delta)[INITIAL_WRITE_READ_SNAP_IDS]; + for (auto& [key, image_extents] : m_parent_snapshot_delta) { + for (auto image_extent : image_extents) { + auto state = image_extent.get_val().state; + + // map image-extents back to this object + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(image_ctx, image_extent.get_off(), + image_extent.get_len(), m_image_area, 0, + &object_extents); + for (auto& object_extent : object_extents) { + ceph_assert(object_extent.object_no == this->m_object_no); + intervals.insert( + object_extent.offset, object_extent.length, + {state, object_extent.length}); + } + } + } + + ldout(cct, 20) << "snapshot_delta=" << *m_snapshot_delta << dendl; + this->finish(0); +} + +template <typename I> +void ObjectListSnapsRequest<I>::zero_extent(uint64_t snap_id, bool dne) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + // the object does not exist or is (partially) under whiteout -- mark the + // missing extents which would be any portion of the object that does not + // have data in the initial snapshot set + if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) { + interval_set<uint64_t> interval; + for (auto [object_offset, object_length] : m_object_extents) { + interval.insert(object_offset, object_length); + } + + for (auto [offset, length] : interval) { + ldout(cct, 20) << "snapshot " << snap_id << ": " + << (dne ? "DNE" : "zeroed") << " extent " + << offset << "~" << length << dendl; + (*m_snapshot_delta)[{snap_id, snap_id}].insert( + offset, length, + SparseExtent( + (dne ? SPARSE_EXTENT_STATE_DNE : SPARSE_EXTENT_STATE_ZEROED), + length)); + } + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectRequest<librbd::ImageCtx>; +template class librbd::io::ObjectReadRequest<librbd::ImageCtx>; +template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>; +template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>; +template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>; +template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>; +template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>; +template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>; diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h new file mode 100644 index 000000000..caf644023 --- /dev/null +++ b/src/librbd/io/ObjectRequest.h @@ -0,0 +1,505 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H +#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/neorados/RADOS.hpp" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/ObjectMap.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include <map> + +class Context; +class ObjectExtent; + +namespace neorados { struct WriteOp; } + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> class CopyupRequest; + +/** + * This class represents an I/O operation to a single RBD data object. + * Its subclasses encapsulate logic for dealing with special cases + * for I/O due to layering. + */ +template <typename ImageCtxT = ImageCtx> +class ObjectRequest { +public: + static ObjectRequest* create_write( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + int write_flags, std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_discard( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_write_same( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, ceph::bufferlist&& data, IOContext io_context, + int op_flags, const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_compare_and_write( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion); + + ObjectRequest(ImageCtxT *ictx, uint64_t objectno, IOContext io_context, + const char *trace_name, const ZTracer::Trace &parent_trace, + Context *completion); + virtual ~ObjectRequest() { + m_trace.event("finish"); + } + + static void add_write_hint(ImageCtxT& image_ctx, + neorados::WriteOp *wr); + + virtual void send() = 0; + + bool has_parent() const { + return m_has_parent; + } + + virtual const char *get_op_type() const = 0; + +protected: + bool compute_parent_extents(Extents *parent_extents, ImageArea *area, + bool read_request); + + ImageCtxT *m_ictx; + uint64_t m_object_no; + IOContext m_io_context; + Context *m_completion; + ZTracer::Trace m_trace; + + void async_finish(int r); + void finish(int r); + +private: + bool m_has_parent = false; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectReadRequest : public ObjectRequest<ImageCtxT> { +public: + static ObjectReadRequest* create( + ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion) { + return new ObjectReadRequest(ictx, objectno, extents, io_context, op_flags, + read_flags, parent_trace, version, completion); + } + + ObjectReadRequest( + ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion); + + void send() override; + + const char *get_op_type() const override { + return "read"; + } + +private: + /** + * @verbatim + * + * <start> + * | + * | + * v + * READ_OBJECT + * | + * v (skip if not needed) + * READ_PARENT + * | + * v (skip if not needed) + * COPYUP + * | + * v + * <finish> + * + * @endverbatim + */ + + ReadExtents* m_extents; + int m_op_flags; + int m_read_flags; + uint64_t* m_version; + + void read_object(); + void handle_read_object(int r); + + void read_parent(); + void handle_read_parent(int r); + + void copyup(); +}; + +template <typename ImageCtxT = ImageCtx> +class AbstractObjectWriteRequest : public ObjectRequest<ImageCtxT> { +public: + AbstractObjectWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, uint64_t len, + IOContext io_context, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion); + + virtual bool is_empty_write_op() const { + return false; + } + + virtual uint8_t get_pre_write_object_map_state() const { + return OBJECT_EXISTS; + } + + virtual void add_copyup_ops(neorados::WriteOp *wr) { + add_write_ops(wr); + } + + void handle_copyup(int r); + + void send() override; + +protected: + uint64_t m_object_off; + uint64_t m_object_len; + bool m_full_object = false; + bool m_copyup_enabled = true; + + virtual bool is_no_op_for_nonexistent_object() const { + return false; + } + virtual bool is_object_map_update_enabled() const { + return true; + } + virtual bool is_post_copyup_write_required() const { + return false; + } + virtual bool is_non_existent_post_write_object_map_state() const { + return false; + } + + virtual void add_write_hint(neorados::WriteOp *wr); + virtual void add_write_ops(neorados::WriteOp *wr) = 0; + + virtual int filter_write_result(int r) const { + return r; + } + + virtual Extents get_copyup_overwrite_extents() const { + return {{m_object_off, m_object_len}}; + } + +private: + /** + * @verbatim + * + * <start> + * | + * v (no-op write request) + * DETECT_NO_OP . . . . . . . . . . . . . . . . . . . + * | . + * v (skip if not required/disabled) . + * PRE_UPDATE_OBJECT_MAP . + * | . . + * | . (child dne) . + * | . . . . . . . . . . + * | . . + * | (post-copyup write) . . + * | . . . . . . . . . . . . . . + * | . . . . + * v v . v . + * WRITE . . . . . . . . > COPYUP (if required) . + * | | . + * |/----------------------/ . + * | . + * v (skip if not required/disabled) . + * POST_UPDATE_OBJECT_MAP . + * | . + * v . + * <finish> < . . . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + Extents m_parent_extents; + ImageArea m_image_area = ImageArea::DATA; + bool m_object_may_exist = false; + bool m_copyup_in_progress = false; + bool m_guarding_migration_write = false; + + void compute_parent_info(); + + void pre_write_object_map_update(); + void handle_pre_write_object_map_update(int r); + + void write_object(); + void handle_write_object(int r); + + void copyup(); + + void post_write_object_map_update(); + void handle_post_write_object_map_update(int r); + +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + int write_flags, std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off, + data.length(), io_context, "write", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags), + m_write_flags(write_flags), m_assert_version(assert_version) { + } + + bool is_empty_write_op() const override { + return (m_write_data.length() == 0); + } + + const char *get_op_type() const override { + return "write"; + } + +protected: + void add_write_ops(neorados::WriteOp *wr) override; + void add_write_hint(neorados::WriteOp *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; + int m_write_flags; + std::optional<uint64_t> m_assert_version; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectDiscardRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectDiscardRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off, + object_len, io_context, "discard", + parent_trace, completion), + m_discard_flags(discard_flags) { + if (this->m_full_object) { + if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 && + this->has_parent()) { + if (!this->m_copyup_enabled) { + // need to hide the parent object instead of child object + m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } + } else { + m_discard_action = DISCARD_ACTION_REMOVE; + } + } else if (object_off + object_len == ictx->layout.object_size) { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_ZERO; + } + } + + const char* get_op_type() const override { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + return "remove"; + case DISCARD_ACTION_REMOVE_TRUNCATE: + return "remove (create+truncate)"; + case DISCARD_ACTION_TRUNCATE: + return "truncate"; + case DISCARD_ACTION_ZERO: + return "zero"; + } + ceph_abort(); + return nullptr; + } + + uint8_t get_pre_write_object_map_state() const override { + if (m_discard_action == DISCARD_ACTION_REMOVE) { + return OBJECT_PENDING; + } + return OBJECT_EXISTS; + } + +protected: + bool is_no_op_for_nonexistent_object() const override { + return (!this->has_parent()); + } + bool is_object_map_update_enabled() const override { + return ( + (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0); + } + bool is_non_existent_post_write_object_map_state() const override { + return (m_discard_action == DISCARD_ACTION_REMOVE); + } + + void add_write_hint(neorados::WriteOp *wr) override { + // no hint for discard + } + + void add_write_ops(neorados::WriteOp *wr) override; + +private: + enum DiscardAction { + DISCARD_ACTION_REMOVE, + DISCARD_ACTION_REMOVE_TRUNCATE, + DISCARD_ACTION_TRUNCATE, + DISCARD_ACTION_ZERO + }; + + DiscardAction m_discard_action; + int m_discard_flags; + +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectWriteSameRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectWriteSameRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, ceph::bufferlist&& data, IOContext io_context, + int op_flags, const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off, + object_len, io_context, "writesame", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "writesame"; + } + +protected: + void add_write_ops(neorados::WriteOp *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest<ImageCtxT> { +public: + ObjectCompareAndWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_bl, ceph::bufferlist&& write_bl, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest<ImageCtxT>(ictx, object_no, object_off, + cmp_bl.length(), io_context, + "compare_and_write", parent_trace, + completion), + m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "compare_and_write"; + } + + void add_copyup_ops(neorados::WriteOp *wr) override { + // no-op on copyup + } + +protected: + virtual bool is_post_copyup_write_required() const { + return true; + } + + void add_write_ops(neorados::WriteOp *wr) override; + + int filter_write_result(int r) const override; + + Extents get_copyup_overwrite_extents() const override { + return {}; + } + +private: + ceph::bufferlist m_cmp_bl; + ceph::bufferlist m_write_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +template <typename ImageCtxT = ImageCtx> +class ObjectListSnapsRequest : public ObjectRequest<ImageCtxT> { +public: + static ObjectListSnapsRequest* create( + ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents, + SnapIds&& snap_ids, int list_snaps_flags, + const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta, + Context *completion) { + return new ObjectListSnapsRequest(ictx, objectno, + std::move(object_extents), + std::move(snap_ids), list_snaps_flags, + parent_trace, snapshot_delta, completion); + } + + ObjectListSnapsRequest( + ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents, + SnapIds&& snap_ids, int list_snaps_flags, + const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta, + Context *completion); + + void send() override; + + const char *get_op_type() const override { + return "snap_list"; + } + +private: + Extents m_object_extents; + SnapIds m_snap_ids; + int m_list_snaps_flags; + SnapshotDelta* m_snapshot_delta; + + neorados::SnapSet m_snap_set; + boost::system::error_code m_ec; + + ImageArea m_image_area = ImageArea::DATA; + SnapshotDelta m_parent_snapshot_delta; + + void list_snaps(); + void handle_list_snaps(int r); + + void list_from_parent(); + void handle_list_from_parent(int r); + + void zero_extent(uint64_t snap_id, bool dne); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectReadRequest<librbd::ImageCtx>; +extern template class librbd::io::AbstractObjectWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectDiscardRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectWriteSameRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectCompareAndWriteRequest<librbd::ImageCtx>; +extern template class librbd::io::ObjectListSnapsRequest<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H diff --git a/src/librbd/io/QosImageDispatch.cc b/src/librbd/io/QosImageDispatch.cc new file mode 100644 index 000000000..ea1d5dbb5 --- /dev/null +++ b/src/librbd/io/QosImageDispatch.cc @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/QosImageDispatch.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/FlushTracker.h" +#include <utility> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::QosImageDispatch: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +uint64_t get_extent_length(const Extents& extents) { + uint64_t length = 0; + for (auto& extent : extents) { + length += extent.second; + } + return length; +} + +uint64_t calculate_tokens(bool read_op, uint64_t extent_length, uint64_t flag) { + if (read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK) != 0)) { + return 0; + } else if (!read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_READ_MASK) != 0)) { + return 0; + } + + return (((flag & IMAGE_DISPATCH_FLAG_QOS_BPS_MASK) != 0) ? extent_length : 1); +} + +static const std::pair<uint64_t, const char*> throttle_flags[] = { + {IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" } +}; + +} // anonymous namespace + +template <typename I> +QosImageDispatch<I>::QosImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; + + SafeTimer *timer; + ceph::mutex *timer_lock; + ImageCtx::get_timer_instance(cct, &timer, &timer_lock); + for (auto [flag, name] : throttle_flags) { + m_throttles.emplace_back( + flag, + new TokenBucketThrottle(cct, name, 0, 0, timer, timer_lock)); + } +} + +template <typename I> +QosImageDispatch<I>::~QosImageDispatch() { + for (auto t : m_throttles) { + delete t.second; + } +} + +template <typename I> +void QosImageDispatch<I>::shut_down(Context* on_finish) { + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template <typename I> +void QosImageDispatch<I>::apply_qos_schedule_tick_min(uint64_t tick) { + for (auto pair : m_throttles) { + pair.second->set_schedule_tick_min(tick); + } +} + +template <typename I> +void QosImageDispatch<I>::apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) { + auto cct = m_image_ctx->cct; + TokenBucketThrottle *throttle = nullptr; + for (auto pair : m_throttles) { + if (flag == pair.first) { + throttle = pair.second; + break; + } + } + ceph_assert(throttle != nullptr); + + int r = throttle->set_limit(limit, burst, burst_seconds); + if (r < 0) { + lderr(cct) << throttle->get_name() << ": invalid qos parameter: " + << "burst(" << burst << ") is less than " + << "limit(" << limit << ")" << dendl; + // if apply failed, we should at least make sure the limit works. + throttle->set_limit(limit, 0, 1); + } + + if (limit) { + m_qos_enabled_flag |= flag; + } else { + m_qos_enabled_flag &= ~flag; + } +} + +template <typename I> +void QosImageDispatch<I>::apply_qos_exclude_ops(uint64_t exclude_ops) { + m_qos_exclude_ops = exclude_ops; +} + +template <typename I> +bool QosImageDispatch<I>::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_READ) { + return false; + } + + if (needs_throttle(true, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool QosImageDispatch<I>::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool QosImageDispatch<I>::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_DISCARD) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool QosImageDispatch<I>::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE_SAME) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool QosImageDispatch<I>::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_COMPARE_AND_WRITE) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool QosImageDispatch<I>::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + return true; +} + +template <typename I> +void QosImageDispatch<I>::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + m_flush_tracker->finish_io(tid); +} + +template <typename I> +bool QosImageDispatch<I>::set_throttle_flag( + std::atomic<uint32_t>* image_dispatch_flags, uint32_t flag) { + uint32_t expected = image_dispatch_flags->load(); + uint32_t desired; + do { + desired = expected | flag; + } while (!image_dispatch_flags->compare_exchange_weak(expected, desired)); + + return ((desired & IMAGE_DISPATCH_FLAG_QOS_MASK) == + IMAGE_DISPATCH_FLAG_QOS_MASK); +} + +template <typename I> +bool QosImageDispatch<I>::needs_throttle( + bool read_op, const Extents& image_extents, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto extent_length = get_extent_length(image_extents); + bool all_qos_flags_set = false; + + if (!read_op) { + m_flush_tracker->start_io(tid); + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + } + *dispatch_result = DISPATCH_RESULT_CONTINUE; + + auto qos_enabled_flag = m_qos_enabled_flag; + for (auto [flag, throttle] : m_throttles) { + if ((qos_enabled_flag & flag) == 0) { + all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag); + continue; + } + + auto tokens = calculate_tokens(read_op, extent_length, flag); + if (tokens > 0 && + throttle->get(tokens, this, &QosImageDispatch<I>::handle_throttle_ready, + Tag{image_dispatch_flags, on_dispatched}, flag)) { + ldout(cct, 15) << "on_dispatched=" << on_dispatched << ", " + << "flag=" << flag << dendl; + all_qos_flags_set = false; + } else { + all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag); + } + } + return !all_qos_flags_set; +} + +template <typename I> +void QosImageDispatch<I>::handle_throttle_ready(Tag&& tag, uint64_t flag) { + auto cct = m_image_ctx->cct; + ldout(cct, 15) << "on_dispatched=" << tag.on_dispatched << ", " + << "flag=" << flag << dendl; + + if (set_throttle_flag(tag.image_dispatch_flags, flag)) { + // timer_lock is held -- so dispatch from outside the timer thread + m_image_ctx->asio_engine->post(tag.on_dispatched, 0); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::QosImageDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/QosImageDispatch.h b/src/librbd/io/QosImageDispatch.h new file mode 100644 index 000000000..f5e08940a --- /dev/null +++ b/src/librbd/io/QosImageDispatch.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H + +#include <list> +#include <memory> + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> class FlushTracker; + +template <typename ImageCtxT> +class QosImageDispatch : public ImageDispatchInterface { +public: + struct Tag { + std::atomic<uint32_t>* image_dispatch_flags; + Context* on_dispatched; + + Tag(std::atomic<uint32_t>* image_dispatch_flags, Context* on_dispatched) + : image_dispatch_flags(image_dispatch_flags), + on_dispatched(on_dispatched) { + } + }; + + QosImageDispatch(ImageCtxT* image_ctx); + ~QosImageDispatch() override; + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_QOS; + } + + void shut_down(Context* on_finish) override; + + void apply_qos_schedule_tick_min(uint64_t tick); + void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst, + uint64_t burst_seconds); + void apply_qos_exclude_ops(uint64_t exclude_ops); + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + std::list<std::pair<uint64_t, TokenBucketThrottle*> > m_throttles; + uint64_t m_qos_enabled_flag = 0; + uint64_t m_qos_exclude_ops = 0; + + std::unique_ptr<FlushTracker<ImageCtxT>> m_flush_tracker; + + void handle_finished(int r, uint64_t tid); + + bool set_throttle_flag(std::atomic<uint32_t>* image_dispatch_flags, + uint32_t flag); + bool needs_throttle(bool read_op, const Extents& image_extents, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched); + void handle_throttle_ready(Tag&& tag, uint64_t flag); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::QosImageDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H diff --git a/src/librbd/io/QueueImageDispatch.cc b/src/librbd/io/QueueImageDispatch.cc new file mode 100644 index 000000000..ea5ed63b4 --- /dev/null +++ b/src/librbd/io/QueueImageDispatch.cc @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/QueueImageDispatch.h" +#include "common/dout.h" +#include "common/Cond.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/FlushTracker.h" +#include "librbd/io/ImageDispatchSpec.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::QueueImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +QueueImageDispatch<I>::QueueImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker<I>(image_ctx)) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template <typename I> +QueueImageDispatch<I>::~QueueImageDispatch() { + delete m_flush_tracker; +} + +template <typename I> +void QueueImageDispatch<I>::shut_down(Context* on_finish) { + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template <typename I> +bool QueueImageDispatch<I>::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(true, tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool QueueImageDispatch<I>::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool QueueImageDispatch<I>::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool QueueImageDispatch<I>::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool QueueImageDispatch<I>::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool QueueImageDispatch<I>::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + return true; +} + +template <typename I> +void QueueImageDispatch<I>::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + m_flush_tracker->finish_io(tid); +} + +template <typename I> +bool QueueImageDispatch<I>::enqueue( + bool read_op, uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (!m_image_ctx->non_blocking_aio) { + return false; + } + + if (!read_op) { + m_flush_tracker->start_io(tid); + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + } + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_image_ctx->asio_engine->post(on_dispatched, 0); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::QueueImageDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/QueueImageDispatch.h b/src/librbd/io/QueueImageDispatch.h new file mode 100644 index 000000000..9a41927ba --- /dev/null +++ b/src/librbd/io/QueueImageDispatch.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include <list> +#include <set> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> class FlushTracker; + +template <typename ImageCtxT> +class QueueImageDispatch : public ImageDispatchInterface { +public: + QueueImageDispatch(ImageCtxT* image_ctx); + ~QueueImageDispatch(); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_QUEUE; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + FlushTracker<ImageCtxT>* m_flush_tracker; + + void handle_finished(int r, uint64_t tid); + + bool enqueue(bool read_op, uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::QueueImageDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc new file mode 100644 index 000000000..c4053fee6 --- /dev/null +++ b/src/librbd/io/ReadResult.cc @@ -0,0 +1,262 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ReadResult.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Utils.h" +#include <boost/variant/apply_visitor.hpp> +#include <boost/variant/static_visitor.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +struct ReadResult::SetImageExtentsVisitor : public boost::static_visitor<void> { + Extents image_extents; + + explicit SetImageExtentsVisitor(const Extents& image_extents) + : image_extents(image_extents) { + } + + void operator()(Linear &linear) const { + uint64_t length = util::get_extents_length(image_extents); + + ceph_assert(length <= linear.buf_len); + linear.buf_len = length; + } + + void operator()(SparseBufferlist &sbl) const { + sbl.image_extents = image_extents; + } + + template <typename T> + void operator()(T &t) const { + } +}; + +struct ReadResult::AssembleResultVisitor : public boost::static_visitor<void> { + CephContext *cct; + Striper::StripedReadResult &destriper; + + AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper) + : cct(cct), destriper(destriper) { + } + + void operator()(Empty &empty) const { + ldout(cct, 20) << "dropping read result" << dendl; + } + + void operator()(Linear &linear) const { + ldout(cct, 20) << "copying resulting bytes to " + << reinterpret_cast<void*>(linear.buf) << dendl; + destriper.assemble_result(cct, linear.buf, linear.buf_len); + } + + void operator()(Vector &vector) const { + bufferlist bl; + destriper.assemble_result(cct, bl, true); + + ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec " + << reinterpret_cast<const void*>(vector.iov) << dendl; + + bufferlist::iterator it = bl.begin(); + size_t length = bl.length(); + size_t offset = 0; + int idx = 0; + for (; offset < length && idx < vector.iov_count; idx++) { + size_t len = std::min(vector.iov[idx].iov_len, length - offset); + it.copy(len, static_cast<char *>(vector.iov[idx].iov_base)); + offset += len; + } + ceph_assert(offset == bl.length()); + } + + void operator()(Bufferlist &bufferlist) const { + bufferlist.bl->clear(); + destriper.assemble_result(cct, *bufferlist.bl, true); + + ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " " + << "bytes to bl " << reinterpret_cast<void*>(bufferlist.bl) + << dendl; + } + + void operator()(SparseBufferlist &sparse_bufferlist) const { + sparse_bufferlist.bl->clear(); + + ExtentMap buffer_extent_map; + auto buffer_extents_length = destriper.assemble_result( + cct, &buffer_extent_map, sparse_bufferlist.bl); + + ldout(cct, 20) << "image_extents=" + << sparse_bufferlist.image_extents << ", " + << "buffer_extent_map=" << buffer_extent_map << dendl; + + sparse_bufferlist.extent_map->clear(); + sparse_bufferlist.extent_map->reserve(buffer_extent_map.size()); + + // The extent-map is logically addressed by buffer-extents not image- or + // object-extents. Translate this address mapping to image-extent + // logical addressing since it's tied to an image-extent read + uint64_t buffer_offset = 0; + auto bem_it = buffer_extent_map.begin(); + for (auto [image_offset, image_length] : sparse_bufferlist.image_extents) { + while (bem_it != buffer_extent_map.end()) { + auto [buffer_extent_offset, buffer_extent_length] = *bem_it; + + if (buffer_offset + image_length <= buffer_extent_offset) { + // skip any image extent that is not included in the results + break; + } + + // current buffer-extent should be within the current image-extent + ceph_assert(buffer_offset <= buffer_extent_offset && + buffer_offset + image_length >= + buffer_extent_offset + buffer_extent_length); + auto image_extent_offset = + image_offset + (buffer_extent_offset - buffer_offset); + ldout(cct, 20) << "mapping buffer extent " << buffer_extent_offset + << "~" << buffer_extent_length << " to image extent " + << image_extent_offset << "~" << buffer_extent_length + << dendl; + sparse_bufferlist.extent_map->emplace_back( + image_extent_offset, buffer_extent_length); + ++bem_it; + } + + buffer_offset += image_length; + } + ceph_assert(buffer_offset == buffer_extents_length); + ceph_assert(bem_it == buffer_extent_map.end()); + + ldout(cct, 20) << "moved resulting " << *sparse_bufferlist.extent_map + << " extents of total " << sparse_bufferlist.bl->length() + << " bytes to bl " + << reinterpret_cast<void*>(sparse_bufferlist.bl) << dendl; + } +}; + +ReadResult::C_ImageReadRequest::C_ImageReadRequest( + AioCompletion *aio_completion, uint64_t buffer_offset, + const Extents image_extents) + : aio_completion(aio_completion), buffer_offset(buffer_offset), + image_extents(image_extents) { + aio_completion->add_request(); +} + +void ReadResult::C_ImageReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ImageReadRequest: r=" << r + << dendl; + if (r >= 0 || (ignore_enoent && r == -ENOENT)) { + striper::LightweightBufferExtents buffer_extents; + size_t length = 0; + for (auto &image_extent : image_extents) { + buffer_extents.emplace_back(buffer_offset + length, image_extent.second); + length += image_extent.second; + } + ceph_assert(r == -ENOENT || length == bl.length()); + + aio_completion->lock.lock(); + aio_completion->read_result.m_destriper.add_partial_result( + cct, std::move(bl), buffer_extents); + aio_completion->lock.unlock(); + r = length; + } + + aio_completion->complete_request(r); +} + +ReadResult::C_ObjectReadRequest::C_ObjectReadRequest( + AioCompletion *aio_completion, ReadExtents&& extents) + : aio_completion(aio_completion), extents(std::move(extents)) { + aio_completion->add_request(); +} + +void ReadResult::C_ObjectReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ObjectReadRequest: r=" << r + << dendl; + + if (r == -ENOENT) { + r = 0; + } + if (r >= 0) { + uint64_t object_len = 0; + aio_completion->lock.lock(); + for (auto& extent: extents) { + ldout(cct, 10) << " got " << extent.extent_map + << " for " << extent.buffer_extents + << " bl " << extent.bl.length() << dendl; + + aio_completion->read_result.m_destriper.add_partial_sparse_result( + cct, std::move(extent.bl), extent.extent_map, extent.offset, + extent.buffer_extents); + + object_len += extent.length; + } + aio_completion->lock.unlock(); + r = object_len; + } + + aio_completion->complete_request(r); +} + +ReadResult::C_ObjectReadMergedExtents::C_ObjectReadMergedExtents( + CephContext* cct, ReadExtents* extents, Context* on_finish) + : cct(cct), extents(extents), on_finish(on_finish) { +} + +void ReadResult::C_ObjectReadMergedExtents::finish(int r) { + if (r >= 0) { + for (auto& extent: *extents) { + if (bl.length() < extent.length) { + lderr(cct) << "Merged extents length is less than expected" << dendl; + r = -EIO; + break; + } + bl.splice(0, extent.length, &extent.bl); + } + if (bl.length() != 0) { + lderr(cct) << "Merged extents length is greater than expected" << dendl; + r = -EIO; + } + } + on_finish->complete(r); +} + +ReadResult::ReadResult() : m_buffer(Empty()) { +} + +ReadResult::ReadResult(char *buf, size_t buf_len) + : m_buffer(Linear(buf, buf_len)) { +} + +ReadResult::ReadResult(const struct iovec *iov, int iov_count) + : m_buffer(Vector(iov, iov_count)) { +} + +ReadResult::ReadResult(ceph::bufferlist *bl) + : m_buffer(Bufferlist(bl)) { +} + +ReadResult::ReadResult(Extents* extent_map, ceph::bufferlist* bl) + : m_buffer(SparseBufferlist(extent_map, bl)) { +} + +void ReadResult::set_image_extents(const Extents& image_extents) { + boost::apply_visitor(SetImageExtentsVisitor(image_extents), m_buffer); +} + +void ReadResult::assemble_result(CephContext *cct) { + boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer); +} + +} // namespace io +} // namespace librbd + diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h new file mode 100644 index 000000000..12a1e78cc --- /dev/null +++ b/src/librbd/io/ReadResult.h @@ -0,0 +1,129 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_READ_RESULT_H +#define CEPH_LIBRBD_IO_READ_RESULT_H + +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "librbd/io/Types.h" +#include "osdc/Striper.h" +#include <sys/uio.h> +#include <boost/variant/variant.hpp> + + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template <typename> struct ObjectReadRequest; + +class ReadResult { +public: + struct C_ImageReadRequest : public Context { + AioCompletion *aio_completion; + uint64_t buffer_offset = 0; + Extents image_extents; + bufferlist bl; + bool ignore_enoent = false; + + C_ImageReadRequest(AioCompletion *aio_completion, + uint64_t buffer_offset, + const Extents image_extents); + + void finish(int r) override; + }; + + struct C_ObjectReadRequest : public Context { + AioCompletion *aio_completion; + ReadExtents extents; + + C_ObjectReadRequest(AioCompletion *aio_completion, ReadExtents&& extents); + + void finish(int r) override; + }; + + struct C_ObjectReadMergedExtents : public Context { + CephContext* cct; + ReadExtents* extents; + Context *on_finish; + bufferlist bl; + + C_ObjectReadMergedExtents(CephContext* cct, ReadExtents* extents, + Context* on_finish); + + void finish(int r) override; + }; + + ReadResult(); + ReadResult(char *buf, size_t buf_len); + ReadResult(const struct iovec *iov, int iov_count); + ReadResult(ceph::bufferlist *bl); + ReadResult(Extents* extent_map, ceph::bufferlist* bl); + + void set_image_extents(const Extents& image_extents); + + void assemble_result(CephContext *cct); + +private: + struct Empty { + }; + + struct Linear { + char *buf; + size_t buf_len; + + Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) { + } + }; + + struct Vector { + const struct iovec *iov; + int iov_count; + + Vector(const struct iovec *iov, int iov_count) + : iov(iov), iov_count(iov_count) { + } + }; + + struct Bufferlist { + ceph::bufferlist *bl; + + Bufferlist(ceph::bufferlist *bl) : bl(bl) { + } + }; + + struct SparseBufferlist { + Extents *extent_map; + ceph::bufferlist *bl; + + Extents image_extents; + + SparseBufferlist(Extents* extent_map, ceph::bufferlist* bl) + : extent_map(extent_map), bl(bl) { + } + }; + + typedef boost::variant<Empty, + Linear, + Vector, + Bufferlist, + SparseBufferlist> Buffer; + struct SetImageExtentsVisitor; + struct AssembleResultVisitor; + + Buffer m_buffer; + Striper::StripedReadResult m_destriper; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_READ_RESULT_H + diff --git a/src/librbd/io/RefreshImageDispatch.cc b/src/librbd/io/RefreshImageDispatch.cc new file mode 100644 index 000000000..3141faf25 --- /dev/null +++ b/src/librbd/io/RefreshImageDispatch.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/RefreshImageDispatch.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include <map> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::RefreshImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +RefreshImageDispatch<I>::RefreshImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template <typename I> +void RefreshImageDispatch<I>::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template <typename I> +bool RefreshImageDispatch<I>::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + // The refresh state machine can initiate a flush and it can + // enable the exclusive-lock which will also attmept to flush. + if (flush_source == FLUSH_SOURCE_REFRESH || + flush_source == FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH || + flush_source == FLUSH_SOURCE_SHUTDOWN) { + return false; + } + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template <typename I> +bool RefreshImageDispatch<I>::needs_refresh( + DispatchResult* dispatch_result, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + + if (m_image_ctx->state->is_refresh_required()) { + ldout(cct, 15) << "on_dispatched=" << on_dispatched << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_image_ctx->state->refresh(on_dispatched); + return true; + } + + return false; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/RefreshImageDispatch.h b/src/librbd/io/RefreshImageDispatch.h new file mode 100644 index 000000000..668dec419 --- /dev/null +++ b/src/librbd/io/RefreshImageDispatch.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT> +class RefreshImageDispatch : public ImageDispatchInterface { +public: + RefreshImageDispatch(ImageCtxT* image_ctx); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_REFRESH; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + bool needs_refresh(DispatchResult* dispatch_result, Context* on_dispatched); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::RefreshImageDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.cc b/src/librbd/io/SimpleSchedulerObjectDispatch.cc new file mode 100644 index 000000000..cd2ffb197 --- /dev/null +++ b/src/librbd/io/SimpleSchedulerObjectDispatch.cc @@ -0,0 +1,565 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/SimpleSchedulerObjectDispatch.h" +#include "include/neorados/RADOS.hpp" +#include "common/ceph_time.h" +#include "common/Timer.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/FlushTracker.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/Utils.h" + +#include <boost/accumulators/accumulators.hpp> +#include <boost/accumulators/statistics/rolling_count.hpp> +#include <boost/accumulators/statistics/rolling_sum.hpp> +#include <boost/accumulators/statistics/stats.hpp> + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::SimpleSchedulerObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace io { + +using namespace boost::accumulators; +using ceph::operator<<; +using librbd::util::data_object_name; + +static const int LATENCY_STATS_WINDOW_SIZE = 10; + +class LatencyStats { +private: + accumulator_set<uint64_t, stats<tag::rolling_count, tag::rolling_sum>> m_acc; + +public: + LatencyStats() + : m_acc(tag::rolling_window::window_size = LATENCY_STATS_WINDOW_SIZE) { + } + + bool is_ready() const { + return rolling_count(m_acc) == LATENCY_STATS_WINDOW_SIZE; + } + + void add(uint64_t latency) { + m_acc(latency); + } + + uint64_t avg() const { + auto count = rolling_count(m_acc); + + if (count > 0) { + return rolling_sum(m_acc); + } + return 0; + } +}; + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_delay_request( + uint64_t object_off, ceph::bufferlist&& data, IOContext io_context, + int op_flags, int object_dispatch_flags, Context* on_dispatched) { + if (!m_delayed_requests.empty()) { + if (!m_io_context || *m_io_context != *io_context || + op_flags != m_op_flags || data.length() == 0 || + intersects(object_off, data.length())) { + return false; + } + } else { + m_io_context = io_context; + m_op_flags = op_flags; + } + + if (data.length() == 0) { + // a zero length write is usually a special case, + // and we don't want it to be merged with others + ceph_assert(m_delayed_requests.empty()); + m_delayed_request_extents.insert(0, UINT64_MAX); + } else { + m_delayed_request_extents.insert(object_off, data.length()); + } + m_object_dispatch_flags |= object_dispatch_flags; + + if (!m_delayed_requests.empty()) { + // try to merge front to an existing request + auto iter = m_delayed_requests.find(object_off + data.length()); + if (iter != m_delayed_requests.end()) { + auto new_iter = m_delayed_requests.insert({object_off, {}}).first; + new_iter->second.data = std::move(data); + new_iter->second.data.append(std::move(iter->second.data)); + new_iter->second.requests = std::move(iter->second.requests); + new_iter->second.requests.push_back(on_dispatched); + m_delayed_requests.erase(iter); + + if (new_iter != m_delayed_requests.begin()) { + auto prev = new_iter; + try_merge_delayed_requests(--prev, new_iter); + } + return true; + } + + // try to merge back to an existing request + iter = m_delayed_requests.lower_bound(object_off); + if (iter != m_delayed_requests.begin() && + (iter == m_delayed_requests.end() || iter->first > object_off)) { + iter--; + } + if (iter != m_delayed_requests.end() && + iter->first + iter->second.data.length() == object_off) { + iter->second.data.append(std::move(data)); + iter->second.requests.push_back(on_dispatched); + + auto next = iter; + if (++next != m_delayed_requests.end()) { + try_merge_delayed_requests(iter, next); + } + return true; + } + } + + // create a new request + auto iter = m_delayed_requests.insert({object_off, {}}).first; + iter->second.data = std::move(data); + iter->second.requests.push_back(on_dispatched); + return true; +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::ObjectRequests::try_merge_delayed_requests( + typename std::map<uint64_t, MergedRequests>::iterator &iter1, + typename std::map<uint64_t, MergedRequests>::iterator &iter2) { + if (iter1->first + iter1->second.data.length() != iter2->first) { + return; + } + + iter1->second.data.append(std::move(iter2->second.data)); + iter1->second.requests.insert(iter1->second.requests.end(), + iter2->second.requests.begin(), + iter2->second.requests.end()); + m_delayed_requests.erase(iter2); +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::ObjectRequests::dispatch_delayed_requests( + I *image_ctx, LatencyStats *latency_stats, ceph::mutex *latency_stats_lock) { + for (auto &it : m_delayed_requests) { + auto offset = it.first; + auto &merged_requests = it.second; + + auto ctx = new LambdaContext( + [requests=std::move(merged_requests.requests), latency_stats, + latency_stats_lock, start_time=ceph_clock_now()](int r) { + if (latency_stats) { + std::lock_guard locker{*latency_stats_lock}; + auto latency = ceph_clock_now() - start_time; + latency_stats->add(latency.to_nsec()); + } + for (auto on_dispatched : requests) { + on_dispatched->complete(r); + } + }); + + auto req = ObjectDispatchSpec::create_write( + image_ctx, OBJECT_DISPATCH_LAYER_SCHEDULER, + m_object_no, offset, std::move(merged_requests.data), m_io_context, + m_op_flags, 0, std::nullopt, 0, {}, ctx); + + req->object_dispatch_flags = m_object_dispatch_flags; + req->send(); + } + + m_dispatch_time = {}; +} + +template <typename I> +SimpleSchedulerObjectDispatch<I>::SimpleSchedulerObjectDispatch( + I* image_ctx) + : m_image_ctx(image_ctx), + m_flush_tracker(new FlushTracker<I>(image_ctx)), + m_lock(ceph::make_mutex(librbd::util::unique_lock_name( + "librbd::io::SimpleSchedulerObjectDispatch::lock", this))), + m_max_delay(image_ctx->config.template get_val<uint64_t>( + "rbd_io_scheduler_simple_max_delay")) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; + + I::get_timer_instance(cct, &m_timer, &m_timer_lock); + + if (m_max_delay == 0) { + m_latency_stats = std::make_unique<LatencyStats>(); + } +} + +template <typename I> +SimpleSchedulerObjectDispatch<I>::~SimpleSchedulerObjectDispatch() { + delete m_flush_tracker; +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::init() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // add ourself to the IO object dispatcher chain + m_image_ctx->io_object_dispatcher->register_dispatch(this); +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " << extents + << dendl; + + std::lock_guard locker{m_lock}; + for (auto& extent : *extents) { + if (intersects(object_no, extent.offset, extent.length)) { + dispatch_delayed_requests(object_no); + break; + } + } + + return false; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + std::lock_guard locker{m_lock}; + + // don't try to batch assert version writes + if (assert_version.has_value() || + (write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) { + dispatch_delayed_requests(object_no); + return false; + } + + if (try_delay_write(object_no, object_off, std::move(data), io_context, + op_flags, *object_dispatch_flags, on_dispatched)) { + + auto dispatch_seq = ++m_dispatch_seq; + m_flush_tracker->start_io(dispatch_seq); + *on_finish = new LambdaContext( + [this, dispatch_seq, ctx=*on_finish](int r) { + ctx->complete(r); + m_flush_tracker->finish_io(dispatch_seq); + }); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + return true; + } + + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, ceph_clock_now(), on_finish); + + return false; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << cmp_data.length() << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + { + std::lock_guard locker{m_lock}; + dispatch_all_delayed_requests(); + } + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + + return true; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::intersects( + uint64_t object_no, uint64_t object_off, uint64_t len) const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + auto it = m_requests.find(object_no); + bool intersects = (it != m_requests.end()) && + it->second->intersects(object_off, len); + + ldout(cct, 20) << intersects << dendl; + + return intersects; +} + +template <typename I> +bool SimpleSchedulerObjectDispatch<I>::try_delay_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int object_dispatch_flags, + Context* on_dispatched) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + if (m_latency_stats && !m_latency_stats->is_ready()) { + ldout(cct, 20) << "latency stats not collected yet" << dendl; + return false; + } + + auto it = m_requests.find(object_no); + if (it == m_requests.end()) { + ldout(cct, 20) << "no pending requests" << dendl; + return false; + } + + auto &object_requests = it->second; + bool delayed = object_requests->try_delay_request( + object_off, std::move(data), io_context, op_flags, object_dispatch_flags, + on_dispatched); + + ldout(cct, 20) << "delayed: " << delayed << dendl; + + // schedule dispatch on the first request added + if (delayed && !object_requests->is_scheduled_dispatch()) { + auto dispatch_time = ceph::real_clock::now(); + if (m_latency_stats) { + dispatch_time += std::chrono::nanoseconds(m_latency_stats->avg() / 2); + } else { + dispatch_time += std::chrono::milliseconds(m_max_delay); + } + object_requests->set_scheduled_dispatch(dispatch_time); + m_dispatch_queue.push_back(object_requests); + if (m_dispatch_queue.front() == object_requests) { + schedule_dispatch_delayed_requests(); + } + } + + return delayed; +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::dispatch_all_delayed_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + while (!m_requests.empty()) { + auto it = m_requests.begin(); + dispatch_delayed_requests(it->second); + m_requests.erase(it); + } +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::register_in_flight_request( + uint64_t object_no, const utime_t &start_time, Context **on_finish) { + auto res = m_requests.insert( + {object_no, std::make_shared<ObjectRequests>(object_no)}); + ceph_assert(res.second); + auto it = res.first; + + auto dispatch_seq = ++m_dispatch_seq; + m_flush_tracker->start_io(dispatch_seq); + + it->second->set_dispatch_seq(dispatch_seq); + *on_finish = new LambdaContext( + [this, object_no, dispatch_seq, start_time, ctx=*on_finish](int r) { + ctx->complete(r); + + std::unique_lock locker{m_lock}; + if (m_latency_stats && start_time != utime_t()) { + auto latency = ceph_clock_now() - start_time; + m_latency_stats->add(latency.to_nsec()); + } + + auto it = m_requests.find(object_no); + if (it == m_requests.end() || + it->second->get_dispatch_seq() != dispatch_seq) { + ldout(m_image_ctx->cct, 20) << "already dispatched" << dendl; + } else { + dispatch_delayed_requests(it->second); + m_requests.erase(it); + } + locker.unlock(); + + m_flush_tracker->finish_io(dispatch_seq); + }); +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests( + uint64_t object_no) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + auto it = m_requests.find(object_no); + if (it == m_requests.end()) { + ldout(cct, 20) << "object_no=" << object_no << ": not found" << dendl; + return; + } + + dispatch_delayed_requests(it->second); + m_requests.erase(it); +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::dispatch_delayed_requests( + ObjectRequestsRef object_requests) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + ldout(cct, 20) << "object_no=" << object_requests->get_object_no() << ", " + << object_requests->delayed_requests_size() << " requests, " + << "dispatch_time=" << object_requests->get_dispatch_time() + << dendl; + + if (!object_requests->is_scheduled_dispatch()) { + return; + } + + object_requests->dispatch_delayed_requests(m_image_ctx, m_latency_stats.get(), + &m_lock); + + ceph_assert(!m_dispatch_queue.empty()); + if (m_dispatch_queue.front() == object_requests) { + m_dispatch_queue.pop_front(); + schedule_dispatch_delayed_requests(); + } +} + +template <typename I> +void SimpleSchedulerObjectDispatch<I>::schedule_dispatch_delayed_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + std::lock_guard timer_locker{*m_timer_lock}; + + if (m_timer_task != nullptr) { + ldout(cct, 20) << "canceling task " << m_timer_task << dendl; + + bool canceled = m_timer->cancel_event(m_timer_task); + ceph_assert(canceled); + m_timer_task = nullptr; + } + + if (m_dispatch_queue.empty()) { + ldout(cct, 20) << "nothing to schedule" << dendl; + return; + } + + auto object_requests = m_dispatch_queue.front().get(); + + while (!object_requests->is_scheduled_dispatch()) { + ldout(cct, 20) << "garbage collecting " << object_requests << dendl; + m_dispatch_queue.pop_front(); + + if (m_dispatch_queue.empty()) { + ldout(cct, 20) << "nothing to schedule" << dendl; + return; + } + object_requests = m_dispatch_queue.front().get(); + } + + m_timer_task = new LambdaContext( + [this, object_no=object_requests->get_object_no()](int r) { + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "running timer task " << m_timer_task << dendl; + + m_timer_task = nullptr; + m_image_ctx->asio_engine->post( + [this, object_no]() { + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + }); + }); + + ldout(cct, 20) << "scheduling task " << m_timer_task << " at " + << object_requests->get_dispatch_time() << dendl; + + m_timer->add_event_at(object_requests->get_dispatch_time(), m_timer_task); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.h b/src/librbd/io/SimpleSchedulerObjectDispatch.h new file mode 100644 index 000000000..ca8a57f3a --- /dev/null +++ b/src/librbd/io/SimpleSchedulerObjectDispatch.h @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H + +#include "common/ceph_mutex.h" +#include "include/interval_set.h" +#include "include/utime.h" + +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/TypeTraits.h" + +#include <list> +#include <map> +#include <memory> + +namespace librbd { + +class ImageCtx; + +namespace io { + +template <typename> class FlushTracker; +class LatencyStats; + +/** + * Simple scheduler plugin for object dispatcher layer. + */ +template <typename ImageCtxT = ImageCtx> +class SimpleSchedulerObjectDispatch : public ObjectDispatchInterface { +private: + // mock unit testing support + typedef ::librbd::io::TypeTraits<ImageCtxT> TypeTraits; + typedef typename TypeTraits::SafeTimer SafeTimer; +public: + static SimpleSchedulerObjectDispatch* create(ImageCtxT* image_ctx) { + return new SimpleSchedulerObjectDispatch(image_ctx); + } + + SimpleSchedulerObjectDispatch(ImageCtxT* image_ctx); + ~SimpleSchedulerObjectDispatch() override; + + ObjectDispatchLayer get_dispatch_layer() const override { + return OBJECT_DISPATCH_LAYER_SCHEDULER; + } + + void init(); + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional<uint64_t> assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + struct MergedRequests { + ceph::bufferlist data; + std::list<Context *> requests; + }; + + class ObjectRequests { + public: + using clock_t = ceph::real_clock; + + ObjectRequests(uint64_t object_no) : m_object_no(object_no) { + } + + uint64_t get_object_no() const { + return m_object_no; + } + + void set_dispatch_seq(uint64_t dispatch_seq) { + m_dispatch_seq = dispatch_seq; + } + + uint64_t get_dispatch_seq() const { + return m_dispatch_seq; + } + + clock_t::time_point get_dispatch_time() const { + return m_dispatch_time; + } + + void set_scheduled_dispatch(const clock_t::time_point &dispatch_time) { + m_dispatch_time = dispatch_time; + } + + bool is_scheduled_dispatch() const { + return !clock_t::is_zero(m_dispatch_time); + } + + size_t delayed_requests_size() const { + return m_delayed_requests.size(); + } + + bool intersects(uint64_t object_off, uint64_t len) const { + return m_delayed_request_extents.intersects(object_off, len); + } + + bool try_delay_request(uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + int object_dispatch_flags, Context* on_dispatched); + + void dispatch_delayed_requests(ImageCtxT *image_ctx, + LatencyStats *latency_stats, + ceph::mutex *latency_stats_lock); + + private: + uint64_t m_object_no; + uint64_t m_dispatch_seq = 0; + clock_t::time_point m_dispatch_time; + IOContext m_io_context; + int m_op_flags = 0; + int m_object_dispatch_flags = 0; + std::map<uint64_t, MergedRequests> m_delayed_requests; + interval_set<uint64_t> m_delayed_request_extents; + + void try_merge_delayed_requests( + typename std::map<uint64_t, MergedRequests>::iterator &iter, + typename std::map<uint64_t, MergedRequests>::iterator &iter2); + }; + + typedef std::shared_ptr<ObjectRequests> ObjectRequestsRef; + typedef std::map<uint64_t, ObjectRequestsRef> Requests; + + ImageCtxT *m_image_ctx; + + FlushTracker<ImageCtxT>* m_flush_tracker; + + ceph::mutex m_lock; + SafeTimer *m_timer; + ceph::mutex *m_timer_lock; + uint64_t m_max_delay; + uint64_t m_dispatch_seq = 0; + + Requests m_requests; + std::list<ObjectRequestsRef> m_dispatch_queue; + Context *m_timer_task = nullptr; + std::unique_ptr<LatencyStats> m_latency_stats; + + bool try_delay_write(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, + int op_flags, int object_dispatch_flags, + Context* on_dispatched); + bool intersects(uint64_t object_no, uint64_t object_off, uint64_t len) const; + + void dispatch_all_delayed_requests(); + void dispatch_delayed_requests(uint64_t object_no); + void dispatch_delayed_requests(ObjectRequestsRef object_requests); + void register_in_flight_request(uint64_t object_no, const utime_t &start_time, + Context** on_finish); + + void schedule_dispatch_delayed_requests(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::SimpleSchedulerObjectDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CACHE_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H diff --git a/src/librbd/io/TypeTraits.h b/src/librbd/io/TypeTraits.h new file mode 100644 index 000000000..2f3a6b7ef --- /dev/null +++ b/src/librbd/io/TypeTraits.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_TYPE_TRAITS_H +#define CEPH_LIBRBD_IO_TYPE_TRAITS_H + +#include "common/Timer.h" + +namespace librbd { +namespace io { + +template <typename IoCtxT> +struct TypeTraits { + typedef ::SafeTimer SafeTimer; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_TYPE_TRAITS_H diff --git a/src/librbd/io/Types.cc b/src/librbd/io/Types.cc new file mode 100644 index 000000000..19fcc6b89 --- /dev/null +++ b/src/librbd/io/Types.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/Types.h" +#include <iostream> + +namespace librbd { +namespace io { + +const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS{0, 0}; + +std::ostream& operator<<(std::ostream& os, SparseExtentState state) { + switch (state) { + case SPARSE_EXTENT_STATE_DNE: + os << "dne"; + break; + case SPARSE_EXTENT_STATE_ZEROED: + os << "zeroed"; + break; + case SPARSE_EXTENT_STATE_DATA: + os << "data"; + break; + default: + ceph_abort(); + break; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const SparseExtent& se) { + os << "[" + << "state=" << se.state << ", " + << "length=" << se.length << "]"; + return os; +} + +std::ostream& operator<<(std::ostream& os, ImageArea area) { + switch (area) { + case ImageArea::DATA: + return os << "data"; + case ImageArea::CRYPTO_HEADER: + return os << "crypto_header"; + default: + ceph_abort(); + } +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h new file mode 100644 index 000000000..7c70986c5 --- /dev/null +++ b/src/librbd/io/Types.h @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_TYPES_H +#define CEPH_LIBRBD_IO_TYPES_H + +#include "include/int_types.h" +#include "include/rados/rados_types.hpp" +#include "common/interval_map.h" +#include "osdc/StriperTypes.h" +#include <iosfwd> +#include <map> +#include <vector> + +struct Context; + +namespace librbd { +namespace io { + +typedef enum { + AIO_TYPE_NONE = 0, + AIO_TYPE_GENERIC, + AIO_TYPE_OPEN, + AIO_TYPE_CLOSE, + AIO_TYPE_READ, + AIO_TYPE_WRITE, + AIO_TYPE_DISCARD, + AIO_TYPE_FLUSH, + AIO_TYPE_WRITESAME, + AIO_TYPE_COMPARE_AND_WRITE, +} aio_type_t; + +enum FlushSource { + FLUSH_SOURCE_USER, + FLUSH_SOURCE_INTERNAL, + FLUSH_SOURCE_SHUTDOWN, + FLUSH_SOURCE_EXCLUSIVE_LOCK, + FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH, + FLUSH_SOURCE_REFRESH, + FLUSH_SOURCE_WRITEBACK, + FLUSH_SOURCE_WRITE_BLOCK, +}; + +enum Direction { + DIRECTION_READ, + DIRECTION_WRITE, + DIRECTION_BOTH +}; + +enum DispatchResult { + DISPATCH_RESULT_INVALID, + DISPATCH_RESULT_RESTART, + DISPATCH_RESULT_CONTINUE, + DISPATCH_RESULT_COMPLETE +}; + +enum ImageDispatchLayer { + IMAGE_DISPATCH_LAYER_NONE = 0, + IMAGE_DISPATCH_LAYER_API_START = IMAGE_DISPATCH_LAYER_NONE, + IMAGE_DISPATCH_LAYER_QUEUE, + IMAGE_DISPATCH_LAYER_QOS, + IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, + IMAGE_DISPATCH_LAYER_REFRESH, + IMAGE_DISPATCH_LAYER_INTERNAL_START = IMAGE_DISPATCH_LAYER_REFRESH, + IMAGE_DISPATCH_LAYER_MIGRATION, + IMAGE_DISPATCH_LAYER_JOURNAL, + IMAGE_DISPATCH_LAYER_WRITE_BLOCK, + IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, + IMAGE_DISPATCH_LAYER_CRYPTO, + IMAGE_DISPATCH_LAYER_CORE, + IMAGE_DISPATCH_LAYER_LAST +}; + +enum { + IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE = 1 << 0, + IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE = 1 << 1, + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE = 1 << 2, + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE = 1 << 3, + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE = 1 << 4, + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE = 1 << 5, + IMAGE_DISPATCH_FLAG_QOS_BPS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_READ_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_BPS_MASK | + IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK), + + // TODO: pass area through ImageDispatchInterface and remove + // this flag + IMAGE_DISPATCH_FLAG_CRYPTO_HEADER = 1 << 6 +}; + +enum { + RBD_IO_OPERATIONS_DEFAULT = 0, + RBD_IO_OPERATION_READ = 1 << 0, + RBD_IO_OPERATION_WRITE = 1 << 1, + RBD_IO_OPERATION_DISCARD = 1 << 2, + RBD_IO_OPERATION_WRITE_SAME = 1 << 3, + RBD_IO_OPERATION_COMPARE_AND_WRITE = 1 << 4, + RBD_IO_OPERATIONS_ALL = ( + RBD_IO_OPERATION_READ | + RBD_IO_OPERATION_WRITE | + RBD_IO_OPERATION_DISCARD | + RBD_IO_OPERATION_WRITE_SAME | + RBD_IO_OPERATION_COMPARE_AND_WRITE) +}; + +enum ObjectDispatchLayer { + OBJECT_DISPATCH_LAYER_NONE = 0, + OBJECT_DISPATCH_LAYER_CACHE, + OBJECT_DISPATCH_LAYER_CRYPTO, + OBJECT_DISPATCH_LAYER_JOURNAL, + OBJECT_DISPATCH_LAYER_PARENT_CACHE, + OBJECT_DISPATCH_LAYER_SCHEDULER, + OBJECT_DISPATCH_LAYER_CORE, + OBJECT_DISPATCH_LAYER_LAST +}; + +enum { + READ_FLAG_DISABLE_READ_FROM_PARENT = 1UL << 0, + READ_FLAG_DISABLE_CLIPPING = 1UL << 1, +}; + +enum { + OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE = 1UL << 0 +}; + +enum { + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0, + OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1 +}; + +enum { + OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0, + OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1 +}; + +enum { + LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT = 1UL << 0, + LIST_SNAPS_FLAG_WHOLE_OBJECT = 1UL << 1, + LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS = 1UL << 2, +}; + +enum SparseExtentState { + SPARSE_EXTENT_STATE_DNE, /* does not exist */ + SPARSE_EXTENT_STATE_ZEROED, + SPARSE_EXTENT_STATE_DATA +}; + +std::ostream& operator<<(std::ostream& os, SparseExtentState state); + +struct SparseExtent { + SparseExtentState state; + uint64_t length; + + SparseExtent(SparseExtentState state, uint64_t length) + : state(state), length(length) { + } + + operator SparseExtentState() const { + return state; + } + + bool operator==(const SparseExtent& rhs) const { + return state == rhs.state && length == rhs.length; + } +}; + +std::ostream& operator<<(std::ostream& os, const SparseExtent& state); + +struct SparseExtentSplitMerge { + SparseExtent split(uint64_t offset, uint64_t length, SparseExtent &se) const { + return SparseExtent(se.state, se.length); + } + + bool can_merge(const SparseExtent& left, const SparseExtent& right) const { + return left.state == right.state; + } + + SparseExtent merge(SparseExtent&& left, SparseExtent&& right) const { + SparseExtent se(left); + se.length += right.length; + return se; + } + + uint64_t length(const SparseExtent& se) const { + return se.length; + } +}; + +typedef interval_map<uint64_t, + SparseExtent, + SparseExtentSplitMerge> SparseExtents; + +typedef std::vector<uint64_t> SnapIds; + +typedef std::pair<librados::snap_t, librados::snap_t> WriteReadSnapIds; +extern const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS; + +typedef std::map<WriteReadSnapIds, SparseExtents> SnapshotDelta; + +struct SparseBufferlistExtent : public SparseExtent { + ceph::bufferlist bl; + + SparseBufferlistExtent(SparseExtentState state, uint64_t length) + : SparseExtent(state, length) { + ceph_assert(state != SPARSE_EXTENT_STATE_DATA); + } + SparseBufferlistExtent(SparseExtentState state, uint64_t length, + ceph::bufferlist&& bl_) + : SparseExtent(state, length), bl(std::move(bl_)) { + ceph_assert(state != SPARSE_EXTENT_STATE_DATA || length == bl.length()); + } + + bool operator==(const SparseBufferlistExtent& rhs) const { + return (state == rhs.state && + length == rhs.length && + bl.contents_equal(rhs.bl)); + } +}; + +struct SparseBufferlistExtentSplitMerge { + SparseBufferlistExtent split(uint64_t offset, uint64_t length, + SparseBufferlistExtent& sbe) const { + ceph::bufferlist bl; + if (sbe.state == SPARSE_EXTENT_STATE_DATA) { + bl.substr_of(bl, offset, length); + } + return SparseBufferlistExtent(sbe.state, length, std::move(bl)); + } + + bool can_merge(const SparseBufferlistExtent& left, + const SparseBufferlistExtent& right) const { + return left.state == right.state; + } + + SparseBufferlistExtent merge(SparseBufferlistExtent&& left, + SparseBufferlistExtent&& right) const { + if (left.state == SPARSE_EXTENT_STATE_DATA) { + ceph::bufferlist bl{std::move(left.bl)}; + bl.claim_append(std::move(right.bl)); + return SparseBufferlistExtent(SPARSE_EXTENT_STATE_DATA, + bl.length(), std::move(bl)); + } else { + return SparseBufferlistExtent(left.state, left.length + right.length, {}); + } + } + + uint64_t length(const SparseBufferlistExtent& sbe) const { + return sbe.length; + } +}; + +typedef interval_map<uint64_t, + SparseBufferlistExtent, + SparseBufferlistExtentSplitMerge> SparseBufferlist; +typedef std::map<uint64_t, SparseBufferlist> SnapshotSparseBufferlist; + +using striper::LightweightBufferExtents; +using striper::LightweightObjectExtent; +using striper::LightweightObjectExtents; + +typedef std::pair<uint64_t,uint64_t> Extent; +typedef std::vector<Extent> Extents; + +enum class ImageArea { + DATA, + CRYPTO_HEADER +}; + +std::ostream& operator<<(std::ostream& os, ImageArea area); + +struct ReadExtent { + const uint64_t offset; + const uint64_t length; + const LightweightBufferExtents buffer_extents; + ceph::bufferlist bl; + Extents extent_map; + + ReadExtent(uint64_t offset, + uint64_t length) : offset(offset), length(length) {}; + ReadExtent(uint64_t offset, + uint64_t length, + const LightweightBufferExtents&& buffer_extents) + : offset(offset), + length(length), + buffer_extents(buffer_extents) {} + ReadExtent(uint64_t offset, + uint64_t length, + const LightweightBufferExtents&& buffer_extents, + ceph::bufferlist&& bl, + Extents&& extent_map) : offset(offset), + length(length), + buffer_extents(buffer_extents), + bl(bl), + extent_map(extent_map) {}; + + friend inline std::ostream& operator<<( + std::ostream& os, + const ReadExtent &extent) { + os << "offset=" << extent.offset << ", " + << "length=" << extent.length << ", " + << "buffer_extents=" << extent.buffer_extents << ", " + << "bl.length=" << extent.bl.length() << ", " + << "extent_map=" << extent.extent_map; + return os; + } +}; + +typedef std::vector<ReadExtent> ReadExtents; + +typedef std::map<uint64_t, uint64_t> ExtentMap; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_TYPES_H diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc new file mode 100644 index 000000000..63d587206 --- /dev/null +++ b/src/librbd/io/Utils.cc @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/Utils.h" +#include "common/dout.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/neorados/RADOS.hpp" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "osd/osd_types.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::util: " << __func__ << ": " + +namespace librbd { +namespace io { +namespace util { + +void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op) { + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM) + op->set_fadvise_random(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL) + op->set_fadvise_sequential(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_WILLNEED) + op->set_fadvise_willneed(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED) + op->set_fadvise_dontneed(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE) + op->set_fadvise_nocache(); + + if (flags & librados::OPERATION_BALANCE_READS) + op->balance_reads(); + if (flags & librados::OPERATION_LOCALIZE_READS) + op->localize_reads(); +} + +bool assemble_write_same_extent( + const LightweightObjectExtent &object_extent, const ceph::bufferlist& data, + ceph::bufferlist *ws_data, bool force_write) { + size_t data_len = data.length(); + + if (!force_write) { + bool may_writesame = true; + for (auto& q : object_extent.buffer_extents) { + if (!(q.first % data_len == 0 && q.second % data_len == 0)) { + may_writesame = false; + break; + } + } + + if (may_writesame) { + ws_data->append(data); + return true; + } + } + + for (auto& q : object_extent.buffer_extents) { + bufferlist sub_bl; + uint64_t sub_off = q.first % data_len; + uint64_t sub_len = data_len - sub_off; + uint64_t extent_left = q.second; + while (extent_left >= sub_len) { + sub_bl.substr_of(data, sub_off, sub_len); + ws_data->claim_append(sub_bl); + extent_left -= sub_len; + if (sub_off) { + sub_off = 0; + sub_len = data_len; + } + } + if (extent_left) { + sub_bl.substr_of(data, sub_off, extent_left); + ws_data->claim_append(sub_bl); + } + } + return false; +} + +template <typename I> +void read_parent(I *image_ctx, uint64_t object_no, ReadExtents* read_extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, + Context* on_finish) { + + auto cct = image_ctx->cct; + + std::shared_lock image_locker{image_ctx->image_lock}; + + Extents parent_extents; + ImageArea area; + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + image_ctx->get_parent_overlap(snap_id, &raw_overlap); + if (raw_overlap > 0) { + // calculate reverse mapping onto the parent image + Extents extents; + for (const auto& extent : *read_extents) { + extents.emplace_back(extent.offset, extent.length); + } + std::tie(parent_extents, area) = object_to_area_extents(image_ctx, + object_no, extents); + object_overlap = image_ctx->prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + if (object_overlap == 0) { + image_locker.unlock(); + + on_finish->complete(-ENOENT); + return; + } + + ldout(cct, 20) << dendl; + + ceph::bufferlist* parent_read_bl; + if (read_extents->size() > 1) { + auto parent_comp = new ReadResult::C_ObjectReadMergedExtents( + cct, read_extents, on_finish); + parent_read_bl = &parent_comp->bl; + on_finish = parent_comp; + } else { + parent_read_bl = &read_extents->front().bl; + } + + auto comp = AioCompletion::create_and_start(on_finish, image_ctx->parent, + AIO_TYPE_READ); + ldout(cct, 20) << "completion=" << comp + << " parent_extents=" << parent_extents + << " area=" << area << dendl; + auto req = io::ImageDispatchSpec::create_read( + *image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp, + std::move(parent_extents), area, ReadResult{parent_read_bl}, + image_ctx->parent->get_data_io_context(), 0, 0, trace); + req->send(); +} + +template <typename I> +int clip_request(I* image_ctx, Extents* image_extents, ImageArea area) { + std::shared_lock image_locker{image_ctx->image_lock}; + for (auto &image_extent : *image_extents) { + auto clip_len = image_extent.second; + int r = clip_io(librbd::util::get_image_ctx(image_ctx), + image_extent.first, &clip_len, area); + if (r < 0) { + return r; + } + + image_extent.second = clip_len; + } + return 0; +} + +void unsparsify(CephContext* cct, ceph::bufferlist* bl, + const Extents& extent_map, uint64_t bl_off, + uint64_t out_bl_len) { + Striper::StripedReadResult destriper; + bufferlist out_bl; + + destriper.add_partial_sparse_result(cct, std::move(*bl), extent_map, bl_off, + {{0, out_bl_len}}); + destriper.assemble_result(cct, out_bl, true); + *bl = out_bl; +} + +template <typename I> +bool trigger_copyup(I* image_ctx, uint64_t object_no, IOContext io_context, + Context* on_finish) { + bufferlist bl; + auto req = new ObjectWriteRequest<I>( + image_ctx, object_no, 0, std::move(bl), io_context, 0, 0, + std::nullopt, {}, on_finish); + if (!req->has_parent()) { + delete req; + return false; + } + + req->send(); + return true; +} + +template <typename I> +void area_to_object_extents(I* image_ctx, uint64_t offset, uint64_t length, + ImageArea area, uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents) { + Extents extents = {{offset, length}}; + image_ctx->io_image_dispatcher->remap_to_physical(extents, area); + for (auto [off, len] : extents) { + Striper::file_to_extents(image_ctx->cct, &image_ctx->layout, off, len, 0, + buffer_offset, object_extents); + } +} + +template <typename I> +std::pair<Extents, ImageArea> object_to_area_extents( + I* image_ctx, uint64_t object_no, const Extents& object_extents) { + Extents extents; + for (auto [off, len] : object_extents) { + Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no, off, + len, extents); + } + auto area = image_ctx->io_image_dispatcher->remap_to_logical(extents); + return {std::move(extents), area}; +} + +template <typename I> +uint64_t area_to_raw_offset(const I& image_ctx, uint64_t offset, + ImageArea area) { + Extents extents = {{offset, 0}}; + image_ctx.io_image_dispatcher->remap_to_physical(extents, area); + return extents[0].first; +} + +template <typename I> +std::pair<uint64_t, ImageArea> raw_to_area_offset(const I& image_ctx, + uint64_t offset) { + Extents extents = {{offset, 0}}; + auto area = image_ctx.io_image_dispatcher->remap_to_logical(extents); + return {extents[0].first, area}; +} + +} // namespace util +} // namespace io +} // namespace librbd + +template void librbd::io::util::read_parent( + librbd::ImageCtx *image_ctx, uint64_t object_no, ReadExtents* extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, Context* on_finish); +template int librbd::io::util::clip_request( + librbd::ImageCtx* image_ctx, Extents* image_extents, ImageArea area); +template bool librbd::io::util::trigger_copyup( + librbd::ImageCtx *image_ctx, uint64_t object_no, IOContext io_context, + Context* on_finish); +template void librbd::io::util::area_to_object_extents( + librbd::ImageCtx* image_ctx, uint64_t offset, uint64_t length, + ImageArea area, uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents); +template auto librbd::io::util::object_to_area_extents( + librbd::ImageCtx* image_ctx, uint64_t object_no, const Extents& extents) + -> std::pair<Extents, ImageArea>; +template uint64_t librbd::io::util::area_to_raw_offset( + const librbd::ImageCtx& image_ctx, uint64_t offset, ImageArea area); +template auto librbd::io::util::raw_to_area_offset( + const librbd::ImageCtx& image_ctx, uint64_t offset) + -> std::pair<uint64_t, ImageArea>; diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h new file mode 100644 index 000000000..efb79b6a6 --- /dev/null +++ b/src/librbd/io/Utils.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_UTILS_H +#define CEPH_LIBRBD_IO_UTILS_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/rados/rados_types.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include <map> + +class ObjectExtent; + +namespace neorados { struct Op; } + +namespace librbd { + +struct ImageCtx; + +namespace io { +namespace util { + +void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op); + +bool assemble_write_same_extent(const LightweightObjectExtent &object_extent, + const ceph::bufferlist& data, + ceph::bufferlist *ws_data, + bool force_write); + +template <typename ImageCtxT = librbd::ImageCtx> +void read_parent(ImageCtxT *image_ctx, uint64_t object_no, + ReadExtents* read_extents, librados::snap_t snap_id, + const ZTracer::Trace &trace, Context* on_finish); + +template <typename ImageCtxT = librbd::ImageCtx> +int clip_request(ImageCtxT* image_ctx, Extents* image_extents, ImageArea area); + +inline uint64_t get_extents_length(const Extents &extents) { + uint64_t total_bytes = 0; + for (auto [_, extent_length] : extents) { + total_bytes += extent_length; + } + return total_bytes; +} + +void unsparsify(CephContext* cct, ceph::bufferlist* bl, + const Extents& extent_map, uint64_t bl_off, + uint64_t out_bl_len); + +template <typename ImageCtxT = librbd::ImageCtx> +bool trigger_copyup(ImageCtxT *image_ctx, uint64_t object_no, + IOContext io_context, Context* on_finish); + +template <typename ImageCtxT = librbd::ImageCtx> +void area_to_object_extents(ImageCtxT* image_ctx, uint64_t offset, + uint64_t length, ImageArea area, + uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents); + +template <typename ImageCtxT = librbd::ImageCtx> +std::pair<Extents, ImageArea> object_to_area_extents( + ImageCtxT* image_ctx, uint64_t object_no, const Extents& object_extents); + +template <typename ImageCtxT = librbd::ImageCtx> +uint64_t area_to_raw_offset(const ImageCtxT& image_ctx, uint64_t offset, + ImageArea area); + +template <typename ImageCtxT = librbd::ImageCtx> +std::pair<uint64_t, ImageArea> raw_to_area_offset(const ImageCtxT& image_ctx, + uint64_t offset); + +inline ObjectDispatchLayer get_previous_layer(ObjectDispatchLayer layer) { + return (ObjectDispatchLayer)(((int)layer) - 1); +} + +} // namespace util +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_UTILS_H diff --git a/src/librbd/io/WriteBlockImageDispatch.cc b/src/librbd/io/WriteBlockImageDispatch.cc new file mode 100644 index 000000000..57d181d20 --- /dev/null +++ b/src/librbd/io/WriteBlockImageDispatch.cc @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/WriteBlockImageDispatch.h" +#include "common/dout.h" +#include "common/Cond.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::WriteBlockImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template <typename I> +struct WriteBlockImageDispatch<I>::C_BlockedWrites : public Context { + WriteBlockImageDispatch *dispatch; + explicit C_BlockedWrites(WriteBlockImageDispatch *dispatch) + : dispatch(dispatch) { + } + + void finish(int r) override { + dispatch->handle_blocked_writes(r); + } +}; + +template <typename I> +WriteBlockImageDispatch<I>::WriteBlockImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::io::WriteBlockImageDispatch::m_lock", + this))) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template <typename I> +void WriteBlockImageDispatch<I>::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template <typename I> +int WriteBlockImageDispatch<I>::block_writes() { + C_SaferCond cond_ctx; + block_writes(&cond_ctx); + return cond_ctx.wait(); +} + +template <typename I> +void WriteBlockImageDispatch<I>::block_writes(Context *on_blocked) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + auto cct = m_image_ctx->cct; + + // ensure owner lock is not held after block_writes completes + on_blocked = util::create_async_context_callback( + *m_image_ctx, on_blocked); + + { + std::unique_lock locker{m_lock}; + ++m_write_blockers; + ldout(cct, 5) << m_image_ctx << ", " + << "num=" << m_write_blockers << dendl; + if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) { + ldout(cct, 5) << "waiting for in-flight writes to complete: " + << "in_flight_writes=" << m_in_flight_writes << dendl; + m_write_blocker_contexts.push_back(on_blocked); + return; + } + } + + flush_io(on_blocked); +}; + +template <typename I> +void WriteBlockImageDispatch<I>::unblock_writes() { + auto cct = m_image_ctx->cct; + + Contexts waiter_contexts; + Contexts dispatch_contexts; + { + std::unique_lock locker{m_lock}; + ceph_assert(m_write_blockers > 0); + --m_write_blockers; + + ldout(cct, 5) << m_image_ctx << ", " + << "num=" << m_write_blockers << dendl; + if (m_write_blockers == 0) { + std::swap(waiter_contexts, m_unblocked_write_waiter_contexts); + std::swap(dispatch_contexts, m_on_dispatches); + } + } + + for (auto ctx : waiter_contexts) { + ctx->complete(0); + } + + for (auto ctx : dispatch_contexts) { + ctx->complete(0); + } +} + +template <typename I> +void WriteBlockImageDispatch<I>::wait_on_writes_unblocked( + Context *on_unblocked) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + auto cct = m_image_ctx->cct; + + { + std::unique_lock locker{m_lock}; + ldout(cct, 20) << m_image_ctx << ", " + << "write_blockers=" << m_write_blockers << dendl; + if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) { + m_unblocked_write_waiter_contexts.push_back(on_unblocked); + return; + } + } + + on_unblocked->complete(0); +} + +template <typename I> +bool WriteBlockImageDispatch<I>::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool WriteBlockImageDispatch<I>::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool WriteBlockImageDispatch<I>::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool WriteBlockImageDispatch<I>::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +bool WriteBlockImageDispatch<I>::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + if (flush_source != FLUSH_SOURCE_USER) { + return false; + } + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template <typename I> +void WriteBlockImageDispatch<I>::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl; + + std::unique_lock locker{m_lock}; + ceph_assert(m_in_flight_writes > 0); + --m_in_flight_writes; + + bool writes_blocked = false; + if (m_write_blockers > 0 && m_in_flight_writes == 0) { + ldout(cct, 10) << "flushing all in-flight IO for blocked writes" << dendl; + writes_blocked = true; + } + locker.unlock(); + + if (writes_blocked) { + flush_io(new C_BlockedWrites(this)); + } +} + +template <typename I> +bool WriteBlockImageDispatch<I>::process_io( + uint64_t tid, DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + std::unique_lock locker{m_lock}; + if (m_write_blockers > 0 || !m_on_dispatches.empty()) { + *dispatch_result = DISPATCH_RESULT_RESTART; + m_on_dispatches.push_back(on_dispatched); + return true; + } + + ++m_in_flight_writes; + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + return false; +} + +template <typename I> +void WriteBlockImageDispatch<I>::flush_io(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // ensure that all in-flight IO is flushed + auto aio_comp = AioCompletion::create_and_start( + on_finish, util::get_image_ctx(m_image_ctx), librbd::io::AIO_TYPE_FLUSH); + auto req = ImageDispatchSpec::create_flush( + *m_image_ctx, IMAGE_DISPATCH_LAYER_WRITE_BLOCK, aio_comp, + FLUSH_SOURCE_WRITE_BLOCK, {}); + req->send(); +} + +template <typename I> +void WriteBlockImageDispatch<I>::handle_blocked_writes(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + Contexts write_blocker_contexts; + { + std::unique_lock locker{m_lock}; + std::swap(write_blocker_contexts, m_write_blocker_contexts); + } + + for (auto ctx : write_blocker_contexts) { + ctx->complete(0); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>; diff --git a/src/librbd/io/WriteBlockImageDispatch.h b/src/librbd/io/WriteBlockImageDispatch.h new file mode 100644 index 000000000..b1d0ddb0e --- /dev/null +++ b/src/librbd/io/WriteBlockImageDispatch.h @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/ceph_mutex.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include <list> + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template <typename ImageCtxT> +class WriteBlockImageDispatch : public ImageDispatchInterface { +public: + WriteBlockImageDispatch(ImageCtxT* image_ctx); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_WRITE_BLOCK; + } + + void shut_down(Context* on_finish) override; + + int block_writes(); + void block_writes(Context *on_blocked); + void unblock_writes(); + + inline bool writes_blocked() const { + std::shared_lock locker{m_lock}; + return (m_write_blockers > 0); + } + + void wait_on_writes_unblocked(Context *on_unblocked); + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic<uint32_t>* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + +private: + struct C_BlockedWrites; + + typedef std::list<Context*> Contexts; + + ImageCtxT* m_image_ctx; + + mutable ceph::shared_mutex m_lock; + Contexts m_on_dispatches; + + uint32_t m_write_blockers = 0; + Contexts m_write_blocker_contexts; + Contexts m_unblocked_write_waiter_contexts; + uint64_t m_in_flight_writes = 0; + + void handle_finished(int r, uint64_t tid); + + bool process_io(uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched); + void flush_io(Context* on_finish); + + bool invalidate_cache(Context* on_finish) override { + return false; + } + + void handle_blocked_writes(int r); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::WriteBlockImageDispatch<librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H |