From 389020e14594e4894e28d1eb9103c210b142509e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 23 May 2024 18:45:13 +0200 Subject: Adding upstream version 18.2.3. Signed-off-by: Daniel Baumann --- src/librbd/ImageCtx.h | 2 + src/librbd/Journal.cc | 86 +++++-- src/librbd/Journal.h | 23 +- src/librbd/ObjectMap.h | 6 + src/librbd/api/DiffIterate.cc | 130 +++++++++-- src/librbd/api/DiffIterate.h | 7 +- src/librbd/api/Snapshot.cc | 4 +- src/librbd/deep_copy/ImageCopyRequest.cc | 7 +- src/librbd/io/ImageRequest.cc | 54 +---- src/librbd/io/ImageRequest.h | 21 +- src/librbd/io/ObjectRequest.cc | 18 +- src/librbd/io/Types.h | 20 +- src/librbd/object_map/DiffRequest.cc | 382 ++++++++++++++++++++----------- src/librbd/object_map/DiffRequest.h | 29 +-- src/librbd/object_map/Types.h | 15 +- 15 files changed, 526 insertions(+), 278 deletions(-) (limited to 'src/librbd') diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h index 9a432c764..066651ba4 100644 --- a/src/librbd/ImageCtx.h +++ b/src/librbd/ImageCtx.h @@ -148,6 +148,7 @@ namespace librbd { // encryption_format ceph::shared_mutex timestamp_lock; // protects (create/access/modify)_timestamp + // and internal diff_iterate_lock_timestamp ceph::mutex async_ops_lock; // protects async_ops and async_requests ceph::mutex copyup_list_lock; // protects copyup_waiting_list @@ -173,6 +174,7 @@ namespace librbd { utime_t create_timestamp; utime_t access_timestamp; utime_t modify_timestamp; + utime_t diff_iterate_lock_timestamp; file_layout_t layout; diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc index 8ddce2e8f..1b37a30c1 100644 --- a/src/librbd/Journal.cc +++ b/src/librbd/Journal.cc @@ -39,6 +39,7 @@ using util::create_async_context_callback; using util::create_context_callback; using journal::util::C_DecodeTag; using journal::util::C_DecodeTags; +using io::Extents; namespace { @@ -760,36 +761,87 @@ void Journal::user_flushed() { } template -uint64_t Journal::append_write_event(uint64_t offset, size_t length, - const bufferlist &bl, - bool flush_entry) { +void Journal::add_write_event_entries(uint64_t offset, size_t length, + const bufferlist &bl, + uint64_t buffer_offset, + Bufferlists *bufferlists) { ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size()); - uint64_t max_write_data_size = + const uint64_t max_write_data_size = m_max_append_size - journal::AioWriteEvent::get_fixed_size(); // ensure that the write event fits within the journal entry - Bufferlists bufferlists; uint64_t bytes_remaining = length; uint64_t event_offset = 0; do { uint64_t event_length = std::min(bytes_remaining, max_write_data_size); bufferlist event_bl; - event_bl.substr_of(bl, event_offset, event_length); + event_bl.substr_of(bl, buffer_offset + event_offset, event_length); journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset, event_length, event_bl), ceph_clock_now()); - bufferlists.emplace_back(); - encode(event_entry, bufferlists.back()); + bufferlists->emplace_back(); + encode(event_entry, bufferlists->back()); event_offset += event_length; bytes_remaining -= event_length; } while (bytes_remaining > 0); +} - return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, offset, - length, flush_entry, 0); +template +uint64_t Journal::append_write_event(const Extents &image_extents, + const bufferlist &bl, + bool flush_entry) { + Bufferlists bufferlists; + uint64_t buffer_offset = 0; + for (auto &extent : image_extents) { + add_write_event_entries(extent.first, extent.second, bl, buffer_offset, + &bufferlists); + + buffer_offset += extent.second; + } + + return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, + image_extents, flush_entry, 0); +} + +template +uint64_t Journal::append_write_same_event(const Extents &image_extents, + const bufferlist &bl, + bool flush_entry) { + Bufferlists bufferlists; + for (auto &extent : image_extents) { + journal::EventEntry event_entry( + journal::AioWriteSameEvent(extent.first, extent.second, bl), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + } + + return append_io_events(journal::EVENT_TYPE_AIO_WRITESAME, bufferlists, + image_extents, flush_entry, 0); +} + +template +uint64_t Journal::append_discard_event(const Extents &image_extents, + uint32_t discard_granularity_bytes, + bool flush_entry) { + Bufferlists bufferlists; + for (auto &extent : image_extents) { + journal::EventEntry event_entry( + journal::AioDiscardEvent(extent.first, extent.second, + discard_granularity_bytes), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + } + + return append_io_events(journal::EVENT_TYPE_AIO_DISCARD, bufferlists, + image_extents, flush_entry, 0); } template @@ -832,7 +884,8 @@ uint64_t Journal::append_compare_and_write_event(uint64_t offset, } while (bytes_remaining > 0); return append_io_events(journal::EVENT_TYPE_AIO_COMPARE_AND_WRITE, - bufferlists, offset, length, flush_entry, -EILSEQ); + bufferlists, {{offset, length}}, flush_entry, + -EILSEQ); } template @@ -842,14 +895,14 @@ uint64_t Journal::append_io_event(journal::EventEntry &&event_entry, bufferlist bl; event_entry.timestamp = ceph_clock_now(); encode(event_entry, bl); - return append_io_events(event_entry.get_event_type(), {bl}, offset, length, - flush_entry, filter_ret_val); + return append_io_events(event_entry.get_event_type(), {bl}, + {{offset, length}}, flush_entry, filter_ret_val); } template uint64_t Journal::append_io_events(journal::EventType event_type, const Bufferlists &bufferlists, - uint64_t offset, size_t length, + const Extents &image_extents, bool flush_entry, int filter_ret_val) { ceph_assert(!bufferlists.empty()); @@ -870,14 +923,13 @@ uint64_t Journal::append_io_events(journal::EventType event_type, { std::lock_guard event_locker{m_event_lock}; - m_events[tid] = Event(futures, offset, length, filter_ret_val); + m_events[tid] = Event(futures, image_extents, filter_ret_val); } CephContext *cct = m_image_ctx.cct; ldout(cct, 20) << this << " " << __func__ << ": " << "event=" << event_type << ", " - << "offset=" << offset << ", " - << "length=" << length << ", " + << "image_extents=" << image_extents << ", " << "flush=" << flush_entry << ", tid=" << tid << dendl; Context *on_safe = create_async_context_callback( diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h index 1ef9ffa88..5327adac7 100644 --- a/src/librbd/Journal.h +++ b/src/librbd/Journal.h @@ -18,6 +18,7 @@ #include "journal/ReplayHandler.h" #include "librbd/Utils.h" #include "librbd/asio/ContextWQ.h" +#include "librbd/io/Types.h" #include "librbd/journal/Types.h" #include "librbd/journal/TypeTraits.h" @@ -133,14 +134,20 @@ public: void user_flushed(); - uint64_t append_write_event(uint64_t offset, size_t length, + uint64_t append_write_event(const io::Extents &image_extents, const bufferlist &bl, bool flush_entry); + uint64_t append_write_same_event(const io::Extents &image_extents, + const bufferlist &bl, + bool flush_entry); uint64_t append_compare_and_write_event(uint64_t offset, size_t length, const bufferlist &cmp_bl, const bufferlist &write_bl, bool flush_entry); + uint64_t append_discard_event(const io::Extents &image_extents, + uint32_t discard_granularity_bytes, + bool flush_entry); uint64_t append_io_event(journal::EventEntry &&event_entry, uint64_t offset, size_t length, bool flush_entry, int filter_ret_val); @@ -200,11 +207,13 @@ private: Event() { } - Event(const Futures &_futures, uint64_t offset, size_t length, + Event(const Futures &_futures, const io::Extents &image_extents, int filter_ret_val) : futures(_futures), filter_ret_val(filter_ret_val) { - if (length > 0) { - pending_extents.insert(offset, length); + for (auto &extent : image_extents) { + if (extent.second > 0) { + pending_extents.insert(extent.first, extent.second); + } } } }; @@ -322,9 +331,13 @@ private: bool is_journal_replaying(const ceph::mutex &) const; bool is_tag_owner(const ceph::mutex &) const; + void add_write_event_entries(uint64_t offset, size_t length, + const bufferlist &bl, + uint64_t buffer_offset, + Bufferlists *bufferlists); uint64_t append_io_events(journal::EventType event_type, const Bufferlists &bufferlists, - uint64_t offset, size_t length, bool flush_entry, + const io::Extents &extents, bool flush_entry, int filter_ret_val); Future wait_event(ceph::mutex &lock, uint64_t tid, Context *on_safe); diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h index 8b5b352ef..35ea4cb88 100644 --- a/src/librbd/ObjectMap.h +++ b/src/librbd/ObjectMap.h @@ -45,6 +45,12 @@ public: return m_object_map.size(); } + template + auto with_object_map(F&& f, Args&&... args) const { + std::shared_lock locker(m_lock); + return std::forward(f)(m_object_map, std::forward(args)...); + } + inline void set_state(uint64_t object_no, uint8_t new_state, const boost::optional ¤t_state) { std::unique_lock locker{m_lock}; diff --git a/src/librbd/api/DiffIterate.cc b/src/librbd/api/DiffIterate.cc index b400b5d5a..717110bd3 100644 --- a/src/librbd/api/DiffIterate.cc +++ b/src/librbd/api/DiffIterate.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #include "librbd/api/DiffIterate.h" +#include "librbd/ExclusiveLock.h" #include "librbd/ImageCtx.h" #include "librbd/ImageState.h" #include "librbd/ObjectMap.h" @@ -30,6 +31,8 @@ namespace api { namespace { +constexpr uint32_t LOCK_INTERVAL_SECONDS = 5; + struct DiffContext { DiffIterate<>::Callback callback; void *callback_arg; @@ -149,12 +152,42 @@ private: } }; -int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) { - // it's possible for a discard to create a hole in the parent image -- ignore - if (exists) { - interval_set *diff = static_cast *>(arg); - diff->insert(off, len); +template +bool should_try_acquire_lock(I* image_ctx) { + if (image_ctx->exclusive_lock == nullptr || + image_ctx->exclusive_lock->is_lock_owner()) { + return false; + } + if ((image_ctx->features & RBD_FEATURE_FAST_DIFF) == 0) { + return false; + } + + utime_t now = ceph_clock_now(); + utime_t cutoff = now - utime_t(LOCK_INTERVAL_SECONDS, 0); + + { + std::shared_lock timestamp_locker{image_ctx->timestamp_lock}; + if (image_ctx->diff_iterate_lock_timestamp > cutoff) { + return false; + } + } + + std::unique_lock timestamp_locker{image_ctx->timestamp_lock}; + if (image_ctx->diff_iterate_lock_timestamp > cutoff) { + return false; } + + image_ctx->diff_iterate_lock_timestamp = now; + return true; +} + +int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) { + // This reads the existing extents in a parent from the beginning + // of time. Since images are thin-provisioned, the extents will + // always represent data, not holes. + ceph_assert(exists); + auto diff = static_cast*>(arg); + diff->insert(off, len); return 0; } @@ -167,10 +200,14 @@ int DiffIterate::diff_iterate(I *ictx, uint64_t off, uint64_t len, bool include_parent, bool whole_object, int (*cb)(uint64_t, size_t, int, void *), - void *arg) -{ - ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off - << " len = " << len << dendl; + void *arg) { + ldout(ictx->cct, 10) << "from_snap_namespace=" << from_snap_namespace + << ", fromsnapname=" << (fromsnapname ?: "") + << ", off=" << off + << ", len=" << len + << ", include_parent=" << include_parent + << ", whole_object=" << whole_object + << dendl; if (!ictx->data_ctx.is_valid()) { return -ENODEV; @@ -197,11 +234,28 @@ int DiffIterate::diff_iterate(I *ictx, return r; } - ictx->image_lock.lock_shared(); - r = clip_io(ictx, off, &len, io::ImageArea::DATA); - ictx->image_lock.unlock_shared(); - if (r < 0) { - return r; + { + std::shared_lock owner_locker{ictx->owner_lock}; + std::shared_lock image_locker{ictx->image_lock}; + + r = clip_io(ictx, off, &len, io::ImageArea::DATA); + if (r < 0) { + return r; + } + + // optimization: hang onto the only object map needed to run fast + // diff against the beginning of time -- it's loaded when exclusive + // lock is acquired + // acquire exclusive lock only if not busy (i.e. don't request), + // throttle acquisition attempts and ignore errors + if (fromsnapname == nullptr && whole_object && + should_try_acquire_lock(ictx)) { + C_SaferCond lock_ctx; + ictx->exclusive_lock->try_acquire_lock(&lock_ctx); + image_locker.unlock(); + owner_locker.unlock(); + lock_ctx.wait(); + } } DiffIterate command(*ictx, from_snap_namespace, fromsnapname, off, len, @@ -210,6 +264,29 @@ int DiffIterate::diff_iterate(I *ictx, return r; } +template +std::pair DiffIterate::calc_object_diff_range() { + uint64_t period = m_image_ctx.get_stripe_period(); + uint64_t first_period_off = round_down_to(m_offset, period); + uint64_t last_period_off = round_down_to(m_offset + m_length - 1, period); + + striper::LightweightObjectExtents object_extents; + if (first_period_off != last_period_off) { + // map only the tail of the first period and the front of the last + // period instead of the entire range for efficiency + Striper::file_to_extents(m_image_ctx.cct, &m_image_ctx.layout, + m_offset, first_period_off + period - m_offset, + 0, 0, &object_extents); + Striper::file_to_extents(m_image_ctx.cct, &m_image_ctx.layout, + last_period_off, m_offset + m_length - last_period_off, + 0, 0, &object_extents); + } else { + Striper::file_to_extents(m_image_ctx.cct, &m_image_ctx.layout, m_offset, + m_length, 0, 0, &object_extents); + } + return {object_extents.front().object_no, object_extents.back().object_no + 1}; +} + template int DiffIterate::execute() { CephContext* cct = m_image_ctx.cct; @@ -244,20 +321,24 @@ int DiffIterate::execute() { int r; bool fast_diff_enabled = false; + uint64_t start_object_no, end_object_no; BitVector<2> object_diff_state; interval_set parent_diff; if (m_whole_object) { + std::tie(start_object_no, end_object_no) = calc_object_diff_range(); + C_SaferCond ctx; auto req = object_map::DiffRequest::create(&m_image_ctx, from_snap_id, - end_snap_id, + end_snap_id, start_object_no, + end_object_no, &object_diff_state, &ctx); req->send(); - r = ctx.wait(); if (r < 0) { ldout(cct, 5) << "fast diff disabled" << dendl; } else { ldout(cct, 5) << "fast diff enabled" << dendl; + ceph_assert(object_diff_state.size() == end_object_no - start_object_no); fast_diff_enabled = true; // check parent overlap only if we are comparing to the beginning of time @@ -265,12 +346,14 @@ int DiffIterate::execute() { std::shared_lock image_locker{m_image_ctx.image_lock}; uint64_t raw_overlap = 0; m_image_ctx.get_parent_overlap(m_image_ctx.snap_id, &raw_overlap); - auto overlap = m_image_ctx.reduce_parent_overlap(raw_overlap, false); - if (overlap.first > 0 && overlap.second == io::ImageArea::DATA) { + io::Extents parent_extents = {{m_offset, m_length}}; + if (m_image_ctx.prune_parent_extents(parent_extents, io::ImageArea::DATA, + raw_overlap, false) > 0) { ldout(cct, 10) << " first getting parent diff" << dendl; - DiffIterate diff_parent(*m_image_ctx.parent, {}, nullptr, 0, - overlap.first, true, true, &simple_diff_cb, - &parent_diff); + DiffIterate diff_parent(*m_image_ctx.parent, {}, nullptr, + parent_extents[0].first, + parent_extents[0].second, true, true, + &simple_diff_cb, &parent_diff); r = diff_parent.execute(); if (r < 0) { return r; @@ -292,7 +375,7 @@ int DiffIterate::execute() { uint64_t left = m_length; while (left > 0) { - uint64_t period_off = off - (off % period); + uint64_t period_off = round_down_to(off, period); uint64_t read_len = std::min(period_off + period - off, left); if (fast_diff_enabled) { @@ -307,7 +390,8 @@ int DiffIterate::execute() { io::SparseExtents aggregate_sparse_extents; for (auto& [object, extents] : object_extents) { const uint64_t object_no = extents.front().objectno; - uint8_t diff_state = object_diff_state[object_no]; + ceph_assert(object_no >= start_object_no && object_no < end_object_no); + uint8_t diff_state = object_diff_state[object_no - start_object_no]; ldout(cct, 20) << "object " << object << ": diff_state=" << (int)diff_state << dendl; diff --git a/src/librbd/api/DiffIterate.h b/src/librbd/api/DiffIterate.h index e6074d9cb..c53b0e995 100644 --- a/src/librbd/api/DiffIterate.h +++ b/src/librbd/api/DiffIterate.h @@ -7,6 +7,7 @@ #include "include/int_types.h" #include "common/bit_vector.hpp" #include "cls/rbd/cls_rbd_types.h" +#include namespace librbd { @@ -51,11 +52,9 @@ private: { } - int execute(); - - int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, - BitVector<2>* object_diff_state); + std::pair calc_object_diff_range(); + int execute(); }; } // namespace api diff --git a/src/librbd/api/Snapshot.cc b/src/librbd/api/Snapshot.cc index 03cefbd1c..306ddb593 100644 --- a/src/librbd/api/Snapshot.cc +++ b/src/librbd/api/Snapshot.cc @@ -378,7 +378,9 @@ int Snapshot::remove(I *ictx, const char *snap_name, uint32_t flags, template int Snapshot::get_timestamp(I *ictx, uint64_t snap_id, struct timespec *timestamp) { auto snap_it = ictx->snap_info.find(snap_id); - ceph_assert(snap_it != ictx->snap_info.end()); + if (snap_it == ictx->snap_info.end()) { + return -ENOENT; + } utime_t time = snap_it->second.timestamp; time.to_timespec(timestamp); return 0; diff --git a/src/librbd/deep_copy/ImageCopyRequest.cc b/src/librbd/deep_copy/ImageCopyRequest.cc index 08e959dd5..668808340 100644 --- a/src/librbd/deep_copy/ImageCopyRequest.cc +++ b/src/librbd/deep_copy/ImageCopyRequest.cc @@ -101,9 +101,10 @@ void ImageCopyRequest::compute_diff() { auto ctx = create_context_callback< ImageCopyRequest, &ImageCopyRequest::handle_compute_diff>(this); - auto req = object_map::DiffRequest::create(m_src_image_ctx, m_src_snap_id_start, - m_src_snap_id_end, &m_object_diff_state, - ctx); + auto req = object_map::DiffRequest::create(m_src_image_ctx, + m_src_snap_id_start, + m_src_snap_id_end, 0, UINT64_MAX, + &m_object_diff_state, ctx); req->send(); } diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc index e4c41c229..fb9f8944e 100644 --- a/src/librbd/io/ImageRequest.cc +++ b/src/librbd/io/ImageRequest.cc @@ -473,7 +473,7 @@ void AbstractImageWriteRequest::send_request() { if (journaling) { // in-flight ops are flushed prior to closing the journal ceph_assert(image_ctx.journal != NULL); - journal_tid = append_journal_event(m_synchronous); + journal_tid = append_journal_event(); } // it's very important that IOContext is captured here instead of @@ -518,22 +518,12 @@ void ImageWriteRequest::assemble_extent( } template -uint64_t ImageWriteRequest::append_journal_event(bool synchronous) { +uint64_t ImageWriteRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; - uint64_t buffer_offset = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - bufferlist sub_bl; - sub_bl.substr_of(m_bl, buffer_offset, extent.second); - buffer_offset += extent.second; - - tid = image_ctx.journal->append_write_event(extent.first, extent.second, - sub_bl, synchronous); - } - - return tid; + return image_ctx.journal->append_write_event( + this->m_image_extents, m_bl, false); } template @@ -566,22 +556,12 @@ void ImageWriteRequest::update_stats(size_t length) { } template -uint64_t ImageDiscardRequest::append_journal_event(bool synchronous) { +uint64_t ImageDiscardRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - journal::EventEntry event_entry( - journal::AioDiscardEvent(extent.first, - extent.second, - this->m_discard_granularity_bytes)); - tid = image_ctx.journal->append_io_event(std::move(event_entry), - extent.first, extent.second, - synchronous, 0); - } - - return tid; + return image_ctx.journal->append_discard_event( + this->m_image_extents, m_discard_granularity_bytes, false); } template @@ -717,21 +697,12 @@ void ImageFlushRequest::send_request() { } template -uint64_t ImageWriteSameRequest::append_journal_event(bool synchronous) { +uint64_t ImageWriteSameRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first, - extent.second, - m_data_bl)); - tid = image_ctx.journal->append_io_event(std::move(event_entry), - extent.first, extent.second, - synchronous, 0); - } - - return tid; + return image_ctx.journal->append_write_same_event( + this->m_image_extents, m_data_bl, false); } template @@ -768,8 +739,7 @@ void ImageWriteSameRequest::update_stats(size_t length) { } template -uint64_t ImageCompareAndWriteRequest::append_journal_event( - bool synchronous) { +uint64_t ImageCompareAndWriteRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; uint64_t tid = 0; @@ -779,7 +749,7 @@ uint64_t ImageCompareAndWriteRequest::append_journal_event( extent.second, m_cmp_bl, m_bl, - synchronous); + false); return tid; } diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h index 2668c1acb..996c90a11 100644 --- a/src/librbd/io/ImageRequest.h +++ b/src/librbd/io/ImageRequest.h @@ -114,11 +114,6 @@ private: template class AbstractImageWriteRequest : public ImageRequest { -public: - inline void flag_synchronous() { - m_synchronous = true; - } - protected: using typename ImageRequest::ObjectRequests; @@ -127,8 +122,7 @@ protected: const char *trace_name, const ZTracer::Trace &parent_trace) : ImageRequest(image_ctx, aio_comp, std::move(image_extents), - area, trace_name, parent_trace), - m_synchronous(false) { + area, trace_name, parent_trace) { } void send_request() override; @@ -144,11 +138,8 @@ protected: const LightweightObjectExtent &object_extent, IOContext io_context, uint64_t journal_tid, bool single_extent, Context *on_finish) = 0; - virtual uint64_t append_journal_event(bool synchronous) = 0; + virtual uint64_t append_journal_event() = 0; virtual void update_stats(size_t length) = 0; - -private: - bool m_synchronous; }; template @@ -180,7 +171,7 @@ protected: const LightweightObjectExtent &object_extent, IOContext io_context, uint64_t journal_tid, bool single_extent, Context *on_finish) override; - uint64_t append_journal_event(bool synchronous) override; + uint64_t append_journal_event() override; void update_stats(size_t length) override; private: @@ -215,7 +206,7 @@ protected: const LightweightObjectExtent &object_extent, IOContext io_context, uint64_t journal_tid, bool single_extent, Context *on_finish) override; - uint64_t append_journal_event(bool synchronous) override; + uint64_t append_journal_event() override; void update_stats(size_t length) override; int prune_object_extents( @@ -283,7 +274,7 @@ protected: const LightweightObjectExtent &object_extent, IOContext io_context, uint64_t journal_tid, bool single_extent, Context *on_finish) override; - uint64_t append_journal_event(bool synchronous) override; + uint64_t append_journal_event() override; void update_stats(size_t length) override; private: bufferlist m_data_bl; @@ -315,7 +306,7 @@ protected: const LightweightObjectExtent &object_extent, IOContext io_context, uint64_t journal_tid, bool single_extent, Context *on_finish) override; - uint64_t append_journal_event(bool synchronous) override; + uint64_t append_journal_event() override; void update_stats(size_t length) override; aio_type_t get_aio_type() const override { diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc index 6d246cdf3..fc1a96858 100644 --- a/src/librbd/io/ObjectRequest.cc +++ b/src/librbd/io/ObjectRequest.cc @@ -834,16 +834,17 @@ void ObjectListSnapsRequest::handle_list_snaps(int r) { end_snap_id, &diff, &end_size, &exists, &clone_end_snap_id, &read_whole_object); - if (read_whole_object || - (!diff.empty() && - ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) { + if (read_whole_object) { ldout(cct, 1) << "need to read full object" << dendl; - diff.clear(); diff.insert(0, image_ctx->layout.object_size); + exists = true; end_size = image_ctx->layout.object_size; clone_end_snap_id = end_snap_id; - } else if (!exists) { - end_size = 0; + } else if ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0 && + !diff.empty()) { + ldout(cct, 20) << "expanding diff from " << diff << dendl; + diff.clear(); + diff.insert(0, image_ctx->layout.object_size); } if (exists) { @@ -863,7 +864,8 @@ void ObjectListSnapsRequest::handle_list_snaps(int r) { // clip diff to size of object (in case it was truncated) interval_set zero_interval; - if (end_size < prev_end_size) { + if (end_size < prev_end_size && + (m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) == 0) { zero_interval.insert(end_size, prev_end_size - end_size); zero_interval.intersection_of(object_interval); @@ -884,7 +886,7 @@ void ObjectListSnapsRequest::handle_list_snaps(int r) { << "end_size=" << end_size << ", " << "prev_end_size=" << prev_end_size << ", " << "exists=" << exists << ", " - << "whole_object=" << read_whole_object << dendl; + << "read_whole_object=" << read_whole_object << dendl; // check if object exists prior to start of incremental snap delta so that // we don't DNE the object if no additional deltas exist diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h index 7c70986c5..03e9ffa3b 100644 --- a/src/librbd/io/Types.h +++ b/src/librbd/io/Types.h @@ -180,8 +180,9 @@ struct SparseExtent { std::ostream& operator<<(std::ostream& os, const SparseExtent& state); struct SparseExtentSplitMerge { - SparseExtent split(uint64_t offset, uint64_t length, SparseExtent &se) const { - return SparseExtent(se.state, se.length); + SparseExtent split(uint64_t offset, uint64_t length, + const SparseExtent& se) const { + return SparseExtent(se.state, length); } bool can_merge(const SparseExtent& left, const SparseExtent& right) const { @@ -232,10 +233,10 @@ struct SparseBufferlistExtent : public SparseExtent { struct SparseBufferlistExtentSplitMerge { SparseBufferlistExtent split(uint64_t offset, uint64_t length, - SparseBufferlistExtent& sbe) const { + const SparseBufferlistExtent& sbe) const { ceph::bufferlist bl; if (sbe.state == SPARSE_EXTENT_STATE_DATA) { - bl.substr_of(bl, offset, length); + bl.substr_of(sbe.bl, offset, length); } return SparseBufferlistExtent(sbe.state, length, std::move(bl)); } @@ -247,14 +248,13 @@ struct SparseBufferlistExtentSplitMerge { SparseBufferlistExtent merge(SparseBufferlistExtent&& left, SparseBufferlistExtent&& right) const { + ceph::bufferlist bl; if (left.state == SPARSE_EXTENT_STATE_DATA) { - ceph::bufferlist bl{std::move(left.bl)}; - bl.claim_append(std::move(right.bl)); - return SparseBufferlistExtent(SPARSE_EXTENT_STATE_DATA, - bl.length(), std::move(bl)); - } else { - return SparseBufferlistExtent(left.state, left.length + right.length, {}); + bl.claim_append(left.bl); + bl.claim_append(right.bl); } + return SparseBufferlistExtent(left.state, left.length + right.length, + std::move(bl)); } uint64_t length(const SparseBufferlistExtent& sbe) const { diff --git a/src/librbd/object_map/DiffRequest.cc b/src/librbd/object_map/DiffRequest.cc index 606d48bbf..acaf31a39 100644 --- a/src/librbd/object_map/DiffRequest.cc +++ b/src/librbd/object_map/DiffRequest.cc @@ -20,6 +20,193 @@ namespace object_map { using util::create_rados_callback; +template +DiffRequest::DiffRequest(I* image_ctx, + uint64_t snap_id_start, uint64_t snap_id_end, + uint64_t start_object_no, uint64_t end_object_no, + BitVector<2>* object_diff_state, + Context* on_finish) + : m_image_ctx(image_ctx), m_snap_id_start(snap_id_start), + m_snap_id_end(snap_id_end), m_start_object_no(start_object_no), + m_end_object_no(end_object_no), m_object_diff_state(object_diff_state), + m_on_finish(on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id_start=" << m_snap_id_start + << ", snap_id_end=" << m_snap_id_end + << ", start_object_no=" << m_start_object_no + << ", end_object_no=" << m_end_object_no + << dendl; +} + +template +bool DiffRequest::is_diff_iterate() const { + return m_start_object_no != 0 || m_end_object_no != UINT64_MAX; +} + +template +int DiffRequest::prepare_for_object_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "snap_id=" << m_current_snap_id << dendl; + + if ((m_image_ctx->features & RBD_FEATURE_FAST_DIFF) == 0) { + ldout(cct, 10) << "fast-diff feature not enabled" << dendl; + return -EINVAL; + } + + if (m_current_snap_id == CEPH_NOSNAP) { + m_current_size = m_image_ctx->size; + } else { + auto snap_it = m_image_ctx->snap_info.find(m_current_snap_id); + if (snap_it == m_image_ctx->snap_info.end()) { + ldout(cct, 10) << "snapshot " << m_current_snap_id << " does not exist" + << dendl; + return -ENOENT; + } + m_current_size = snap_it->second.size; + } + + uint64_t flags; + int r = m_image_ctx->get_flags(m_current_snap_id, &flags); + ceph_assert(r == 0); + + if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) { + ldout(cct, 1) << "cannot perform fast diff on invalid object map" + << dendl; + return -EINVAL; + } + + return 0; +} + +template +int DiffRequest::process_object_map(const BitVector<2>& object_map) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "snap_id=" << m_current_snap_id << dendl; + + uint64_t num_objs = Striper::get_num_objects(m_image_ctx->layout, + m_current_size); + if (object_map.size() < num_objs) { + ldout(cct, 1) << "object map too small: " + << object_map.size() << " < " << num_objs << dendl; + return -EINVAL; + } + + uint64_t start_object_no, end_object_no; + uint64_t prev_object_diff_state_size = m_object_diff_state->size(); + if (is_diff_iterate()) { + start_object_no = std::min(m_start_object_no, num_objs); + end_object_no = std::min(m_end_object_no, num_objs); + uint64_t num_objs_in_range = end_object_no - start_object_no; + if (m_object_diff_state->size() != num_objs_in_range) { + m_object_diff_state->resize(num_objs_in_range); + } + } else { + // for deep-copy, the object diff state should be the largest of + // all versions in the set, so it's only ever grown + // shrink is handled by flagging trimmed objects as non-existent + // and comparing against the previous object diff state as usual + if (m_object_diff_state->size() < num_objs) { + m_object_diff_state->resize(num_objs); + } + start_object_no = 0; + end_object_no = m_object_diff_state->size(); + } + + uint64_t overlap = std::min(m_object_diff_state->size(), + prev_object_diff_state_size); + auto it = object_map.begin() + start_object_no; + auto diff_it = m_object_diff_state->begin(); + uint64_t ono = start_object_no; + for (; ono < start_object_no + overlap; ++diff_it, ++ono) { + uint8_t object_map_state = (ono < num_objs ? *it++ : OBJECT_NONEXISTENT); + uint8_t prev_object_diff_state = *diff_it; + switch (prev_object_diff_state) { + case DIFF_STATE_HOLE: + if (object_map_state != OBJECT_NONEXISTENT) { + // stay in HOLE on intermediate snapshots for diff-iterate + if (!is_diff_iterate() || m_current_snap_id == m_snap_id_end) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } + } + break; + case DIFF_STATE_DATA: + if (object_map_state == OBJECT_NONEXISTENT) { + *diff_it = DIFF_STATE_HOLE_UPDATED; + } else if (object_map_state != OBJECT_EXISTS_CLEAN) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } + break; + case DIFF_STATE_HOLE_UPDATED: + if (object_map_state != OBJECT_NONEXISTENT) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } + break; + case DIFF_STATE_DATA_UPDATED: + if (object_map_state == OBJECT_NONEXISTENT) { + *diff_it = DIFF_STATE_HOLE_UPDATED; + } + break; + default: + ceph_abort(); + } + + ldout(cct, 20) << "object state: " << ono << " " + << static_cast(prev_object_diff_state) + << "->" << static_cast(*diff_it) << " (" + << static_cast(object_map_state) << ")" + << dendl; + } + ldout(cct, 20) << "computed overlap diffs" << dendl; + + ceph_assert(diff_it == m_object_diff_state->end() || + end_object_no <= num_objs); + for (; ono < end_object_no; ++it, ++diff_it, ++ono) { + uint8_t object_map_state = *it; + if (object_map_state == OBJECT_NONEXISTENT) { + *diff_it = DIFF_STATE_HOLE; + } else if (m_current_snap_id != m_snap_id_start) { + // diffing against the beginning of time or image was grown + // (implicit) starting state is HOLE, this is the first object + // map after + if (is_diff_iterate()) { + // for diff-iterate, if the object is discarded prior to or + // in the end version, result should be HOLE + // since DATA_UPDATED can transition only to HOLE_UPDATED, + // stay in HOLE on intermediate snapshots -- another way to + // put this is that when starting with a hole, intermediate + // snapshots can be ignored as the result depends only on the + // end version + if (m_current_snap_id == m_snap_id_end) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } else { + *diff_it = DIFF_STATE_HOLE; + } + } else { + // for deep-copy, if the object is discarded prior to or + // in the end version, result should be HOLE_UPDATED + *diff_it = DIFF_STATE_DATA_UPDATED; + } + } else { + // diffing against a snapshot, this is its object map + if (object_map_state != OBJECT_PENDING) { + *diff_it = DIFF_STATE_DATA; + } else { + *diff_it = DIFF_STATE_DATA_UPDATED; + } + } + + ldout(cct, 20) << "object state: " << ono << " " + << "->" << static_cast(*diff_it) << " (" + << static_cast(*it) << ")" << dendl; + } + ldout(cct, 20) << "computed resize diffs" << dendl; + + ceph_assert(diff_it == m_object_diff_state->end()); + return 0; +} + template void DiffRequest::send() { auto cct = m_image_ctx->cct; @@ -30,24 +217,62 @@ void DiffRequest::send() { << "snap_id_end=" << m_snap_id_end << dendl; finish(-EINVAL); return; - } else if (m_snap_id_start == m_snap_id_end) { - // no delta between the same snapshot - finish(0); + } + if (m_start_object_no == UINT64_MAX || m_start_object_no > m_end_object_no || + (m_start_object_no != 0 && m_end_object_no == UINT64_MAX)) { + lderr(cct) << "invalid start/end object numbers: " + << "start_object_no=" << m_start_object_no << ", " + << "end_object_no=" << m_end_object_no << dendl; + finish(-EINVAL); return; } m_object_diff_state->clear(); - // collect all the snap ids in the provided range (inclusive) - if (m_snap_id_start != 0) { - m_snap_ids.insert(m_snap_id_start); + if (m_snap_id_start == m_snap_id_end) { + // no delta between the same snapshot + finish(0); + return; + } + if (m_start_object_no == m_end_object_no) { + // no objects in the provided range (half-open) + finish(0); + return; } std::shared_lock image_locker{m_image_ctx->image_lock}; - auto snap_info_it = m_image_ctx->snap_info.upper_bound(m_snap_id_start); - auto snap_info_it_end = m_image_ctx->snap_info.lower_bound(m_snap_id_end); - for (; snap_info_it != snap_info_it_end; ++snap_info_it) { - m_snap_ids.insert(snap_info_it->first); + if (is_diff_iterate() && + m_snap_id_start == 0 && + m_snap_id_end == m_image_ctx->snap_id && + m_image_ctx->object_map != nullptr) { + ldout(cct, 10) << "using in-memory object map" << dendl; + m_current_snap_id = m_snap_id_end; + + int r = prepare_for_object_map(); + if (r == 0) { + r = m_image_ctx->object_map->with_object_map( + [this](const BitVector<2>& object_map) { + return process_object_map(object_map); + }); + } + image_locker.unlock(); + + finish(r); + return; + } + + // collect all the snap ids in the provided range (inclusive) unless + // this is diff-iterate against the beginning of time, in which case + // only the end version matters + if (!is_diff_iterate() || m_snap_id_start != 0) { + if (m_snap_id_start != 0) { + m_snap_ids.insert(m_snap_id_start); + } + auto snap_info_it = m_image_ctx->snap_info.upper_bound(m_snap_id_start); + auto snap_info_it_end = m_image_ctx->snap_info.lower_bound(m_snap_id_end); + for (; snap_info_it != snap_info_it_end; ++snap_info_it) { + m_snap_ids.insert(snap_info_it->first); + } } m_snap_ids.insert(m_snap_id_end); @@ -72,59 +297,23 @@ void DiffRequest::load_object_map( auto cct = m_image_ctx->cct; ldout(cct, 10) << "snap_id=" << m_current_snap_id << dendl; - if ((m_image_ctx->features & RBD_FEATURE_FAST_DIFF) == 0) { - image_locker->unlock(); - - ldout(cct, 10) << "fast-diff feature not enabled" << dendl; - finish(-EINVAL); - return; - } - // ignore ENOENT with intermediate snapshots since deleted // snaps will get merged with later snapshots m_ignore_enoent = (m_current_snap_id != m_snap_id_start && m_current_snap_id != m_snap_id_end); - if (m_current_snap_id == CEPH_NOSNAP) { - m_current_size = m_image_ctx->size; - } else { - auto snap_it = m_image_ctx->snap_info.find(m_current_snap_id); - if (snap_it == m_image_ctx->snap_info.end()) { - ldout(cct, 10) << "snapshot " << m_current_snap_id << " does not exist" - << dendl; - if (!m_ignore_enoent) { - image_locker->unlock(); - - finish(-ENOENT); - return; - } - - load_object_map(image_locker); - return; - } - - m_current_size = snap_it->second.size; - } - - uint64_t flags = 0; - int r = m_image_ctx->get_flags(m_current_snap_id, &flags); - if (r < 0) { + int r = prepare_for_object_map(); + if (r == -ENOENT && m_ignore_enoent) { + load_object_map(image_locker); + return; + } else if (r < 0) { image_locker->unlock(); - lderr(cct) << "failed to retrieve image flags: " << cpp_strerror(r) - << dendl; finish(r); return; } image_locker->unlock(); - if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) { - ldout(cct, 1) << "cannot perform fast diff on invalid object map" - << dendl; - finish(-EINVAL); - return; - } - std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, m_current_snap_id)); @@ -144,100 +333,27 @@ void DiffRequest::handle_load_object_map(int r) { auto cct = m_image_ctx->cct; ldout(cct, 10) << "r=" << r << dendl; + BitVector<2> object_map; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, + m_current_snap_id)); + if (r == 0) { auto bl_it = m_out_bl.cbegin(); - r = cls_client::object_map_load_finish(&bl_it, &m_object_map); + r = cls_client::object_map_load_finish(&bl_it, &object_map); } - - std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, - m_current_snap_id)); if (r == -ENOENT && m_ignore_enoent) { ldout(cct, 10) << "object map " << oid << " does not exist" << dendl; - - std::shared_lock image_locker{m_image_ctx->image_lock}; - load_object_map(&image_locker); - return; } else if (r < 0) { lderr(cct) << "failed to load object map: " << oid << dendl; finish(r); return; - } - ldout(cct, 20) << "loaded object map " << oid << dendl; - - uint64_t num_objs = Striper::get_num_objects(m_image_ctx->layout, - m_current_size); - if (m_object_map.size() < num_objs) { - ldout(cct, 1) << "object map too small: " - << m_object_map.size() << " < " << num_objs << dendl; - finish(-EINVAL); - return; } else { - m_object_map.resize(num_objs); - } - - uint64_t prev_object_diff_state_size = m_object_diff_state->size(); - if (prev_object_diff_state_size < num_objs) { - // the diff state should be the largest of all snapshots in the set - m_object_diff_state->resize(num_objs); - } - if (m_object_map.size() < m_object_diff_state->size()) { - // the image was shrunk so expanding the object map will flag end objects - // as non-existent and they will be compared against the previous object - // diff state - m_object_map.resize(m_object_diff_state->size()); - } - - uint64_t overlap = std::min(m_object_map.size(), prev_object_diff_state_size); - auto it = m_object_map.begin(); - auto overlap_end_it = it + overlap; - auto diff_it = m_object_diff_state->begin(); - uint64_t i = 0; - for (; it != overlap_end_it; ++it, ++diff_it, ++i) { - uint8_t object_map_state = *it; - uint8_t prev_object_diff_state = *diff_it; - if (object_map_state == OBJECT_EXISTS || - object_map_state == OBJECT_PENDING || - (object_map_state == OBJECT_EXISTS_CLEAN && - prev_object_diff_state != DIFF_STATE_DATA && - prev_object_diff_state != DIFF_STATE_DATA_UPDATED)) { - *diff_it = DIFF_STATE_DATA_UPDATED; - } else if (object_map_state == OBJECT_NONEXISTENT && - prev_object_diff_state != DIFF_STATE_HOLE && - prev_object_diff_state != DIFF_STATE_HOLE_UPDATED) { - *diff_it = DIFF_STATE_HOLE_UPDATED; - } - - ldout(cct, 20) << "object state: " << i << " " - << static_cast(prev_object_diff_state) - << "->" << static_cast(*diff_it) << " (" - << static_cast(object_map_state) << ")" - << dendl; - } - ldout(cct, 20) << "computed overlap diffs" << dendl; - - bool diff_from_start = (m_snap_id_start == 0); - auto end_it = m_object_map.end(); - if (m_object_map.size() > prev_object_diff_state_size) { - for (; it != end_it; ++it,++diff_it, ++i) { - uint8_t object_map_state = *it; - if (object_map_state == OBJECT_NONEXISTENT) { - *diff_it = DIFF_STATE_HOLE; - } else if (diff_from_start || - (m_object_diff_state_valid && - object_map_state != OBJECT_EXISTS_CLEAN)) { - *diff_it = DIFF_STATE_DATA_UPDATED; - } else { - *diff_it = DIFF_STATE_DATA; - } - - ldout(cct, 20) << "object state: " << i << " " - << "->" << static_cast(*diff_it) << " (" - << static_cast(*it) << ")" << dendl; + r = process_object_map(object_map); + if (r < 0) { + finish(r); + return; } } - ldout(cct, 20) << "computed resize diffs" << dendl; - - m_object_diff_state_valid = true; std::shared_lock image_locker{m_image_ctx->image_lock}; load_object_map(&image_locker); diff --git a/src/librbd/object_map/DiffRequest.h b/src/librbd/object_map/DiffRequest.h index e83a1629e..740f4e02a 100644 --- a/src/librbd/object_map/DiffRequest.h +++ b/src/librbd/object_map/DiffRequest.h @@ -21,21 +21,20 @@ namespace object_map { template class DiffRequest { public: - static DiffRequest* create(ImageCtxT* image_ctx, uint64_t snap_id_start, - uint64_t snap_id_end, + static DiffRequest* create(ImageCtxT* image_ctx, + uint64_t snap_id_start, uint64_t snap_id_end, + uint64_t start_object_no, uint64_t end_object_no, BitVector<2>* object_diff_state, Context* on_finish) { return new DiffRequest(image_ctx, snap_id_start, snap_id_end, - object_diff_state, on_finish); + start_object_no, end_object_no, object_diff_state, + on_finish); } - DiffRequest(ImageCtxT* image_ctx, uint64_t snap_id_start, - uint64_t snap_id_end, BitVector<2>* object_diff_state, - Context* on_finish) - : m_image_ctx(image_ctx), m_snap_id_start(snap_id_start), - m_snap_id_end(snap_id_end), m_object_diff_state(object_diff_state), - m_on_finish(on_finish) { - } + DiffRequest(ImageCtxT* image_ctx, + uint64_t snap_id_start, uint64_t snap_id_end, + uint64_t start_object_no, uint64_t end_object_no, + BitVector<2>* object_diff_state, Context* on_finish); void send(); @@ -58,6 +57,8 @@ private: ImageCtxT* m_image_ctx; uint64_t m_snap_id_start; uint64_t m_snap_id_end; + uint64_t m_start_object_no; + uint64_t m_end_object_no; BitVector<2>* m_object_diff_state; Context* m_on_finish; @@ -67,11 +68,13 @@ private: uint64_t m_current_size = 0; - BitVector<2> m_object_map; - bool m_object_diff_state_valid = false; - bufferlist m_out_bl; + bool is_diff_iterate() const; + + int prepare_for_object_map(); + int process_object_map(const BitVector<2>& object_map); + void load_object_map(std::shared_lock* image_locker); void handle_load_object_map(int r); diff --git a/src/librbd/object_map/Types.h b/src/librbd/object_map/Types.h index 0ce91bd96..576ea0e4b 100644 --- a/src/librbd/object_map/Types.h +++ b/src/librbd/object_map/Types.h @@ -8,10 +8,17 @@ namespace librbd { namespace object_map { enum DiffState { - DIFF_STATE_HOLE = 0, /* unchanged hole */ - DIFF_STATE_DATA = 1, /* unchanged data */ - DIFF_STATE_HOLE_UPDATED = 2, /* new hole */ - DIFF_STATE_DATA_UPDATED = 3 /* new data */ + // diff-iterate: hole with or without data captured in intermediate snapshot + // deep-copy: hole without data captured in intermediate snapshot + DIFF_STATE_HOLE = 0, + // diff-iterate, deep-copy: unchanged data + DIFF_STATE_DATA = 1, + // diff-iterate: new hole (data -> hole) + // deep-copy: new hole (data -> hole) or hole with data captured in + // intermediate snapshot + DIFF_STATE_HOLE_UPDATED = 2, + // diff-iterate, deep-copy: new data (hole -> data) or changed data + DIFF_STATE_DATA_UPDATED = 3 }; } // namespace object_map -- cgit v1.2.3