diff options
Diffstat (limited to 'src/librbd/journal/Replay.cc')
-rw-r--r-- | src/librbd/journal/Replay.cc | 1190 |
1 files changed, 1190 insertions, 0 deletions
diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc new file mode 100644 index 00000000..f96153e7 --- /dev/null +++ b/src/librbd/journal/Replay.cc @@ -0,0 +1,1190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Replay.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " + +namespace librbd { +namespace journal { + +namespace { + +static const uint64_t IN_FLIGHT_IO_LOW_WATER_MARK(32); +static const uint64_t IN_FLIGHT_IO_HIGH_WATER_MARK(64); + +static NoOpProgressContext no_op_progress_callback; + +template <typename I, typename E> +struct ExecuteOp : public Context { + I &image_ctx; + E event; + Context *on_op_complete; + + ExecuteOp(I &image_ctx, const E &event, Context *on_op_complete) + : image_ctx(image_ctx), event(event), on_op_complete(on_op_complete) { + } + + void execute(const journal::SnapCreateEvent &_) { + image_ctx.operations->execute_snap_create(event.snap_namespace, + event.snap_name, + on_op_complete, + event.op_tid, false); + } + + void execute(const journal::SnapRemoveEvent &_) { + image_ctx.operations->execute_snap_remove(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRenameEvent &_) { + image_ctx.operations->execute_snap_rename(event.snap_id, + event.dst_snap_name, + on_op_complete); + } + + void execute(const journal::SnapProtectEvent &_) { + image_ctx.operations->execute_snap_protect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapUnprotectEvent &_) { + image_ctx.operations->execute_snap_unprotect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRollbackEvent &_) { + image_ctx.operations->execute_snap_rollback(event.snap_namespace, + event.snap_name, + no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::RenameEvent &_) { + image_ctx.operations->execute_rename(event.image_name, + on_op_complete); + } + + void execute(const journal::ResizeEvent &_) { + image_ctx.operations->execute_resize(event.size, true, no_op_progress_callback, + on_op_complete, event.op_tid); + } + + void execute(const journal::FlattenEvent &_) { + image_ctx.operations->execute_flatten(no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::SnapLimitEvent &_) { + image_ctx.operations->execute_snap_set_limit(event.limit, on_op_complete); + } + + void execute(const journal::UpdateFeaturesEvent &_) { + image_ctx.operations->execute_update_features(event.features, event.enabled, + on_op_complete, event.op_tid); + } + + void execute(const journal::MetadataSetEvent &_) { + image_ctx.operations->execute_metadata_set(event.key, event.value, + on_op_complete); + } + + void execute(const journal::MetadataRemoveEvent &_) { + image_ctx.operations->execute_metadata_remove(event.key, on_op_complete); + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + if (r < 0) { + lderr(cct) << ": ExecuteOp::" << __func__ << ": r=" << r << dendl; + on_op_complete->complete(r); + return; + } + + ldout(cct, 20) << ": ExecuteOp::" << __func__ << dendl; + RWLock::RLocker owner_locker(image_ctx.owner_lock); + + if (image_ctx.exclusive_lock == nullptr || + !image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping op" << dendl; + on_op_complete->complete(-ECANCELED); + return; + } + + execute(event); + } +}; + +template <typename I> +struct C_RefreshIfRequired : public Context { + I &image_ctx; + Context *on_finish; + + C_RefreshIfRequired(I &image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + ~C_RefreshIfRequired() override { + delete on_finish; + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + Context *ctx = on_finish; + on_finish = nullptr; + + if (r < 0) { + lderr(cct) << ": C_RefreshIfRequired::" << __func__ << ": r=" << r << dendl; + image_ctx.op_work_queue->queue(ctx, r); + return; + } + + if (image_ctx.state->is_refresh_required()) { + ldout(cct, 20) << ": C_RefreshIfRequired::" << __func__ << ": " + << "refresh required" << dendl; + image_ctx.state->refresh(ctx); + return; + } + + image_ctx.op_work_queue->queue(ctx, 0); + } +}; + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " \ + << __func__ + +template <typename I> +Replay<I>::Replay(I &image_ctx) + : m_image_ctx(image_ctx), m_lock("Replay<I>::m_lock") { +} + +template <typename I> +Replay<I>::~Replay() { + ceph_assert(m_in_flight_aio_flush == 0); + ceph_assert(m_in_flight_aio_modify == 0); + ceph_assert(m_aio_modify_unsafe_contexts.empty()); + ceph_assert(m_aio_modify_safe_contexts.empty()); + ceph_assert(m_op_events.empty()); + ceph_assert(m_in_flight_op_events == 0); +} + +template <typename I> +int Replay<I>::decode(bufferlist::const_iterator *it, EventEntry *event_entry) { + try { + using ceph::decode; + decode(*event_entry, *it); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; +} + +template <typename I> +void Replay<I>::process(const EventEntry &event_entry, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", on_safe=" << on_safe + << dendl; + + on_ready = util::create_async_context_callback(m_image_ctx, on_ready); + + RWLock::RLocker owner_lock(m_image_ctx.owner_lock); + if (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping event" << dendl; + m_image_ctx.op_work_queue->queue(on_safe, -ECANCELED); + on_ready->complete(0); + return; + } + + boost::apply_visitor(EventVisitor(this, on_ready, on_safe), + event_entry.event); +} + +template <typename I> +void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + io::AioCompletion *flush_comp = nullptr; + on_finish = util::create_async_context_callback( + m_image_ctx, on_finish); + + { + Mutex::Locker locker(m_lock); + + // safely commit any remaining AIO modify operations + if ((m_in_flight_aio_flush + m_in_flight_aio_modify) != 0) { + flush_comp = create_aio_flush_completion(nullptr); + ceph_assert(flush_comp != nullptr); + } + + for (auto &op_event_pair : m_op_events) { + OpEvent &op_event = op_event_pair.second; + if (cancel_ops) { + // cancel ops that are waiting to start (waiting for + // OpFinishEvent or waiting for ready) + if (op_event.on_start_ready == nullptr && + op_event.on_op_finish_event != nullptr) { + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART); + } + } else if (op_event.on_op_finish_event != nullptr) { + // start ops waiting for OpFinishEvent + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, 0); + } else if (op_event.on_start_ready != nullptr) { + // waiting for op ready + op_event_pair.second.finish_on_ready = true; + } + } + + ceph_assert(!m_shut_down); + m_shut_down = true; + + ceph_assert(m_flush_ctx == nullptr); + if (m_in_flight_op_events > 0 || flush_comp != nullptr) { + std::swap(m_flush_ctx, on_finish); + } + } + + // execute the following outside of lock scope + if (flush_comp != nullptr) { + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +template <typename I> +void Replay<I>::flush(Context *on_finish) { + io::AioCompletion *aio_comp; + { + Mutex::Locker locker(m_lock); + aio_comp = create_aio_flush_completion( + util::create_async_context_callback(m_image_ctx, on_finish)); + if (aio_comp == nullptr) { + return; + } + } + + RWLock::RLocker owner_locker(m_image_ctx.owner_lock); + io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); +} + +template <typename I> +void Replay<I>::replay_op_ready(uint64_t op_tid, Context *on_resume) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << dendl; + + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.op_in_progress && + op_event.on_op_finish_event == nullptr && + op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + // resume processing replay events + Context *on_start_ready = nullptr; + std::swap(on_start_ready, op_event.on_start_ready); + on_start_ready->complete(0); + + // cancel has been requested -- send error to paused state machine + if (!op_event.finish_on_ready && m_flush_ctx != nullptr) { + m_image_ctx.op_work_queue->queue(on_resume, -ERESTART); + return; + } + + // resume the op state machine once the associated OpFinishEvent + // is processed + op_event.on_op_finish_event = new FunctionContext( + [on_resume](int r) { + on_resume->complete(r); + }); + + // shut down request -- don't expect OpFinishEvent + if (op_event.finish_on_ready) { + m_image_ctx.op_work_queue->queue(on_resume, 0); + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioDiscardEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO discard event" << dendl; + + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_DISCARD, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + event.discard_granularity_bytes, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO write event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITE, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(data), 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioFlushEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO flush event" << dendl; + + io::AioCompletion *aio_comp; + { + Mutex::Locker locker(m_lock); + aio_comp = create_aio_flush_completion(on_safe); + } + + if (aio_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::AioWriteSameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO writesame event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITESAME, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_writesame(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(data), 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + + template <typename I> + void Replay<I>::handle_event(const journal::AioCompareAndWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl; + + bufferlist cmp_data = event.cmp_data; + bufferlist write_data = event.write_data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_COMPARE_AND_WRITE, + &flush_required, + {-EILSEQ}); + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest<I>::aio_compare_and_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + std::move(cmp_data), + std::move(write_data), + nullptr, 0, {}); + } + + if (flush_required) { + m_lock.Lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.Unlock(); + + io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } +} + +template <typename I> +void Replay<I>::handle_event(const journal::OpFinishEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Op finish event: " + << "op_tid=" << event.op_tid << dendl; + + bool op_in_progress; + bool filter_ret_val; + Context *on_op_complete = nullptr; + Context *on_op_finish_event = nullptr; + { + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(event.op_tid); + if (op_it == m_op_events.end()) { + ldout(cct, 10) << ": unable to locate associated op: assuming previously " + << "committed." << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, 0); + return; + } + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.on_finish_safe == nullptr); + op_event.on_finish_ready = on_ready; + op_event.on_finish_safe = on_safe; + op_in_progress = op_event.op_in_progress; + std::swap(on_op_complete, op_event.on_op_complete); + std::swap(on_op_finish_event, op_event.on_op_finish_event); + + // special errors which indicate op never started but was recorded + // as failed in the journal + filter_ret_val = (op_event.op_finish_error_codes.count(event.r) != 0); + } + + if (event.r < 0) { + if (op_in_progress) { + // bubble the error up to the in-progress op to cancel it + on_op_finish_event->complete(event.r); + } else { + // op hasn't been started -- bubble the error up since + // our image is now potentially in an inconsistent state + // since simple errors should have been caught before + // creating the op event + delete on_op_complete; + delete on_op_finish_event; + handle_op_complete(event.op_tid, filter_ret_val ? 0 : event.r); + } + return; + } + + // journal recorded success -- apply the op now + on_op_finish_event->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapCreateEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap create event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapCreateEvent>(m_image_ctx, event, + on_op_complete)), + 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap remove event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRemoveEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rename event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRenameEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapProtectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap protect event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapProtectEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EBUSY}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapUnprotectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap unprotect event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapUnprotectEvent>(m_image_ctx, + event, + on_op_complete)); + + // ignore errors recorded in the journal + op_event->op_finish_error_codes = {-EBUSY}; + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapRollbackEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rollback start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapRollbackEvent>(m_image_ctx, + event, + on_op_complete)); + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::RenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Rename event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::RenameEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::ResizeEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Resize start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::ResizeEvent>(m_image_ctx, event, + on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::FlattenEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Flatten start event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::FlattenEvent>(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::DemotePromoteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Demote/Promote event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::SnapLimitEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap limit event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::SnapLimitEvent>(m_image_ctx, + event, + on_op_complete)); + + op_event->ignore_error_codes = {-ERANGE}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::UpdateFeaturesEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Update features event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired<I>( + m_image_ctx, new ExecuteOp<I, journal::UpdateFeaturesEvent>( + m_image_ctx, event, on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template <typename I> +void Replay<I>::handle_event(const journal::MetadataSetEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata set event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp<I, journal::MetadataSetEvent>( + m_image_ctx, event, on_op_complete)); + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::MetadataRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata remove event" << dendl; + + Mutex::Locker locker(m_lock); + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired<I>(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp<I, journal::MetadataRemoveEvent>( + m_image_ctx, event, on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template <typename I> +void Replay<I>::handle_event(const journal::UnknownEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": unknown event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template <typename I> +void Replay<I>::handle_aio_modify_complete(Context *on_ready, Context *on_safe, + int r, std::set<int> &filters, + bool writeback_cache_enabled) { + Mutex::Locker locker(m_lock); + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", " + << "on_safe=" << on_safe << ", r=" << r << dendl; + + if (on_ready != nullptr) { + on_ready->complete(0); + } + + if (filters.find(r) != filters.end()) + r = 0; + + if (r < 0) { + lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl; + m_image_ctx.op_work_queue->queue(on_safe, r); + return; + } + + if (writeback_cache_enabled) { + // will be completed after next flush operation completes + m_aio_modify_safe_contexts.insert(on_safe); + } else { + // IO is safely stored on disk + ceph_assert(m_in_flight_aio_modify > 0); + --m_in_flight_aio_modify; + + if (m_on_aio_ready != nullptr) { + ldout(cct, 10) << ": resuming paused AIO" << dendl; + m_on_aio_ready->complete(0); + m_on_aio_ready = nullptr; + } + + ldout(cct, 20) << ": completing safe context: " << on_safe << dendl; + m_image_ctx.op_work_queue->queue(on_safe, 0); + } +} + +template <typename I> +void Replay<I>::handle_aio_flush_complete(Context *on_flush_safe, + Contexts &on_safe_ctxs, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << ": AIO flush failed: " << cpp_strerror(r) << dendl; + } + + Context *on_aio_ready = nullptr; + Context *on_flush = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_in_flight_aio_flush > 0); + ceph_assert(m_in_flight_aio_modify >= on_safe_ctxs.size()); + --m_in_flight_aio_flush; + m_in_flight_aio_modify -= on_safe_ctxs.size(); + + std::swap(on_aio_ready, m_on_aio_ready); + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + + // strip out previously failed on_safe contexts + for (auto it = on_safe_ctxs.begin(); it != on_safe_ctxs.end(); ) { + if (m_aio_modify_safe_contexts.erase(*it)) { + ++it; + } else { + it = on_safe_ctxs.erase(it); + } + } + } + + if (on_aio_ready != nullptr) { + ldout(cct, 10) << ": resuming paused AIO" << dendl; + on_aio_ready->complete(0); + } + + if (on_flush_safe != nullptr) { + on_safe_ctxs.push_back(on_flush_safe); + } + for (auto ctx : on_safe_ctxs) { + ldout(cct, 20) << ": completing safe context: " << ctx << dendl; + ctx->complete(r); + } + + if (on_flush != nullptr) { + ldout(cct, 20) << ": completing flush context: " << on_flush << dendl; + on_flush->complete(r); + } +} + +template <typename I> +Context *Replay<I>::create_op_context_callback(uint64_t op_tid, + Context *on_ready, + Context *on_safe, + OpEvent **op_event) { + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ceph_assert(m_lock.is_locked()); + if (m_op_events.count(op_tid) != 0) { + lderr(cct) << ": duplicate op tid detected: " << op_tid << dendl; + + // on_ready is already async but on failure invoke on_safe async + // as well + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -EINVAL); + return nullptr; + } + + ++m_in_flight_op_events; + *op_event = &m_op_events[op_tid]; + (*op_event)->on_start_safe = on_safe; + + Context *on_op_complete = new C_OpOnComplete(this, op_tid); + (*op_event)->on_op_complete = on_op_complete; + return on_op_complete; +} + +template <typename I> +void Replay<I>::handle_op_complete(uint64_t op_tid, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << ", " + << "r=" << r << dendl; + + OpEvent op_event; + bool shutting_down = false; + { + Mutex::Locker locker(m_lock); + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + op_event = std::move(op_it->second); + m_op_events.erase(op_it); + + if (m_shut_down) { + ceph_assert(m_flush_ctx != nullptr); + shutting_down = true; + } + } + + ceph_assert(op_event.on_start_ready == nullptr || (r < 0 && r != -ERESTART)); + if (op_event.on_start_ready != nullptr) { + // blocking op event failed before it became ready + ceph_assert(op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + op_event.on_start_ready->complete(0); + } else { + // event kicked off by OpFinishEvent + ceph_assert((op_event.on_finish_ready != nullptr && + op_event.on_finish_safe != nullptr) || shutting_down); + } + + if (op_event.on_op_finish_event != nullptr) { + op_event.on_op_finish_event->complete(r); + } + + if (op_event.on_finish_ready != nullptr) { + op_event.on_finish_ready->complete(0); + } + + // filter out errors caused by replay of the same op + if (r < 0 && op_event.ignore_error_codes.count(r) != 0) { + r = 0; + } + + op_event.on_start_safe->complete(r); + if (op_event.on_finish_safe != nullptr) { + op_event.on_finish_safe->complete(r); + } + + // shut down request might have occurred while lock was + // dropped -- handle if pending + Context *on_flush = nullptr; + { + Mutex::Locker locker(m_lock); + ceph_assert(m_in_flight_op_events > 0); + --m_in_flight_op_events; + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + } + if (on_flush != nullptr) { + m_image_ctx.op_work_queue->queue(on_flush, 0); + } +} + +template <typename I> +io::AioCompletion * +Replay<I>::create_aio_modify_completion(Context *on_ready, + Context *on_safe, + io::aio_type_t aio_type, + bool *flush_required, + std::set<int> &&filters) { + Mutex::Locker locker(m_lock); + CephContext *cct = m_image_ctx.cct; + ceph_assert(m_on_aio_ready == nullptr); + + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ++m_in_flight_aio_modify; + + bool writeback_cache_enabled = m_image_ctx.is_writeback_cache_enabled(); + if (writeback_cache_enabled) { + m_aio_modify_unsafe_contexts.push_back(on_safe); + } + + // FLUSH if we hit the low-water mark -- on_safe contexts are + // completed by flushes-only so that we don't move the journal + // commit position until safely on-disk + + *flush_required = (writeback_cache_enabled && + m_aio_modify_unsafe_contexts.size() == + IN_FLIGHT_IO_LOW_WATER_MARK); + if (*flush_required) { + ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush" + << dendl; + } + + // READY for more events if: + // * not at high-water mark for IO + // * in-flight ops are at a consistent point (snap create has IO flushed, + // shrink has adjusted clip boundary, etc) -- should have already been + // flagged not-ready + if (m_in_flight_aio_modify == IN_FLIGHT_IO_HIGH_WATER_MARK) { + ldout(cct, 10) << ": hit AIO replay high-water mark: pausing replay" + << dendl; + ceph_assert(m_on_aio_ready == nullptr); + std::swap(m_on_aio_ready, on_ready); + } + + // when the modification is ACKed by librbd, we can process the next + // event. when flushed, the completion of the next flush will fire the + // on_safe callback + auto aio_comp = io::AioCompletion::create_and_start<Context>( + new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters), + writeback_cache_enabled), + util::get_image_ctx(&m_image_ctx), aio_type); + return aio_comp; +} + +template <typename I> +io::AioCompletion *Replay<I>::create_aio_flush_completion(Context *on_safe) { + ceph_assert(m_lock.is_locked()); + + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + if (on_safe != nullptr) { + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + } + return nullptr; + } + + ++m_in_flight_aio_flush; + + // associate all prior write/discard ops to this flush request + auto aio_comp = io::AioCompletion::create_and_start<Context>( + new C_AioFlushComplete(this, on_safe, + std::move(m_aio_modify_unsafe_contexts)), + util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH); + m_aio_modify_unsafe_contexts.clear(); + return aio_comp; +} + +template <typename I> +bool Replay<I>::clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp) { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.snap_lock.get_read(); + size_t image_size = m_image_ctx.size; + m_image_ctx.snap_lock.put_read(); + + if (image_offset >= image_size) { + // rbd-mirror image sync might race an IO event w/ associated resize between + // the point the peer is registered and the sync point is created, so no-op + // IO events beyond the current image extents since under normal conditions + // it wouldn't have been recorded in the journal + ldout(cct, 5) << ": no-op IO event beyond image size" << dendl; + aio_comp->get(); + aio_comp->set_request_count(0); + aio_comp->put(); + return true; + } + + return false; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::Replay<librbd::ImageCtx>; |